Multiple Regression in \(R\)

We’ll use the Credit dataframe from the ISLR package to demonstrate multiple regression with:

A numerical outcome variable \(y\), in this case credit card balance.

Two explanatory variables:

  1. A first numerical explanatory variable \(x_1\). In this case, their credit limit.

  2. A second numerical explanatory variable \(x_2\). In this case, their income (in thousands of dollars).

Note: This dataset is not based on actual individuals, it is a simulated dataset used for educational purposes.

Analogous to simple linear regression, where the regression function \(E\{Y\} = \beta_0 + \beta_1X_1\) is a line, regression function \(E\{Y\} = \beta_0 + \beta_1X_1 +\beta_2X_2\) is a plane.

library(ISLR)
library(plotly)

data(Credit)
head(Credit)
  ID  Income Limit Rating Cards Age Education Gender Student Married Ethnicity
1  1  14.891  3606    283     2  34        11   Male      No     Yes Caucasian
2  2 106.025  6645    483     3  82        15 Female     Yes     Yes     Asian
3  3 104.593  7075    514     4  71        11   Male      No      No     Asian
4  4 148.924  9504    681     3  36        11 Female      No      No     Asian
5  5  55.882  4897    357     2  68        16   Male      No     Yes Caucasian
6  6  80.180  8047    569     4  77        10   Male      No      No Caucasian
  Balance
1     333
2     903
3     580
4     964
5     331
6    1151
# draw 3D scatterplot
p <- plot_ly(data = Credit, z = ~Balance, x = ~Income, y = ~Limit, opacity = 0.6, color = Credit$Balance) %>%
  add_markers() 
p
#Fit a multiple linear regression model 
#Syntax: model_name <- lm(y ~ x1 + x2 + ... +xn, data = data_frame_name)

Balance_model <- lm(Balance ~ Limit + Income, data = Credit)
Balance_model

Call:
lm(formula = Balance ~ Limit + Income, data = Credit)

Coefficients:
(Intercept)        Limit       Income  
  -385.1793       0.2643      -7.6633  
range(Credit$Limit)
[1]   855 13913
range(Credit$Income)
[1]  10.354 186.634

Scatter plot matrix for Dwaine Studios example

library(ggplot2)
library(plotly)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
DwaineData <- read.table("https://people.stat.sc.edu/Hitchcock/studiodata.txt", sep ="" , header = FALSE)

#DwaineData <- read.table("http://users.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%20%206%20Data%20Sets/CH06FI05.txt", sep ="" , header = FALSE)

#DwaineData <- read.table("https://people.stat.sc.edu/Hitchcock/studiodata.txt", sep ="" , header = FALSE)

DwaineData <- data.frame(DwaineData)


colnames(DwaineData) <- c("TARGTPOP", "DISPOINC", "SALES")
head(DwaineData)
##   TARGTPOP DISPOINC SALES
## 1     68.5     16.7 174.4
## 2     45.2     16.8 164.4
## 3     91.3     18.2 244.2
## 4     47.8     16.3 154.6
## 5     46.9     17.3 181.6
## 6     66.1     18.2 207.5
dim(DwaineData)
## [1] 21  3
#pairs(DwaineData) # Default Scatterplot matrix. Simple to make

p <- ggpairs(DwaineData, title="Scatterplot matrix")
ggplotly(p)
## Warning: Can only have one: highlight
## Warning: Can only have one: highlight

Correlation matrix for Dwaine Studios example

cor(DwaineData)
##           TARGTPOP  DISPOINC     SALES
## TARGTPOP 1.0000000 0.7812993 0.9445543
## DISPOINC 0.7812993 1.0000000 0.8358025
## SALES    0.9445543 0.8358025 1.0000000

Correlation Test for Normality

Breusch-Pagan Test for Constancy of Error Variance

F test for lack of fit

# F test for lack of fit

#Reduced <- lm(SALES ~ TARGTPOP + DISPOINC, data = DwaineData) # This is our usual model
#summary(Reduced)

#Full <-  lm(SALES ~ factor(TARGTPOP)*factor(DISPOINC), data = DwaineData) # This is the full model
#summary(Full)

#anova (Reduced, Full)
# Problem 5 from exercises

p5 <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%206%20Data%20Sets/CH06PR05.txt", sep ="" , header = FALSE)


p5Data <- data.frame(p5)


colnames(p5Data) <- c("y", "x1", "x2")
(p5Data)
     y x1 x2
1   64  4  2
2   73  4  4
3   61  4  2
4   76  4  4
5   72  6  2
6   80  6  4
7   71  6  2
8   83  6  4
9   83  8  2
10  89  8  4
11  86  8  2
12  93  8  4
13  88 10  2
14  95 10  4
15  94 10  2
16 100 10  4
dim(p5Data)
[1] 16  3
Reduced <- lm(y ~ x1 + x2, data = p5Data) # This is our usual model
summary(Reduced)

Call:
lm(formula = y ~ x1 + x2, data = p5Data)

Residuals:
   Min     1Q Median     3Q    Max 
-4.400 -1.762  0.025  1.587  4.200 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  37.6500     2.9961  12.566 1.20e-08 ***
x1            4.4250     0.3011  14.695 1.78e-09 ***
x2            4.3750     0.6733   6.498 2.01e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.693 on 13 degrees of freedom
Multiple R-squared:  0.9521,    Adjusted R-squared:  0.9447 
F-statistic: 129.1 on 2 and 13 DF,  p-value: 2.658e-09
Full <-  lm(y ~ factor(x1)*factor(x2), data = p5Data) # This is the full model
summary(Full)

Call:
lm(formula = y ~ factor(x1) * factor(x2), data = p5Data)

Residuals:
   Min     1Q Median     3Q    Max 
  -3.0   -1.5    0.0    1.5    3.0 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)                62.500      1.887  33.113 7.55e-10 ***
factor(x1)6                 9.000      2.669   3.372  0.00976 ** 
factor(x1)8                22.000      2.669   8.242 3.52e-05 ***
factor(x1)10               28.500      2.669  10.677 5.19e-06 ***
factor(x2)4                12.000      2.669   4.496  0.00201 ** 
factor(x1)6:factor(x2)4    -2.000      3.775  -0.530  0.61063    
factor(x1)8:factor(x2)4    -5.500      3.775  -1.457  0.18322    
factor(x1)10:factor(x2)4   -5.500      3.775  -1.457  0.18322    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.669 on 8 degrees of freedom
Multiple R-squared:  0.971, Adjusted R-squared:  0.9457 
F-statistic:  38.3 on 7 and 8 DF,  p-value: 1.56e-05

Testing the hypothesis:

\(H_0: \mbox{Reduced model is good}\) \(\quad\) \(H_a: \mbox{Full model is good}\)

anova (Reduced, Full)
Analysis of Variance Table

Model 1: y ~ x1 + x2
Model 2: y ~ factor(x1) * factor(x2)
  Res.Df  RSS Df Sum of Sq     F Pr(>F)
1     13 94.3                          
2      8 57.0  5      37.3 1.047  0.453

F Test for Regression Relation, Problem 6.6 in the exercises

library(Matrix)

p5 <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%206%20Data%20Sets/CH06PR05.txt", sep ="" , header = FALSE)


p5Data <- data.frame(p5)


colnames(p5Data) <- c("y", "x1", "x2")
#dim(p5Data)

######

n <- nrow(p5Data)


Y <- p5Data$y

Y <- as.matrix(Y) 
Y                       # This is your Y
      [,1]
 [1,]   64
 [2,]   73
 [3,]   61
 [4,]   76
 [5,]   72
 [6,]   80
 [7,]   71
 [8,]   83
 [9,]   83
[10,]   89
[11,]   86
[12,]   93
[13,]   88
[14,]   95
[15,]   94
[16,]  100
X <- p5Data[,-1]        # remove y to form X
X <- as.matrix(X)
X                       # Not quite the X we need
      x1 x2
 [1,]  4  2
 [2,]  4  4
 [3,]  4  2
 [4,]  4  4
 [5,]  6  2
 [6,]  6  4
 [7,]  6  2
 [8,]  6  4
 [9,]  8  2
[10,]  8  4
[11,]  8  2
[12,]  8  4
[13,] 10  2
[14,] 10  4
[15,] 10  2
[16,] 10  4
X <- cbind(rep(1,n), X) # Adding a column of ones to make the X matrix
X                       # This is your X
        x1 x2
 [1,] 1  4  2
 [2,] 1  4  4
 [3,] 1  4  2
 [4,] 1  4  4
 [5,] 1  6  2
 [6,] 1  6  4
 [7,] 1  6  2
 [8,] 1  6  4
 [9,] 1  8  2
[10,] 1  8  4
[11,] 1  8  2
[12,] 1  8  4
[13,] 1 10  2
[14,] 1 10  4
[15,] 1 10  2
[16,] 1 10  4
# Calculating SSTO 

YtY <- t(Y) %*% Y
YtY
       [,1]
[1,] 108896
J <- matrix(1, n, n)
#J

YtJY <- t(Y) %*% J %*% Y
YtJY
        [,1]
[1,] 1710864
SSTO <- YtY - (1/n) * YtJY
SSTO
     [,1]
[1,] 1967
# Calculating SSE

b <- solve(t(X) %*% X) %*% t(X) %*% Y

SSE <- YtY - t(b) %*% t(X) %*% Y
SSE
     [,1]
[1,] 94.3
# Calculating SSR

SSR <- SSTO - SSE
SSR
       [,1]
[1,] 1872.7
# Calculating F statistic

p <- 3 # number of predictors

MSR <- SSR/(p-1)
MSE <- SSE/(n-p)

Fstat <- MSR/MSE
Fstat  
         [,1]
[1,] 129.0832
# P-value
pf(Fstat, p-1, n-p, lower.tail = FALSE)
             [,1]
[1,] 2.658261e-09

Varifying the above results

p5model <- lm(y~x1+x2, data = p5Data)
summary(p5model) # Read the very last line of the output 

Call:
lm(formula = y ~ x1 + x2, data = p5Data)

Residuals:
   Min     1Q Median     3Q    Max 
-4.400 -1.762  0.025  1.587  4.200 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  37.6500     2.9961  12.566 1.20e-08 ***
x1            4.4250     0.3011  14.695 1.78e-09 ***
x2            4.3750     0.6733   6.498 2.01e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.693 on 13 degrees of freedom
Multiple R-squared:  0.9521,    Adjusted R-squared:  0.9447 
F-statistic: 129.1 on 2 and 13 DF,  p-value: 2.658e-09

Test for individual \(\beta\)’s

p5model <- lm(y~x1+x2, data = p5Data)
summary(p5model) 

Call:
lm(formula = y ~ x1 + x2, data = p5Data)

Residuals:
   Min     1Q Median     3Q    Max 
-4.400 -1.762  0.025  1.587  4.200 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  37.6500     2.9961  12.566 1.20e-08 ***
x1            4.4250     0.3011  14.695 1.78e-09 ***
x2            4.3750     0.6733   6.498 2.01e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.693 on 13 degrees of freedom
Multiple R-squared:  0.9521,    Adjusted R-squared:  0.9447 
F-statistic: 129.1 on 2 and 13 DF,  p-value: 2.658e-09
?confint


confint(p5model, method = "none")
                2.5 %    97.5 %
(Intercept) 31.177312 44.122688
x1           3.774470  5.075530
x2           2.920372  5.829628
confint(p5model, method = c("none","bonferroni"))
                2.5 %    97.5 %
(Intercept) 31.177312 44.122688
x1           3.774470  5.075530
x2           2.920372  5.829628

Interval Estimation of \(\beta_k\)

confint(p5model, level = .95)
                2.5 %    97.5 %
(Intercept) 31.177312 44.122688
x1           3.774470  5.075530
x2           2.920372  5.829628

Prediction of New Observation \(Y_{h(new)}\)

p5model <- lm(y~x1+x2, data = p5Data)

#prediction interval for the new observation x1 = 2, x2 = 4

xnew <- t(c(x1 = 2, x2 = 4))
xnew <- data.frame(xnew)
xnew
  x1 x2
1  2  4
predict(p5model, xnew,interval="pred",level=.95) #prediction interval
  fit      lwr      upr
1  64 57.02385 70.97615

Prediction of Mean of m New Observations at \(X_h\)

p5model <- lm(y~x1+x2, data = p5Data)

# prediction interval for the new observation x1 = 2, x2 = 4; x1 = 3, x2 = 3

xnew <- matrix(c(2,4), nrow = 1, byrow = TRUE)
xnew <- data.frame(xnew)
xnew
  X1 X2
1  2  4
colnames(xnew) <- c("x1", "x2")
xnew
  x1 x2
1  2  4
predict(p5model, xnew, interval="conf",level=.95) #confidence interval
  fit      lwr      upr
1  64 60.15142 67.84858