NOTE: Use this script for Ch23, ONLY if the question uses raw data

1 Checking assumptions

AtlReg <- read.csv("AtlantaRegression.csv")
AtlReg

               TO Distance Fare
1       Baltimore      568  219
2          Boston      933  222
3          Dallas      720  249
4          Denver     1190  308
5         Detroit      602  249
6     Kansas City      683  141
7       Las Vegas     1719  252
8           Miami      589  229
9         Memphis      327  183
10    Minneapolis      894  209
11    New Orleans      419  199
12         NYCity      749  248
13  Oklahoma City      749  301
14        Orlando      392  238
15   Philadelphia      657  205
16      St. Louis      461  232
17 Salt Lake City     1565  371
18        Seattle     2150  343

# Figure out what your x and y are
x <- AtlReg$Distance
y <- AtlReg$Fare

############################################################################
# Checking Assumption 1:
############################################################################

# Create a scatterplot of x vs. y
plot(x, y, xlab="Distance", ylab="Fare", main = "Scatterplot of Fare vs. Distance", col = "red")

# Find the correlation between x and y
r <- cor(x,y)
r

[1] 0.6942721

# Find the R^2 value
Rsquared <- cor(x,y)^2
Rsquared

[1] 0.4820137

# Everything looks good? 

############################################################################
# Checking Assumption 2 and 3:
############################################################################

# We have to create the linear model to get the residual plot
# we use the lm function to create the linear model

model <- lm(y ~ x) 

# It is time to examine the residual

plot(x, model$residuals) # residuals vs x - Residual plot

# a better residual plot?
library(MASS)
stdresvals <- stdres(model) # standardized residuals
plot(x, stdresvals) # This residual plot is easier to read than the previous

############################################################################
# Checking Assumption 4:
############################################################################

hist(model$residuals) # histogram of residuals - To check if residuals are normal

qqnorm(model$residuals) # QQplot- To check if residuals are normal

2 More about the model

AtlReg <- read.csv("AtlantaRegression.csv")
AtlReg

               TO Distance Fare
1       Baltimore      568  219
2          Boston      933  222
3          Dallas      720  249
4          Denver     1190  308
5         Detroit      602  249
6     Kansas City      683  141
7       Las Vegas     1719  252
8           Miami      589  229
9         Memphis      327  183
10    Minneapolis      894  209
11    New Orleans      419  199
12         NYCity      749  248
13  Oklahoma City      749  301
14        Orlando      392  238
15   Philadelphia      657  205
16      St. Louis      461  232
17 Salt Lake City     1565  371
18        Seattle     2150  343

# Figure out what your x and y are
x <- AtlReg$Distance
y <- AtlReg$Fare

model <- lm(y ~ x)

# plot of x vs. y 
plot(x, y, xlab="Distance", ylab="Fare", main = "Scatterplot of Fare vs. Distance", col = "red")

#We can add the line to the plot as a visual aid
abline(lm(y~x))

summary(model)


Call:
lm(formula = y ~ x)

Residuals:
    Min      1Q  Median      3Q     Max 
-89.911 -22.881   1.304  22.978  70.747 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 177.21452   19.99315   8.864 1.43e-07 ***
x             0.07862    0.02037   3.859  0.00139 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 41.82 on 16 degrees of freedom
Multiple R-squared:  0.482, Adjusted R-squared:  0.4496 
F-statistic: 14.89 on 1 and 16 DF,  p-value: 0.00139

names(model)

 [1] "coefficients"  "residuals"     "effects"       "rank"         
 [5] "fitted.values" "assign"        "qr"            "df.residual"  
 [9] "xlevels"       "call"          "terms"         "model"

model$coefficients  #[1] is the intercept, [2] is the slope

 (Intercept)            x 
177.21452251   0.07861903

model$fitted.values

       1        2        3        4        5        6        7        8 
221.8701 250.5661 233.8202 270.7712 224.5432 230.9113 312.3606 223.5211 
       9       10       11       12       13       14       15       16 
202.9229 247.4999 210.1559 236.1002 236.1002 208.0332 228.8672 213.4579 
      17       18 
300.2533 346.2454

model$residuals

         1          2          3          4          5          6          7 
 -2.870130 -28.566075  15.179778  37.228835  24.456823 -89.911318 -60.360631 
         8          9         10         11         12         13         14 
  5.478870 -19.922944 -38.499933 -11.155895  11.899826  64.899826  29.966819 
        15         16         17         18 
-23.867224  18.542106  70.746700  -3.245432

#--------------------------------------------------
# Optional: Creating an easily read table of obsreved values, fits and residuals:

numbers <- cbind(AtlReg[,3], model$fitted.values, model$residuals)
colnames(numbers) <- c("Observed Fare", "Fits", "Residuals")
rownames(numbers) <- AtlReg[,1]

output <- as.table(numbers)
output

               Observed Fare       Fits  Residuals
Baltimore         219.000000 221.870130  -2.870130
Boston            222.000000 250.566075 -28.566075
Dallas            249.000000 233.820222  15.179778
Denver            308.000000 270.771165  37.228835
Detroit           249.000000 224.543177  24.456823
Kansas City       141.000000 230.911318 -89.911318
Las Vegas         252.000000 312.360631 -60.360631
Miami             229.000000 223.521130   5.478870
Memphis           183.000000 202.922944 -19.922944
Minneapolis       209.000000 247.499933 -38.499933
New Orleans       199.000000 210.155895 -11.155895
NYCity            248.000000 236.100174  11.899826
Oklahoma City     301.000000 236.100174  64.899826
Orlando           238.000000 208.033181  29.966819
Philadelphia      205.000000 228.867224 -23.867224
St. Louis         232.000000 213.457894  18.542106
Salt Lake City    371.000000 300.253300  70.746700
Seattle           343.000000 346.245432  -3.245432

#--------------End Table---------------------------

3 Write out the model equation

summary(model) # Use this to get the coefficients


Call:
lm(formula = y ~ x)

Residuals:
    Min      1Q  Median      3Q     Max 
-89.911 -22.881   1.304  22.978  70.747 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 177.21452   19.99315   8.864 1.43e-07 ***
x             0.07862    0.02037   3.859  0.00139 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 41.82 on 16 degrees of freedom
Multiple R-squared:  0.482, Adjusted R-squared:  0.4496 
F-statistic: 14.89 on 1 and 16 DF,  p-value: 0.00139

\[\hat{y} = 177.21452 + 0.07862 \times x\] \[\hat{Fare} = 177.21452 + 0.07862 \times Distance\] Back to top

4 Hypothesis test for \(\beta\)’s

model <- lm(y ~ x)
summary(model)


Call:
lm(formula = y ~ x)

Residuals:
    Min      1Q  Median      3Q     Max 
-89.911 -22.881   1.304  22.978  70.747 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 177.21452   19.99315   8.864 1.43e-07 ***
x             0.07862    0.02037   3.859  0.00139 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 41.82 on 16 degrees of freedom
Multiple R-squared:  0.482, Adjusted R-squared:  0.4496 
F-statistic: 14.89 on 1 and 16 DF,  p-value: 0.00139

5 CI for \(\beta\)’s

model <- lm(y ~ x)

confint(model, level=.95) # 95% CI for betas

                   2.5 %     97.5 %
(Intercept) 134.83094693 219.598098
x             0.03542601   0.121812

6 PI for a new observation

model <- lm(y ~ x)

#prediction interval for the new observation 700
predict(model, data.frame(x=700),interval="pred",level=.95) #prediction interval

       fit      lwr    upr
1 232.2478 140.9256 323.57

7 CI for the mean value for a new observation

model <- lm(y ~ x)

#confidence interval for the new observation 700
predict(model, data.frame(x=700),interval="conf",level=.95) #confidence interval

       fit     lwr      upr
1 232.2478 210.323 254.1727

R codes for Ch 23 (with raw data)

Dr. Hasthika Rupasinghe

Oct 27, 2021 at 09:26:36 PM

1 Checking assumptions

2 More about the model

3 Write out the model equation

4 Hypothesis test for \(\beta\)’s

5 CI for \(\beta\)’s

6 PI for a new observation

7 CI for the mean value for a new observation