NOTE: Use this script for Ch23, ONLY if the question uses raw data
AtlReg <- read.csv("AtlantaRegression.csv")
AtlReg
TO Distance Fare
1 Baltimore 568 219
2 Boston 933 222
3 Dallas 720 249
4 Denver 1190 308
5 Detroit 602 249
6 Kansas City 683 141
7 Las Vegas 1719 252
8 Miami 589 229
9 Memphis 327 183
10 Minneapolis 894 209
11 New Orleans 419 199
12 NYCity 749 248
13 Oklahoma City 749 301
14 Orlando 392 238
15 Philadelphia 657 205
16 St. Louis 461 232
17 Salt Lake City 1565 371
18 Seattle 2150 343
# Figure out what your x and y are
x <- AtlReg$Distance
y <- AtlReg$Fare
############################################################################
# Checking Assumption 1:
############################################################################
# Create a scatterplot of x vs. y
plot(x, y, xlab="Distance", ylab="Fare", main = "Scatterplot of Fare vs. Distance", col = "red")
# Find the correlation between x and y
r <- cor(x,y)
r
[1] 0.6942721
# Find the R^2 value
Rsquared <- cor(x,y)^2
Rsquared
[1] 0.4820137
# Everything looks good?
############################################################################
# Checking Assumption 2 and 3:
############################################################################
# We have to create the linear model to get the residual plot
# we use the lm function to create the linear model
model <- lm(y ~ x)
# It is time to examine the residual
plot(x, model$residuals) # residuals vs x - Residual plot
# a better residual plot?
library(MASS)
stdresvals <- stdres(model) # standardized residuals
plot(x, stdresvals) # This residual plot is easier to read than the previous
############################################################################
# Checking Assumption 4:
############################################################################
hist(model$residuals) # histogram of residuals - To check if residuals are normal
qqnorm(model$residuals) # QQplot- To check if residuals are normal
AtlReg <- read.csv("AtlantaRegression.csv")
AtlReg
TO Distance Fare
1 Baltimore 568 219
2 Boston 933 222
3 Dallas 720 249
4 Denver 1190 308
5 Detroit 602 249
6 Kansas City 683 141
7 Las Vegas 1719 252
8 Miami 589 229
9 Memphis 327 183
10 Minneapolis 894 209
11 New Orleans 419 199
12 NYCity 749 248
13 Oklahoma City 749 301
14 Orlando 392 238
15 Philadelphia 657 205
16 St. Louis 461 232
17 Salt Lake City 1565 371
18 Seattle 2150 343
# Figure out what your x and y are
x <- AtlReg$Distance
y <- AtlReg$Fare
model <- lm(y ~ x)
# plot of x vs. y
plot(x, y, xlab="Distance", ylab="Fare", main = "Scatterplot of Fare vs. Distance", col = "red")
#We can add the line to the plot as a visual aid
abline(lm(y~x))
summary(model)
Call:
lm(formula = y ~ x)
Residuals:
Min 1Q Median 3Q Max
-89.911 -22.881 1.304 22.978 70.747
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 177.21452 19.99315 8.864 1.43e-07 ***
x 0.07862 0.02037 3.859 0.00139 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 41.82 on 16 degrees of freedom
Multiple R-squared: 0.482, Adjusted R-squared: 0.4496
F-statistic: 14.89 on 1 and 16 DF, p-value: 0.00139
names(model)
[1] "coefficients" "residuals" "effects" "rank"
[5] "fitted.values" "assign" "qr" "df.residual"
[9] "xlevels" "call" "terms" "model"
model$coefficients #[1] is the intercept, [2] is the slope
(Intercept) x
177.21452251 0.07861903
model$fitted.values
1 2 3 4 5 6 7 8
221.8701 250.5661 233.8202 270.7712 224.5432 230.9113 312.3606 223.5211
9 10 11 12 13 14 15 16
202.9229 247.4999 210.1559 236.1002 236.1002 208.0332 228.8672 213.4579
17 18
300.2533 346.2454
model$residuals
1 2 3 4 5 6 7
-2.870130 -28.566075 15.179778 37.228835 24.456823 -89.911318 -60.360631
8 9 10 11 12 13 14
5.478870 -19.922944 -38.499933 -11.155895 11.899826 64.899826 29.966819
15 16 17 18
-23.867224 18.542106 70.746700 -3.245432
#--------------------------------------------------
# Optional: Creating an easily read table of obsreved values, fits and residuals:
numbers <- cbind(AtlReg[,3], model$fitted.values, model$residuals)
colnames(numbers) <- c("Observed Fare", "Fits", "Residuals")
rownames(numbers) <- AtlReg[,1]
output <- as.table(numbers)
output
Observed Fare Fits Residuals
Baltimore 219.000000 221.870130 -2.870130
Boston 222.000000 250.566075 -28.566075
Dallas 249.000000 233.820222 15.179778
Denver 308.000000 270.771165 37.228835
Detroit 249.000000 224.543177 24.456823
Kansas City 141.000000 230.911318 -89.911318
Las Vegas 252.000000 312.360631 -60.360631
Miami 229.000000 223.521130 5.478870
Memphis 183.000000 202.922944 -19.922944
Minneapolis 209.000000 247.499933 -38.499933
New Orleans 199.000000 210.155895 -11.155895
NYCity 248.000000 236.100174 11.899826
Oklahoma City 301.000000 236.100174 64.899826
Orlando 238.000000 208.033181 29.966819
Philadelphia 205.000000 228.867224 -23.867224
St. Louis 232.000000 213.457894 18.542106
Salt Lake City 371.000000 300.253300 70.746700
Seattle 343.000000 346.245432 -3.245432
#--------------End Table---------------------------
summary(model) # Use this to get the coefficients
Call:
lm(formula = y ~ x)
Residuals:
Min 1Q Median 3Q Max
-89.911 -22.881 1.304 22.978 70.747
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 177.21452 19.99315 8.864 1.43e-07 ***
x 0.07862 0.02037 3.859 0.00139 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 41.82 on 16 degrees of freedom
Multiple R-squared: 0.482, Adjusted R-squared: 0.4496
F-statistic: 14.89 on 1 and 16 DF, p-value: 0.00139
\[\hat{y} = 177.21452 + 0.07862 \times x\] \[\hat{Fare} = 177.21452 + 0.07862 \times Distance\] Back to top
model <- lm(y ~ x)
summary(model)
Call:
lm(formula = y ~ x)
Residuals:
Min 1Q Median 3Q Max
-89.911 -22.881 1.304 22.978 70.747
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 177.21452 19.99315 8.864 1.43e-07 ***
x 0.07862 0.02037 3.859 0.00139 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 41.82 on 16 degrees of freedom
Multiple R-squared: 0.482, Adjusted R-squared: 0.4496
F-statistic: 14.89 on 1 and 16 DF, p-value: 0.00139
model <- lm(y ~ x)
confint(model, level=.95) # 95% CI for betas
2.5 % 97.5 %
(Intercept) 134.83094693 219.598098
x 0.03542601 0.121812
model <- lm(y ~ x)
#prediction interval for the new observation 700
predict(model, data.frame(x=700),interval="pred",level=.95) #prediction interval
fit lwr upr
1 232.2478 140.9256 323.57
model <- lm(y ~ x)
#confidence interval for the new observation 700
predict(model, data.frame(x=700),interval="conf",level=.95) #confidence interval
fit lwr upr
1 232.2478 210.323 254.1727