Variable descriptions were obtained from King County, Department of Assessments. All feature engineering should be done in the first code chunks of your document.
housedata
) and Data to send predictions (housedataT
)# Data Set
housedata <- read.csv("https://lasanthi-asu.github.io/STT3851ClassRepo/Rmarkdown/Data/housedata.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1)) # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedata <- housedata[, -1]
attach(housedata)
# Test Data Set (only use this to send predictions)
#### Perform same steps with test set
housedataT <- read.csv("https://lasanthi-asu.github.io/STT3851ClassRepo/Rmarkdown/Data/housedataTEST.csv",
colClasses = c(id = "character", date = "character",
yr_built = "character", zipcode = "factor", grade = "factor"))
housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1)) # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1)) # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
attach(housedataT)
library(DT)
datatable(housedata[, ], rownames = FALSE)
Here is an example of a correlation matrix with some of the predictors
# Correlation matrix
cor(housedata1)
price bedrooms bathrooms sqft_living sqft_lot
price 1.00000000 0.31284286 0.52334477 0.70291635 0.088238107
bedrooms 0.31284286 1.00000000 0.52923162 0.59105983 0.030179053
bathrooms 0.52334477 0.52923162 1.00000000 0.75455302 0.082139581
sqft_living 0.70291635 0.59105983 0.75455302 1.00000000 0.166967283
sqft_lot 0.08823811 0.03017905 0.08213958 0.16696728 1.000000000
floors 0.25235756 0.18028523 0.50066694 0.35267511 -0.002951851
view 0.39102268 0.07884375 0.18312596 0.27981310 0.069978368
sqft_above 0.60527752 0.49174312 0.68455295 0.87631944 0.176005462
sqft_basement 0.33122956 0.31056084 0.29077234 0.44288611 0.018691884
lat 0.30948443 -0.01002422 0.02676418 0.05693083 -0.085417697
long 0.02131272 0.13604729 0.22151426 0.23737409 0.225347502
sqft_living15 0.58348082 0.40306677 0.56816564 0.75627424 0.147707827
sqft_lot15 0.08080643 0.02784234 0.08467962 0.17830644 0.727774079
floors view sqft_above sqft_basement lat
price 0.252357558 0.391022681 0.605277522 0.33122956 0.309484427
bedrooms 0.180285231 0.078843754 0.491743119 0.31056084 -0.010024220
bathrooms 0.500666944 0.183125959 0.684552945 0.29077234 0.026764178
sqft_living 0.352675112 0.279813103 0.876319445 0.44288611 0.056930825
sqft_lot -0.002951851 0.069978368 0.176005462 0.01869188 -0.085417697
floors 1.000000000 0.026258735 0.522710921 -0.24145116 0.049952734
view 0.026258735 1.000000000 0.163954243 0.27514730 0.008885553
sqft_above 0.522710921 0.163954243 1.000000000 -0.04379916 0.001422037
sqft_basement -0.241451164 0.275147303 -0.043799158 1.00000000 0.115419784
lat 0.049952734 0.008885553 0.001422037 0.11541978 1.000000000
long 0.125918561 -0.076033506 0.341128260 -0.14261452 -0.133311159
sqft_living15 0.280417017 0.279937620 0.732554007 0.20500421 0.049938206
sqft_lot15 -0.007389463 0.068809179 0.188503973 0.01894595 -0.089826111
long sqft_living15 sqft_lot15
price 0.02131272 0.58348082 0.080806426
bedrooms 0.13604729 0.40306677 0.027842339
bathrooms 0.22151426 0.56816564 0.084679619
sqft_living 0.23737409 0.75627424 0.178306444
sqft_lot 0.22534750 0.14770783 0.727774079
floors 0.12591856 0.28041702 -0.007389463
view -0.07603351 0.27993762 0.068809179
sqft_above 0.34112826 0.73255401 0.188503973
sqft_basement -0.14261452 0.20500421 0.018945951
lat -0.13331116 0.04993821 -0.089826111
long 1.00000000 0.33399885 0.253890095
sqft_living15 0.33399885 1.00000000 0.184561578
sqft_lot15 0.25389009 0.18456158 1.000000000
# Corrplot
library(corrplot)
corrplot(cor(housedata1))
Consider predicting the price (price
) of a house based on a certain feature (sqft_living
). Start by graphing the relationship.
library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point() +
theme_bw()
p1
Overplotting is problematic. What should we do?
alpha
).alpha
(optional)p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
geom_point(alpha = 0.05, color = "blue") +
theme_bw()
p2
p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
theme_bw()
p3
p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_bin2d(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000)) +
theme_bw()
p4
p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
theme_bw()
p5
**Note* For both stat_bin2d
and stat_binhex
, if you manually specify the range, and there is a bin that falls outside that range because it has too many or too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.
p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) +
stat_binhex(bins = 50) +
scale_fill_gradient(low = "lightblue", high = "red",
limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
theme_bw()
p6
Use a simple linear model to predict the price of a house with 2,500 \(\text{ft}^2\).
p6 + geom_smooth(method = "lm") +
geom_vline(xintercept = 2500,linetype = "dashed", color = "red") +
geom_hline(yintercept = predict(slm, newdata = data.frame(sqft_living = 2500)), linetype = "dashed", color = "red") +
labs(x = "Living Space (square feet)", y = "Price ($)")
mod.zip
below).mod.zip <- lm(price ~ 1, data = housedata)
summary(mod.zip)
Call:
lm(formula = price ~ 1, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-464367 -219367 -89367 100633 7160633
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 539367 2804 192.4 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 369700 on 17383 degrees of freedom
sqft_basement
. (named mod.all
below).mod.all <- lm(price ~ . - sqft_basement, data = housedata)
summary(mod.all)
Call:
lm(formula = price ~ . - sqft_basement, data = housedata)
Residuals:
Min 1Q Median 3Q Max
-1539648 -60120 2700 56110 3478116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.135e+07 6.410e+06 -6.450 1.15e-10 ***
date 1.145e+02 1.016e+01 11.269 < 2e-16 ***
bedrooms -1.431e+04 1.708e+03 -8.377 < 2e-16 ***
bathrooms 2.394e+04 2.764e+03 8.663 < 2e-16 ***
sqft_living 1.170e+02 3.714e+00 31.495 < 2e-16 ***
sqft_lot 2.574e-01 4.059e-02 6.342 2.32e-10 ***
floors -2.963e+04 3.344e+03 -8.862 < 2e-16 ***
waterfrontYes 6.641e+05 1.465e+04 45.331 < 2e-16 ***
view 4.953e+04 1.828e+03 27.102 < 2e-16 ***
conditionfair 9.809e+04 3.414e+04 2.873 0.004075 **
conditionaverage 1.042e+05 3.174e+04 3.282 0.001032 **
conditiongood 1.318e+05 3.175e+04 4.151 3.32e-05 ***
conditionvery good 1.814e+05 3.194e+04 5.677 1.39e-08 ***
grade10 1.173e+05 1.549e+05 0.757 0.448960
grade11 3.143e+05 1.552e+05 2.025 0.042869 *
grade12 7.525e+05 1.561e+05 4.821 1.44e-06 ***
grade13 1.661e+06 1.615e+05 10.287 < 2e-16 ***
grade3 -7.935e+02 1.776e+05 -0.004 0.996434
grade4 -9.356e+04 1.588e+05 -0.589 0.555737
grade5 -1.035e+05 1.548e+05 -0.668 0.503898
grade6 -1.122e+05 1.547e+05 -0.725 0.468200
grade7 -1.091e+05 1.547e+05 -0.705 0.480711
grade8 -8.819e+04 1.548e+05 -0.570 0.568766
grade9 -1.235e+04 1.548e+05 -0.080 0.936432
sqft_above 5.176e+01 3.858e+00 13.416 < 2e-16 ***
yr_built -3.312e+00 2.798e-01 -11.838 < 2e-16 ***
yr_renovated 2.591e+00 2.847e-01 9.103 < 2e-16 ***
zipcode98002 1.909e+04 1.516e+04 1.259 0.207972
zipcode98003 -1.207e+04 1.341e+04 -0.900 0.368288
zipcode98004 7.176e+05 2.455e+04 29.229 < 2e-16 ***
zipcode98005 2.534e+05 2.626e+04 9.650 < 2e-16 ***
zipcode98006 2.205e+05 2.140e+04 10.305 < 2e-16 ***
zipcode98007 2.136e+05 2.691e+04 7.938 2.18e-15 ***
zipcode98008 2.371e+05 2.569e+04 9.230 < 2e-16 ***
zipcode98010 1.108e+05 2.280e+04 4.860 1.18e-06 ***
zipcode98011 5.783e+04 3.358e+04 1.722 0.085099 .
zipcode98014 1.005e+05 3.709e+04 2.711 0.006722 **
zipcode98019 7.545e+04 3.613e+04 2.088 0.036804 *
zipcode98022 6.976e+04 2.010e+04 3.470 0.000521 ***
zipcode98023 -5.426e+04 1.240e+04 -4.376 1.21e-05 ***
zipcode98024 1.806e+05 3.184e+04 5.672 1.43e-08 ***
zipcode98027 1.737e+05 2.203e+04 7.885 3.33e-15 ***
zipcode98028 3.778e+04 3.257e+04 1.160 0.246053
zipcode98029 2.355e+05 2.510e+04 9.383 < 2e-16 ***
zipcode98030 1.126e+04 1.465e+04 0.769 0.441829
zipcode98031 1.564e+04 1.537e+04 1.018 0.308915
zipcode98032 -1.284e+04 1.792e+04 -0.717 0.473687
zipcode98033 2.958e+05 2.789e+04 10.608 < 2e-16 ***
zipcode98034 1.264e+05 2.990e+04 4.228 2.37e-05 ***
zipcode98038 7.791e+04 1.655e+04 4.708 2.53e-06 ***
zipcode98039 1.101e+06 3.245e+04 33.942 < 2e-16 ***
zipcode98040 4.680e+05 2.194e+04 21.327 < 2e-16 ***
zipcode98042 2.810e+04 1.418e+04 1.982 0.047456 *
zipcode98045 1.814e+05 3.066e+04 5.917 3.33e-09 ***
zipcode98052 1.967e+05 2.842e+04 6.922 4.62e-12 ***
zipcode98053 1.808e+05 3.037e+04 5.954 2.67e-09 ***
zipcode98055 2.146e+04 1.718e+04 1.249 0.211728
zipcode98056 6.367e+04 1.868e+04 3.409 0.000653 ***
zipcode98058 3.359e+04 1.620e+04 2.073 0.038186 *
zipcode98059 6.952e+04 1.837e+04 3.784 0.000155 ***
zipcode98065 1.346e+05 2.815e+04 4.782 1.75e-06 ***
zipcode98070 -7.192e+04 2.176e+04 -3.305 0.000950 ***
zipcode98072 9.011e+04 3.321e+04 2.713 0.006671 **
zipcode98074 1.588e+05 2.686e+04 5.912 3.45e-09 ***
zipcode98075 1.598e+05 2.576e+04 6.204 5.64e-10 ***
zipcode98077 6.050e+04 3.468e+04 1.745 0.081073 .
zipcode98092 -1.196e+03 1.326e+04 -0.090 0.928155
zipcode98102 4.390e+05 2.935e+04 14.959 < 2e-16 ***
zipcode98103 2.424e+05 2.709e+04 8.948 < 2e-16 ***
zipcode98105 3.997e+05 2.773e+04 14.414 < 2e-16 ***
zipcode98106 5.587e+04 2.006e+04 2.785 0.005359 **
zipcode98107 2.487e+05 2.780e+04 8.948 < 2e-16 ***
zipcode98108 6.070e+04 2.237e+04 2.714 0.006652 **
zipcode98109 4.148e+05 2.861e+04 14.498 < 2e-16 ***
zipcode98112 5.380e+05 2.544e+04 21.151 < 2e-16 ***
zipcode98115 2.480e+05 2.744e+04 9.040 < 2e-16 ***
zipcode98116 2.080e+05 2.232e+04 9.320 < 2e-16 ***
zipcode98117 2.180e+05 2.782e+04 7.834 4.99e-15 ***
zipcode98118 1.124e+05 1.948e+04 5.771 8.02e-09 ***
zipcode98119 4.068e+05 2.701e+04 15.061 < 2e-16 ***
zipcode98122 2.721e+05 2.421e+04 11.235 < 2e-16 ***
zipcode98125 1.097e+05 2.968e+04 3.695 0.000220 ***
zipcode98126 1.206e+05 2.068e+04 5.831 5.61e-09 ***
zipcode98133 5.335e+04 3.061e+04 1.743 0.081338 .
zipcode98136 1.768e+05 2.100e+04 8.420 < 2e-16 ***
zipcode98144 2.204e+05 2.244e+04 9.823 < 2e-16 ***
zipcode98146 3.043e+04 1.876e+04 1.622 0.104828
zipcode98148 3.650e+04 2.635e+04 1.385 0.165989
zipcode98155 4.353e+04 3.191e+04 1.364 0.172629
zipcode98166 5.831e+03 1.725e+04 0.338 0.735412
zipcode98168 9.246e+03 1.825e+04 0.507 0.612501
zipcode98177 1.079e+05 3.190e+04 3.382 0.000720 ***
zipcode98178 -5.908e+03 1.861e+04 -0.317 0.750888
zipcode98188 -4.184e+03 1.898e+04 -0.220 0.825497
zipcode98198 -3.087e+04 1.448e+04 -2.132 0.033009 *
zipcode98199 2.724e+05 2.639e+04 10.323 < 2e-16 ***
lat 2.233e+05 6.632e+04 3.367 0.000761 ***
long -2.361e+05 4.727e+04 -4.995 5.95e-07 ***
sqft_living15 1.521e+01 3.029e+00 5.023 5.15e-07 ***
sqft_lot15 -1.159e-01 6.429e-02 -1.803 0.071344 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 150200 on 17284 degrees of freedom
Multiple R-squared: 0.8359, Adjusted R-squared: 0.835
F-statistic: 889.4 on 99 and 17284 DF, p-value: < 2.2e-16
anova(mod.all)
Analysis of Variance Table
Response: price
Df Sum Sq Mean Sq F value Pr(>F)
date 1 2.9956e+10 2.9956e+10 1.3282 0.2491485
bedrooms 1 2.3249e+14 2.3249e+14 10307.9306 < 2.2e-16 ***
bathrooms 1 4.2285e+14 4.2285e+14 18748.2537 < 2.2e-16 ***
sqft_living 1 5.5816e+14 5.5816e+14 24747.5215 < 2.2e-16 ***
sqft_lot 1 3.8694e+12 3.8694e+12 171.5587 < 2.2e-16 ***
floors 1 9.0483e+10 9.0483e+10 4.0118 0.0451986 *
waterfront 1 8.3044e+13 8.3044e+13 3681.9764 < 2.2e-16 ***
view 1 3.8892e+13 3.8892e+13 1724.3793 < 2.2e-16 ***
condition 4 1.6341e+13 4.0854e+12 181.1348 < 2.2e-16 ***
grade 11 1.7804e+14 1.6185e+13 717.6222 < 2.2e-16 ***
sqft_above 1 7.0222e+12 7.0222e+12 311.3450 < 2.2e-16 ***
yr_built 1 8.4946e+13 8.4946e+13 3766.2921 < 2.2e-16 ***
yr_renovated 1 4.7609e+11 4.7609e+11 21.1086 4.370e-06 ***
zipcode 69 3.5814e+14 5.1905e+12 230.1340 < 2.2e-16 ***
lat 1 3.1253e+11 3.1253e+11 13.8567 0.0001979 ***
long 1 6.0257e+11 6.0257e+11 26.7164 2.382e-07 ***
sqft_living15 1 5.4648e+11 5.4648e+11 24.2294 8.630e-07 ***
sqft_lot15 1 7.3351e+10 7.3351e+10 3.2522 0.0713445 .
Residuals 17284 3.8983e+14 2.2554e+10
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Your goal is to create a model with as small a test error as possible.
Submit a vector named (Yourlastname_Yourfirstname
) with the predicted house prices for your model using the data frame housedataT
.
Suppose your final model is mod.all
.
dim(housedataT)
[1] 4229 19
Yourlastname_Yourfirstname <- predict(mod.all, newdata = housedataT)
#Yourlastname_Yourfirstname
write.csv()
to create the vector Yourlastname_Yourfirstname.csv
write.csv(Yourlastname_Yourfirstname, file = "Yourlastname_Yourfirstname.csv") # this part will due May 3rd
price
values for the data set housedataT
.[1] 105493.4
Note: When we create lasso or ridge models, we use the function model.matrix
to convert data into a model matrix. This function turns all of the factor variables into dummy variables. Because of this, for this data set, once the models are created, we will have 100 predictors (we initially had only 19). But the issue is that the housedataT
data set does not have 100 predictors (only 19). The fix is very simple: we convert housedataT
into a model matrix as bellow:
housedataT_new <- model.matrix(~., housedataT)[,-1] # Use this to send me predictions for only lasso or ridge like models
#Assume you created a Lasso model and named it lasso_model
Yourlastname_Yourfirstname <- predict(lasso_model ,s = bestlam, newx = housedataT_new)