1 Variable descriptions

Variable descriptions were obtained from King County, Department of Assessments. All feature engineering should be done in the first code chunks of your document.

2 Data and feature engineering

2.1 Data Set (housedata) and Data to send predictions (housedataT)

# Data Set

housedata <- read.csv("https://lasanthi-asu.github.io/STT3851ClassRepo/Rmarkdown/Data/housedata.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))

housedata$date <- as.Date(housedata$date, "%Y%m%d")
housedata$waterfront <- factor(housedata$waterfront, labels = c("No", "Yes"))
housedata$condition <- factor(housedata$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedata$yr_renovated <- ifelse(housedata$yr_renovated == 0, housedata$yr_built, housedata$yr_renovated)
housedata$yr_built <- as.Date(ISOdate(housedata$yr_built, 9, 1))  # Complete Year, Sept 1
housedata$yr_renovated <- as.Date(ISOdate(housedata$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedata <- housedata[, -1]
attach(housedata)

# Test Data Set (only use this to send predictions)

#### Perform same steps with test set

housedataT <- read.csv("https://lasanthi-asu.github.io/STT3851ClassRepo/Rmarkdown/Data/housedataTEST.csv", 
                      colClasses = c(id = "character", date = "character", 
                                     yr_built = "character", zipcode = "factor", grade = "factor"))

housedataT$date <- as.Date(housedataT$date, "%Y%m%d")
housedataT$waterfront <- factor(housedataT$waterfront, labels = c("No", "Yes"))
housedataT$condition <- factor(housedataT$condition, labels = c("poor", "fair", "average", "good", "very good"))
housedataT$yr_renovated <- ifelse(housedataT$yr_renovated == 0, housedataT$yr_built, housedataT$yr_renovated)
housedataT$yr_built <- as.Date(ISOdate(housedataT$yr_built, 9, 1))  # Complete Year, Sept 1
housedataT$yr_renovated <- as.Date(ISOdate(housedataT$yr_renovated, 9, 1))  # Last renovated Year, Sept 1
housedataT <- housedataT[, -1]
attach(housedataT)
library(DT)
datatable(housedata[, ], rownames = FALSE)

3 Where to start

4 Correlation Matrix

Here is an example of a correlation matrix with some of the predictors

# Correlation matrix
cor(housedata1)
                   price    bedrooms  bathrooms sqft_living     sqft_lot
price         1.00000000  0.31284286 0.52334477  0.70291635  0.088238107
bedrooms      0.31284286  1.00000000 0.52923162  0.59105983  0.030179053
bathrooms     0.52334477  0.52923162 1.00000000  0.75455302  0.082139581
sqft_living   0.70291635  0.59105983 0.75455302  1.00000000  0.166967283
sqft_lot      0.08823811  0.03017905 0.08213958  0.16696728  1.000000000
floors        0.25235756  0.18028523 0.50066694  0.35267511 -0.002951851
view          0.39102268  0.07884375 0.18312596  0.27981310  0.069978368
sqft_above    0.60527752  0.49174312 0.68455295  0.87631944  0.176005462
sqft_basement 0.33122956  0.31056084 0.29077234  0.44288611  0.018691884
lat           0.30948443 -0.01002422 0.02676418  0.05693083 -0.085417697
long          0.02131272  0.13604729 0.22151426  0.23737409  0.225347502
sqft_living15 0.58348082  0.40306677 0.56816564  0.75627424  0.147707827
sqft_lot15    0.08080643  0.02784234 0.08467962  0.17830644  0.727774079
                    floors         view   sqft_above sqft_basement          lat
price          0.252357558  0.391022681  0.605277522    0.33122956  0.309484427
bedrooms       0.180285231  0.078843754  0.491743119    0.31056084 -0.010024220
bathrooms      0.500666944  0.183125959  0.684552945    0.29077234  0.026764178
sqft_living    0.352675112  0.279813103  0.876319445    0.44288611  0.056930825
sqft_lot      -0.002951851  0.069978368  0.176005462    0.01869188 -0.085417697
floors         1.000000000  0.026258735  0.522710921   -0.24145116  0.049952734
view           0.026258735  1.000000000  0.163954243    0.27514730  0.008885553
sqft_above     0.522710921  0.163954243  1.000000000   -0.04379916  0.001422037
sqft_basement -0.241451164  0.275147303 -0.043799158    1.00000000  0.115419784
lat            0.049952734  0.008885553  0.001422037    0.11541978  1.000000000
long           0.125918561 -0.076033506  0.341128260   -0.14261452 -0.133311159
sqft_living15  0.280417017  0.279937620  0.732554007    0.20500421  0.049938206
sqft_lot15    -0.007389463  0.068809179  0.188503973    0.01894595 -0.089826111
                     long sqft_living15   sqft_lot15
price          0.02131272    0.58348082  0.080806426
bedrooms       0.13604729    0.40306677  0.027842339
bathrooms      0.22151426    0.56816564  0.084679619
sqft_living    0.23737409    0.75627424  0.178306444
sqft_lot       0.22534750    0.14770783  0.727774079
floors         0.12591856    0.28041702 -0.007389463
view          -0.07603351    0.27993762  0.068809179
sqft_above     0.34112826    0.73255401  0.188503973
sqft_basement -0.14261452    0.20500421  0.018945951
lat           -0.13331116    0.04993821 -0.089826111
long           1.00000000    0.33399885  0.253890095
sqft_living15  0.33399885    1.00000000  0.184561578
sqft_lot15     0.25389009    0.18456158  1.000000000
# Corrplot
library(corrplot)
corrplot(cor(housedata1))

Consider predicting the price (price) of a house based on a certain feature (sqft_living). Start by graphing the relationship.

library(ggplot2)
p1 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
  geom_point() + 
  theme_bw()
p1

4.1 Making Scatterplots more useful (optional)

Overplotting is problematic. What should we do?

  • Consider making the plotting shape smaller.
  • Make the points semitransparent (alpha).
  • Bin the data into rectangles.
  • Bin the data into hexagons.

4.1.1 Using alpha (optional)

p2 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        geom_point(alpha = 0.05, color = "blue") + 
        theme_bw() 
p2

4.1.2 Using rectangles (optional)

p3 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        theme_bw()
p3

p4 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_bin2d(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000)) +
        theme_bw()
p4

4.1.3 Using hexagons (optional)

p5 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 800), breaks = seq(0, 800, by = 200)) +
        theme_bw()
p5

**Note* For both stat_bin2d and stat_binhex, if you manually specify the range, and there is a bin that falls outside that range because it has too many or too few points, that bin will show up as grey rather than the color at the high or low end of the range. Observe the gray hexagons in the lower left corner of the above graph.

p6 <- ggplot(data = housedata, aes(x = sqft_living, y = price)) + 
        stat_binhex(bins = 50) + 
        scale_fill_gradient(low = "lightblue", high = "red", 
                            limits = c(0, 1000), breaks = seq(0, 1000, by = 200)) +
        theme_bw()
p6

4.1.4 What features might be visible in a scatterplot? (optional)

  • Causal relationships (linear and nonlinear) - One variable may have a direct influence on another in some way.
  • Associations - Variables may be associated with on another without being directly causally related.
  • Outliers of groups of outliers - Cases can be outliers in two dimensions without being outliers in either dimension separately.
  • Clusters - Sometimes there are groups of cases which are separate from the rest of the data.
  • Gaps - Occasionally, particular combinations of values do not occur.
  • Barriers - Some combinations of values may not be possible
  • Conditional relationship - Sometimes the relationship between two variables is better summarized by a conditional description that by a function.

Use a simple linear model to predict the price of a house with 2,500 \(\text{ft}^2\).

p6 + geom_smooth(method = "lm") + 
  geom_vline(xintercept = 2500,linetype = "dashed", color = "red") +
  geom_hline(yintercept = predict(slm, newdata = data.frame(sqft_living = 2500)), linetype = "dashed", color = "red") + 
  labs(x = "Living Space (square feet)", y = "Price ($)")

5 Building models

5.1 Most basic model (named mod.zip below).

mod.zip <- lm(price ~ 1, data = housedata)
summary(mod.zip)

Call:
lm(formula = price ~ 1, data = housedata)

Residuals:
    Min      1Q  Median      3Q     Max 
-464367 -219367  -89367  100633 7160633 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   539367       2804   192.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 369700 on 17383 degrees of freedom

5.2 Model with all current features except sqft_basement. (named mod.all below).

mod.all <- lm(price ~ . - sqft_basement, data = housedata)
summary(mod.all)

Call:
lm(formula = price ~ . - sqft_basement, data = housedata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1539648   -60120     2700    56110  3478116 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -4.135e+07  6.410e+06  -6.450 1.15e-10 ***
date                1.145e+02  1.016e+01  11.269  < 2e-16 ***
bedrooms           -1.431e+04  1.708e+03  -8.377  < 2e-16 ***
bathrooms           2.394e+04  2.764e+03   8.663  < 2e-16 ***
sqft_living         1.170e+02  3.714e+00  31.495  < 2e-16 ***
sqft_lot            2.574e-01  4.059e-02   6.342 2.32e-10 ***
floors             -2.963e+04  3.344e+03  -8.862  < 2e-16 ***
waterfrontYes       6.641e+05  1.465e+04  45.331  < 2e-16 ***
view                4.953e+04  1.828e+03  27.102  < 2e-16 ***
conditionfair       9.809e+04  3.414e+04   2.873 0.004075 ** 
conditionaverage    1.042e+05  3.174e+04   3.282 0.001032 ** 
conditiongood       1.318e+05  3.175e+04   4.151 3.32e-05 ***
conditionvery good  1.814e+05  3.194e+04   5.677 1.39e-08 ***
grade10             1.173e+05  1.549e+05   0.757 0.448960    
grade11             3.143e+05  1.552e+05   2.025 0.042869 *  
grade12             7.525e+05  1.561e+05   4.821 1.44e-06 ***
grade13             1.661e+06  1.615e+05  10.287  < 2e-16 ***
grade3             -7.935e+02  1.776e+05  -0.004 0.996434    
grade4             -9.356e+04  1.588e+05  -0.589 0.555737    
grade5             -1.035e+05  1.548e+05  -0.668 0.503898    
grade6             -1.122e+05  1.547e+05  -0.725 0.468200    
grade7             -1.091e+05  1.547e+05  -0.705 0.480711    
grade8             -8.819e+04  1.548e+05  -0.570 0.568766    
grade9             -1.235e+04  1.548e+05  -0.080 0.936432    
sqft_above          5.176e+01  3.858e+00  13.416  < 2e-16 ***
yr_built           -3.312e+00  2.798e-01 -11.838  < 2e-16 ***
yr_renovated        2.591e+00  2.847e-01   9.103  < 2e-16 ***
zipcode98002        1.909e+04  1.516e+04   1.259 0.207972    
zipcode98003       -1.207e+04  1.341e+04  -0.900 0.368288    
zipcode98004        7.176e+05  2.455e+04  29.229  < 2e-16 ***
zipcode98005        2.534e+05  2.626e+04   9.650  < 2e-16 ***
zipcode98006        2.205e+05  2.140e+04  10.305  < 2e-16 ***
zipcode98007        2.136e+05  2.691e+04   7.938 2.18e-15 ***
zipcode98008        2.371e+05  2.569e+04   9.230  < 2e-16 ***
zipcode98010        1.108e+05  2.280e+04   4.860 1.18e-06 ***
zipcode98011        5.783e+04  3.358e+04   1.722 0.085099 .  
zipcode98014        1.005e+05  3.709e+04   2.711 0.006722 ** 
zipcode98019        7.545e+04  3.613e+04   2.088 0.036804 *  
zipcode98022        6.976e+04  2.010e+04   3.470 0.000521 ***
zipcode98023       -5.426e+04  1.240e+04  -4.376 1.21e-05 ***
zipcode98024        1.806e+05  3.184e+04   5.672 1.43e-08 ***
zipcode98027        1.737e+05  2.203e+04   7.885 3.33e-15 ***
zipcode98028        3.778e+04  3.257e+04   1.160 0.246053    
zipcode98029        2.355e+05  2.510e+04   9.383  < 2e-16 ***
zipcode98030        1.126e+04  1.465e+04   0.769 0.441829    
zipcode98031        1.564e+04  1.537e+04   1.018 0.308915    
zipcode98032       -1.284e+04  1.792e+04  -0.717 0.473687    
zipcode98033        2.958e+05  2.789e+04  10.608  < 2e-16 ***
zipcode98034        1.264e+05  2.990e+04   4.228 2.37e-05 ***
zipcode98038        7.791e+04  1.655e+04   4.708 2.53e-06 ***
zipcode98039        1.101e+06  3.245e+04  33.942  < 2e-16 ***
zipcode98040        4.680e+05  2.194e+04  21.327  < 2e-16 ***
zipcode98042        2.810e+04  1.418e+04   1.982 0.047456 *  
zipcode98045        1.814e+05  3.066e+04   5.917 3.33e-09 ***
zipcode98052        1.967e+05  2.842e+04   6.922 4.62e-12 ***
zipcode98053        1.808e+05  3.037e+04   5.954 2.67e-09 ***
zipcode98055        2.146e+04  1.718e+04   1.249 0.211728    
zipcode98056        6.367e+04  1.868e+04   3.409 0.000653 ***
zipcode98058        3.359e+04  1.620e+04   2.073 0.038186 *  
zipcode98059        6.952e+04  1.837e+04   3.784 0.000155 ***
zipcode98065        1.346e+05  2.815e+04   4.782 1.75e-06 ***
zipcode98070       -7.192e+04  2.176e+04  -3.305 0.000950 ***
zipcode98072        9.011e+04  3.321e+04   2.713 0.006671 ** 
zipcode98074        1.588e+05  2.686e+04   5.912 3.45e-09 ***
zipcode98075        1.598e+05  2.576e+04   6.204 5.64e-10 ***
zipcode98077        6.050e+04  3.468e+04   1.745 0.081073 .  
zipcode98092       -1.196e+03  1.326e+04  -0.090 0.928155    
zipcode98102        4.390e+05  2.935e+04  14.959  < 2e-16 ***
zipcode98103        2.424e+05  2.709e+04   8.948  < 2e-16 ***
zipcode98105        3.997e+05  2.773e+04  14.414  < 2e-16 ***
zipcode98106        5.587e+04  2.006e+04   2.785 0.005359 ** 
zipcode98107        2.487e+05  2.780e+04   8.948  < 2e-16 ***
zipcode98108        6.070e+04  2.237e+04   2.714 0.006652 ** 
zipcode98109        4.148e+05  2.861e+04  14.498  < 2e-16 ***
zipcode98112        5.380e+05  2.544e+04  21.151  < 2e-16 ***
zipcode98115        2.480e+05  2.744e+04   9.040  < 2e-16 ***
zipcode98116        2.080e+05  2.232e+04   9.320  < 2e-16 ***
zipcode98117        2.180e+05  2.782e+04   7.834 4.99e-15 ***
zipcode98118        1.124e+05  1.948e+04   5.771 8.02e-09 ***
zipcode98119        4.068e+05  2.701e+04  15.061  < 2e-16 ***
zipcode98122        2.721e+05  2.421e+04  11.235  < 2e-16 ***
zipcode98125        1.097e+05  2.968e+04   3.695 0.000220 ***
zipcode98126        1.206e+05  2.068e+04   5.831 5.61e-09 ***
zipcode98133        5.335e+04  3.061e+04   1.743 0.081338 .  
zipcode98136        1.768e+05  2.100e+04   8.420  < 2e-16 ***
zipcode98144        2.204e+05  2.244e+04   9.823  < 2e-16 ***
zipcode98146        3.043e+04  1.876e+04   1.622 0.104828    
zipcode98148        3.650e+04  2.635e+04   1.385 0.165989    
zipcode98155        4.353e+04  3.191e+04   1.364 0.172629    
zipcode98166        5.831e+03  1.725e+04   0.338 0.735412    
zipcode98168        9.246e+03  1.825e+04   0.507 0.612501    
zipcode98177        1.079e+05  3.190e+04   3.382 0.000720 ***
zipcode98178       -5.908e+03  1.861e+04  -0.317 0.750888    
zipcode98188       -4.184e+03  1.898e+04  -0.220 0.825497    
zipcode98198       -3.087e+04  1.448e+04  -2.132 0.033009 *  
zipcode98199        2.724e+05  2.639e+04  10.323  < 2e-16 ***
lat                 2.233e+05  6.632e+04   3.367 0.000761 ***
long               -2.361e+05  4.727e+04  -4.995 5.95e-07 ***
sqft_living15       1.521e+01  3.029e+00   5.023 5.15e-07 ***
sqft_lot15         -1.159e-01  6.429e-02  -1.803 0.071344 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 150200 on 17284 degrees of freedom
Multiple R-squared:  0.8359,    Adjusted R-squared:  0.835 
F-statistic: 889.4 on 99 and 17284 DF,  p-value: < 2.2e-16
anova(mod.all)
Analysis of Variance Table

Response: price
                 Df     Sum Sq    Mean Sq    F value    Pr(>F)    
date              1 2.9956e+10 2.9956e+10     1.3282 0.2491485    
bedrooms          1 2.3249e+14 2.3249e+14 10307.9306 < 2.2e-16 ***
bathrooms         1 4.2285e+14 4.2285e+14 18748.2537 < 2.2e-16 ***
sqft_living       1 5.5816e+14 5.5816e+14 24747.5215 < 2.2e-16 ***
sqft_lot          1 3.8694e+12 3.8694e+12   171.5587 < 2.2e-16 ***
floors            1 9.0483e+10 9.0483e+10     4.0118 0.0451986 *  
waterfront        1 8.3044e+13 8.3044e+13  3681.9764 < 2.2e-16 ***
view              1 3.8892e+13 3.8892e+13  1724.3793 < 2.2e-16 ***
condition         4 1.6341e+13 4.0854e+12   181.1348 < 2.2e-16 ***
grade            11 1.7804e+14 1.6185e+13   717.6222 < 2.2e-16 ***
sqft_above        1 7.0222e+12 7.0222e+12   311.3450 < 2.2e-16 ***
yr_built          1 8.4946e+13 8.4946e+13  3766.2921 < 2.2e-16 ***
yr_renovated      1 4.7609e+11 4.7609e+11    21.1086 4.370e-06 ***
zipcode          69 3.5814e+14 5.1905e+12   230.1340 < 2.2e-16 ***
lat               1 3.1253e+11 3.1253e+11    13.8567 0.0001979 ***
long              1 6.0257e+11 6.0257e+11    26.7164 2.382e-07 ***
sqft_living15     1 5.4648e+11 5.4648e+11    24.2294 8.630e-07 ***
sqft_lot15        1 7.3351e+10 7.3351e+10     3.2522 0.0713445 .  
Residuals     17284 3.8983e+14 2.2554e+10                         
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Your goal is to create a model with as small a test error as possible.

5.3 Prediction

  • Submit a vector named (Yourlastname_Yourfirstname) with the predicted house prices for your model using the data frame housedataT.

  • Suppose your final model is mod.all.

dim(housedataT)
[1] 4229   19
Yourlastname_Yourfirstname <- predict(mod.all, newdata = housedataT)
#Yourlastname_Yourfirstname
  • Use write.csv() to create the vector Yourlastname_Yourfirstname.csv
write.csv(Yourlastname_Yourfirstname, file = "Yourlastname_Yourfirstname.csv") # this part will due May 3rd
  • I will compute your \(\sqrt{MSPE}\) as I have have the price values for the data set housedataT.
[1] 105493.4

Note: When we create lasso or ridge models, we use the function model.matrix to convert data into a model matrix. This function turns all of the factor variables into dummy variables. Because of this, for this data set, once the models are created, we will have 100 predictors (we initially had only 19). But the issue is that the housedataT data set does not have 100 predictors (only 19). The fix is very simple: we convert housedataT into a model matrix as bellow:

housedataT_new <- model.matrix(~., housedataT)[,-1] # Use this to send me predictions for only lasso or ridge like models

#Assume you created a Lasso model and named it lasso_model
Yourlastname_Yourfirstname <- predict(lasso_model ,s = bestlam, newx = housedataT_new)