Finding LS estimates in \(R\)

Read data from an URL

The toluca data set is coming from the Table 1 in Chapter 1. (CH01TA01.txt). To access the data sets go to this url: http://www.cnachtsheim-text.csom.umn.edu/Chapter%20%201%20Data%20Sets.html. Use the following set of codes to load the toluca data set in to R

toluca <- read.table("http://www.cnachtsheim-text.csom.umn.edu/Kutner/Chapter%20%201%20Data%20Sets/CH01TA01.txt", sep ="" , header = FALSE)


#Look at the first 6 entries
head(toluca)
  V1  V2
1 80 399
2 30 121
3 50 221
4 90 376
5 70 361
6 60 224

Rename columns

colnames(toluca) <- c("lotSize", "hours")

#Look at the first 6 entries
head(toluca)
  lotSize hours
1      80   399
2      30   121
3      50   221
4      90   376
5      70   361
6      60   224
#View(toluca)

Creating a scatter plot

library(ggplot2)
ggplot(toluca, aes(x = lotSize, y = hours)) +
  geom_point() +
  labs(x = "Lot Size", y = "Work Hours", title = "Toluca example scatter plot") +
  theme_bw()

Note: Lot Size and Work hours has a strong, linear, positive association

Creating a scatter plot, LS line added

ggplot(toluca, aes(x = lotSize, y = hours)) +
  geom_point() +
  labs(x = "Lot Size", y = "Work Hours", title = "Toluca example, LS line added") +
  geom_smooth(method = "lm", se = FALSE) +
  theme_bw()

Finding the LS estimates

toluca_LS_model <- lm(hours ~ lotSize, data = toluca)
summary_toluca_LS_model <- summary(toluca_LS_model)
summary_toluca_LS_model

Call:
lm(formula = hours ~ lotSize, data = toluca)

Residuals:
    Min      1Q  Median      3Q     Max 
-83.876 -34.088  -5.982  38.826 103.528 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   62.366     26.177   2.382   0.0259 *  
lotSize        3.570      0.347  10.290 4.45e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 48.82 on 23 degrees of freedom
Multiple R-squared:  0.8215,    Adjusted R-squared:  0.8138 
F-statistic: 105.9 on 1 and 23 DF,  p-value: 4.449e-10

Finding fitted values \(\hat{y_i}\), and residuals \(e_i =(y_i-\hat{y_i})\)

#Method 1
#library(moderndive)
#Fittedandresiduals <-get_regression_points(toluca_LS_model)
#Fittedandresiduals

#Method 2
#Simply use the summary_toluca_LS_model object to get residuals and fitted values
e <- summary_toluca_LS_model$residuals
e
          1           2           3           4           5           6 
 51.0179798 -48.4719192 -19.8759596  -7.6840404  48.7200000 -52.5779798 
          7           8           9          10          11          12 
 55.2098990   4.0179798 -66.3860606 -83.8759596 -45.1739394 -60.2800000 
         13          14          15          16          17          18 
  5.3159596 -20.7698990 -20.0880808   0.6139394  42.5280808  27.1240404 
         19          20          21          22          23          24 
 -6.6840404 -34.0880808 103.5280808  84.3159596  38.8260606  -5.9820202 
         25 
 10.7200000 

Calculating \(SSE = \sum (y_i-\hat{y_i})^2\)

#Using method 1 above
#sum_of_square_of_residuals <- sum(Fittedandresiduals$residual^2)
#sum_of_square_of_residuals

#Using method 2
sum_of_square_of_residuals <- sum(summary_toluca_LS_model$residuals^2)
sum_of_square_of_residuals
[1] 54825.46

Calculating \(MSE =SSE/(n-2)\)

Mean_Square_Error <- sum_of_square_of_residuals/(nrow(toluca) -2)
Mean_Square_Error
[1] 2383.716

Calculating Residual Standard Error (estimator of standard deviation \(\sigma\)) \(s = \sqrt(MSE)\)

s <- sqrt(Mean_Square_Error)
s
## [1] 48.82331