library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
GrinnellHouses <- read.csv("https://grinnell-sta-295-s24.github.io/data/GrinnellHouses.csv")
Codebook for data:
https://grinnell-sta-295-s24.github.io/data/GrinnellHouses_codebook.txt
We start with data exploration:
glimpse(GrinnellHouses)
## Rows: 929
## Columns: 17
## $ Date <int> 16695, 16880, 16875, 16833, 16667, 16583, 16602, 16687, 16…
## $ Address <chr> "1510 First Ave #112 ", "1020 Center St ", "918 Chatterton…
## $ Bedrooms <int> 2, 3, 4, 3, 3, 3, 2, 2, 3, 3, 5, 4, 3, 2, 2, 2, 3, 4, 4, 2…
## $ Baths <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.50, 2.00, 2.00…
## $ SquareFeet <int> 1120, 1224, 1540, 1154, 1277, 1079, 768, 1152, 912, 1488, …
## $ LotSize <dbl> NA, 0.1721763, NA, NA, 0.2066116, 0.1993572, NA, NA, 0.218…
## $ YearBuilt <int> 1993, 1900, 1970, 1900, 1900, 1900, 1955, 1900, 1900, 1900…
## $ YearSold <int> 2005, 2006, 2006, 2006, 2005, 2005, 2005, 2005, 2005, 2005…
## $ MonthSold <int> 9, 3, 3, 2, 8, 5, 6, 9, 9, 9, 8, 3, 3, 12, 8, 5, 1, 3, 4, …
## $ DaySold <int> 16, 20, 15, 1, 19, 27, 15, 8, 21, 23, 30, 17, 22, 2, 16, 2…
## $ CostPerSqFt <dbl> 6.25, 22.06, 18.18, 26.00, 24.08, 38.92, 59.90, 42.97, 54.…
## $ OrigPrice <int> 17000, 35000, 54000, 65000, 35000, 45900, 62500, 54500, 59…
## $ ListPrice <int> 10500, 35000, 47000, 49000, 35000, 45900, 56500, 52500, 52…
## $ SalePrice <int> 7000, 27000, 28000, 30000, 30750, 42000, 46000, 49500, 500…
## $ SPLPPct <dbl> 66.67, 77.14, 59.57, 61.22, 87.86, 91.50, 81.42, 94.29, 95…
## $ Latitude <dbl> 41.73880, 41.74558, 41.74404, 41.74503, 41.74041, 41.81942…
## $ Longitude <dbl> -92.71378, -92.73168, -92.71308, -92.72896, -92.73002, -92…
Let’s plot Latitude and Longitude
ggplot(GrinnellHouses, aes( x = Longitude, y = Latitude, color = SalePrice)) +
geom_point() +
viridis::scale_color_viridis()
Removing houses with erroneous coordinates:
GrinnellHouses <- GrinnellHouses %>%
filter(Latitude != 0, Longitude != 0)
And plotting again:
ggplot(GrinnellHouses, aes( x = Longitude, y = Latitude, color = SalePrice)) +
geom_point() +
viridis::scale_color_viridis()
We still have some houses with atypical coordinates, but these are less likely to be data error. Still, it may be best to remove them
GrinnellHouses <- GrinnellHouses %>%
filter(Longitude > -94, Longitude < -92.5, Latitude < 41.85)
Plotting once more:
ggplot(GrinnellHouses, aes( x = Longitude, y = Latitude, color = SalePrice)) +
geom_point() +
viridis::scale_color_viridis()
Do we have any NA data?
colSums(is.na(GrinnellHouses))
## Date Address Bedrooms Baths SquareFeet LotSize
## 0 0 0 0 18 187
## YearBuilt YearSold MonthSold DaySold CostPerSqFt OrigPrice
## 0 0 0 0 0 0
## ListPrice SalePrice SPLPPct Latitude Longitude
## 0 0 0 0 0
This is fine, but we should be cautious depending on which models we use.
Goal: write a function that will build a model on a supplied training set.
Suggestions: KNN model
This function will take as input a training data set, a test data set, the number of neighbors desired, and a number corresponding to the distance metric, and will output the knn model object, which can be used to find fitted values.
get_that_model <- function(the_training_data, the_test_data, how_many_neighbors, distance_number){
library(kknn)
kknn(SalePrice ~ Latitude + Longitude, train = the_training_data,
test = the_test_data, distance = distance_number, k =how_many_neighbors, kernel = "rectangular")
}
We now split our data into test and training sets using functions
from rsample
set.seed(100)
library(rsample)
my_split <- initial_split(GrinnellHouses)
Grinnell_train <- training(my_split)
Grinnell_test <- testing(my_split)
Now, let’s quickly build our models for a variety of different k’s, along with different distance metrics
knn_out_15k <- get_that_model(Grinnell_train, Grinnell_test, 15, 2)
knn_out_5k <- get_that_model(Grinnell_train, Grinnell_test, 5, 2)
knn_out_5k_man <- get_that_model(Grinnell_train, Grinnell_test, 5, 1)
We can arrange the predictions from these models as columns in a data frame
pred_data_frame <- data.frame(the_5 = knn_out_5k$fitted.values,
the_15 = knn_out_15k$fitted.values,
the_5_man = knn_out_5k_man$fitted.values)
head(pred_data_frame)
## the_5 the_15 the_5_man
## 1 99400 93577.93 89400
## 2 57260 59663.33 57260
## 3 62000 67000.00 62400
## 4 73200 83800.00 73200
## 5 62160 68133.33 62160
## 6 72060 97626.67 72060
Now, we use rsample
to perform 10-fold cross validation
using the vfold_cv
function.
set.seed(9432)
my_fold <- vfold_cv(Grinnell_train, v = 10)
Note that the output of the vfold_cv
function is a data
frame; the first element is a “Split” indicating how data was
partitioned, and the second element is the “Fold” used for
validation.
Let’s look at the first element of splits:
splits
is a list, so to access the first
element, we need to use the [[ ]]
notation, rather than the
[ ]
notation used for vectors or data frames.my_fold$splits[[1]]
## <Analysis/Assess/Total>
## <619/69/688>
To get training set, apply analysis
function to one of
splits:
analysis(my_fold$splits[[1]]) %>% head()
## Date Address Bedrooms Baths SquareFeet LotSize YearBuilt YearSold
## 1 19327 408 16th Ave 3 3.5 2399 0.2203857 1989 2012
## 2 18624 1211 Hamlin 2 1.0 1152 0.3575758 1986 2010
## 3 17310 1923 Reed St 4 3.0 2330 NA 1999 2007
## 4 19047 1909 Reed St 3 3.0 4240 0.2647383 2001 2012
## 5 19277 4259 20th St 4 3.0 3922 21.9100000 1991 2012
## 6 18032 1120 West St 3 2.0 1799 0.2840909 1900 2009
## MonthSold DaySold CostPerSqFt OrigPrice ListPrice SalePrice SPLPPct Latitude
## 1 11 30 116.72 299900 285000 280000 98.25 41.76062
## 2 12 28 23.52 30000 30000 27100 90.33 41.74810
## 3 5 24 163.09 399900 399900 380000 95.02 41.75820
## 4 2 24 70.17 359000 324000 297500 91.82 41.75767
## 5 10 11 108.36 455000 455000 425000 93.41 41.67986
## 6 5 15 49.47 102000 89900 89000 99.00 41.74700
## Longitude
## 1 -92.73036
## 2 -92.72959
## 3 -92.73296
## 4 -92.73294
## 5 -92.74646
## 6 -92.72720
To get test set, instead apply assessment
function
assessment(my_fold$splits[[1]]) %>% head()
## Date Address Bedrooms Baths SquareFeet LotSize YearBuilt
## 1 17353 321 4th Ave 5 2.50 2050 0.1928375 1900
## 2 16832 409 Park Street 3 1.75 1064 NA 1971
## 3 18375 1417 Michael Avenue 4 3.00 2905 0.2871901 2003
## 4 17198 409 5th Ave 3 2.00 1678 NA 1900
## 5 19792 1623 3rd 4 2.00 1576 0.0355831 1900
## 6 16982 12 Melrose Ln 3 2.00 1341 0.2375803 2006
## YearSold MonthSold DaySold CostPerSqFt OrigPrice ListPrice SalePrice SPLPPct
## 1 2007 7 6 68.29 148000 148000 140000 94.59
## 2 2006 1 31 106.67 115900 115900 113500 97.93
## 3 2010 4 23 101.38 299900 299900 294500 98.20
## 4 2007 2 1 53.93 94900 89000 90500 101.69
## 5 2014 3 10 38.07 80000 80000 60000 75.00
## 6 2006 6 30 132.74 161900 161900 178000 109.94
## Latitude Longitude
## 1 41.74337 -92.73204
## 2 41.73642 -92.72261
## 3 41.72700 -92.71491
## 4 41.74485 -92.73120
## 5 41.74186 -92.71235
## 6 41.73429 -92.71336
Now let’s use functions to build a model on splits. We’ll create prediction on the test set and then compute the test RMSE. Our function will output this RMSE
my_cv_model <- function(split){
mod <- lm(SalePrice ~ Latitude + Longitude, data = analysis(split))
validation_set <- assessment(split)
pred <- predict(mod, newdata = validation_set)
rmse <- sqrt(mean((validation_set$SalePrice - pred)^2) )
rmse
}
Let apply to the first split, to test functionality:
my_cv_model(my_fold$splits[[1]])
## [1] 94376.73
To apply a function to each element of a list or vector, use the
map_dbl
from the purrr
package
library(purrr)
map_dbl(my_fold$splits, my_cv_model)
## [1] 94376.73 86427.95 72153.23 76523.70 77138.36 71094.67 66483.30 78088.23
## [9] 62486.97 67344.22
Let’s attach this to the data frame of splits:
my_fold$rmse <- map_dbl(my_fold$splits, my_cv_model)
And look at the results:
my_fold
## # 10-fold cross-validation
## # A tibble: 10 × 3
## splits id rmse
## <list> <chr> <dbl>
## 1 <split [619/69]> Fold01 94377.
## 2 <split [619/69]> Fold02 86428.
## 3 <split [619/69]> Fold03 72153.
## 4 <split [619/69]> Fold04 76524.
## 5 <split [619/69]> Fold05 77138.
## 6 <split [619/69]> Fold06 71095.
## 7 <split [619/69]> Fold07 66483.
## 8 <split [619/69]> Fold08 78088.
## 9 <split [620/68]> Fold09 62487.
## 10 <split [620/68]> Fold10 67344.
To get cross-validated RMSE, take average of the rmse column:
mean(my_fold$rmse)
## [1] 75211.74