for loops are a programming tool for iteratively performing an operation, which may depend on the iteration number
Ex: Create a dataframe of 10 rows and 4 columns, where the ith column contains the first 10 multiples of i.
my_df <- as.data.frame( matrix(0, nrow = 10, ncol = 4) )
for (i in 1:4){
my_df[,i] <- (1:10)*i
}
Let’s rename the columns so they are of the form: multiples.i
colnames(my_df) <- paste("multiples.",1:4, sep = "")
head(my_df)
map functions take a vector as input, and applies a specific function to each element of the vector, and returns a vector as output:
Let’s compute the means of each column of the matrix
library(purrr)
map_dbl(my_df, mean)
## multiples.1 multiples.2 multiples.3 multiples.4
## 5.5 11.0 16.5 22.0
Goal: Use cross-validation to determine the best k for knn
GrinnellHousesClean <- read.csv("data/GrinnellHousesClean.csv")
Create an intial split
library(rsample)
my_initial_split <- initial_split(GrinnellHousesClean)
Grinnell_train <- training(my_initial_split)
Grinnell_test <- testing(my_initial_split)
Build a template for a function to make a knn model
library(kknn)
my_knn_mod <- kknn(SalePrice ~ Latitude + Longitude, train = Grinnell_train, test = Grinnell_test, k = 5, distance = 1, kernel = "rectangular")
Build a function which can take k as an input and return a knn model as output
get_knn <- function(k){
my_knn_mod <- kknn(SalePrice ~ Latitude + Longitude, train = Grinnell_train, test = Grinnell_test, k = k, distance = 1, kernel = "rectangular")
my_knn_mod
}
my_knn10 <- get_knn(10)
Build a function that will instead output the rmse for the knn model
get_knn_rmse <- function(k){
my_knn_mod <- kknn(SalePrice ~ Latitude + Longitude, train = Grinnell_train, test = Grinnell_test, k = k, distance = 1, kernel = "rectangular")
my_preds <- my_knn_mod$fitted.values
my_obs <- Grinnell_test$SalePrice
my_rmse <- sqrt(mean((my_obs - my_preds)^2))
my_rmse
}
get_knn_rmse(10)
## [1] 54538.87
Use a for loop to compute test rmse for k between 1 and 30
my_rmse <- numeric(30)
for (i in 1:30){
my_rmse[i] <- get_knn_rmse(i)
}
head(my_rmse)
## [1] 53573.16 47559.38 53368.88 54615.40 54576.33 54433.50
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data.frame(k = 1:30, my_rmse) %>%
ggplot(aes(x = k, y = my_rmse))+geom_line() + geom_point()
Use cross-validation to compare
set.seed(331)
library(rsample)
my_cv <- vfold_cv(GrinnellHousesClean)
Let’s rebuild our get_knn_rmse
function so that it takes
a split as input:
get_knn_rmse_split <- function(k, split){
my_training <- analysis(split)
my_testing <- assessment(split)
my_knn_mod <- kknn(SalePrice ~ Latitude + Longitude, train = my_training, test = my_testing, k = k, distance = 1, kernel = "rectangular")
my_preds <- my_knn_mod$fitted.values
my_obs <- my_testing$SalePrice
my_rmse <- sqrt(mean((my_obs - my_preds)^2))
my_rmse
}
Now, use map_dbl
to create a vector of rmse for each
fold:
library(purrr)
map_dbl(my_cv$splits, get_knn_rmse_split, k = 3)
## [1] 45758.35 50363.51 54908.40 39963.55 52671.92 65881.48 50499.79 47328.87
## [9] 57953.32 55120.20
Now, we use a for-loop to iterate across all k between 1 and 30:
my_rmse_mat <- as.data.frame( matrix(NA, ncol = 30, nrow = 10))
for (i in 1:30){
my_rmse_mat[,i] <- map_dbl(my_cv$splits, get_knn_rmse_split, k = i)
}
Now, we rename columns:
colnames(my_rmse_mat) <- paste("knn",1:30, sep ="")
head(my_rmse_mat)
Finally, we can get our CV RMSE estimates by applying the
mean
function to each column using map_dbl
cv_rmse <- map_dbl(my_rmse_mat, mean)
Finally, we plot to summarize:
data.frame(k = 1:30, cv_rmse) %>%
ggplot(aes(x = k, y = cv_rmse))+geom_line() + geom_point()