As before, we use a subset of the PDX Trees data:
library(pdxTrees)
library(dplyr)
library(tidyr)
my_pdxTrees <- get_pdxTrees_parks(park = c("Berkeley Park", "Woodstock Park", "Westmoreland Park", "Mt Scott Park", "Powell Park", "Kennilworth Park", "Sellwood Park", "Crystal Springs Rhododendron Garden", "Laurelhurst Park"))%>% mutate_if(is.character, as.factor) %>% dplyr::select(DBH, Condition, Tree_Height, Crown_Width_NS, Crown_Width_EW, Crown_Base_Height, Functional_Type, Mature_Size, Carbon_Sequestration_lb) %>% drop_na()
names(my_pdxTrees)
## [1] "DBH" "Condition"
## [3] "Tree_Height" "Crown_Width_NS"
## [5] "Crown_Width_EW" "Crown_Base_Height"
## [7] "Functional_Type" "Mature_Size"
## [9] "Carbon_Sequestration_lb"
dim(my_pdxTrees)
## [1] 3015 9
set.seed(1)
library(rsample)
my_pdxTrees_split <- initial_split(my_pdxTrees )
my_pdxTrees_train <- training(my_pdxTrees_split)
my_pdxTrees_test <- testing(my_pdxTrees_split)
Choose a relatively high initial learning rate. A rate of 0.1 is a reasonable starting point.
Determine the optimal number of trees for this learning rate using cross-validation.
Fix other tree-specific parameters and tune the learning rate, assessed by computation speed and model accuracy.
Tune tree-specific parameters for fixed learning rate.
Once tree-specific parameters have been found, lower learning rate and increase number of trees to assess improvements in accuracy.
Warning! This search can take considerable time (minutes to hours), depending on computing power, number of variables in model, and number of observations. DO NOT ATTEMPT ON RSTUDIO SERVER!!
Now we tune across learning rate. We’ve chosen a variety of small and large rates to consider.
system.time
is used to assess whether certain models
take longer to fit than others
For each learning rate, we compute the rmse (estimated by cross-valdiation), along with number of trees producing minimal rmse.
library(gbm)
learning_rate <- c(0.3, 0.1, 0.05, 0.01, 0.005, 0.001)
rMSE <- rep(NA, 6)
n_trees <- rep(NA,6)
Time <- rep(NA,6)
set.seed(122)
for (i in 1:6){
train_time <- system.time({
gbm_mod <- gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
distribution = "gaussian",
n.trees=5000,
interaction.depth = 3,
shrinkage = learning_rate[i],
cv.folds = 10,
n.cores = 8,
n.minobsinnode = 10) })
rMSE[i] <- sqrt(min(gbm_mod$cv.error))
n_trees[i] <- which.min(gbm_mod$cv.error)
Time[i] <- train_time[["elapsed"]]
}
data.frame(learning_rate, n_trees, Time, rMSE) %>% arrange(rMSE) %>% head()
## learning_rate n_trees Time rMSE
## 1 0.010 4689 12.370 9.718694
## 2 0.100 333 12.445 9.730369
## 3 0.050 970 12.251 9.759684
## 4 0.005 4501 12.543 9.995629
## 5 0.300 237 12.601 10.406424
## 6 0.001 5000 12.235 11.572594
(This took approximately 1 min on my 2023 Macbook pro)
Here, we use the shrinkage penalty determined earlier, and then look at tree-specific parameters.
my_grid <- expand.grid(
n.trees = c(5000),
shrinkage = 0.01,
interaction.depth = c(3,5,7),
n.minobsinnode = c(5,10,15)
)
model_fit <- function(n.trees, shrinkage, interaction.depth, n.minobsinnode){
set.seed(40)
library(gbm)
gbm_mod <- gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
distribution = "gaussian",
n.trees=n.trees,
interaction.depth = interaction.depth,
shrinkage = shrinkage,
cv.folds = 10,
n.cores = 8,
n.minobsinnode = n.minobsinnode)
rMSE <- sqrt(min(gbm_mod$cv.error))
rMSE
}
We now use the pmap_dbl
function from
purrr
. This function iterates over multiple inputs
simultaneously to reduce memory load.
library(purrr)
set.seed(10)
my_grid$rmse <- pmap_dbl(
my_grid,
~ model_fit(
n.trees = ..1,
shrinkage = ..2,
interaction =..3,
n.minobsinnode = ..4))
(This took about 5 minutes on my 2023 Macbook pro)
We can then view results of the exhaustive search:
head(arrange(my_grid, rmse))
## n.trees shrinkage interaction.depth n.minobsinnode rmse
## 1 5000 0.01 7 5 9.389038
## 2 5000 0.01 7 10 9.486088
## 3 5000 0.01 5 5 9.557491
## 4 5000 0.01 7 15 9.610702
## 5 5000 0.01 5 10 9.708710
## 6 5000 0.01 5 15 9.708723
We take the top performing parameters and then increase number of trees, to see if any accuracy gains can be obtained.
set.seed(10101)
library(gbm)
cv_boosted_tree_2 <-gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
distribution = "gaussian",
n.trees=10000,
interaction.depth = 7,
n.minobsinnode = 5,
shrinkage = .01,
cv.folds = 10,
n.cores = 8)
my_errors2 <- cv_boosted_tree_2$cv.error
best_n2 <- which.min(cv_boosted_tree_2$cv.error)
data.frame(best_n2, cv_error = sqrt(my_errors2[best_n2]))
## best_n2 cv_error
## 1 2107 8.956275
We also see if we can realize any gains from decreasing learning rate:
set.seed(10101)
cv_boosted_tree_3 <-gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
distribution = "gaussian",
n.trees=10000,
interaction.depth = 7,
n.minobsinnode = 5,
shrinkage = .005,
cv.folds = 10,
n.cores = 8)
my_errors3 <- cv_boosted_tree_3$cv.error
best_n3 <- which.min(cv_boosted_tree_3$cv.error)
data.frame(best_n3, cv_error = sqrt(my_errors3[best_n3]))
## best_n3 cv_error
## 1 4292 8.978074
Now we fit the best gbm
model:
set.seed(698)
gbm_best <- gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
distribution = "gaussian",
n.trees=2107,
interaction.depth = 7,
shrinkage = 0.01,
n.minobsinnode = 5)
my_preds_bt_best<- predict(gbm_best, my_pdxTrees_test)
And compare:
library(randomForest)
library(rpart)
set.seed(10101)
boosted_tree<-gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
distribution = "gaussian",
n.trees=400,
interaction.depth = 3,
shrinkage = .1)
rfmodels200 <- randomForest(Carbon_Sequestration_lb ~ ., data = my_pdxTrees_train, ntree = 200, mtry = 4, na.action = na.roughfix )
my_lm <- lm(Carbon_Sequestration_lb~., data = my_pdxTrees_train)
my_tree <- rpart(Carbon_Sequestration_lb ~., data = my_pdxTrees_train, control = rpart.control(cp = .005))
my_preds_rf<- predict(rfmodels200, my_pdxTrees_test)
my_preds_bt<- predict(boosted_tree, my_pdxTrees_test)
my_preds_lm <- predict(my_lm, my_pdxTrees_test)
my_preds_tr <- predict(my_tree, my_pdxTrees_test)
results <- data.frame(obs = my_pdxTrees_test$Carbon_Sequestration_lb,
random_forest = my_preds_rf,
boosted_tree_best = my_preds_bt_best,
boosted_tree = my_preds_bt,
linear_model = my_preds_lm,
pruned_tree = my_preds_tr) %>% pivot_longer(!obs, names_to = "model", values_to = "preds")
library(yardstick)
options(pillar.sigfig=5)
results %>% group_by(model) %>% rmse(truth = obs, estimate = preds) %>% arrange(.estimate)
## # A tibble: 5 × 4
## model .metric .estimator .estimate
## <chr> <chr> <chr> <dbl>
## 1 random_forest rmse standard 10.773
## 2 boosted_tree_best rmse standard 11.161
## 3 boosted_tree rmse standard 11.479
## 4 pruned_tree rmse standard 13.704
## 5 linear_model rmse standard 17.652