Boosted Tree Tuning

Load Data

As before, we use a subset of the PDX Trees data:

library(pdxTrees)
library(dplyr)
library(tidyr)
my_pdxTrees <- get_pdxTrees_parks(park = c("Berkeley Park", "Woodstock Park", "Westmoreland Park", "Mt Scott Park", "Powell Park", "Kennilworth Park", "Sellwood Park", "Crystal Springs Rhododendron Garden", "Laurelhurst Park"))%>%  mutate_if(is.character, as.factor) %>%  dplyr::select(DBH, Condition, Tree_Height, Crown_Width_NS, Crown_Width_EW, Crown_Base_Height, Functional_Type, Mature_Size, Carbon_Sequestration_lb) %>% drop_na()

names(my_pdxTrees)

## [1] "DBH"                     "Condition"              
## [3] "Tree_Height"             "Crown_Width_NS"         
## [5] "Crown_Width_EW"          "Crown_Base_Height"      
## [7] "Functional_Type"         "Mature_Size"            
## [9] "Carbon_Sequestration_lb"

dim(my_pdxTrees)

## [1] 3015    9

set.seed(1)
library(rsample)
my_pdxTrees_split <- initial_split(my_pdxTrees )
my_pdxTrees_train <- training(my_pdxTrees_split)
my_pdxTrees_test <- testing(my_pdxTrees_split)

General Strategy for finding best Parameters

Choose a relatively high initial learning rate. A rate of 0.1 is a reasonable starting point.
Determine the optimal number of trees for this learning rate using cross-validation.
Fix other tree-specific parameters and tune the learning rate, assessed by computation speed and model accuracy.
Tune tree-specific parameters for fixed learning rate.
Once tree-specific parameters have been found, lower learning rate and increase number of trees to assess improvements in accuracy.

Warning! This search can take considerable time (minutes to hours), depending on computing power, number of variables in model, and number of observations. DO NOT ATTEMPT ON RSTUDIO SERVER!!

Tune Learning Rate

Now we tune across learning rate. We’ve chosen a variety of small and large rates to consider.

system.time is used to assess whether certain models take longer to fit than others

For each learning rate, we compute the rmse (estimated by cross-valdiation), along with number of trees producing minimal rmse.

library(gbm)

learning_rate <- c(0.3, 0.1, 0.05, 0.01, 0.005, 0.001)

rMSE <- rep(NA, 6)
n_trees <- rep(NA,6)
Time <- rep(NA,6)
set.seed(122)

for (i in 1:6){
  train_time <- system.time({
  gbm_mod <- gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
                  distribution = "gaussian", 
                  n.trees=5000, 
                  interaction.depth = 3,
                  shrinkage = learning_rate[i],
                  cv.folds = 10,
                  n.cores = 8,
                n.minobsinnode = 10)  })
  rMSE[i] <- sqrt(min(gbm_mod$cv.error))
  n_trees[i] <- which.min(gbm_mod$cv.error)
  Time[i] <- train_time[["elapsed"]]

}

data.frame(learning_rate, n_trees, Time, rMSE) %>% arrange(rMSE) %>% head()

##   learning_rate n_trees   Time      rMSE
## 1         0.010    4689 12.370  9.718694
## 2         0.100     333 12.445  9.730369
## 3         0.050     970 12.251  9.759684
## 4         0.005    4501 12.543  9.995629
## 5         0.300     237 12.601 10.406424
## 6         0.001    5000 12.235 11.572594

(This took approximately 1 min on my 2023 Macbook pro)

Cross-Validation along a Grid

In order to cross-validate a large number of parameters, we create a parameter grid:

Here, we use the shrinkage penalty determined earlier, and then look at tree-specific parameters.

my_grid <- expand.grid(
  n.trees = c(5000),
  shrinkage = 0.01,
  interaction.depth = c(3,5,7),
  n.minobsinnode = c(5,10,15)
)

Then we create a model fitting function:

model_fit <- function(n.trees, shrinkage, interaction.depth, n.minobsinnode){
  set.seed(40)
  library(gbm)
  gbm_mod <- gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
                  distribution = "gaussian", 
                  n.trees=n.trees, 
                  interaction.depth = interaction.depth,
                  shrinkage = shrinkage,
                  cv.folds = 10,
                  n.cores = 8,
                n.minobsinnode = n.minobsinnode)
  rMSE <- sqrt(min(gbm_mod$cv.error))
  rMSE
}

Implementing the Grid Search

We now use the pmap_dbl function from purrr. This function iterates over multiple inputs simultaneously to reduce memory load.

library(purrr)
set.seed(10)
my_grid$rmse <- pmap_dbl(
  my_grid,
  ~ model_fit(
    n.trees = ..1,
    shrinkage = ..2,
    interaction =..3,
    n.minobsinnode = ..4))

(This took about 5 minutes on my 2023 Macbook pro)

We can then view results of the exhaustive search:

head(arrange(my_grid, rmse))

##   n.trees shrinkage interaction.depth n.minobsinnode     rmse
## 1    5000      0.01                 7              5 9.389038
## 2    5000      0.01                 7             10 9.486088
## 3    5000      0.01                 5              5 9.557491
## 4    5000      0.01                 7             15 9.610702
## 5    5000      0.01                 5             10 9.708710
## 6    5000      0.01                 5             15 9.708723

Increase Trees

We take the top performing parameters and then increase number of trees, to see if any accuracy gains can be obtained.

set.seed(10101)
library(gbm)
cv_boosted_tree_2 <-gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
                  distribution = "gaussian", 
                  n.trees=10000, 
                  interaction.depth = 7,
                  n.minobsinnode = 5,
                  shrinkage = .01,
                  cv.folds = 10,
                  n.cores = 8)

my_errors2 <- cv_boosted_tree_2$cv.error
best_n2 <- which.min(cv_boosted_tree_2$cv.error)
data.frame(best_n2, cv_error = sqrt(my_errors2[best_n2]))

##   best_n2 cv_error
## 1    2107 8.956275

Lower Rate

We also see if we can realize any gains from decreasing learning rate:

set.seed(10101)
cv_boosted_tree_3 <-gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
                  distribution = "gaussian", 
                  n.trees=10000, 
                  interaction.depth = 7,
                  n.minobsinnode = 5,
                  shrinkage = .005,
                  cv.folds = 10,
                  n.cores = 8)

my_errors3 <- cv_boosted_tree_3$cv.error
best_n3 <- which.min(cv_boosted_tree_3$cv.error)
data.frame(best_n3, cv_error = sqrt(my_errors3[best_n3]))

##   best_n3 cv_error
## 1    4292 8.978074

Comparison

Now we fit the best gbm model:

set.seed(698)
gbm_best <- gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
                  distribution = "gaussian", 
                  n.trees=2107, 
                  interaction.depth = 7,
                  shrinkage = 0.01,
                n.minobsinnode = 5)

my_preds_bt_best<- predict(gbm_best, my_pdxTrees_test)

And compare:

library(randomForest)
library(rpart)

set.seed(10101)

boosted_tree<-gbm(Carbon_Sequestration_lb ~., my_pdxTrees_train,
                  distribution = "gaussian", 
                  n.trees=400, 
                  interaction.depth = 3,
                  shrinkage = .1)

rfmodels200 <- randomForest(Carbon_Sequestration_lb ~ ., data = my_pdxTrees_train, ntree = 200, mtry = 4, na.action = na.roughfix )
my_lm <- lm(Carbon_Sequestration_lb~., data = my_pdxTrees_train)
my_tree <- rpart(Carbon_Sequestration_lb ~., data = my_pdxTrees_train, control = rpart.control(cp = .005))

my_preds_rf<- predict(rfmodels200, my_pdxTrees_test)
my_preds_bt<- predict(boosted_tree, my_pdxTrees_test)
my_preds_lm <- predict(my_lm, my_pdxTrees_test)
my_preds_tr <- predict(my_tree, my_pdxTrees_test)

results <- data.frame(obs = my_pdxTrees_test$Carbon_Sequestration_lb, 
                      random_forest = my_preds_rf,
                      boosted_tree_best = my_preds_bt_best,
                      boosted_tree = my_preds_bt,
                      linear_model = my_preds_lm,
                      pruned_tree = my_preds_tr) %>% pivot_longer(!obs, names_to = "model", values_to = "preds")

library(yardstick)

options(pillar.sigfig=5)
results %>% group_by(model) %>% rmse(truth = obs, estimate = preds) %>% arrange(.estimate)

## # A tibble: 5 × 4
##   model             .metric .estimator .estimate
##   <chr>             <chr>   <chr>          <dbl>
## 1 random_forest     rmse    standard      10.773
## 2 boosted_tree_best rmse    standard      11.161
## 3 boosted_tree      rmse    standard      11.479
## 4 pruned_tree       rmse    standard      13.704
## 5 linear_model      rmse    standard      17.652