Tidymodels Boosted Trees

Load Tidymodels packages

library(tidymodels)

Load Data

Here, we have a subset of the Ames housing data:

library(dplyr)
house<-read.csv("data/house.csv") %>% relocate(SalePrice, Id)

names(house)

##  [1] "SalePrice"     "Id"            "Functional"    "BldgType"     
##  [5] "Foundation"    "LotShape"      "LandSlope"     "SaleCondition"
##  [9] "RoofMatl"      "ScreenPorch"   "MSSubClass"    "GarageCars"   
## [13] "BedroomAbvGr"  "TotalBsmtSF"   "LotArea"       "OpenPorchSF"  
## [17] "BsmtFullBath"  "WoodDeckSF"    "OverallCond"   "YrSold"       
## [21] "GrLivArea"     "MoSold"        "TotRmsAbvGrd"  "PoolArea"     
## [25] "YearBuilt"     "GarageArea"    "OverallQual"   "Fireplaces"   
## [29] "EnclosedPorch" "FullBath"      "HalfBath"

dim(house)

## [1] 200  31

library(rsample)
set.seed(325)
data_split <- initial_split(house , prop = 3/4, strata = SalePrice)
train_data <- training(data_split)
test_data <- testing(data_split)

Create Pre-processing recipe

house_rec <- recipe(SalePrice ~ ., data = train_data) %>% 
  update_role(Id, new_role = "ID") %>% 
  step_log(LotArea, base = 10) %>% 
  step_mutate(TotalBath = FullBath+0.5*HalfBath) %>% 
  step_rm(FullBath, HalfBath) %>% 
  step_novel(all_nominal()) %>% 
  step_dummy(all_nominal(), - all_outcomes())

Determine Model Formula

Here, we identify the parameters we want to tune via cross-validation

boosted_mod <- boost_tree(
  trees = tune(),
  tree_depth = tune(), 
  min_n = tune(),
  learn_rate = tune()
) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

Specify Parameter Grid

The crossing function will create a regular grid of parameters:

trees <- c(500, 1000, 2000)
tree_depth <- c(3, 5, 7)
min_n <- c(5, 10, 20)
learn_rate <- c(0.001, 0.005, 0.01, 0.1)

xgb_grid <- crossing(trees, tree_depth, min_n, learn_rate)

Set Workflow

xgb_wf <- workflow() %>% 
  add_model(boosted_mod) %>% 
  add_recipe(house_rec)

Prepare Folds

set.seed(132)
my_folds <- vfold_cv(train_data, strata = SalePrice)

Tuning Time

set.seed(336)
my_time <- system.time(
xgb_res <- tune_grid(
  xgb_wf,
  resamples = my_folds,
  grid = xgb_grid,
  control = control_grid(save_pred = TRUE)
)
)

my_time["elapsed"]

## elapsed 
## 203.463

This took about 3 minutes on my 2023 Macbook Pro.

Explore Results

collect_metrics(xgb_res)

## # A tibble: 216 × 10
##    trees min_n tree_depth learn_rate .metric .estimator       mean     n std_err
##    <dbl> <dbl>      <dbl>      <dbl> <chr>   <chr>           <dbl> <int>   <dbl>
##  1   500     5          3      0.001 rmse    standard   119896.       10 3.55e+3
##  2   500     5          3      0.001 rsq     standard        0.745    10 6.10e-2
##  3  1000     5          3      0.001 rmse    standard    78198.       10 3.01e+3
##  4  1000     5          3      0.001 rsq     standard        0.786    10 5.61e-2
##  5  2000     5          3      0.001 rmse    standard    40371.       10 2.47e+3
##  6  2000     5          3      0.001 rsq     standard        0.815    10 4.88e-2
##  7   500     5          3      0.005 rmse    standard    33410.       10 2.35e+3
##  8   500     5          3      0.005 rsq     standard        0.819    10 4.64e-2
##  9  1000     5          3      0.005 rmse    standard    27362.       10 2.21e+3
## 10  1000     5          3      0.005 rsq     standard        0.831    10 4.04e-2
## # ℹ 206 more rows
## # ℹ 1 more variable: .config <chr>

show_best(xgb_res, metric = "rmse")

## # A tibble: 5 × 10
##   trees min_n tree_depth learn_rate .metric .estimator   mean     n std_err
##   <dbl> <dbl>      <dbl>      <dbl> <chr>   <chr>       <dbl> <int>   <dbl>
## 1  2000     5          3      0.005 rmse    standard   26578.    10   2077.
## 2  1000     5          3      0.01  rmse    standard   26724.    10   2072.
## 3  2000    10          3      0.005 rmse    standard   26806.    10   2130.
## 4  1000    10          3      0.01  rmse    standard   26840.    10   2127.
## 5   500     5          3      0.1   rmse    standard   26936.    10   1968.
## # ℹ 1 more variable: .config <chr>

show_best(xgb_res, metric = "rsq")

## # A tibble: 5 × 10
##   trees min_n tree_depth learn_rate .metric .estimator  mean     n std_err
##   <dbl> <dbl>      <dbl>      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
## 1  2000     5          3      0.005 rsq     standard   0.837    10  0.0393
## 2  1000     5          3      0.01  rsq     standard   0.834    10  0.0403
## 3   500     5          3      0.1   rsq     standard   0.833    10  0.0413
## 4  1000     5          3      0.1   rsq     standard   0.833    10  0.0414
## 5  2000     5          3      0.1   rsq     standard   0.833    10  0.0414
## # ℹ 1 more variable: .config <chr>

Visualize Parameters

xgb_res %>% collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = tree_depth, y = mean, shape = as.factor(trees), color = as.factor(min_n)))+
  geom_jitter(width = .35, size = 2, alpha = .9)+ 
  facet_wrap(~as.factor(learn_rate), scale = "free")+
  labs(title = "RMSE")

xgb_res %>% collect_metrics() %>% 
  filter(.metric == "rsq") %>% 
  ggplot(aes(x = tree_depth, y = mean, shape = as.factor(trees), color = as.factor(min_n)))+
  geom_jitter(width = .35, size = 2, alpha = .9)+ 
  facet_wrap(~as.factor(learn_rate), scale = "free") +
  labs(title = "RSQ")

Select Best Parameters

best_rmse <- select_best(xgb_res, metric = "rmse")
best_rmse

## # A tibble: 1 × 5
##   trees min_n tree_depth learn_rate .config               
##   <dbl> <dbl>      <dbl>      <dbl> <chr>                 
## 1  2000     5          3      0.005 Preprocessor1_Model006

best_rsq <- select_best(xgb_res, metric = "rsq")
best_rsq

## # A tibble: 1 × 5
##   trees min_n tree_depth learn_rate .config               
##   <dbl> <dbl>      <dbl>      <dbl> <chr>                 
## 1  2000     5          3      0.005 Preprocessor1_Model006

Finalize

final_xgb_rmse <- finalize_workflow(xgb_wf, best_rmse)

final_xgb_rmse

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 5 Recipe Steps
## 
## • step_log()
## • step_mutate()
## • step_rm()
## • step_novel()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   trees = 2000
##   min_n = 5
##   tree_depth = 3
##   learn_rate = 0.005
## 
## Computational engine: xgboost

final_xgb_rsq<- finalize_workflow(xgb_wf, best_rsq)

final_xgb_rsq

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 5 Recipe Steps
## 
## • step_log()
## • step_mutate()
## • step_rm()
## • step_novel()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   trees = 2000
##   min_n = 5
##   tree_depth = 3
##   learn_rate = 0.005
## 
## Computational engine: xgboost

final_res_rmse <- last_fit(final_xgb_rmse, data_split)
collect_metrics(final_res_rmse)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard   27234.    Preprocessor1_Model1
## 2 rsq     standard       0.865 Preprocessor1_Model1

final_res_rsq <- last_fit(final_xgb_rsq, data_split)
collect_metrics(final_res_rsq)

## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard   27234.    Preprocessor1_Model1
## 2 rsq     standard       0.865 Preprocessor1_Model1