library(tidymodels)
Here, we have a subset of the Ames housing data:
library(dplyr)
house<-read.csv("data/house.csv") %>% relocate(SalePrice, Id)
names(house)
## [1] "SalePrice" "Id" "Functional" "BldgType"
## [5] "Foundation" "LotShape" "LandSlope" "SaleCondition"
## [9] "RoofMatl" "ScreenPorch" "MSSubClass" "GarageCars"
## [13] "BedroomAbvGr" "TotalBsmtSF" "LotArea" "OpenPorchSF"
## [17] "BsmtFullBath" "WoodDeckSF" "OverallCond" "YrSold"
## [21] "GrLivArea" "MoSold" "TotRmsAbvGrd" "PoolArea"
## [25] "YearBuilt" "GarageArea" "OverallQual" "Fireplaces"
## [29] "EnclosedPorch" "FullBath" "HalfBath"
dim(house)
## [1] 200 31
library(rsample)
set.seed(325)
data_split <- initial_split(house , prop = 3/4, strata = SalePrice)
train_data <- training(data_split)
test_data <- testing(data_split)
house_rec <- recipe(SalePrice ~ ., data = train_data) %>%
update_role(Id, new_role = "ID") %>%
step_log(LotArea, base = 10) %>%
step_mutate(TotalBath = FullBath+0.5*HalfBath) %>%
step_rm(FullBath, HalfBath) %>%
step_novel(all_nominal()) %>%
step_dummy(all_nominal(), - all_outcomes())
Here, we identify the parameters we want to tune via cross-validation
boosted_mod <- boost_tree(
trees = tune(),
tree_depth = tune(),
min_n = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost") %>%
set_mode("regression")
The crossing function will create a regular grid of
parameters:
trees <- c(500, 1000, 2000)
tree_depth <- c(3, 5, 7)
min_n <- c(5, 10, 20)
learn_rate <- c(0.001, 0.005, 0.01, 0.1)
xgb_grid <- crossing(trees, tree_depth, min_n, learn_rate)
xgb_wf <- workflow() %>%
add_model(boosted_mod) %>%
add_recipe(house_rec)
set.seed(132)
my_folds <- vfold_cv(train_data, strata = SalePrice)
set.seed(336)
my_time <- system.time(
xgb_res <- tune_grid(
xgb_wf,
resamples = my_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
)
my_time["elapsed"]
## elapsed
## 203.463
This took about 3 minutes on my 2023 Macbook Pro.
collect_metrics(xgb_res)
## # A tibble: 216 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 500 5 3 0.001 rmse standard 119896. 10 3.55e+3
## 2 500 5 3 0.001 rsq standard 0.745 10 6.10e-2
## 3 1000 5 3 0.001 rmse standard 78198. 10 3.01e+3
## 4 1000 5 3 0.001 rsq standard 0.786 10 5.61e-2
## 5 2000 5 3 0.001 rmse standard 40371. 10 2.47e+3
## 6 2000 5 3 0.001 rsq standard 0.815 10 4.88e-2
## 7 500 5 3 0.005 rmse standard 33410. 10 2.35e+3
## 8 500 5 3 0.005 rsq standard 0.819 10 4.64e-2
## 9 1000 5 3 0.005 rmse standard 27362. 10 2.21e+3
## 10 1000 5 3 0.005 rsq standard 0.831 10 4.04e-2
## # ℹ 206 more rows
## # ℹ 1 more variable: .config <chr>
show_best(xgb_res, metric = "rmse")
## # A tibble: 5 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 2000 5 3 0.005 rmse standard 26578. 10 2077.
## 2 1000 5 3 0.01 rmse standard 26724. 10 2072.
## 3 2000 10 3 0.005 rmse standard 26806. 10 2130.
## 4 1000 10 3 0.01 rmse standard 26840. 10 2127.
## 5 500 5 3 0.1 rmse standard 26936. 10 1968.
## # ℹ 1 more variable: .config <chr>
show_best(xgb_res, metric = "rsq")
## # A tibble: 5 × 10
## trees min_n tree_depth learn_rate .metric .estimator mean n std_err
## <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl>
## 1 2000 5 3 0.005 rsq standard 0.837 10 0.0393
## 2 1000 5 3 0.01 rsq standard 0.834 10 0.0403
## 3 500 5 3 0.1 rsq standard 0.833 10 0.0413
## 4 1000 5 3 0.1 rsq standard 0.833 10 0.0414
## 5 2000 5 3 0.1 rsq standard 0.833 10 0.0414
## # ℹ 1 more variable: .config <chr>
xgb_res %>% collect_metrics() %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = tree_depth, y = mean, shape = as.factor(trees), color = as.factor(min_n)))+
geom_jitter(width = .35, size = 2, alpha = .9)+
facet_wrap(~as.factor(learn_rate), scale = "free")+
labs(title = "RMSE")
xgb_res %>% collect_metrics() %>%
filter(.metric == "rsq") %>%
ggplot(aes(x = tree_depth, y = mean, shape = as.factor(trees), color = as.factor(min_n)))+
geom_jitter(width = .35, size = 2, alpha = .9)+
facet_wrap(~as.factor(learn_rate), scale = "free") +
labs(title = "RSQ")
best_rmse <- select_best(xgb_res, metric = "rmse")
best_rmse
## # A tibble: 1 × 5
## trees min_n tree_depth learn_rate .config
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2000 5 3 0.005 Preprocessor1_Model006
best_rsq <- select_best(xgb_res, metric = "rsq")
best_rsq
## # A tibble: 1 × 5
## trees min_n tree_depth learn_rate .config
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2000 5 3 0.005 Preprocessor1_Model006
final_xgb_rmse <- finalize_workflow(xgb_wf, best_rmse)
final_xgb_rmse
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 5 Recipe Steps
##
## • step_log()
## • step_mutate()
## • step_rm()
## • step_novel()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## trees = 2000
## min_n = 5
## tree_depth = 3
## learn_rate = 0.005
##
## Computational engine: xgboost
final_xgb_rsq<- finalize_workflow(xgb_wf, best_rsq)
final_xgb_rsq
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 5 Recipe Steps
##
## • step_log()
## • step_mutate()
## • step_rm()
## • step_novel()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## trees = 2000
## min_n = 5
## tree_depth = 3
## learn_rate = 0.005
##
## Computational engine: xgboost
final_res_rmse <- last_fit(final_xgb_rmse, data_split)
collect_metrics(final_res_rmse)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 27234. Preprocessor1_Model1
## 2 rsq standard 0.865 Preprocessor1_Model1
final_res_rsq <- last_fit(final_xgb_rsq, data_split)
collect_metrics(final_res_rsq)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 27234. Preprocessor1_Model1
## 2 rsq standard 0.865 Preprocessor1_Model1