This vigniette demonstrates how to use the DALEX
package with models created with the xgboost package.
In this example we are going to use the wine
dataset from the breakDown
package. The wine quality will be predicted based on other features.
library("breakDown")
head(wine)
#> fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1 7.0 0.27 0.36 20.7 0.045
#> 2 6.3 0.30 0.34 1.6 0.049
#> 3 8.1 0.28 0.40 6.9 0.050
#> 4 7.2 0.23 0.32 8.5 0.058
#> 5 7.2 0.23 0.32 8.5 0.058
#> 6 8.1 0.28 0.40 6.9 0.050
#> free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
#> 1 45 170 1.0010 3.00 0.45 8.8
#> 2 14 132 0.9940 3.30 0.49 9.5
#> 3 30 97 0.9951 3.26 0.44 10.1
#> 4 47 186 0.9956 3.19 0.40 9.9
#> 5 47 186 0.9956 3.19 0.40 9.9
#> 6 30 97 0.9951 3.26 0.44 10.1
#> quality
#> 1 6
#> 2 6
#> 3 6
#> 4 6
#> 5 6
#> 6 6
Model building
Let’s build a model. We need to prepare xgb.DMatrix
first.
library("xgboost")
model_martix_train <- model.matrix(quality ~ . - 1, wine)
data_train <- xgb.DMatrix(model_martix_train, label = wine$quality)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
objective = "reg:linear")
wine_xgb_model <- xgb.train(param, data_train, nrounds = 50)
wine_xgb_model
#> ##### xgb.Booster
#> raw: 20.1 Kb
#> call:
#> xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#> max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "reg:linear", silent = "1"
#> xgb.attributes:
#> niter
#> callbacks:
#> cb.print.evaluation(period = print_every_n)
#> # of features: 11
#> niter: 50
#> nfeatures : 11
Explainer
Now we can create an explainer.
library("DALEX")
explainer_xgb <- explain(wine_xgb_model,
data = model_martix_train,
y = wine$quality,
label = "xgboost",
colorize = FALSE)
#> Preparation of a new explainer is initiated
#> -> model label : xgboost
#> -> data : 4898 rows 11 cols
#> -> target variable : 4898 values
#> -> predict function : yhat.default will be used ( default )
#> -> predicted values : numerical, min = 2.869188 , mean = 5.878132 , max = 8.078749
#> -> model_info : package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression ( default )
#> -> residual function : difference between y and yhat ( default )
#> -> residuals : numerical, min = -3.251447 , mean = -0.0002230403 , max = 3.005342
#> A new explainer has been created!
explainer_xgb
#> Model label: xgboost
#> Model class: xgb.Booster
#> Data head :
#> fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
#> 1 7.0 0.27 0.36 20.7 0.045
#> 2 6.3 0.30 0.34 1.6 0.049
#> free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
#> 1 45 170 1.001 3.0 0.45 8.8
#> 2 14 132 0.994 3.3 0.49 9.5
Single variable
For continouse variable
sv_xgb_satisfaction_level <- model_profile(explainer_xgb,
variable = "alcohol",
type = "partial")
plot(sv_xgb_satisfaction_level)
Single prediction
nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb <- predict_parts(explainer_xgb,
new_observation = nobs,
type = "break_down")
head(sp_xgb)
#> contribution
#> xgboost: intercept 5.878
#> xgboost: residual.sugar = 20.7 0.332
#> xgboost: alcohol = 8.8 -0.045
#> xgboost: density = 1.001 -0.429
#> xgboost: volatile.acidity = 0.27 -0.297
#> xgboost: free.sulfur.dioxide = 45 -0.040
plot(sp_xgb)
Variable importance
vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#> variable mean_dropout_loss label
#> 1 _full_model_ 0.6295067 xgboost
#> 2 fixed.acidity 0.6391484 xgboost
#> 3 sulphates 0.6471640 xgboost
#> 4 citric.acid 0.6538835 xgboost
#> 5 total.sulfur.dioxide 0.6552513 xgboost
#> 6 chlorides 0.6691735 xgboost
plot(vd_xgb)
Classification
In this example we are going to use the HR_data
dataset from the breakDown
package. The model will predict odds that someone will leave the company.
library("breakDown")
head(HR_data)
#> satisfaction_level last_evaluation number_project average_montly_hours
#> 1 0.38 0.53 2 157
#> 2 0.80 0.86 5 262
#> 3 0.11 0.88 7 272
#> 4 0.72 0.87 5 223
#> 5 0.37 0.52 2 159
#> 6 0.41 0.50 2 153
#> time_spend_company Work_accident left promotion_last_5years sales salary
#> 1 3 0 1 0 sales low
#> 2 6 0 1 0 sales medium
#> 3 4 0 1 0 sales medium
#> 4 5 0 1 0 sales low
#> 5 3 0 1 0 sales low
#> 6 3 0 1 0 sales low
Model building
Let’s build a model. We need to prepare xgb.DMatrix
first.
library("xgboost")
model_martix_train <- model.matrix(left ~ . - 1, HR_data)
data_train <- xgb.DMatrix(model_martix_train, label = HR_data$left)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
objective = "binary:logistic", eval_metric = "auc")
HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
HR_xgb_model
#> ##### xgb.Booster
#> raw: 19.5 Kb
#> call:
#> xgb.train(params = param, data = data_train, nrounds = 50)
#> params (as set within xgb.train):
#> max_depth = "2", eta = "1", silent = "1", nthread = "2", objective = "binary:logistic", eval_metric = "auc", silent = "1"
#> xgb.attributes:
#> niter
#> callbacks:
#> cb.print.evaluation(period = print_every_n)
#> # of features: 19
#> niter: 50
#> nfeatures : 19
Explainer
Now we can create an explainer.
library("DALEX")
predict_logit <- function(model, x) {
raw_x <- predict(model, x)
exp(raw_x)/(1 + exp(raw_x))
}
logit <- function(x) exp(x)/(1+exp(x))
explainer_xgb <- explain(HR_xgb_model,
data = model_martix_train,
y = HR_data$left,
predict_function = predict_logit,
link = logit,
label = "xgboost",
colorize = FALSE)
#> Preparation of a new explainer is initiated
#> -> model label : xgboost
#> -> data : 14999 rows 19 cols
#> -> target variable : 14999 values
#> -> predict function : predict_logit
#> -> predicted values : numerical, min = 0.5 , mean = 0.5555972 , max = 0.7310584
#> -> model_info : package Model of class: xgb.Booster package unrecognized , ver. Unknown , task regression ( default )
#> -> residual function : difference between y and yhat ( default )
#> -> residuals : numerical, min = -0.7296657 , mean = -0.3175147 , max = 0.4997965
#> A new explainer has been created!
explainer_xgb
#> Model label: xgboost
#> Model class: xgb.Booster
#> Data head :
#> satisfaction_level last_evaluation number_project average_montly_hours
#> 1 0.38 0.53 2 157
#> 2 0.80 0.86 5 262
#> time_spend_company Work_accident promotion_last_5years salesaccounting
#> 1 3 0 0 0
#> 2 6 0 0 0
#> saleshr salesIT salesmanagement salesmarketing salesproduct_mng salesRandD
#> 1 0 0 0 0 0 0
#> 2 0 0 0 0 0 0
#> salessales salessupport salestechnical salarylow salarymedium
#> 1 1 0 0 1 0
#> 2 1 0 0 0 1
Single variable
For continouse variable
sv_xgb_satisfaction_level <- model_profile(explainer_xgb,
variable = "satisfaction_level",
type = "partial")
plot(sv_xgb_satisfaction_level)
Single prediction
nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb <- predict_parts(explainer_xgb,
new_observation = nobs,
type = "break_down")
head(sp_xgb)
#> contribution
#> xgboost: intercept 0.556
#> xgboost: time_spend_company = 3 -0.013
#> xgboost: satisfaction_level = 0.38 0.012
#> xgboost: last_evaluation = 0.53 0.020
#> xgboost: average_montly_hours = 157 0.061
#> xgboost: salarylow = 1 0.019
plot(sp_xgb)
Variable importance
vd_xgb <- model_parts(explainer_xgb, type = "raw")
head(vd_xgb)
#> variable mean_dropout_loss label
#> 1 _full_model_ 0.4641699 xgboost
#> 2 salarymedium 0.4640913 xgboost
#> 3 salesaccounting 0.4641699 xgboost
#> 4 salesIT 0.4641699 xgboost
#> 5 salesmanagement 0.4641699 xgboost
#> 6 salesmarketing 0.4641699 xgboost
plot(vd_xgb)
Session info
sessionInfo()
#> R version 3.6.3 (2020-02-29)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=Polish_Poland.1250 LC_CTYPE=Polish_Poland.1250
#> [3] LC_MONETARY=Polish_Poland.1250 LC_NUMERIC=C
#> [5] LC_TIME=Polish_Poland.1250
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] DALEX_2.0.1 xgboost_1.0.0.2 breakDown_0.2.0
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.4 pillar_1.4.3 compiler_3.6.3 ingredients_2.0
#> [5] tools_3.6.3 digest_0.6.25 evaluate_0.14 lifecycle_0.2.0
#> [9] tibble_2.1.3 gtable_0.3.0 lattice_0.20-38 pkgconfig_2.0.3
#> [13] rlang_0.4.6 Matrix_1.2-18 yaml_2.2.1 xfun_0.12
#> [17] stringr_1.4.0 dplyr_1.0.0 knitr_1.28 generics_0.0.2
#> [21] vctrs_0.3.1 grid_3.6.3 tidyselect_1.1.0 glue_1.3.2
#> [25] data.table_1.12.8 R6_2.4.1 iBreakDown_1.3.1 rmarkdown_2.1
#> [29] farver_2.0.3 ggplot2_3.3.0 purrr_0.3.3 magrittr_1.5
#> [33] scales_1.1.0 htmltools_0.4.0 colorspace_1.4-1 labeling_0.3
#> [37] stringi_1.4.6 munsell_0.5.0 crayon_1.3.4
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4