Extreme Gradient Boosting in R
Premium
Extreme Gradient Boosting has a very efficient implementation. Unlike SVM and RandomForest, we can tune parameter using the whole downsampling set. We focus on varying Ridge & Lasso regularization and learning rate. We use 10% of data for validating tuning parameter.
1set.seed(400)
2#down sampling again so than we get more info when stacking
3samp = downSample(data_train[-getIndexsOfColumns(data_train, c( "loan_status") )],data_train$loan_status,yname="loan_status")
4#choose small data for validating
5train_index_tuning= createDataPartition(samp$loan_status,p = 0.1,list=FALSE,times=1)
6
1etas = c(0.1,0.3)
2alphas = c(0,0.5,1)
3lambdas = c(0,0.5,1)
4
5install.packages("xgboost”)
6library(xgboost)
7
8test_watchlist = list(
9 test = xgb.DMatrix(
10 data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
11 label = as.numeric(samp[train_index_tuning,"loan_status"])-1
12 )
13)
14
15gbm_perf = data.frame(eta=numeric(0),alpha=numeric(0),lambda=numeric(0),auc=numeric(0))
16for(eta in etas){
17 for(alpha in alphas){
18 for(lambda in lambdas){
19 model = xgb.train(
20 data= xgb.DMatrix(
21 data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
22 label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
23 ),
24 objective = "binary:logistic",
25 nrounds = 350,
26 watchlist = test_watchlist,
27 eval_metric = "auc",
28 early.stop.rounds = 10,
29 alpha = alpha,
30 lambda = lambda,
31 eta = eta)
32 gbm_perf[nrow(gbm_perf)+1,] = c(eta,alpha,lambda,model$bestScore)
33 }
34 }
35}
36
37gbm_perf %>% arrange(desc(auc))
38
The best tuning parameter is eta = 0.1
, alpha = 0.5
, and lambda = 1.0
. We retrain it again here in case readers didn’t run the tuning code. We collect its performance.
1set.seed(400)
2test_watchlist = list(
3 test = xgb.DMatrix(
4 data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
5 label = as.numeric(samp[train_index_tuning,"loan_status"])-1
6 )
7)
8
9xgb_model = xgb.train(
10 data= xgb.DMatrix(
11 data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
12 label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
13 ),
14 objective = "binary:logistic",
15 nrounds = 350,
16 watchlist = test_watchlist,
17 eval_metric = "auc",
18 early.stop.round = 10,
19 alpha = 0.5,
20 lambda = 1.0,
21 eta = 0.1)
22
1[1] test-auc:0.656015
2Will train until test_auc hasn't improved in 10 rounds.
3
4[2] test-auc:0.672381
5[3] test-auc:0.679274
6[4] test-auc:0.681095
7[5] test-auc:0.689003
8[6] test-auc:0.690731
9[7] test-auc:0.692656
10[8] test-auc:0.693467
11[9] test-auc:0.695073
12[10] test-auc:0.695977
13[11] test-auc:0.697777
14[12] test-auc:0.698707
15[13] test-auc:0.698882
16[14] test-auc:0.701135
17[15] test-auc:0.702984
18[16] test-auc:0.704485
19[17] test-auc:0.705495
20[18] test-auc:0.706659
21[19] test-auc:0.707099
22[20] test-auc:0.708352
23[21] test-auc:0.708617
24[22] test-auc:0.709497
25[23] test-auc:0.710382
26[24] test-auc:0.710718
27[25] test-auc:0.711810
28[26] test-auc:0.712518
29[27] test-auc:0.712553
30[28] test-auc:0.713433
31[29] test-auc:0.713817
32[30] test-auc:0.713927
33[31] test-auc:0.714554
34[32] test-auc:0.714980
35[33] test-auc:0.715179
36[34] test-auc:0.715924
37[35] test-auc:0.716299
38[36] test-auc:0.716700
39[37] test-auc:0.716733
40[38] test-auc:0.717246
41[39] test-auc:0.717266
42[40] test-auc:0.717660
43[41] test-auc:0.718041
44[42] test-auc:0.717934
45[43] test-auc:0.718888
46[44] test-auc:0.719207
47[45] test-auc:0.719219
48[46] test-auc:0.719322
49[47] test-auc:0.719694
50[48] test-auc:0.719976
51[49] test-auc:0.720586
52[50] test-auc:0.721058
53[51] test-auc:0.721292
54[52] test-auc:0.721402
55[53] test-auc:0.721236
56[54] test-auc:0.721692
57[55] test-auc:0.721584
58[56] test-auc:0.721585
59[57] test-auc:0.721289
60[58] test-auc:0.721552
61[59] test-auc:0.721499
62[60] test-auc:0.721309
63[61] test-auc:0.721516
64[62] test-auc:0.721776
65[63] test-auc:0.721996
66[64] test-auc:0.721674
67[65] test-auc:0.722132
68[66] test-auc:0.722179
69[67] test-auc:0.721972
70[68] test-auc:0.721933
71[69] test-auc:0.722067
72[70] test-auc:0.722049
73[71] test-auc:0.722391
74[72] test-auc:0.722212
75[73] test-auc:0.722595
76[74] test-auc:0.722503
77[75] test-auc:0.722345
78[76] test-auc:0.722503
79[77] test-auc:0.722589
80[78] test-auc:0.722731
81[79] test-auc:0.722534
82[80] test-auc:0.722708
83[81] test-auc:0.722812
84[82] test-auc:0.722784
85[83] test-auc:0.723001
86[84] test-auc:0.723074
87[85] test-auc:0.722978
88[86] test-auc:0.722794
89[87] test-auc:0.723016
90[88] test-auc:0.723130
91[89] test-auc:0.723050
92[90] test-auc:0.722762
93[91] test-auc:0.722684
94[92] test-auc:0.723375
95[93] test-auc:0.723760
96[94] test-auc:0.723965
97[95] test-auc:0.724011
98[96] test-auc:0.724273
99[97] test-auc:0.724057
100[98] test-auc:0.723959
101[99] test-auc:0.724219
102[100] test-auc:0.724262
103[101] test-auc:0.724202
104[102] test-auc:0.724573
105[103] test-auc:0.724432
106[104] test-auc:0.724790
107[105] test-auc:0.724792
108[106] test-auc:0.725028
109[107] test-auc:0.725214
110[108] test-auc:0.725203
111[109] test-auc:0.725292
112[110] test-auc:0.725163
113[111] test-auc:0.725213
114[112] test-auc:0.725229
115[113] test-auc:0.724999
116[114] test-auc:0.725181
117[115] test-auc:0.725151
118[116] test-auc:0.725140
119[117] test-auc:0.725079
120[118] test-auc:0.725306
121[119] test-auc:0.725115
122[120] test-auc:0.725141
123[121] test-auc:0.725279
124[122] test-auc:0.725290
125[123] test-auc:0.725333
126[124] test-auc:0.725351
127[125] test-auc:0.725092
128[126] test-auc:0.725250
129[127] test-auc:0.725163
130[128] test-auc:0.725266
131[129] test-auc:0.725420
132[130] test-auc:0.725525
133[131] test-auc:0.725444
134[132] test-auc:0.725441
135[133] test-auc:0.725484
136[134] test-auc:0.725423
137[135] test-auc:0.725419
138[136] test-auc:0.725665
139[137] test-auc:0.725749
140[138] test-auc:0.725820
141[139] test-auc:0.725869
142[140] test-auc:0.725578
143[141] test-auc:0.725665
144[142] test-auc:0.725927
145[143] test-auc:0.725749
146[144] test-auc:0.725693
147[145] test-auc:0.725440
148[146] test-auc:0.725374
149[147] test-auc:0.725685
150[148] test-auc:0.725883
151[149] test-auc:0.725466
152[150] test-auc:0.725497
153[151] test-auc:0.725454
154[152] test-auc:0.725437
155Stopping. Best iteration:
156[142] test-auc:0.725927
157
1> xgb_model
2##### xgb.Booster
3raw: 480.9 Kb
4call:
5 xgb.train(data = xgb.DMatrix(data = as.matrix(samp[-train_index_tuning,
6 ][getNumericColumns(samp)]), label = as.numeric(samp[-train_index_tuning,
7 "loan_status"]) - 1), nrounds = 350, watchlist = test_watchlist,
8 objective = "binary:logistic", eval_metric = "auc", early.stop.round = 10,
9 alpha = 0.5, lambda = 1, eta = 0.1)
10params (as set within xgb.train):
11 objective = "binary:logistic", eval_metric = "auc", early_stop_round = "10", alpha = "0.5", lambda = "1", eta = "0.1", silent = "1"
12xgb.attributes:
13 best_iteration, best_msg, best_ntreelimit, best_score, niter
14callbacks:
15 cb.print.evaluation(period = print_every_n)
16 cb.evaluation.log()
17 cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize,
18 verbose = verbose)
19niter: 152
20best_iteration: 142
21best_ntreelimit: 142
22best_score: 0.725927
23evaluation_log:
24 iter test_auc
25 1 0.656015
26 2 0.672381
27---
28 151 0.725454
29 152 0.725437
30>
31
1predict_loan_status_xgb = predict(xgb_model,as.matrix(data_test[getNumericColumns(data_test)]))
2
3rocCurve_xgb = roc(response = data_test$loan_status,
4 predictor = predict_loan_status_xgb)
5
6auc_curve = auc(rocCurve_xgb)
7
8plot(rocCurve_xgb,legacy.axes = TRUE,print.auc = TRUE,col="red",main="ROC(XGB)")
9
1> rocCurve_xgb
2Call:
3roc.default(response = data_test$loan_status, predictor = predict_loan_status_xgb)
4Data: predict_loan_status_xgb in 5358 controls (data_test$loan_status Default) < 12602 cases (data_test$loan_status Fully.Paid).
5Area under the curve: 0.706
6>
7
1predict_loan_status_label = ifelse(predict_loan_status_xgb<0.5,"Default","Fully.Paid")
2c = confusionMatrix(predict_loan_status_label,data_test$loan_status,positive="Fully.Paid")
3
4table_perf[4,] = c("XGB",
5 round(auc_curve,3),
6 as.numeric(round(c$overall["Accuracy"],3)),
7 as.numeric(round(c$byClass["Sensitivity"],3)),
8 as.numeric(round(c$byClass["Specificity"],3)),
9 as.numeric(round(c$overall["Kappa"],3))
10 )
11
The model’s performance is as follow:
1> tail(table_perf,1)
2 model auc accuracy sensitivity specificity kappa
34 XGB 0.706 0.636 0.618 0.68 0.255
4>
5
Unlock Premium Content
Upgrade your account to access the full article, downloads, and exercises.
You'll get access to:
- Access complete tutorials and examples
- Download source code and resources
- Follow along with practical exercises
- Get in-depth explanations