Extreme Gradient Boosting in R

Premium

Extreme Gradient Boosting has a very efficient implementation. Unlike SVM and RandomForest, we can tune parameter using the whole downsampling set. We focus on varying Ridge & Lasso regularization and learning rate. We use 10% of data for validating tuning parameter.

1set.seed(400)
2#down sampling again so than we get more info when stacking
3samp = downSample(data_train[-getIndexsOfColumns(data_train, c( "loan_status") )],data_train$loan_status,yname="loan_status")
4#choose small data for validating
5train_index_tuning= createDataPartition(samp$loan_status,p = 0.1,list=FALSE,times=1)
6
1etas = c(0.1,0.3)
2alphas = c(0,0.5,1)
3lambdas = c(0,0.5,1)
4
5install.packages("xgboost”)
6library(xgboost)
7
8test_watchlist = list(
9    test = xgb.DMatrix(
10        data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
11        label = as.numeric(samp[train_index_tuning,"loan_status"])-1
12    )
13)
14
15gbm_perf = data.frame(eta=numeric(0),alpha=numeric(0),lambda=numeric(0),auc=numeric(0))
16for(eta in etas){
17    for(alpha in alphas){
18        for(lambda in lambdas){
19            model = xgb.train(
20                data= xgb.DMatrix(
21                    data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
22                    label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
23                ),
24                objective = "binary:logistic",
25                nrounds = 350,
26                watchlist = test_watchlist,
27                eval_metric = "auc",
28                early.stop.rounds = 10,
29                alpha = alpha,
30                lambda = lambda,
31                eta = eta)
32            gbm_perf[nrow(gbm_perf)+1,] = c(eta,alpha,lambda,model$bestScore)
33        }
34    }
35}
36
37gbm_perf %>% arrange(desc(auc))
38

The best tuning parameter is eta = 0.1alpha = 0.5, and lambda = 1.0. We retrain it again here in case readers didn’t run the tuning code. We collect its performance.

1set.seed(400)
2test_watchlist = list(
3    test = xgb.DMatrix(
4        data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
5        label = as.numeric(samp[train_index_tuning,"loan_status"])-1
6    )
7)
8
9xgb_model = xgb.train(
10                data= xgb.DMatrix(
11                    data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
12                    label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
13                ),
14                objective = "binary:logistic",
15                nrounds = 350,
16                watchlist = test_watchlist,
17                eval_metric = "auc",
18                early.stop.round = 10,
19                alpha = 0.5,
20                lambda = 1.0,
21                eta = 0.1)
22
1[1]	test-auc:0.656015 
2Will train until test_auc hasn't improved in 10 rounds.
3
4[2]	test-auc:0.672381 
5[3]	test-auc:0.679274 
6[4]	test-auc:0.681095 
7[5]	test-auc:0.689003 
8[6]	test-auc:0.690731 
9[7]	test-auc:0.692656 
10[8]	test-auc:0.693467 
11[9]	test-auc:0.695073 
12[10]	test-auc:0.695977 
13[11]	test-auc:0.697777 
14[12]	test-auc:0.698707 
15[13]	test-auc:0.698882 
16[14]	test-auc:0.701135 
17[15]	test-auc:0.702984 
18[16]	test-auc:0.704485 
19[17]	test-auc:0.705495 
20[18]	test-auc:0.706659 
21[19]	test-auc:0.707099 
22[20]	test-auc:0.708352 
23[21]	test-auc:0.708617 
24[22]	test-auc:0.709497 
25[23]	test-auc:0.710382 
26[24]	test-auc:0.710718 
27[25]	test-auc:0.711810 
28[26]	test-auc:0.712518 
29[27]	test-auc:0.712553 
30[28]	test-auc:0.713433 
31[29]	test-auc:0.713817 
32[30]	test-auc:0.713927 
33[31]	test-auc:0.714554 
34[32]	test-auc:0.714980 
35[33]	test-auc:0.715179 
36[34]	test-auc:0.715924 
37[35]	test-auc:0.716299 
38[36]	test-auc:0.716700 
39[37]	test-auc:0.716733 
40[38]	test-auc:0.717246 
41[39]	test-auc:0.717266 
42[40]	test-auc:0.717660 
43[41]	test-auc:0.718041 
44[42]	test-auc:0.717934 
45[43]	test-auc:0.718888 
46[44]	test-auc:0.719207 
47[45]	test-auc:0.719219 
48[46]	test-auc:0.719322 
49[47]	test-auc:0.719694 
50[48]	test-auc:0.719976 
51[49]	test-auc:0.720586 
52[50]	test-auc:0.721058 
53[51]	test-auc:0.721292 
54[52]	test-auc:0.721402 
55[53]	test-auc:0.721236 
56[54]	test-auc:0.721692 
57[55]	test-auc:0.721584 
58[56]	test-auc:0.721585 
59[57]	test-auc:0.721289 
60[58]	test-auc:0.721552 
61[59]	test-auc:0.721499 
62[60]	test-auc:0.721309 
63[61]	test-auc:0.721516 
64[62]	test-auc:0.721776 
65[63]	test-auc:0.721996 
66[64]	test-auc:0.721674 
67[65]	test-auc:0.722132 
68[66]	test-auc:0.722179 
69[67]	test-auc:0.721972 
70[68]	test-auc:0.721933 
71[69]	test-auc:0.722067 
72[70]	test-auc:0.722049 
73[71]	test-auc:0.722391 
74[72]	test-auc:0.722212 
75[73]	test-auc:0.722595 
76[74]	test-auc:0.722503 
77[75]	test-auc:0.722345 
78[76]	test-auc:0.722503 
79[77]	test-auc:0.722589 
80[78]	test-auc:0.722731 
81[79]	test-auc:0.722534 
82[80]	test-auc:0.722708 
83[81]	test-auc:0.722812 
84[82]	test-auc:0.722784 
85[83]	test-auc:0.723001 
86[84]	test-auc:0.723074 
87[85]	test-auc:0.722978 
88[86]	test-auc:0.722794 
89[87]	test-auc:0.723016 
90[88]	test-auc:0.723130 
91[89]	test-auc:0.723050 
92[90]	test-auc:0.722762 
93[91]	test-auc:0.722684 
94[92]	test-auc:0.723375 
95[93]	test-auc:0.723760 
96[94]	test-auc:0.723965 
97[95]	test-auc:0.724011 
98[96]	test-auc:0.724273 
99[97]	test-auc:0.724057 
100[98]	test-auc:0.723959 
101[99]	test-auc:0.724219 
102[100]	test-auc:0.724262 
103[101]	test-auc:0.724202 
104[102]	test-auc:0.724573 
105[103]	test-auc:0.724432 
106[104]	test-auc:0.724790 
107[105]	test-auc:0.724792 
108[106]	test-auc:0.725028 
109[107]	test-auc:0.725214 
110[108]	test-auc:0.725203 
111[109]	test-auc:0.725292 
112[110]	test-auc:0.725163 
113[111]	test-auc:0.725213 
114[112]	test-auc:0.725229 
115[113]	test-auc:0.724999 
116[114]	test-auc:0.725181 
117[115]	test-auc:0.725151 
118[116]	test-auc:0.725140 
119[117]	test-auc:0.725079 
120[118]	test-auc:0.725306 
121[119]	test-auc:0.725115 
122[120]	test-auc:0.725141 
123[121]	test-auc:0.725279 
124[122]	test-auc:0.725290 
125[123]	test-auc:0.725333 
126[124]	test-auc:0.725351 
127[125]	test-auc:0.725092 
128[126]	test-auc:0.725250 
129[127]	test-auc:0.725163 
130[128]	test-auc:0.725266 
131[129]	test-auc:0.725420 
132[130]	test-auc:0.725525 
133[131]	test-auc:0.725444 
134[132]	test-auc:0.725441 
135[133]	test-auc:0.725484 
136[134]	test-auc:0.725423 
137[135]	test-auc:0.725419 
138[136]	test-auc:0.725665 
139[137]	test-auc:0.725749 
140[138]	test-auc:0.725820 
141[139]	test-auc:0.725869 
142[140]	test-auc:0.725578 
143[141]	test-auc:0.725665 
144[142]	test-auc:0.725927 
145[143]	test-auc:0.725749 
146[144]	test-auc:0.725693 
147[145]	test-auc:0.725440 
148[146]	test-auc:0.725374 
149[147]	test-auc:0.725685 
150[148]	test-auc:0.725883 
151[149]	test-auc:0.725466 
152[150]	test-auc:0.725497 
153[151]	test-auc:0.725454 
154[152]	test-auc:0.725437 
155Stopping. Best iteration:
156[142]	test-auc:0.725927
157
1> xgb_model
2##### xgb.Booster
3raw: 480.9 Kb 
4call:
5  xgb.train(data = xgb.DMatrix(data = as.matrix(samp[-train_index_tuning, 
6    ][getNumericColumns(samp)]), label = as.numeric(samp[-train_index_tuning, 
7    "loan_status"]) - 1), nrounds = 350, watchlist = test_watchlist, 
8    objective = "binary:logistic", eval_metric = "auc", early.stop.round = 10, 
9    alpha = 0.5, lambda = 1, eta = 0.1)
10params (as set within xgb.train):
11  objective = "binary:logistic", eval_metric = "auc", early_stop_round = "10", alpha = "0.5", lambda = "1", eta = "0.1", silent = "1"
12xgb.attributes:
13  best_iteration, best_msg, best_ntreelimit, best_score, niter
14callbacks:
15  cb.print.evaluation(period = print_every_n)
16  cb.evaluation.log()
17  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
18    verbose = verbose)
19niter: 152
20best_iteration: 142
21best_ntreelimit: 142
22best_score: 0.725927
23evaluation_log:
24    iter test_auc
25       1 0.656015
26       2 0.672381
27---              
28     151 0.725454
29     152 0.725437
30> 
31
1predict_loan_status_xgb = predict(xgb_model,as.matrix(data_test[getNumericColumns(data_test)]))
2
3rocCurve_xgb = roc(response = data_test$loan_status,
4               predictor = predict_loan_status_xgb)
5
6auc_curve = auc(rocCurve_xgb)
7
8plot(rocCurve_xgb,legacy.axes = TRUE,print.auc = TRUE,col="red",main="ROC(XGB)")
9
1> rocCurve_xgb
2Call:
3roc.default(response = data_test$loan_status, predictor = predict_loan_status_xgb)
4Data: predict_loan_status_xgb in 5358 controls (data_test$loan_status Default) < 12602 cases (data_test$loan_status Fully.Paid).
5Area under the curve: 0.706
6>
7
1predict_loan_status_label = ifelse(predict_loan_status_xgb<0.5,"Default","Fully.Paid")
2c = confusionMatrix(predict_loan_status_label,data_test$loan_status,positive="Fully.Paid")
3
4table_perf[4,] = c("XGB",
5  round(auc_curve,3),
6  as.numeric(round(c$overall["Accuracy"],3)),
7  as.numeric(round(c$byClass["Sensitivity"],3)),
8  as.numeric(round(c$byClass["Specificity"],3)),
9  as.numeric(round(c$overall["Kappa"],3))
10  )
11

The model’s performance is as follow:

1> tail(table_perf,1)
2  model   auc accuracy sensitivity specificity kappa
34   XGB 0.706    0.636       0.618        0.68 0.255
4> 
5