Extreme Gradient Boosting in R

Extreme Gradient Boosting has a very efficient implementation. Unlike SVM and RandomForest, we can tune parameter using the whole downsampling set. We focus on varying Ridge & Lasso regularization and learning rate. We use 10% of data for validating tuning parameter.

set.seed(400)
#down sampling again so than we get more info when stacking
samp = downSample(data_train[-getIndexsOfColumns(data_train, c( "loan_status") )],data_train$loan_status,yname="loan_status")
#choose small data for validating
train_index_tuning= createDataPartition(samp$loan_status,p = 0.1,list=FALSE,times=1)
etas = c(0.1,0.3)
alphas = c(0,0.5,1)
lambdas = c(0,0.5,1)

install.packages("xgboost”)
library(xgboost)

test_watchlist = list(
    test = xgb.DMatrix(
        data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
        label = as.numeric(samp[train_index_tuning,"loan_status"])-1
    )
)

gbm_perf = data.frame(eta=numeric(0),alpha=numeric(0),lambda=numeric(0),auc=numeric(0))
for(eta in etas){
    for(alpha in alphas){
        for(lambda in lambdas){
            model = xgb.train(
                data= xgb.DMatrix(
                    data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
                    label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
                ),
                objective = "binary:logistic",
                nrounds = 350,
                watchlist = test_watchlist,
                eval_metric = "auc",
                early.stop.rounds = 10,
                alpha = alpha,
                lambda = lambda,
                eta = eta)
            gbm_perf[nrow(gbm_perf)+1,] = c(eta,alpha,lambda,model$bestScore)
        }
    }
}

gbm_perf %>% arrange(desc(auc))

The best tuning parameter is eta = 0.1alpha = 0.5, and lambda = 1.0. We retrain it again here in case readers didn’t run the tuning code. We collect its performance.

set.seed(400)
test_watchlist = list(
    test = xgb.DMatrix(
        data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
        label = as.numeric(samp[train_index_tuning,"loan_status"])-1
    )
)

xgb_model = xgb.train(
                data= xgb.DMatrix(
                    data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
                    label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
                ),
                objective = "binary:logistic",
                nrounds = 350,
                watchlist = test_watchlist,
                eval_metric = "auc",
                early.stop.round = 10,
                alpha = 0.5,
                lambda = 1.0,
                eta = 0.1)
[1] test-auc:0.656015 
Will train until test_auc hasn't improved in 10 rounds.

[2] test-auc:0.672381 
[3] test-auc:0.679274 
[4] test-auc:0.681095 
[5] test-auc:0.689003 
[6] test-auc:0.690731 
[7] test-auc:0.692656 
[8] test-auc:0.693467 
[9] test-auc:0.695073 
[10]    test-auc:0.695977 
[11]    test-auc:0.697777 
[12]    test-auc:0.698707 
[13]    test-auc:0.698882 
[14]    test-auc:0.701135 
[15]    test-auc:0.702984 
[16]    test-auc:0.704485 
[17]    test-auc:0.705495 
[18]    test-auc:0.706659 
[19]    test-auc:0.707099 
[20]    test-auc:0.708352 
[21]    test-auc:0.708617 
[22]    test-auc:0.709497 
[23]    test-auc:0.710382 
[24]    test-auc:0.710718 
[25]    test-auc:0.711810 
[26]    test-auc:0.712518 
[27]    test-auc:0.712553 
[28]    test-auc:0.713433 
[29]    test-auc:0.713817 
[30]    test-auc:0.713927 
[31]    test-auc:0.714554 
[32]    test-auc:0.714980 
[33]    test-auc:0.715179 
[34]    test-auc:0.715924 
[35]    test-auc:0.716299 
[36]    test-auc:0.716700 
[37]    test-auc:0.716733 
[38]    test-auc:0.717246 
[39]    test-auc:0.717266 
[40]    test-auc:0.717660 
[41]    test-auc:0.718041 
[42]    test-auc:0.717934 
[43]    test-auc:0.718888 
[44]    test-auc:0.719207 
[45]    test-auc:0.719219 
[46]    test-auc:0.719322 
[47]    test-auc:0.719694 
[48]    test-auc:0.719976 
[49]    test-auc:0.720586 
[50]    test-auc:0.721058 
[51]    test-auc:0.721292 
[52]    test-auc:0.721402 
[53]    test-auc:0.721236 
[54]    test-auc:0.721692 
[55]    test-auc:0.721584 
[56]    test-auc:0.721585 
[57]    test-auc:0.721289 
[58]    test-auc:0.721552 
[59]    test-auc:0.721499 
[60]    test-auc:0.721309 
[61]    test-auc:0.721516 
[62]    test-auc:0.721776 
[63]    test-auc:0.721996 
[64]    test-auc:0.721674 
[65]    test-auc:0.722132 
[66]    test-auc:0.722179 
[67]    test-auc:0.721972 
[68]    test-auc:0.721933 
[69]    test-auc:0.722067 
[70]    test-auc:0.722049 
[71]    test-auc:0.722391 
[72]    test-auc:0.722212 
[73]    test-auc:0.722595 
[74]    test-auc:0.722503 
[75]    test-auc:0.722345 
[76]    test-auc:0.722503 
[77]    test-auc:0.722589 
[78]    test-auc:0.722731 
[79]    test-auc:0.722534 
[80]    test-auc:0.722708 
[81]    test-auc:0.722812 
[82]    test-auc:0.722784 
[83]    test-auc:0.723001 
[84]    test-auc:0.723074 
[85]    test-auc:0.722978 
[86]    test-auc:0.722794 
[87]    test-auc:0.723016 
[88]    test-auc:0.723130 
[89]    test-auc:0.723050 
[90]    test-auc:0.722762 
[91]    test-auc:0.722684 
[92]    test-auc:0.723375 
[93]    test-auc:0.723760 
[94]    test-auc:0.723965 
[95]    test-auc:0.724011 
[96]    test-auc:0.724273 
[97]    test-auc:0.724057 
[98]    test-auc:0.723959 
[99]    test-auc:0.724219 
[100]   test-auc:0.724262 
[101]   test-auc:0.724202 
[102]   test-auc:0.724573 
[103]   test-auc:0.724432 
[104]   test-auc:0.724790 
[105]   test-auc:0.724792 
[106]   test-auc:0.725028 
[107]   test-auc:0.725214 
[108]   test-auc:0.725203 
[109]   test-auc:0.725292 
[110]   test-auc:0.725163 
[111]   test-auc:0.725213 
[112]   test-auc:0.725229 
[113]   test-auc:0.724999 
[114]   test-auc:0.725181 
[115]   test-auc:0.725151 
[116]   test-auc:0.725140 
[117]   test-auc:0.725079 
[118]   test-auc:0.725306 
[119]   test-auc:0.725115 
[120]   test-auc:0.725141 
[121]   test-auc:0.725279 
[122]   test-auc:0.725290 
[123]   test-auc:0.725333 
[124]   test-auc:0.725351 
[125]   test-auc:0.725092 
[126]   test-auc:0.725250 
[127]   test-auc:0.725163 
[128]   test-auc:0.725266 
[129]   test-auc:0.725420 
[130]   test-auc:0.725525 
[131]   test-auc:0.725444 
[132]   test-auc:0.725441 
[133]   test-auc:0.725484 
[134]   test-auc:0.725423 
[135]   test-auc:0.725419 
[136]   test-auc:0.725665 
[137]   test-auc:0.725749 
[138]   test-auc:0.725820 
[139]   test-auc:0.725869 
[140]   test-auc:0.725578 
[141]   test-auc:0.725665 
[142]   test-auc:0.725927 
[143]   test-auc:0.725749 
[144]   test-auc:0.725693 
[145]   test-auc:0.725440 
[146]   test-auc:0.725374 
[147]   test-auc:0.725685 
[148]   test-auc:0.725883 
[149]   test-auc:0.725466 
[150]   test-auc:0.725497 
[151]   test-auc:0.725454 
[152]   test-auc:0.725437 
Stopping. Best iteration:
[142]   test-auc:0.725927
> xgb_model
##### xgb.Booster
raw: 480.9 Kb 
call:
  xgb.train(data = xgb.DMatrix(data = as.matrix(samp[-train_index_tuning, 
    ][getNumericColumns(samp)]), label = as.numeric(samp[-train_index_tuning, 
    "loan_status"]) - 1), nrounds = 350, watchlist = test_watchlist, 
    objective = "binary:logistic", eval_metric = "auc", early.stop.round = 10, 
    alpha = 0.5, lambda = 1, eta = 0.1)
params (as set within xgb.train):
  objective = "binary:logistic", eval_metric = "auc", early_stop_round = "10", alpha = "0.5", lambda = "1", eta = "0.1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.print.evaluation(period = print_every_n)
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
niter: 152
best_iteration: 142
best_ntreelimit: 142
best_score: 0.725927
evaluation_log:
    iter test_auc
       1 0.656015
       2 0.672381
---              
     151 0.725454
     152 0.725437
> 
predict_loan_status_xgb = predict(xgb_model,as.matrix(data_test[getNumericColumns(data_test)]))

rocCurve_xgb = roc(response = data_test$loan_status,
               predictor = predict_loan_status_xgb)

auc_curve = auc(rocCurve_xgb)

plot(rocCurve_xgb,legacy.axes = TRUE,print.auc = TRUE,col="red",main="ROC(XGB)")

> rocCurve_xgb
Call:
roc.default(response = data_test$loan_status, predictor = predict_loan_status_xgb)
Data: predict_loan_status_xgb in 5358 controls (data_test$loan_status Default) < 12602 cases (data_test$loan_status Fully.Paid).
Area under the curve: 0.706
>
predict_loan_status_label = ifelse(predict_loan_status_xgb<0.5,"Default","Fully.Paid")
c = confusionMatrix(predict_loan_status_label,data_test$loan_status,positive="Fully.Paid")

table_perf[4,] = c("XGB",
  round(auc_curve,3),
  as.numeric(round(c$overall["Accuracy"],3)),
  as.numeric(round(c$byClass["Sensitivity"],3)),
  as.numeric(round(c$byClass["Specificity"],3)),
  as.numeric(round(c$overall["Kappa"],3))
  )

The model’s performance is as follow:

> tail(table_perf,1)
  model   auc accuracy sensitivity specificity kappa
4   XGB 0.706    0.636       0.618        0.68 0.255
> 
Finance Train Subscription

Unlock full access to Finance Train and see the entire library of member-only content and resources.