Extreme Gradient Boosting in R

Extreme Gradient Boosting has a very efficient implementation. Unlike SVM and RandomForest, we can tune parameter using the whole downsampling set. We focus on varying Ridge & Lasso regularization and learning rate. We use 10% of data for validating tuning parameter.

set.seed(400)
#down sampling again so than we get more info when stacking
samp = downSample(data_train[-getIndexsOfColumns(data_train, c( "loan_status") )],data_train$loan_status,yname="loan_status")
#choose small data for validating
train_index_tuning= createDataPartition(samp$loan_status,p = 0.1,list=FALSE,times=1)
etas = c(0.1,0.3)
alphas = c(0,0.5,1)
lambdas = c(0,0.5,1)

install.packages("xgboost”)
library(xgboost)

test_watchlist = list(
    test = xgb.DMatrix(
        data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
        label = as.numeric(samp[train_index_tuning,"loan_status"])-1
    )
)

gbm_perf = data.frame(eta=numeric(0),alpha=numeric(0),lambda=numeric(0),auc=numeric(0))
for(eta in etas){
    for(alpha in alphas){
        for(lambda in lambdas){
            model = xgb.train(
                data= xgb.DMatrix(
                    data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
                    label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
                ),
                objective = "binary:logistic",
                nrounds = 350,
                watchlist = test_watchlist,
                eval_metric = "auc",
                early.stop.rounds = 10,
                alpha = alpha,
                lambda = lambda,
                eta = eta)
            gbm_perf[nrow(gbm_perf)+1,] = c(eta,alpha,lambda,model$bestScore)
        }
    }
}

gbm_perf %>% arrange(desc(auc))

The best tuning parameter is eta = 0.1alpha = 0.5, and lambda = 1.0. We retrain it again here in case readers didn’t run the tuning code. We collect its performance.

set.seed(400)
test_watchlist = list(
    test = xgb.DMatrix(
        data = as.matrix(samp[train_index_tuning,][getNumericColumns(samp)]),
        label = as.numeric(samp[train_index_tuning,"loan_status"])-1
    )
)

xgb_model = xgb.train(
                data= xgb.DMatrix(
                    data = as.matrix(samp[-train_index_tuning,][getNumericColumns(samp)]),
                    label = as.numeric(samp[-train_index_tuning,"loan_status"])-1
                ),
                objective = "binary:logistic",
                nrounds = 350,
                watchlist = test_watchlist,
                eval_metric = "auc",
                early.stop.round = 10,
                alpha = 0.5,
                lambda = 1.0,
                eta = 0.1)
[1] test-auc:0.656015 
Will train until test_auc hasn't improved in 10 rounds.

[2] test-auc:0.672381 
[3] test-auc:0.679274 
[4] test-auc:0.681095 
[5] test-auc:0.689003 
[6] test-auc:0.690731 
[7] test-auc:0.692656 
[8] test-auc:0.693467 
[9] test-auc:0.695073 
[10]    test-auc:0.695977 
[11]    test-auc:0.697777 
[12]    test-auc:0.698707 
[13]    test-auc:0.698882 
[14]    test-auc:0.701135 
[15]    test-auc:0.702984 
[16]    test-auc:0.704485 
[17]    test-auc:0.705495 
[18]    test-auc:0.706659 
[19]    test-auc:0.707099 
[20]    test-auc:0.708352 
[21]    test-auc:0.708617 
[22]    test-auc:0.709497 
[23]    test-auc:0.710382 
[24]    test-auc:0.710718 
[25]    test-auc:0.711810 
[26]    test-auc:0.712518 
[27]    test-auc:0.712553 
[28]    test-auc:0.713433 
[29]    test-auc:0.713817 
[30]    test-auc:0.713927 
[31]    test-auc:0.714554 
[32]    test-auc:0.714980 
[33]    test-auc:0.715179 
[34]    test-auc:0.715924 
[35]    test-auc:0.716299 
[36]    test-auc:0.716700 
[37]    test-auc:0.716733 
[38]    test-auc:0.717246 
[39]    test-auc:0.717266 
[40]    test-auc:0.717660 
[41]    test-auc:0.718041 
[42]    test-auc:0.717934 
[43]    test-auc:0.718888 
[44]    test-auc:0.719207 
[45]    test-auc:0.719219 
[46]    test-auc:0.719322 
[47]    test-auc:0.719694 
[48]    test-auc:0.719976 
[49]    test-auc:0.720586 
[50]    test-auc:0.721058 
[51]    test-auc:0.721292 
[52]    test-auc:0.721402 
[53]    test-auc:0.721236 
[54]    test-auc:0.721692 
[55]    test-auc:0.721584 
[56]    test-auc:0.721585 
[57]    test-auc:0.721289 
[58]    test-auc:0.721552 
[59]    test-auc:0.721499 
[60]    test-auc:0.721309 
[61]    test-auc:0.721516 
[62]    test-auc:0.721776 
[63]    test-auc:0.721996 
[64]    test-auc:0.721674 
[65]    test-auc:0.722132 
[66]    test-auc:0.722179 
[67]    test-auc:0.721972 
[68]    test-auc:0.721933 
[69]    test-auc:0.722067 
[70]    test-auc:0.722049 
[71]    test-auc:0.722391 
[72]    test-auc:0.722212 
[73]    test-auc:0.722595 
[74]    test-auc:0.722503 
[75]    test-auc:0.722345 
[76]    test-auc:0.722503 
[77]    test-auc:0.722589 
[78]    test-auc:0.722731 
[79]    test-auc:0.722534 
[80]    test-auc:0.722708 
[81]    test-auc:0.722812 
[82]    test-auc:0.722784 
[83]    test-auc:0.723001 
[84]    test-auc:0.723074 
[85]    test-auc:0.722978 
[86]    test-auc:0.722794 
[87]    test-auc:0.723016 
[88]    test-auc:0.723130 
[89]    test-auc:0.723050 
[90]    test-auc:0.722762 
[91]    test-auc:0.722684 
[92]    test-auc:0.723375 
[93]    test-auc:0.723760 
[94]    test-auc:0.723965 
[95]    test-auc:0.724011 
[96]    test-auc:0.724273 
[97]    test-auc:0.724057 
[98]    test-auc:0.723959 
[99]    test-auc:0.724219 
[100]   test-auc:0.724262 
[101]   test-auc:0.724202 
[102]   test-auc:0.724573 
[103]   test-auc:0.724432 
[104]   test-auc:0.724790 
[105]   test-auc:0.724792 
[106]   test-auc:0.725028 
[107]   test-auc:0.725214 
[108]   test-auc:0.725203 
[109]   test-auc:0.725292 
[110]   test-auc:0.725163 
[111]   test-auc:0.725213 
[112]   test-auc:0.725229 
[113]   test-auc:0.724999 
[114]   test-auc:0.725181 
[115]   test-auc:0.725151 
[116]   test-auc:0.725140 
[117]   test-auc:0.725079 
[118]   test-auc:0.725306 
[119]   test-auc:0.725115 
[120]   test-auc:0.725141 
[121]   test-auc:0.725279 
[122]   test-auc:0.725290 
[123]   test-auc:0.725333 
[124]   test-auc:0.725351 
[125]   test-auc:0.725092 
[126]   test-auc:0.725250 
[127]   test-auc:0.725163 
[128]   test-auc:0.725266 
[129]   test-auc:0.725420 
[130]   test-auc:0.725525 
[131]   test-auc:0.725444 
[132]   test-auc:0.725441 
[133]   test-auc:0.725484 
[134]   test-auc:0.725423 
[135]   test-auc:0.725419 
[136]   test-auc:0.725665 
[137]   test-auc:0.725749 
[138]   test-auc:0.725820 
[139]   test-auc:0.725869 
[140]   test-auc:0.725578 
[141]   test-auc:0.725665 
[142]   test-auc:0.725927 
[143]   test-auc:0.725749 
[144]   test-auc:0.725693 
[145]   test-auc:0.725440 
[146]   test-auc:0.725374 
[147]   test-auc:0.725685 
[148]   test-auc:0.725883 
[149]   test-auc:0.725466 
[150]   test-auc:0.725497 
[151]   test-auc:0.725454 
[152]   test-auc:0.725437 
Stopping. Best iteration:
[142]   test-auc:0.725927
> xgb_model
##### xgb.Booster
raw: 480.9 Kb 
call:
  xgb.train(data = xgb.DMatrix(data = as.matrix(samp[-train_index_tuning, 
    ][getNumericColumns(samp)]), label = as.numeric(samp[-train_index_tuning, 
    "loan_status"]) - 1), nrounds = 350, watchlist = test_watchlist, 
    objective = "binary:logistic", eval_metric = "auc", early.stop.round = 10, 
    alpha = 0.5, lambda = 1, eta = 0.1)
params (as set within xgb.train):
  objective = "binary:logistic", eval_metric = "auc", early_stop_round = "10", alpha = "0.5", lambda = "1", eta = "0.1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.print.evaluation(period = print_every_n)
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
niter: 152
best_iteration: 142
best_ntreelimit: 142
best_score: 0.725927
evaluation_log:
    iter test_auc
       1 0.656015
       2 0.672381
---              
     151 0.725454
     152 0.725437
> 
predict_loan_status_xgb = predict(xgb_model,as.matrix(data_test[getNumericColumns(data_test)]))

rocCurve_xgb = roc(response = data_test$loan_status,
               predictor = predict_loan_status_xgb)

auc_curve = auc(rocCurve_xgb)

plot(rocCurve_xgb,legacy.axes = TRUE,print.auc = TRUE,col="red",main="ROC(XGB)")

> rocCurve_xgb
Call:
roc.default(response = data_test$loan_status, predictor = predict_loan_status_xgb)
Data: predict_loan_status_xgb in 5358 controls (data_test$loan_status Default) < 12602 cases (data_test$loan_status Fully.Paid).
Area under the curve: 0.706
>
predict_loan_status_label = ifelse(predict_loan_status_xgb<0.5,"Default","Fully.Paid")
c = confusionMatrix(predict_loan_status_label,data_test$loan_status,positive="Fully.Paid")

table_perf[4,] = c("XGB",
  round(auc_curve,3),
  as.numeric(round(c$overall["Accuracy"],3)),
  as.numeric(round(c$byClass["Sensitivity"],3)),
  as.numeric(round(c$byClass["Specificity"],3)),
  as.numeric(round(c$overall["Kappa"],3))
  )

The model’s performance is as follow:

> tail(table_perf,1)
  model   auc accuracy sensitivity specificity kappa
4   XGB 0.706    0.636       0.618        0.68 0.255
> 

Please login to view this lesson.

With our free registration, you can access to all the lessons on finance, risk, data analytics and data science for finance professionals.

Sign in free

Course Downloads

Member Only