Random Forest Model in R

Premium

Now, we will tune RandomForest model. Like SVM, we tune parameter based on 5% downsampling data. The procedure is exactly the same as for SVM model. Below we have reproduced the code for Random Forest model.

1set.seed(300)
2#down sampling again so than we get more info when stacking
3samp = downSample(data_train[-getIndexsOfColumns(data_train, c( "loan_status") )],data_train$loan_status,yname="loan_status")
4#choose small data for tuning 
5train_index_tuning = createDataPartition(samp$loan_status,p = 0.05,list=FALSE,times=1)
6#choose small data for re-train
7train_index_training = createDataPartition(samp$loan_status,p = 0.1,list=FALSE,times=1)
8rfGrid = expand.grid(
9                .mtry = as.integer(seq(2,ncol(samp), (ncol(samp) - 2)/4))
10                )
11#Install random forest package
12library(randomForest)
13rfTuned = train(
14    samp[train_index_tuning,-getIndexsOfColumns(samp,"loan_status")],
15    y = samp[train_index_tuning,"loan_status"],
16    method = "rf",
17    tuneGrid = rfGrid,
18    metric = "ROC",
19    trControl = ctrl,
20    preProcess = NULL,
21    ntree = 100
22    )
23plot(rfTuned)
24
1> rfTuned
2Random Forest 
31268 samples
4  70 predictor
5   2 classes: 'Default', 'Fully.Paid' 
6No pre-processing
7Resampling: Cross-Validated (3 fold) 
8Summary of sample sizes: 845, 845, 846 
9Resampling results across tuning parameters:
10  mtry  ROC        Sens       Spec     
11   2    0.7028532  0.6909073  0.6199440
12  19    0.6832394  0.6451832  0.6088706
13  36    0.6706683  0.6231333  0.5820516
14  53    0.6748038  0.6263003  0.6026111
15  71    0.6751421  0.6609511  0.5962622
16ROC was used to select the optimal model using the largest value.
17The final value used for the model was mtry = 2.
18>
19

The best parameter is mtry(number of predictors) = 2. Like SVM, we fit 10% of downsampling data with this value.

1rf_model = randomForest(loan_status ~ . ,data = samp[train_index_training,],mtry = 2,ntree=400)
2predict_loan_status_rf = predict(rf_model,data_test,"prob")
3predict_loan_status_rf = as.data.frame(predict_loan_status_rf)$Fully.Paid
4rocCurve_rf = roc(response = data_test$loan_status,
5               predictor = predict_loan_status_rf)
6auc_curve = auc(rocCurve_rf)
7plot(rocCurve_rf,legacy.axes = TRUE,print.auc = TRUE,col="red",main="ROC(RandomForest)")
8
1> rocCurve_rf
2
3Call:
4roc.default(response = data_test$loan_status, predictor = predict_loan_status_rf)
5
6Data: predict_loan_status_rf in 5358 controls (data_test$loan_status Default) < 12602 cases (data_test$loan_status Fully.Paid).
7Area under the curve: 0.705
8> 
9predict_loan_status_label = ifelse(predict_loan_status_rf<0.5,"Default","Fully.Paid")
10c = confusionMatrix(predict_loan_status_label,data_test$loan_status,positive="Fully.Paid")
11
12table_perf[3,] = c("RandomForest",
13  round(auc_curve,3),
14  as.numeric(round(c$overall["Accuracy"],3)),
15  as.numeric(round(c$byClass["Sensitivity"],3)),
16  as.numeric(round(c$byClass["Specificity"],3)),
17  as.numeric(round(c$overall["Kappa"],3))
18  )
19

The model’s performance is as follow:

1> tail(table_perf,1)
2         model   auc accuracy sensitivity specificity kappa
33 RandomForest 0.705    0.657       0.666       0.635 0.268
4> 
5