Lessons
- Credit Risk Modelling - Case Studies
- Classification vs. Regression Models
- Case Study - German Credit - Steps to Build a Predictive Model
- Import Credit Data Set in R
- German Credit Data : Data Preprocessing and Feature Selection in R
- Credit Modelling: Training and Test Data Sets
- Build the Predictive Model
- Logistic Regression Model in R
- Measure Model Performance in R Using ROCR Package
- Create a Confusion Matrix in R
- Credit Risk Modelling - Case Study- Lending Club Data
- Explore Loan Data in R - Loan Grade and Interest Rate
- Credit Risk Modelling - Required R Packages
- Loan Data - Training and Test Data Sets
- Data Cleaning in R - Part 1
- Data Cleaning in R - Part 2
- Data Cleaning in R - Part 3
- Data Cleaning in R - Part 5
- Remove Dimensions By Fitting Logistic Regression
- Create a Function and Prepare Test Data in R
- Building Credit Risk Model
- Credit Risk - Logistic Regression Model in R
- Support Vector Machine (SVM) Model in R
- Random Forest Model in R
- Extreme Gradient Boosting in R
- Predictive Modelling: Averaging Results from Multiple Models
- Predictive Modelling: Comparing Model Results
- How Insurance Companies Calculate Risk
Create a Function and Prepare Test Data in R
When we build the model, we will need the same set of columns in the test data also and will also need to apply all the same transformations that we have done to the test_data
also.
Kept Columns
> str_c("'",paste(colnames(data_train),collapse="','"),"’")
[1] "'term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status'"
>
keep_columns = c('term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status')
Create Function
applyFeatureTransformations <- function(dt,use_kept_column = keep_columns,use_median_impute_model=median_impute_model, use_dummy_model=dummy_model,use_trans_model=trans_model){
#consolidate loan status
dt$loan_status = ifelse(str_detect(dt$loan_status,"Paid"),dt$loan_status,"Default")
#parse int_rate
dt$int_rate = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
#impute median
dt = predict(median_impute_model,dt)
#parse revol_util
dt$revol_util = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
dt$earliest_cr_line = parse_date_time(str_c("01",dt$issue_d),"dmy" ) - parse_date_time(str_c("01",dt$earliest_cr_line),"dmy" )
dt$earliest_cr_line = as.numeric(dt$earliest_cr_line,units = "days")
#binary variables for addr_state
dt$is_ny = ifelse(dt$addr_state=="NY",1,0)
dt$is_pa = ifelse(dt$addr_state=="PA",1,0)
dt$is_nj = ifelse(dt$addr_state=="NJ",1,0)
dt$is_oh = ifelse(dt$addr_state=="OH",1,0)
dt$is_fl = ifelse(dt$addr_state=="FL",1,0)
dt$is_co = ifelse(dt$addr_state=="CO",1,0)
dt$is_ga = ifelse(dt$addr_state=="GA",1,0)
dt$is_va = ifelse(dt$addr_state=="VA",1,0)
dt$is_az = ifelse(dt$addr_state=="AZ",1,0)
dt$is_ca = ifelse(dt$addr_state=="CA",1,0)
#transform transactions
dt$annual_inc = dt$annual_inc/dt$funded_amnt
dt$revol_bal = dt$revol_bal/dt$funded_amnt
dt$avg_cur_bal = dt$avg_cur_bal/dt$funded_amnt
dt$bc_open_to_buy = dt$bc_open_to_buy/dt$funded_amnt
#if purpose falling outside of recognized values
all_purpose = c('debt_consolidation','small_business','other','credit_card','major_purchase','moving','home_improvement','house','car','medical','renewable_energy','vacation','wedding')
dt$purpose = ifelse(dt$purpose %in% all_purpose,dt$purpose,"other")
#create dummy variables
loan_status = dt$loan_status
dt = as.data.frame(predict(use_dummy_model,dt))
dt$loan_status = loan_status
#center,scale data
trans_model_test = preProcess(dt,method=c("center","scale"))
dt = predict(trans_model_test, dt)
#remove all unused features
colnames(dt) = str_replace_all(colnames(dt)," ","_")
colnames(dt) = str_replace_all(colnames(dt),"<","_")
colnames(dt) = str_replace_all(colnames(dt),"/","_")
dt = dt[use_kept_column]
#set loan with status 'Fully Paid' as a positive sample
dt$loan_status = ifelse(dt$loan_status == "Fully Paid","Fully.Paid",dt$loan_status)
dt$loan_status = factor(dt$loan_status,levels = c("Default","Fully.Paid"))
return(dt)
}
Prepare Test Data
We will now take our test data and apply our data transformations to it.
data_test
This content is for paid members only.
Join our membership for lifelong unlimited access to all our data science learning content and resources.