- Credit Risk Modelling - Case Studies
- Classification vs. Regression Models
- Case Study - German Credit - Steps to Build a Predictive Model
- Import Credit Data Set in R
- German Credit Data : Data Preprocessing and Feature Selection in R
- Credit Modelling: Training and Test Data Sets
- Build the Predictive Model
- Logistic Regression Model in R
- Measure Model Performance in R Using ROCR Package
- Create a Confusion Matrix in R
- Credit Risk Modelling - Case Study- Lending Club Data
- Explore Loan Data in R - Loan Grade and Interest Rate
- Credit Risk Modelling - Required R Packages
- Loan Data - Training and Test Data Sets
- Data Cleaning in R - Part 1
- Data Cleaning in R - Part 2
- Data Cleaning in R - Part 3
- Data Cleaning in R - Part 5
- Remove Dimensions By Fitting Logistic Regression
- Create a Function and Prepare Test Data in R
- Building Credit Risk Model
- Credit Risk - Logistic Regression Model in R
- Support Vector Machine (SVM) Model in R
- Random Forest Model in R
- Extreme Gradient Boosting in R
- Predictive Modelling: Averaging Results from Multiple Models
- Predictive Modelling: Comparing Model Results
- How Insurance Companies Calculate Risk
Create a Function and Prepare Test Data in R
When we build the model, we will need the same set of columns in the test data also and will also need to apply all the same transformations that we have done to the test_data
also.
Kept Columns
> str_c("'",paste(colnames(data_train),collapse="','"),"’")
[1] "'term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status'"
>
keep_columns = c('term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status')
Create Function
applyFeatureTransformations <- function(dt,use_kept_column = keep_columns,use_median_impute_model=median_impute_model, use_dummy_model=dummy_model,use_trans_model=trans_model){
#consolidate loan status
dt$loan_status = ifelse(str_detect(dt$loan_status,"Paid"),dt$loan_status,"Default")
#parse int_rate
dt$int_rate = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
#impute median
dt = predict(median_impute_model,dt)
#parse revol_util
dt$revol_util = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
dt$earliest_cr_line = parse_date_time(str_c("01",dt$issue_d),"dmy" ) - parse_date_time(str_c("01",dt$earliest_cr_line),"dmy" )
dt$earliest_cr_line = as.numeric(dt$earliest_cr_line,units = "days")
#binary variables for addr_state
dt$is_ny = ifelse(dt$addr_state=="NY",1,0)
dt$is_pa = ifelse(dt$addr_state=="PA",1,0)
dt$is_nj = ifelse(dt$addr_state=="NJ",1,0)
dt$is_oh = ifelse(dt$addr_state=="OH",1,0)
dt$is_fl = ifelse(dt$addr_state=="FL",1,0)
dt$is_co = ifelse(dt$addr_state=="CO",1,0)
dt$is_ga = ifelse(dt$addr_state=="GA",1,0)
dt$is_va = ifelse(dt$addr_state=="VA",1,0)
dt$is_az = ifelse(dt$addr_state=="AZ",1,0)
dt$is_ca = ifelse(dt$addr_state=="CA",1,0)
#transform transactions
dt$annual_inc = dt$annual_inc/dt$funded_amnt
dt$revol_bal = dt$revol_bal/dt$funded_amnt
dt$avg_cur_bal = dt$avg_cur_bal/dt$funded_amnt
dt$bc_open_to_buy = dt$bc_open_to_buy/dt$funded_amnt
#if purpose falling outside of recognized values
all_purpose = c('debt_consolidation','small_business','other','credit_card','major_purchase','moving','home_improvement','house','car','medical','renewable_energy','vacation','wedding')
dt$purpose = ifelse(dt$purpose %in% all_purpose,dt$purpose,"other")
#create dummy variables
loan_status = dt$loan_status
dt = as.data.frame(predict(use_dummy_model,dt))
dt$loan_status = loan_status
#center,scale data
trans_model_test = preProcess(dt,method=c("center","scale"))
dt = predict(trans_model_test, dt)
#remove all unused features
colnames(dt) = str_replace_all(colnames(dt)," ","_")
colnames(dt) = str_replace_all(colnames(dt),"<","_")
colnames(dt) = str_replace_all(colnames(dt),"/","_")
dt = dt[use_kept_column]
#set loan with status 'Fully Paid' as a positive sample
dt$loan_status = ifelse(dt$loan_status == "Fully Paid","Fully.Paid",dt$loan_status)
dt$loan_status = factor(dt$loan_status,levels = c("Default","Fully.Paid"))
return(dt)
}
Prepare Test Data
We will now take our test data and apply our data transformations to it.
data_test
Before we apply the transformations, we will create some variables with some data from the test dataset which we will use later for evaluating the investment.
#later used for evaluate investment
data_test_grade = data_test$grade
data_test_funded_amnt = data_test$funded_amnt
data_test_total_pymnt = data_test$total_pymnt
We will now apply the data transformations using the function we created.
data_test = applyFeatureTransformations(data_test)
Related Downloads
Data Science in Finance: 9-Book Bundle
Master R and Python for financial data science with our comprehensive bundle of 9 ebooks.
What's Included:
- Getting Started with R
- R Programming for Data Science
- Data Visualization with R
- Financial Time Series Analysis with R
- Quantitative Trading Strategies with R
- Derivatives with R
- Credit Risk Modelling With R
- Python for Data Science
- Machine Learning in Finance using Python
Each book includes PDFs, explanations, instructions, data files, and R code for all examples.
Get the Bundle for $39 (Regular $57)Free Guides - Getting Started with R and Python
Enter your name and email address below and we will email you the guides for R programming and Python.