Create a Function and Prepare Test Data in R

When we build the model, we will need the same set of columns in the test data also and will also need to apply all the same transformations that we have done to the test_data also.

Kept Columns

> str_c("'",paste(colnames(data_train),collapse="','"),"’")
[1] "'term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status'"
>
keep_columns = c('term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status')

Create Function

applyFeatureTransformations <- function(dt,use_kept_column = keep_columns,use_median_impute_model=median_impute_model,  use_dummy_model=dummy_model,use_trans_model=trans_model){
    #consolidate loan status
    dt$loan_status = ifelse(str_detect(dt$loan_status,"Paid"),dt$loan_status,"Default")
    #parse int_rate
    dt$int_rate = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
    #impute median
    dt = predict(median_impute_model,dt)
    #parse revol_util
    dt$revol_util = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
        dt$earliest_cr_line = parse_date_time(str_c("01",dt$issue_d),"dmy" ) - parse_date_time(str_c("01",dt$earliest_cr_line),"dmy" )
        dt$earliest_cr_line = as.numeric(dt$earliest_cr_line,units = "days")
    #binary variables for addr_state
    dt$is_ny = ifelse(dt$addr_state=="NY",1,0)
    dt$is_pa = ifelse(dt$addr_state=="PA",1,0)
    dt$is_nj = ifelse(dt$addr_state=="NJ",1,0)
    dt$is_oh = ifelse(dt$addr_state=="OH",1,0)
    dt$is_fl = ifelse(dt$addr_state=="FL",1,0)
    dt$is_co = ifelse(dt$addr_state=="CO",1,0)
    dt$is_ga = ifelse(dt$addr_state=="GA",1,0)
    dt$is_va = ifelse(dt$addr_state=="VA",1,0)
    dt$is_az = ifelse(dt$addr_state=="AZ",1,0)
    dt$is_ca = ifelse(dt$addr_state=="CA",1,0)
    #transform transactions
    dt$annual_inc = dt$annual_inc/dt$funded_amnt
    dt$revol_bal = dt$revol_bal/dt$funded_amnt
    dt$avg_cur_bal = dt$avg_cur_bal/dt$funded_amnt
    dt$bc_open_to_buy = dt$bc_open_to_buy/dt$funded_amnt
    #if purpose falling outside of recognized values
    all_purpose = c('debt_consolidation','small_business','other','credit_card','major_purchase','moving','home_improvement','house','car','medical','renewable_energy','vacation','wedding')
    dt$purpose = ifelse(dt$purpose %in% all_purpose,dt$purpose,"other")
    #create dummy variables
    loan_status = dt$loan_status
    dt = as.data.frame(predict(use_dummy_model,dt))
    dt$loan_status = loan_status
    #center,scale data
    trans_model_test = preProcess(dt,method=c("center","scale"))
    dt = predict(trans_model_test, dt)    
    #remove all unused features
    colnames(dt) = str_replace_all(colnames(dt)," ","_")
    colnames(dt) = str_replace_all(colnames(dt),"<","_")
    colnames(dt) = str_replace_all(colnames(dt),"/","_")
    dt = dt[use_kept_column]
    #set loan with status 'Fully Paid' as a positive sample
    dt$loan_status = ifelse(dt$loan_status == "Fully Paid","Fully.Paid",dt$loan_status)
    dt$loan_status = factor(dt$loan_status,levels = c("Default","Fully.Paid"))
    return(dt)
}

Prepare Test Data

We will now take our test data and apply our data transformations to it.

data_test

Before we apply the transformations, we will create some variables with some data from the test dataset which we will use later for evaluating the investment.

#later used for evaluate investment
data_test_grade = data_test$grade
data_test_funded_amnt = data_test$funded_amnt
data_test_total_pymnt = data_test$total_pymnt

We will now apply the data transformations using the function we created.

data_test = applyFeatureTransformations(data_test)

Related Downloads

Data Science in Finance: 9-Book Bundle

Data Science in Finance Book Bundle

Master R and Python for financial data science with our comprehensive bundle of 9 ebooks.

What's Included:

  • Getting Started with R
  • R Programming for Data Science
  • Data Visualization with R
  • Financial Time Series Analysis with R
  • Quantitative Trading Strategies with R
  • Derivatives with R
  • Credit Risk Modelling With R
  • Python for Data Science
  • Machine Learning in Finance using Python

Each book includes PDFs, explanations, instructions, data files, and R code for all examples.

Get the Bundle for $39 (Regular $57)
JOIN 30,000 DATA PROFESSIONALS

Free Guides - Getting Started with R and Python

Enter your name and email address below and we will email you the guides for R programming and Python.

Data Science in Finance: 9-Book Bundle

Data Science in Finance Book Bundle

Master R and Python for financial data science with our comprehensive bundle of 9 ebooks.

What's Included:

  • Getting Started with R
  • R Programming for Data Science
  • Data Visualization with R
  • Financial Time Series Analysis with R
  • Quantitative Trading Strategies with R
  • Derivatives with R
  • Credit Risk Modelling With R
  • Python for Data Science
  • Machine Learning in Finance using Python

Each book comes with PDFs, detailed explanations, step-by-step instructions, data files, and complete downloadable R code for all examples.