Create a Function and Prepare Test Data in R
Premium
When we build the model, we will need the same set of columns in the test data also and will also need to apply all the same transformations that we have done to the test_data
also.
Kept Columns
1> str_c("'",paste(colnames(data_train),collapse="','"),"’")
2[1] "'term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status'"
3>
4
1keep_columns = c('term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status')
2
Create Function
1applyFeatureTransformations <- function(dt,use_kept_column = keep_columns,use_median_impute_model=median_impute_model, use_dummy_model=dummy_model,use_trans_model=trans_model){
2 #consolidate loan status
3 dt$loan_status = ifelse(str_detect(dt$loan_status,"Paid"),dt$loan_status,"Default")
4 #parse int_rate
5 dt$int_rate = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
6 #impute median
7 dt = predict(median_impute_model,dt)
8 #parse revol_util
9 dt$revol_util = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
10 dt$earliest_cr_line = parse_date_time(str_c("01",dt$issue_d),"dmy" ) - parse_date_time(str_c("01",dt$earliest_cr_line),"dmy" )
11 dt$earliest_cr_line = as.numeric(dt$earliest_cr_line,units = "days")
12 #binary variables for addr_state
13 dt$is_ny = ifelse(dt$addr_state=="NY",1,0)
14 dt$is_pa = ifelse(dt$addr_state=="PA",1,0)
15 dt$is_nj = ifelse(dt$addr_state=="NJ",1,0)
16 dt$is_oh = ifelse(dt$addr_state=="OH",1,0)
17 dt$is_fl = ifelse(dt$addr_state=="FL",1,0)
18 dt$is_co = ifelse(dt$addr_state=="CO",1,0)
19 dt$is_ga = ifelse(dt$addr_state=="GA",1,0)
20 dt$is_va = ifelse(dt$addr_state=="VA",1,0)
21 dt$is_az = ifelse(dt$addr_state=="AZ",1,0)
22 dt$is_ca = ifelse(dt$addr_state=="CA",1,0)
23 #transform transactions
24 dt$annual_inc = dt$annual_inc/dt$funded_amnt
25 dt$revol_bal = dt$revol_bal/dt$funded_amnt
26 dt$avg_cur_bal = dt$avg_cur_bal/dt$funded_amnt
27 dt$bc_open_to_buy = dt$bc_open_to_buy/dt$funded_amnt
28 #if purpose falling outside of recognized values
29 all_purpose = c('debt_consolidation','small_business','other','credit_card','major_purchase','moving','home_improvement','house','car','medical','renewable_energy','vacation','wedding')
30 dt$purpose = ifelse(dt$purpose %in% all_purpose,dt$purpose,"other")
31 #create dummy variables
32 loan_status = dt$loan_status
33 dt = as.data.frame(predict(use_dummy_model,dt))
34 dt$loan_status = loan_status
35 #center,scale data
36 trans_model_test = preProcess(dt,method=c("center","scale"))
37 dt = predict(trans_model_test, dt)
38 #remove all unused features
39 colnames(dt) = str_replace_all(colnames(dt)," ","_")
40 colnames(dt) = str_replace_all(colnames(dt),"<","_")
41 colnames(dt) = str_replace_all(colnames(dt),"/","_")
42 dt = dt[use_kept_column]
43 #set loan with status 'Fully Paid' as a positive sample
44 dt$loan_status = ifelse(dt$loan_status == "Fully Paid","Fully.Paid",dt$loan_status)
45 dt$loan_status = factor(dt$loan_status,levels = c("Default","Fully.Paid"))
46 return(dt)
47}
48
Prepare Test Data
We will now take our test data and apply our data transformations to it.
data_test
Before we apply the transformations, we will create some variables with some data from the test dataset which we will use later for evaluating the investment.
1#later used for evaluate investment
2data_test_grade = data_test$grade
3data_test_funded_amnt = data_test$funded_amnt
4data_test_total_pymnt = data_test$total_pymnt
5
We will now apply the data transformations using the function we created.
data_test = applyFeatureTransformations(data_test)