• Skip to primary navigation
  • Skip to main content
  • Skip to primary sidebar
  • Skip to footer
Finance Train

Finance Train

High Quality tutorials for finance, risk, data science

  • Home
  • Data Science
  • CFA® Exam
  • PRM Exam
  • Tutorials
  • Careers
  • Products
  • Login

Create a Function and Prepare Test Data in R

Data Science, Risk Management

This lesson is part 20 of 28 in the course Credit Risk Modelling in R

When we build the model, we will need the same set of columns in the test data also and will also need to apply all the same transformations that we have done to the test_data also.

Kept Columns

> str_c("'",paste(colnames(data_train),collapse="','"),"’")
[1] "'term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status'"
>
keep_columns = c('term_60_months','sub_gradeA2','sub_gradeA3','sub_gradeA4','sub_gradeA5','sub_gradeB1','sub_gradeB2','sub_gradeB3','sub_gradeB4','sub_gradeB5','sub_gradeC1','sub_gradeC2','sub_gradeC3','sub_gradeC4','sub_gradeC5','sub_gradeD1','sub_gradeD2','sub_gradeD3','sub_gradeD4','sub_gradeD5','sub_gradeE1','sub_gradeE2','sub_gradeE3','sub_gradeE4','sub_gradeE5','sub_gradeF1','sub_gradeF2','sub_gradeF3','sub_gradeF4','sub_gradeF5','sub_gradeG1','sub_gradeG2','sub_gradeG3','sub_gradeG4','sub_gradeG5','emp_lengthn_a','home_ownershipRENT','verification_statusSource_Verified','verification_statusVerified','purposecredit_card','purposedebt_consolidation','purposehome_improvement','purposemajor_purchase','purposemedical','purposemoving','purposeother','purposesmall_business','purposevacation','dti','delinq_2yrs','earliest_cr_line','mths_since_last_delinq','application_typeJoint_App','max_bal_bc','all_util','inq_fi','total_cu_tl','avg_cur_bal','bc_open_to_buy','mort_acc','mths_since_recent_bc','num_actv_bc_tl','num_bc_tl','num_tl_90g_dpd_24m','percent_bc_gt_75','is_ny','is_pa','is_nj','is_co','is_ga','loan_status')

Create Function

applyFeatureTransformations <- function(dt,use_kept_column = keep_columns,use_median_impute_model=median_impute_model,  use_dummy_model=dummy_model,use_trans_model=trans_model){
    #consolidate loan status
    dt$loan_status = ifelse(str_detect(dt$loan_status,"Paid"),dt$loan_status,"Default")
    #parse int_rate
    dt$int_rate = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
    #impute median
    dt = predict(median_impute_model,dt)
    #parse revol_util
    dt$revol_util = (as.numeric(gsub(pattern = "%",replacement = "",x = dt$int_rate)))
        dt$earliest_cr_line = parse_date_time(str_c("01",dt$issue_d),"dmy" ) - parse_date_time(str_c("01",dt$earliest_cr_line),"dmy" )
        dt$earliest_cr_line = as.numeric(dt$earliest_cr_line,units = "days")
    #binary variables for addr_state
    dt$is_ny = ifelse(dt$addr_state=="NY",1,0)
    dt$is_pa = ifelse(dt$addr_state=="PA",1,0)
    dt$is_nj = ifelse(dt$addr_state=="NJ",1,0)
    dt$is_oh = ifelse(dt$addr_state=="OH",1,0)
    dt$is_fl = ifelse(dt$addr_state=="FL",1,0)
    dt$is_co = ifelse(dt$addr_state=="CO",1,0)
    dt$is_ga = ifelse(dt$addr_state=="GA",1,0)
    dt$is_va = ifelse(dt$addr_state=="VA",1,0)
    dt$is_az = ifelse(dt$addr_state=="AZ",1,0)
    dt$is_ca = ifelse(dt$addr_state=="CA",1,0)
    #transform transactions
    dt$annual_inc = dt$annual_inc/dt$funded_amnt
    dt$revol_bal = dt$revol_bal/dt$funded_amnt
    dt$avg_cur_bal = dt$avg_cur_bal/dt$funded_amnt
    dt$bc_open_to_buy = dt$bc_open_to_buy/dt$funded_amnt
    #if purpose falling outside of recognized values
    all_purpose = c('debt_consolidation','small_business','other','credit_card','major_purchase','moving','home_improvement','house','car','medical','renewable_energy','vacation','wedding')
    dt$purpose = ifelse(dt$purpose %in% all_purpose,dt$purpose,"other")
    #create dummy variables
    loan_status = dt$loan_status
    dt = as.data.frame(predict(use_dummy_model,dt))
    dt$loan_status = loan_status
    #center,scale data
    trans_model_test = preProcess(dt,method=c("center","scale"))
    dt = predict(trans_model_test, dt)    
    #remove all unused features
    colnames(dt) = str_replace_all(colnames(dt)," ","_")
    colnames(dt) = str_replace_all(colnames(dt),"<","_")
    colnames(dt) = str_replace_all(colnames(dt),"/","_")
    dt = dt[use_kept_column]
    #set loan with status 'Fully Paid' as a positive sample
    dt$loan_status = ifelse(dt$loan_status == "Fully Paid","Fully.Paid",dt$loan_status)
    dt$loan_status = factor(dt$loan_status,levels = c("Default","Fully.Paid"))
    return(dt)
}

Prepare Test Data

We will now take our test data and apply our data transformations to it.

data_test

Before we apply the transformations, we will create some variables with some data from the test dataset which we will use later for evaluating the investment.

#later used for evaluate investment
data_test_grade = data_test$grade
data_test_funded_amnt = data_test$funded_amnt
data_test_total_pymnt = data_test$total_pymnt

We will now apply the data transformations using the function we created.

data_test = applyFeatureTransformations(data_test)
Previous Lesson

‹ Remove Dimensions By Fitting Logistic Regression

Next Lesson

Building Credit Risk Model ›

Join Our Facebook Group - Finance, Risk and Data Science

Posts You May Like

How to Improve your Financial Health

CFA® Exam Overview and Guidelines (Updated for 2021)

Changing Themes (Look and Feel) in ggplot2 in R

Coordinates in ggplot2 in R

Facets for ggplot2 Charts in R (Faceting Layer)

Reader Interactions

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.

Primary Sidebar

In this Course

  • Credit Risk Modelling – Case Studies
  • Classification vs. Regression Models
  • Case Study – German Credit – Steps to Build a Predictive Model
  • Import Credit Data Set in R
  • German Credit Data : Data Preprocessing and Feature Selection in R
  • Credit Modelling: Training and Test Data Sets
  • Build the Predictive Model
  • Logistic Regression Model in R
  • Measure Model Performance in R Using ROCR Package
  • Create a Confusion Matrix in R
  • Credit Risk Modelling – Case Study- Lending Club Data
  • Explore Loan Data in R – Loan Grade and Interest Rate
  • Credit Risk Modelling – Required R Packages
  • Loan Data – Training and Test Data Sets
  • Data Cleaning in R – Part 1
  • Data Cleaning in R – Part 2
  • Data Cleaning in R – Part 3
  • Data Cleaning in R – Part 5
  • Remove Dimensions By Fitting Logistic Regression
  • Create a Function and Prepare Test Data in R
  • Building Credit Risk Model
  • Credit Risk – Logistic Regression Model in R
  • Support Vector Machine (SVM) Model in R
  • Random Forest Model in R
  • Extreme Gradient Boosting in R
  • Predictive Modelling: Averaging Results from Multiple Models
  • Predictive Modelling: Comparing Model Results
  • How Insurance Companies Calculate Risk

Latest Tutorials

    • Data Visualization with R
    • Derivatives with R
    • Machine Learning in Finance Using Python
    • Credit Risk Modelling in R
    • Quantitative Trading Strategies in R
    • Financial Time Series Analysis in R
    • VaR Mapping
    • Option Valuation
    • Financial Reporting Standards
    • Fraud
Facebook Group

Membership

Unlock full access to Finance Train and see the entire library of member-only content and resources.

Subscribe

Footer

Recent Posts

  • How to Improve your Financial Health
  • CFA® Exam Overview and Guidelines (Updated for 2021)
  • Changing Themes (Look and Feel) in ggplot2 in R
  • Coordinates in ggplot2 in R
  • Facets for ggplot2 Charts in R (Faceting Layer)

Products

  • Level I Authority for CFA® Exam
  • CFA Level I Practice Questions
  • CFA Level I Mock Exam
  • Level II Question Bank for CFA® Exam
  • PRM Exam 1 Practice Question Bank
  • All Products

Quick Links

  • Privacy Policy
  • Contact Us

CFA Institute does not endorse, promote or warrant the accuracy or quality of Finance Train. CFA® and Chartered Financial Analyst® are registered trademarks owned by CFA Institute.

Copyright © 2021 Finance Train. All rights reserved.

  • About Us
  • Privacy Policy
  • Contact Us