Data Cleaning in R - Part 1
Discarding Attributes
LendingClub also provides a data dictionary that contains details of all attributes of out dataset. We can use that dictionary to understand more about the data columns we have and remove columns that may not impact the loan default.
Discard Attributes
We can use the data dictionary to identify and discard some attributes which we think are irrelevant or will have little impact on loan default.
1discard_column = c("collection_recovery_fee","emp_title",
2 "funded_amnt_inv","id",
3 "installment","last_credit_pull_d",
4 "last_fico_range_high","last_fico_range_low",
5 "last_pymnt_amnt","last_pymnt_d",
6 "loan_amnt","member_id",
7 "next_pymnt_d","num_tl_120dpd_2m",
8 "num_tl_30dpd","out_prncp",
9 "out_prncp_inv","recoveries",
10 "total_pymnt","total_pymnt_inv",
11 "total_rec_int","total_rec_late_fee",
12 "total_rec_prncp","url",
13 "zip_code"
14 )
15
1> data_train = (data_train[,!(names(data_train) %in% discard_column)])
2> dim(data_train)
3[1] 41909 122
4>
5
Discard Grade Attribute
We will also drop the grade
attribute as the grade information is also available in sub_grade
.
> data_train$grade = NULL
Discard Columns with Too Many NAs
We will also discard columns with too many NAs. We will discard columns where the percentage is higher than 50% NAs.
1> data_train <- data_train[, -which(colMeans(is.na(data_train)) > 0.5)]
2
For the rest of the columns in NAs, it is reasonable to impute missing values with the median value. We will store median_impute_model for further use.
To do so, we will first store the percentage of NA values in each column in tmp
variable.
1> tmp = sort(sapply(data_train, function(x) sum(length(which(is.na(x)))))/nrow(data_train),decreasing = TRUE)
2
3> median_impute_model = preProcess(data_train[names(tmp)],method="medianImpute")
4> data_train = predict(median_impute_model,data_train)
5
Before proceeding, we will check our data once more to ensure that ensure that there are no attributes with NA values.
1> sort(sapply(data_train, function(x) sum(length(which(is.na(x)))))/nrow(data_train),decreasing = TRUE)
2 funded_amnt term int_rate
3 0 0 0
4 sub_grade emp_length home_ownership
5 0 0 0
6 annual_inc verification_status issue_d
7 0 0 0
8 loan_status pymnt_plan desc
9 0 0 0
10 purpose title addr_state
11 0 0 0
12 dti delinq_2yrs earliest_cr_line
13 0 0 0
14 inq_last_6mths mths_since_last_delinq open_acc
15 0 0 0
16 pub_rec revol_bal revol_util
17 0 0 0
18 total_acc initial_list_status collections_12_mths_ex_med
19 0 0 0
20 policy_code application_type verification_status_joint
21 0 0 0
22 acc_now_delinq tot_coll_amt tot_cur_bal
23 0 0 0
24 open_acc_6m open_act_il open_il_12m
25 0 0 0
26 open_il_24m mths_since_rcnt_il total_bal_il
27 0 0 0
28 il_util open_rv_12m open_rv_24m
29 0 0 0
30 max_bal_bc all_util total_rev_hi_lim
31 0 0 0
32 inq_fi total_cu_tl inq_last_12m
33 0 0 0
34 acc_open_past_24mths avg_cur_bal bc_open_to_buy
35 0 0 0
36 bc_util chargeoff_within_12_mths delinq_amnt
37 0 0 0
38 mo_sin_old_il_acct mo_sin_old_rev_tl_op mo_sin_rcnt_rev_tl_op
39 0 0 0
40 mo_sin_rcnt_tl mort_acc mths_since_recent_bc
41 0 0 0
42 mths_since_recent_inq num_accts_ever_120_pd num_actv_bc_tl
43 0 0 0
44 num_actv_rev_tl num_bc_sats num_bc_tl
45 0 0 0
46 num_il_tl num_op_rev_tl num_rev_accts
47 0 0 0
48 num_rev_tl_bal_gt_0 num_sats num_tl_90g_dpd_24m
49 0 0 0
50 num_tl_op_past_12m pct_tl_nvr_dlq percent_bc_gt_75
51 0 0 0
52 pub_rec_bankruptcies tax_liens tot_hi_cred_lim
53 0 0 0
54 total_bal_ex_mort total_bc_limit total_il_high_credit_limit
55 0 0 0
56 hardship_flag hardship_type hardship_reason
57 0 0 0
58 hardship_status hardship_start_date hardship_end_date
59 0 0 0
60 payment_plan_start_date hardship_loan_status disbursement_method
61 0 0 0
62 debt_settlement_flag debt_settlement_flag_date settlement_status
63 0 0 0
64 settlement_date
65 0
66>
67
Transformations
Let’s take a look at the remaining features:
str(data_train)
1> str(data_train)
2'data.frame': 41909 obs. of 94 variables:
3 $ funded_amnt : int 10000 35000 14400 7250 10000 10000 25000 8400 6950 16000 ...
4 $ term : chr " 60 months" " 60 months" " 36 months" " 36 months" ...
5 $ int_rate : num 19.53 20.75 10.75 13.67 5.32 ...
6 $ sub_grade : chr "D5" "E2" "B4" "C3" ...
7 $ emp_length : chr "4 years" "3 years" "10+ years" "3 years" ...
8 $ home_ownership : chr "OWN" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
9 $ annual_inc : num 52000 85000 85000 72000 45000 ...
10 $ verification_status : chr "Source Verified" "Source Verified" "Not Verified" "Source Verified" ...
11 $ issue_d : chr "Mar-2016" "Mar-2016" "Mar-2016" "Mar-2016" ...
12 $ loan_status : chr "Fully Paid" "Fully Paid" "Fully Paid" "Fully Paid" ...
13 $ pymnt_plan : chr "n" "n" "n" "n" ...
14 $ desc : chr "" "" "" "" ...
15 $ purpose : chr "other" "debt_consolidation" "small_business" "debt_consolidation" ...
16 $ title : chr "Other" "Debt consolidation" "Business" "Debt consolidation" ...
17 $ addr_state : chr "GA" "NY" "NH" "IN" ...
18 $ dti : num 15 24.98 28.11 23.93 8.03 ...
19 $ delinq_2yrs : int 0 0 0 1 0 0 0 0 0 0 ...
20 $ earliest_cr_line : chr "Oct-2000" "Dec-2008" "Feb-2000" "Jan-1985" ...
21 $ inq_last_6mths : int 1 1 0 0 0 1 1 1 0 1 ...
22 $ mths_since_last_delinq : num 44 31 72 20 31 31 31 60 55 25 ...
23 $ open_acc : int 4 9 16 12 7 9 9 7 5 10 ...
24 $ pub_rec : int 2 0 0 1 1 0 0 0 4 0 ...
25 $ revol_bal : int 1077 10167 37582 12220 471 10139 47954 11059 7096 9891 ...
26 $ revol_util : chr "35.9%" "46.6%" "85.8%" "65%" ...
27 $ total_acc : int 12 19 33 24 15 14 22 22 26 16 ...
28 $ initial_list_status : chr "w" "w" "w" "w" ...
29 $ collections_12_mths_ex_med: int 0 0 0 0 0 0 0 1 0 0 ...
30 $ policy_code : int 1 1 1 1 1 1 1 1 1 1 ...
31 $ application_type : chr "Individual" "Individual" "Individual" "Individual" ...
32 $ verification_status_joint : chr "" "" "" "" ...
33 $ acc_now_delinq : int 0 0 0 0 0 0 0 0 0 0 ...
34 $ tot_coll_amt : int 622 0 0 0 0 0 0 0 0 2017 ...
35 $ tot_cur_bal : int 15886 161643 190166 366483 11970 42767 431227 157051 7096 22551 ...
36 $ open_acc_6m : num 2 0 0 1 0 1 2 1 0 0 ...
37 $ open_act_il : num 1 3 3 2 1 2 2 1 0 2 ...
38 $ open_il_12m : num 4 0 1 1 0 1 2 1 0 2 ...
39 $ open_il_24m : num 5 3 1 3 1 1 2 1 0 2 ...
40 $ mths_since_rcnt_il : num 2 14 12 3 23 6 6 10 91 9 ...
41 $ total_bal_il : num 14809 73863 22387 40343 11499 ...
42 $ il_util : num 99 83 47 92 72 73 61 91 76 67 ...
43 $ open_rv_12m : num 0 0 0 1 1 1 1 2 0 3 ...
44 $ open_rv_24m : num 0 1 0 1 2 1 1 2 1 4 ...
45 $ max_bal_bc : num 1007 5109 12211 3694 325 ...
46 $ all_util : num 88 71 66 84 54 55 60 86 59 50 ...
47 $ total_rev_hi_lim : int 3000 21800 43800 18800 6400 33000 81500 14100 12000 26700 ...
48 $ inq_fi : num 3 5 0 0 2 2 0 1 0 1 ...
49 $ total_cu_tl : num 0 1 0 0 2 0 1 1 0 1 ...
50 $ inq_last_12m : num 2 2 0 1 0 3 4 3 1 4 ...
51 $ acc_open_past_24mths : int 5 4 1 4 3 2 3 4 1 6 ...
52 $ avg_cur_bal : int 3972 17960 11885 30540 1710 4752 47914 22436 1419 2506 ...
53 $ bc_open_to_buy : num 1623 4833 3393 997 4329 ...
54 $ bc_util : num 39.9 67.8 87.4 90.1 9.8 50.7 35.8 90.6 69.8 49.5 ...
55 $ chargeoff_within_12_mths : int 0 0 0 0 0 0 0 0 0 0 ...
56 $ delinq_amnt : int 0 0 0 0 0 0 0 0 0 0 ...
57 $ mo_sin_old_il_acct : num 101 87 145 132 135 65 258 129 153 113 ...
58 $ mo_sin_old_rev_tl_op : int 185 65 193 374 310 65 246 129 205 184 ...
59 $ mo_sin_rcnt_rev_tl_op : int 25 22 26 8 10 10 4 2 17 7 ...
60 $ mo_sin_rcnt_tl : int 2 14 12 3 10 6 4 2 17 7 ...
61 $ mort_acc : int 0 1 6 4 2 0 4 2 8 0 ...
62 $ mths_since_recent_bc : num 25 22 32 59 10 10 4 89 31 7 ...
63 $ mths_since_recent_inq : num 4 5 20 9 23 6 4 2 11 0 ...
64 $ num_accts_ever_120_pd : int 2 0 0 3 0 0 0 2 1 2 ...
65 $ num_actv_bc_tl : int 2 3 5 3 2 4 3 2 3 4 ...
66 $ num_actv_rev_tl : int 2 3 11 5 2 4 4 4 4 6 ...
67 $ num_bc_sats : int 2 3 5 3 4 4 5 2 3 5 ...
68 $ num_bc_tl : int 3 4 9 4 5 4 7 7 7 5 ...
69 $ num_il_tl : int 7 9 7 8 4 4 8 2 3 7 ...
70 $ num_op_rev_tl : int 3 5 11 9 6 7 6 5 5 8 ...
71 $ num_rev_accts : int 5 9 20 12 9 10 10 12 15 9 ...
72 $ num_rev_tl_bal_gt_0 : int 2 3 11 5 2 4 4 4 4 6 ...
73 $ num_sats : int 4 9 16 12 7 9 9 7 5 10 ...
74 $ num_tl_90g_dpd_24m : int 0 0 0 1 0 0 0 0 0 0 ...
75 $ num_tl_op_past_12m : int 4 0 1 2 1 2 3 3 0 5 ...
76 $ pct_tl_nvr_dlq : num 83.3 100 93.9 83.3 100 100 100 86.4 91.3 93.3 ...
77 $ percent_bc_gt_75 : num 50 33.3 100 100 0 0 0 100 33.3 25 ...
78 $ pub_rec_bankruptcies : int 0 0 0 0 1 0 0 0 3 0 ...
79 $ tax_liens : int 0 0 0 1 0 0 0 0 1 0 ...
80 $ tot_hi_cred_lim : int 18000 193751 232277 374306 22363 77555 499524 166260 12000 45520 ...
81 $ total_bal_ex_mort : int 15886 84030 59969 52563 11970 42767 67575 30290 7096 22551 ...
82 $ total_bc_limit : int 2700 15000 27000 10100 4800 20000 52500 6200 9700 18800 ...
83 $ total_il_high_credit_limit: int 15000 87951 47352 43666 15963 44555 32024 21079 0 18820 ...
84 $ hardship_flag : chr "N" "N" "N" "N" ...
85 $ hardship_type : chr "" "" "" "" ...
86 $ hardship_reason : chr "" "" "" "" ...
87 $ hardship_status : chr "" "" "" "" ...
88 $ hardship_start_date : chr "" "" "" "" ...
89 $ hardship_end_date : chr "" "" "" "" ...
90 $ payment_plan_start_date : chr "" "" "" "" ...
91 $ hardship_loan_status : chr "" "" "" "" ...
92 $ disbursement_method : chr "Cash" "Cash" "Cash" "Cash" ...
93 $ debt_settlement_flag : chr "N" "N" "N" "N" ...
94 $ debt_settlement_flag_date : chr "" "" "" "" ...
95 $ settlement_status : chr "" "" "" "" ...
96 $ settlement_date : chr "" "" "" "" ...
97>
98
Remove More Columns
1discard_column = c("hardship_flag","hardship_type",
2 "hardship_reason","hardship_status",
3 "hardship_start_date","hardship_end_date",
4 "payment_plan_start_date","hardship_loan_status",
5 "disbursement_method","debt_settlement_flag",
6 "debt_settlement_flag_date","settlement_status",
7 "settlement_date"
8 )
9> data_train = (data_train[,!(names(data_train) %in% discard_column)])
10
revol_util
We will convert revol_util
to numeric.
1data_train$revol_util = (as.numeric(gsub(pattern = "%",replacement = "",x = data_train$int_rate)))
2
earliest_cr_line
earliest_cr_line
is transformed to the number of days before the loan is issued.
1data_train$earliest_cr_line = parse_date_time(str_c("01",data_train$issue_d),"dmy" ) - parse_date_time(str_c("01",data_train$earliest_cr_line),"dmy" )
2data_train$earliest_cr_line = as.numeric(data_train$earliest_cr_line,units = "days")
3
Extract Issued Month
We can see that the default rate doesn’t vary much by the month it is issued. We will drop issue_d
1data_train$issue_m = sapply( data_train$issue_d ,function(x){str_split(x,"-")[[1]][1]})
2tmp = data_train %>% filter(loan_status=="Default") %>% group_by(issue_m) %>% summarise(default_count = n())
3tmp2 = data_train %>% group_by(issue_m) %>% summarise(count = n())
4tmp2 %>% left_join(tmp) %>% mutate(default_rate = default_count/count)
5
1Joining, by = "issue_m"
2# A tibble: 3 x 4
3 issue_m count default_count default_rate
4
51 Feb 12470 3779 0.303
62 Jan 10664 3090 0.290
73 Mar 18775 5761 0.307
8>
9
1data_train$issue_m = NULL
2data_train$issue_d = NULL
3rm(tmp,tmp2)
4
Unlock Premium Content
Upgrade your account to access the full article, downloads, and exercises.
You'll get access to:
- Access complete tutorials and examples
- Download source code and resources
- Follow along with practical exercises
- Get in-depth explanations