Data Cleaning in R - Part 1

Premium

Discarding Attributes

LendingClub also provides a data dictionary that contains details of all attributes of out dataset. We can use that dictionary to understand more about the data columns we have and remove columns that may not impact the loan default.

Discard Attributes

We can use the data dictionary to identify and discard some attributes which we think are irrelevant or will have little impact on loan default.

1discard_column = c("collection_recovery_fee","emp_title",
2                   "funded_amnt_inv","id",
3                   "installment","last_credit_pull_d",
4                   "last_fico_range_high","last_fico_range_low",
5                   "last_pymnt_amnt","last_pymnt_d",
6                   "loan_amnt","member_id",
7                   "next_pymnt_d","num_tl_120dpd_2m",
8                   "num_tl_30dpd","out_prncp",
9                   "out_prncp_inv","recoveries",
10                   "total_pymnt","total_pymnt_inv",
11                   "total_rec_int","total_rec_late_fee",
12                   "total_rec_prncp","url",
13                   "zip_code"
14                   )
15
1> data_train = (data_train[,!(names(data_train) %in% discard_column)])
2> dim(data_train)
3[1] 41909   122
4>
5

Discard Grade Attribute

We will also drop the grade attribute as the grade information is also available in sub_grade.

> data_train$grade = NULL

Discard Columns with Too Many NAs

We will also discard columns with too many NAs. We will discard columns where the percentage is higher than 50% NAs.

1> data_train <- data_train[, -which(colMeans(is.na(data_train)) > 0.5)]
2

For the rest of the columns in NAs, it is reasonable to impute missing values with the median value. We will store median_impute_model for further use.

To do so, we will first store the percentage of NA values in each column in tmp variable.

1> tmp = sort(sapply(data_train, function(x) sum(length(which(is.na(x)))))/nrow(data_train),decreasing = TRUE)
2
3> median_impute_model = preProcess(data_train[names(tmp)],method="medianImpute")
4> data_train = predict(median_impute_model,data_train)
5

Before proceeding, we will check our data once more to ensure that ensure that there are no attributes with NA values.

1> sort(sapply(data_train, function(x) sum(length(which(is.na(x)))))/nrow(data_train),decreasing = TRUE)
2               funded_amnt                       term                   int_rate 
3                         0                          0                          0 
4                 sub_grade                 emp_length             home_ownership 
5                         0                          0                          0 
6                annual_inc        verification_status                    issue_d 
7                         0                          0                          0 
8               loan_status                 pymnt_plan                       desc 
9                         0                          0                          0 
10                   purpose                      title                 addr_state 
11                         0                          0                          0 
12                       dti                delinq_2yrs           earliest_cr_line 
13                         0                          0                          0 
14            inq_last_6mths     mths_since_last_delinq                   open_acc 
15                         0                          0                          0 
16                   pub_rec                  revol_bal                 revol_util 
17                         0                          0                          0 
18                 total_acc        initial_list_status collections_12_mths_ex_med 
19                         0                          0                          0 
20               policy_code           application_type  verification_status_joint 
21                         0                          0                          0 
22            acc_now_delinq               tot_coll_amt                tot_cur_bal 
23                         0                          0                          0 
24               open_acc_6m                open_act_il                open_il_12m 
25                         0                          0                          0 
26               open_il_24m         mths_since_rcnt_il               total_bal_il 
27                         0                          0                          0 
28                   il_util                open_rv_12m                open_rv_24m 
29                         0                          0                          0 
30                max_bal_bc                   all_util           total_rev_hi_lim 
31                         0                          0                          0 
32                    inq_fi                total_cu_tl               inq_last_12m 
33                         0                          0                          0 
34      acc_open_past_24mths                avg_cur_bal             bc_open_to_buy 
35                         0                          0                          0 
36                   bc_util   chargeoff_within_12_mths                delinq_amnt 
37                         0                          0                          0 
38        mo_sin_old_il_acct       mo_sin_old_rev_tl_op      mo_sin_rcnt_rev_tl_op 
39                         0                          0                          0 
40            mo_sin_rcnt_tl                   mort_acc       mths_since_recent_bc 
41                         0                          0                          0 
42     mths_since_recent_inq      num_accts_ever_120_pd             num_actv_bc_tl 
43                         0                          0                          0 
44           num_actv_rev_tl                num_bc_sats                  num_bc_tl 
45                         0                          0                          0 
46                 num_il_tl              num_op_rev_tl              num_rev_accts 
47                         0                          0                          0 
48       num_rev_tl_bal_gt_0                   num_sats         num_tl_90g_dpd_24m 
49                         0                          0                          0 
50        num_tl_op_past_12m             pct_tl_nvr_dlq           percent_bc_gt_75 
51                         0                          0                          0 
52      pub_rec_bankruptcies                  tax_liens            tot_hi_cred_lim 
53                         0                          0                          0 
54         total_bal_ex_mort             total_bc_limit total_il_high_credit_limit 
55                         0                          0                          0 
56             hardship_flag              hardship_type            hardship_reason 
57                         0                          0                          0 
58           hardship_status        hardship_start_date          hardship_end_date 
59                         0                          0                          0 
60   payment_plan_start_date       hardship_loan_status        disbursement_method 
61                         0                          0                          0 
62      debt_settlement_flag  debt_settlement_flag_date          settlement_status 
63                         0                          0                          0 
64           settlement_date 
65                         0 
66> 
67

Transformations

Let’s take a look at the remaining features:

str(data_train)
1> str(data_train)
2'data.frame':    41909 obs. of  94 variables:
3 $ funded_amnt               : int  10000 35000 14400 7250 10000 10000 25000 8400 6950 16000 ...
4 $ term                      : chr  " 60 months" " 60 months" " 36 months" " 36 months" ...
5 $ int_rate                  : num  19.53 20.75 10.75 13.67 5.32 ...
6 $ sub_grade                 : chr  "D5" "E2" "B4" "C3" ...
7 $ emp_length                : chr  "4 years" "3 years" "10+ years" "3 years" ...
8 $ home_ownership            : chr  "OWN" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
9 $ annual_inc                : num  52000 85000 85000 72000 45000 ...
10 $ verification_status       : chr  "Source Verified" "Source Verified" "Not Verified" "Source Verified" ...
11 $ issue_d                   : chr  "Mar-2016" "Mar-2016" "Mar-2016" "Mar-2016" ...
12 $ loan_status               : chr  "Fully Paid" "Fully Paid" "Fully Paid" "Fully Paid" ...
13 $ pymnt_plan                : chr  "n" "n" "n" "n" ...
14 $ desc                      : chr  "" "" "" "" ...
15 $ purpose                   : chr  "other" "debt_consolidation" "small_business" "debt_consolidation" ...
16 $ title                     : chr  "Other" "Debt consolidation" "Business" "Debt consolidation" ...
17 $ addr_state                : chr  "GA" "NY" "NH" "IN" ...
18 $ dti                       : num  15 24.98 28.11 23.93 8.03 ...
19 $ delinq_2yrs               : int  0 0 0 1 0 0 0 0 0 0 ...
20 $ earliest_cr_line          : chr  "Oct-2000" "Dec-2008" "Feb-2000" "Jan-1985" ...
21 $ inq_last_6mths            : int  1 1 0 0 0 1 1 1 0 1 ...
22 $ mths_since_last_delinq    : num  44 31 72 20 31 31 31 60 55 25 ...
23 $ open_acc                  : int  4 9 16 12 7 9 9 7 5 10 ...
24 $ pub_rec                   : int  2 0 0 1 1 0 0 0 4 0 ...
25 $ revol_bal                 : int  1077 10167 37582 12220 471 10139 47954 11059 7096 9891 ...
26 $ revol_util                : chr  "35.9%" "46.6%" "85.8%" "65%" ...
27 $ total_acc                 : int  12 19 33 24 15 14 22 22 26 16 ...
28 $ initial_list_status       : chr  "w" "w" "w" "w" ...
29 $ collections_12_mths_ex_med: int  0 0 0 0 0 0 0 1 0 0 ...
30 $ policy_code               : int  1 1 1 1 1 1 1 1 1 1 ...
31 $ application_type          : chr  "Individual" "Individual" "Individual" "Individual" ...
32 $ verification_status_joint : chr  "" "" "" "" ...
33 $ acc_now_delinq            : int  0 0 0 0 0 0 0 0 0 0 ...
34 $ tot_coll_amt              : int  622 0 0 0 0 0 0 0 0 2017 ...
35 $ tot_cur_bal               : int  15886 161643 190166 366483 11970 42767 431227 157051 7096 22551 ...
36 $ open_acc_6m               : num  2 0 0 1 0 1 2 1 0 0 ...
37 $ open_act_il               : num  1 3 3 2 1 2 2 1 0 2 ...
38 $ open_il_12m               : num  4 0 1 1 0 1 2 1 0 2 ...
39 $ open_il_24m               : num  5 3 1 3 1 1 2 1 0 2 ...
40 $ mths_since_rcnt_il        : num  2 14 12 3 23 6 6 10 91 9 ...
41 $ total_bal_il              : num  14809 73863 22387 40343 11499 ...
42 $ il_util                   : num  99 83 47 92 72 73 61 91 76 67 ...
43 $ open_rv_12m               : num  0 0 0 1 1 1 1 2 0 3 ...
44 $ open_rv_24m               : num  0 1 0 1 2 1 1 2 1 4 ...
45 $ max_bal_bc                : num  1007 5109 12211 3694 325 ...
46 $ all_util                  : num  88 71 66 84 54 55 60 86 59 50 ...
47 $ total_rev_hi_lim          : int  3000 21800 43800 18800 6400 33000 81500 14100 12000 26700 ...
48 $ inq_fi                    : num  3 5 0 0 2 2 0 1 0 1 ...
49 $ total_cu_tl               : num  0 1 0 0 2 0 1 1 0 1 ...
50 $ inq_last_12m              : num  2 2 0 1 0 3 4 3 1 4 ...
51 $ acc_open_past_24mths      : int  5 4 1 4 3 2 3 4 1 6 ...
52 $ avg_cur_bal               : int  3972 17960 11885 30540 1710 4752 47914 22436 1419 2506 ...
53 $ bc_open_to_buy            : num  1623 4833 3393 997 4329 ...
54 $ bc_util                   : num  39.9 67.8 87.4 90.1 9.8 50.7 35.8 90.6 69.8 49.5 ...
55 $ chargeoff_within_12_mths  : int  0 0 0 0 0 0 0 0 0 0 ...
56 $ delinq_amnt               : int  0 0 0 0 0 0 0 0 0 0 ...
57 $ mo_sin_old_il_acct        : num  101 87 145 132 135 65 258 129 153 113 ...
58 $ mo_sin_old_rev_tl_op      : int  185 65 193 374 310 65 246 129 205 184 ...
59 $ mo_sin_rcnt_rev_tl_op     : int  25 22 26 8 10 10 4 2 17 7 ...
60 $ mo_sin_rcnt_tl            : int  2 14 12 3 10 6 4 2 17 7 ...
61 $ mort_acc                  : int  0 1 6 4 2 0 4 2 8 0 ...
62 $ mths_since_recent_bc      : num  25 22 32 59 10 10 4 89 31 7 ...
63 $ mths_since_recent_inq     : num  4 5 20 9 23 6 4 2 11 0 ...
64 $ num_accts_ever_120_pd     : int  2 0 0 3 0 0 0 2 1 2 ...
65 $ num_actv_bc_tl            : int  2 3 5 3 2 4 3 2 3 4 ...
66 $ num_actv_rev_tl           : int  2 3 11 5 2 4 4 4 4 6 ...
67 $ num_bc_sats               : int  2 3 5 3 4 4 5 2 3 5 ...
68 $ num_bc_tl                 : int  3 4 9 4 5 4 7 7 7 5 ...
69 $ num_il_tl                 : int  7 9 7 8 4 4 8 2 3 7 ...
70 $ num_op_rev_tl             : int  3 5 11 9 6 7 6 5 5 8 ...
71 $ num_rev_accts             : int  5 9 20 12 9 10 10 12 15 9 ...
72 $ num_rev_tl_bal_gt_0       : int  2 3 11 5 2 4 4 4 4 6 ...
73 $ num_sats                  : int  4 9 16 12 7 9 9 7 5 10 ...
74 $ num_tl_90g_dpd_24m        : int  0 0 0 1 0 0 0 0 0 0 ...
75 $ num_tl_op_past_12m        : int  4 0 1 2 1 2 3 3 0 5 ...
76 $ pct_tl_nvr_dlq            : num  83.3 100 93.9 83.3 100 100 100 86.4 91.3 93.3 ...
77 $ percent_bc_gt_75          : num  50 33.3 100 100 0 0 0 100 33.3 25 ...
78 $ pub_rec_bankruptcies      : int  0 0 0 0 1 0 0 0 3 0 ...
79 $ tax_liens                 : int  0 0 0 1 0 0 0 0 1 0 ...
80 $ tot_hi_cred_lim           : int  18000 193751 232277 374306 22363 77555 499524 166260 12000 45520 ...
81 $ total_bal_ex_mort         : int  15886 84030 59969 52563 11970 42767 67575 30290 7096 22551 ...
82 $ total_bc_limit            : int  2700 15000 27000 10100 4800 20000 52500 6200 9700 18800 ...
83 $ total_il_high_credit_limit: int  15000 87951 47352 43666 15963 44555 32024 21079 0 18820 ...
84 $ hardship_flag             : chr  "N" "N" "N" "N" ...
85 $ hardship_type             : chr  "" "" "" "" ...
86 $ hardship_reason           : chr  "" "" "" "" ...
87 $ hardship_status           : chr  "" "" "" "" ...
88 $ hardship_start_date       : chr  "" "" "" "" ...
89 $ hardship_end_date         : chr  "" "" "" "" ...
90 $ payment_plan_start_date   : chr  "" "" "" "" ...
91 $ hardship_loan_status      : chr  "" "" "" "" ...
92 $ disbursement_method       : chr  "Cash" "Cash" "Cash" "Cash" ...
93 $ debt_settlement_flag      : chr  "N" "N" "N" "N" ...
94 $ debt_settlement_flag_date : chr  "" "" "" "" ...
95 $ settlement_status         : chr  "" "" "" "" ...
96 $ settlement_date           : chr  "" "" "" "" ...
97>
98

Remove More Columns

1discard_column = c("hardship_flag","hardship_type",
2                   "hardship_reason","hardship_status",
3                   "hardship_start_date","hardship_end_date",
4                   "payment_plan_start_date","hardship_loan_status",
5                   "disbursement_method","debt_settlement_flag",
6                   "debt_settlement_flag_date","settlement_status",
7                   "settlement_date"
8                   )
9> data_train = (data_train[,!(names(data_train) %in% discard_column)])
10

revol_util

We will convert revol_util to numeric.

1data_train$revol_util = (as.numeric(gsub(pattern = "%",replacement = "",x = data_train$int_rate)))
2

earliest_cr_line

earliest_cr_line is transformed to the number of days before the loan is issued.

1data_train$earliest_cr_line = parse_date_time(str_c("01",data_train$issue_d),"dmy" ) - parse_date_time(str_c("01",data_train$earliest_cr_line),"dmy" )
2data_train$earliest_cr_line = as.numeric(data_train$earliest_cr_line,units = "days")
3

Extract Issued Month

We can see that the default rate doesn’t vary much by the month it is issued. We will drop issue_d

1data_train$issue_m = sapply( data_train$issue_d ,function(x){str_split(x,"-")[[1]][1]})
2tmp = data_train %>% filter(loan_status=="Default") %>% group_by(issue_m) %>% summarise(default_count = n())
3tmp2 = data_train %>% group_by(issue_m) %>% summarise(count = n())
4tmp2 %>% left_join(tmp) %>% mutate(default_rate = default_count/count)
5
1Joining, by = "issue_m"
2# A tibble: 3 x 4
3  issue_m count default_count default_rate
4                      
51 Feb     12470          3779        0.303
62 Jan     10664          3090        0.290
73 Mar     18775          5761        0.307
8> 
9
1data_train$issue_m = NULL
2data_train$issue_d = NULL
3rm(tmp,tmp2)
4