Sigolyori’s Blog - Monthly Dacon: Credit card default prediction competition(월간 데이콘 신용카드 사용자 연체 예측 AI 경진대회)

First, call functions with library.

library(tidyverse)
library(tidymodels)
library(skimr)
library(lubridate)

Read data with readr::read_csv. specially chain with mutate_if to change chracter data type to factor type.

df <- read_csv("train.csv") %>% mutate_if(is.character, as.factor)

skim::skim is function which enable showing summary in detail.

skim(df)

-- Data Summary ------------------------
                           Values
Name                       df    
Number of rows             26457 
Number of columns          20    
_______________________          
Column type frequency:           
  factor                   8     
  numeric                  12    
________________________         
Group variables            None  

-- Variable type: factor -------------------------------------------------------
  skim_variable n_missing complete_rate ordered n_unique
1 gender                0         1     FALSE          2
2 car                   0         1     FALSE          2
3 reality               0         1     FALSE          2
4 income_type           0         1     FALSE          5
5 edu_type              0         1     FALSE          5
6 family_type           0         1     FALSE          5
7 house_type            0         1     FALSE          6
8 occyp_type         8171         0.691 FALSE         18
  top_counts                                 
1 F: 17697, M: 8760                          
2 N: 16410, Y: 10047                         
3 Y: 17830, N: 8627                          
4 Wor: 13645, Com: 6202, Pen: 4449, Sta: 2154
5 Sec: 17995, Hig: 7162, Inc: 1020, Low: 257 
6 Mar: 18196, Sin: 3496, Civ: 2123, Sep: 1539
7 Hou: 23653, Wit: 1257, Mun: 818, Ren: 429  
8 Lab: 4512, Cor: 2646, Sal: 2539, Man: 2167 

-- Variable type: numeric ------------------------------------------------------
   skim_variable n_missing complete_rate        mean         sd     p0    p25
 1 index                 0             1  13228        7638.         0   6614
 2 child_num             0             1      0.429       0.747      0      0
 3 income_total          0             1 187307.     101878.     27000 121500
 4 DAYS_BIRTH            0             1 -15958.       4202.    -25152 -19431
 5 DAYS_EMPLOYED         0             1  59069.     137475.    -15713  -3153
 6 FLAG_MOBIL            0             1      1           0          1      1
 7 work_phone            0             1      0.225       0.417      0      0
 8 phone                 0             1      0.294       0.456      0      0
 9 email                 0             1      0.0913      0.288      0      0
10 family_size           0             1      2.20        0.917      1      2
11 begin_month           0             1    -26.1        16.6      -60    -39
12 credit                0             1      1.52        0.702      0      1
      p50    p75    p100 hist                            
 1  13228  19842   26456 "\u2587\u2587\u2587\u2587\u2587"
 2      0      1      19 "\u2587\u2581\u2581\u2581\u2581"
 3 157500 225000 1575000 "\u2587\u2581\u2581\u2581\u2581"
 4 -15547 -12446   -7705 "\u2583\u2586\u2587\u2587\u2585"
 5  -1539   -407  365243 "\u2587\u2581\u2581\u2581\u2582"
 6      1      1       1 "\u2581\u2581\u2587\u2581\u2581"
 7      0      0       1 "\u2587\u2581\u2581\u2581\u2582"
 8      0      1       1 "\u2587\u2581\u2581\u2581\u2583"
 9      0      0       1 "\u2587\u2581\u2581\u2581\u2581"
10      2      3      20 "\u2587\u2581\u2581\u2581\u2581"
11    -24    -12       0 "\u2585\u2586\u2586\u2587\u2587"
12      2      2       2 "\u2582\u2581\u2583\u2581\u2587"

A skim_df: 20 × 15
	skim_type	skim_variable	n_missing	complete_rate	factor.ordered	factor.n_unique	factor.top_counts	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
	<chr>	<chr>	<int>	<dbl>	<lgl>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
1	factor	gender	0	1.0000000	FALSE	2	F: 17697, M: 8760	NA	NA	NA	NA	NA	NA	NA	NA
2	factor	car	0	1.0000000	FALSE	2	N: 16410, Y: 10047	NA	NA	NA	NA	NA	NA	NA	NA
3	factor	reality	0	1.0000000	FALSE	2	Y: 17830, N: 8627	NA	NA	NA	NA	NA	NA	NA	NA
4	factor	income_type	0	1.0000000	FALSE	5	Wor: 13645, Com: 6202, Pen: 4449, Sta: 2154	NA	NA	NA	NA	NA	NA	NA	NA
5	factor	edu_type	0	1.0000000	FALSE	5	Sec: 17995, Hig: 7162, Inc: 1020, Low: 257	NA	NA	NA	NA	NA	NA	NA	NA
6	factor	family_type	0	1.0000000	FALSE	5	Mar: 18196, Sin: 3496, Civ: 2123, Sep: 1539	NA	NA	NA	NA	NA	NA	NA	NA
7	factor	house_type	0	1.0000000	FALSE	6	Hou: 23653, Wit: 1257, Mun: 818, Ren: 429	NA	NA	NA	NA	NA	NA	NA	NA
8	factor	occyp_type	8171	0.6911592	FALSE	18	Lab: 4512, Cor: 2646, Sal: 2539, Man: 2167	NA	NA	NA	NA	NA	NA	NA	NA
9	numeric	index	0	1.0000000	NA	NA	NA	1.322800e+04	7.637622e+03	0	6614	13228	19842	26456	<U+2587><U+2587><U+2587><U+2587><U+2587>
10	numeric	child_num	0	1.0000000	NA	NA	NA	4.286578e-01	7.473264e-01	0	0	0	1	19	<U+2587><U+2581><U+2581><U+2581><U+2581>
11	numeric	income_total	0	1.0000000	NA	NA	NA	1.873065e+05	1.018784e+05	27000	121500	157500	225000	1575000	<U+2587><U+2581><U+2581><U+2581><U+2581>
12	numeric	DAYS_BIRTH	0	1.0000000	NA	NA	NA	-1.595805e+04	4.201589e+03	-25152	-19431	-15547	-12446	-7705	<U+2583><U+2586><U+2587><U+2587><U+2585>
13	numeric	DAYS_EMPLOYED	0	1.0000000	NA	NA	NA	5.906875e+04	1.374754e+05	-15713	-3153	-1539	-407	365243	<U+2587><U+2581><U+2581><U+2581><U+2582>
14	numeric	FLAG_MOBIL	0	1.0000000	NA	NA	NA	1.000000e+00	0.000000e+00	1	1	1	1	1	<U+2581><U+2581><U+2587><U+2581><U+2581>
15	numeric	work_phone	0	1.0000000	NA	NA	NA	2.247420e-01	4.174202e-01	0	0	0	0	1	<U+2587><U+2581><U+2581><U+2581><U+2582>
16	numeric	phone	0	1.0000000	NA	NA	NA	2.942510e-01	4.557140e-01	0	0	0	1	1	<U+2587><U+2581><U+2581><U+2581><U+2583>
17	numeric	email	0	1.0000000	NA	NA	NA	9.128019e-02	2.880126e-01	0	0	0	0	1	<U+2587><U+2581><U+2581><U+2581><U+2581>
18	numeric	family_size	0	1.0000000	NA	NA	NA	2.196848e+00	9.167167e-01	1	2	2	3	20	<U+2587><U+2581><U+2581><U+2581><U+2581>
19	numeric	begin_month	0	1.0000000	NA	NA	NA	-2.612329e+01	1.655955e+01	-60	-39	-24	-12	0	<U+2585><U+2586><U+2586><U+2587><U+2587>
20	numeric	credit	0	1.0000000	NA	NA	NA	1.519560e+00	7.022828e-01	0	1	2	2	2	<U+2582><U+2581><U+2583><U+2581><U+2587>

Split your train dataset to train set and validation set. It prevents from data leakage and make it possible to assess your prediction accuracy.

split = df  %>% initial_split(prop=0.75, strata='credit')

tr = split  %>% training()
vl = split  %>% testing()

Record your preprocessing step with recipe. the specific steps are below.

Set your target value with credit
Convert features with day to year (DAYS_BIRTH and DAYS_EMPLOYED)
Drop index, DAYS_BIRTH, DAYS_EMPLOYED.
Replace missing value in occyp_type with ‘unknown’.
Control scale of numeric values from 0 to 1.

rec <- tr %>% 
  recipe(credit ~.) %>% 
  step_mutate(credit = as.factor(credit), skip=TRUE) %>% 
  step_mutate(yrs_birth = -ceiling(DAYS_BIRTH/365),
              yrs_employed = -ceiling(DAYS_EMPLOYED/365)) %>% 
  step_rm(index, DAYS_BIRTH, DAYS_EMPLOYED) %>% 
  step_unknown(occyp_type) %>% 
  step_integer(all_nominal(), -all_outcomes()) %>% 
  step_center(all_predictors(), -all_outcomes())

print(credit_rec)

Recipe

Inputs:

      role #variables
   outcome          1
 predictor         19

Training data contained 26457 data points and 8171 incomplete rows. 

Operations:

Variables removed index [trained]
Variable mutation for ~factor(credit) [trained]
Variable mutation for ~factor(gender), ~factor(car), ~factor(rea... [trained]
Variable mutation for ~ifelse(DAYS_EMPLOYED > 0, 0, DAYS_EMPLOYED) [trained]
Unknown factor level assignment for occyp_type [trained]
Yeo-Johnson transformation on DAYS_BIRTH, DAYS_EMPLOYED, begin_month [trained]
Centering and scaling for child_num, family_size [trained]
Log transformation on income_total [trained]

You can view the prepped data with juice.

rec_tr <- rec  %>% prep(tr)  %>% juice()

head(rec_tr)

A tibble: 6 × 19
gender	car	reality	child_num	income_total	income_type	edu_type	family_type	house_type	FLAG_MOBIL	work_phone	phone	email	occyp_type	family_size	begin_month	credit	yrs_birth	yrs_employed
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<fct>	<dbl>	<dbl>
-0.3319726	0.6210563	0.3278399	0.5708094	-29379.44	-2.400867	0.9036891	-0.370376	-0.2791049	0	-0.2271948	-0.2958371	-0.09081746	-10.8347445	0.8033968	-33.948191	0	-11.192924	165.8533
0.6680274	0.6210563	0.3278399	1.5708094	-29379.44	1.599133	0.9036891	-0.370376	3.7208951	0	-0.2271948	0.7041629	-0.09081746	-6.8347445	1.8033968	-33.948191	0	-8.192924	171.8533
0.6680274	0.6210563	0.3278399	-0.4291906	-51879.44	-2.400867	-2.0963109	-0.370376	-0.2791049	0	0.7728052	-0.2958371	0.90918254	-2.8347445	-0.1966032	8.051809	0	-11.192924	169.8533
-0.3319726	-0.3789437	0.3278399	-0.4291906	-51879.44	1.599133	-2.0963109	-0.370376	-0.2791049	0	-0.2271948	-0.2958371	-0.09081746	-2.8347445	-0.1966032	24.051809	0	-7.192924	164.8533
0.6680274	0.6210563	0.3278399	0.5708094	83120.56	1.599133	0.9036891	-0.370376	-0.2791049	0	-0.2271948	0.7041629	-0.09081746	-2.8347445	0.8033968	4.051809	0	-4.192924	164.8533
0.6680274	-0.3789437	-0.6721601	0.5708094	83120.56	1.599133	0.9036891	-0.370376	-0.2791049	0	0.7728052	-0.2958371	-0.09081746	-0.8347445	0.8033968	-8.948191	0	-16.192924	161.8533

As you can see below, all features are in numeric type. Also, mean is 0 because we use step_center above.

skim(rec_tr)

-- Data Summary ------------------------
                           Values
Name                       rec_tr
Number of rows             19842 
Number of columns          19    
_______________________          
Column type frequency:           
  factor                   1     
  numeric                  18    
________________________         
Group variables            None  

-- Variable type: factor -------------------------------------------------------
  skim_variable n_missing complete_rate ordered n_unique
1 credit                0             1 FALSE          3
  top_counts                
1 2: 12726, 1: 4700, 0: 2416

-- Variable type: numeric ------------------------------------------------------
   skim_variable n_missing complete_rate      mean         sd           p0
 1 gender                0             1 -1.03e-16      0.471      -0.332 
 2 car                   0             1  8.70e-17      0.485      -0.379 
 3 reality               0             1  9.21e-17      0.469      -0.672 
 4 child_num             0             1 -2.68e-17      0.743      -0.429 
 5 income_total          0             1 -8.67e-12 101083.    -159879.    
 6 income_type           0             1 -3.82e-17      1.74       -2.40  
 7 edu_type              0             1  3.28e-16      1.34       -3.10  
 8 family_type           0             1  4.62e-17      0.950      -1.37  
 9 house_type            0             1 -1.01e-16      0.943      -1.28  
10 FLAG_MOBIL            0             1  0             0           0     
11 work_phone            0             1 -6.31e-18      0.419      -0.227 
12 phone                 0             1  4.65e-18      0.456      -0.296 
13 email                 0             1 -9.23e-18      0.287      -0.0908
14 occyp_type            0             1 -7.37e-16      5.99      -10.8   
15 family_size           0             1  5.77e-17      0.914      -1.20  
16 begin_month           0             1  7.42e-16     16.5       -33.9   
17 yrs_birth             0             1 -1.24e-15     11.5       -22.2   
18 yrs_employed          0             1  7.96e-15    375.       -840.    
           p25         p50        p75        p100
 1     -0.332      -0.332      0.668        0.668
 2     -0.379      -0.379      0.621        0.621
 3     -0.672       0.328      0.328        0.328
 4     -0.429      -0.429      0.571       13.6  
 5 -65379.     -29379.     38121.     1388121.   
 6     -1.40        1.60       1.60         1.60 
 7     -2.10        0.904      0.904        0.904
 8     -0.370      -0.370     -0.370        2.63 
 9     -0.279      -0.279     -0.279        3.72 
10      0           0          0            0    
11     -0.227      -0.227     -0.227        0.773
12     -0.296      -0.296      0.704        0.704
13     -0.0908     -0.0908    -0.0908       0.909
14     -4.83       -0.835      7.17         7.17 
15     -0.197      -0.197      0.803       12.8  
16    -12.9         2.05      14.1         26.1  
17     -9.19       -1.19       9.81        24.8  
18    162.        165.       169.         204.   
   hist                            
 1 "\u2587\u2581\u2581\u2581\u2583"
 2 "\u2587\u2581\u2581\u2581\u2585"
 3 "\u2583\u2581\u2581\u2581\u2587"
 4 "\u2587\u2581\u2581\u2581\u2581"
 5 "\u2587\u2581\u2581\u2581\u2581"
 6 "\u2583\u2582\u2581\u2581\u2587"
 7 "\u2581\u2583\u2581\u2581\u2587"
 8 "\u2581\u2587\u2581\u2582\u2581"
 9 "\u2587\u2581\u2581\u2581\u2581"
10 "\u2581\u2581\u2587\u2581\u2581"
11 "\u2587\u2581\u2581\u2581\u2582"
12 "\u2587\u2581\u2581\u2581\u2583"
13 "\u2587\u2581\u2581\u2581\u2581"
14 "\u2585\u2582\u2586\u2583\u2587"
15 "\u2587\u2581\u2581\u2581\u2581"
16 "\u2585\u2586\u2586\u2587\u2587"
17 "\u2585\u2587\u2587\u2586\u2583"
18 "\u2582\u2581\u2581\u2581\u2587"

A skim_df: 19 × 15
	skim_type	skim_variable	n_missing	complete_rate	factor.ordered	factor.n_unique	factor.top_counts	numeric.mean	numeric.sd	numeric.p0	numeric.p25	numeric.p50	numeric.p75	numeric.p100	numeric.hist
	<chr>	<chr>	<int>	<dbl>	<lgl>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
1	factor	credit	0	1	FALSE	3	2: 12726, 1: 4700, 0: 2416	NA	NA	NA	NA	NA	NA	NA	NA
2	numeric	gender	0	1	NA	NA	NA	-1.033030e-16	4.709331e-01	-3.319726e-01	-3.319726e-01	-3.319726e-01	6.680274e-01	6.680274e-01	<U+2587><U+2581><U+2581><U+2581><U+2583>
3	numeric	car	0	1	NA	NA	NA	8.698253e-17	4.851363e-01	-3.789437e-01	-3.789437e-01	-3.789437e-01	6.210563e-01	6.210563e-01	<U+2587><U+2581><U+2581><U+2581><U+2585>
4	numeric	reality	0	1	NA	NA	NA	9.209199e-17	4.694380e-01	-6.721601e-01	-6.721601e-01	3.278399e-01	3.278399e-01	3.278399e-01	<U+2583><U+2581><U+2581><U+2581><U+2587>
5	numeric	child_num	0	1	NA	NA	NA	-2.679652e-17	7.430613e-01	-4.291906e-01	-4.291906e-01	-4.291906e-01	5.708094e-01	1.357081e+01	<U+2587><U+2581><U+2581><U+2581><U+2581>
6	numeric	income_total	0	1	NA	NA	NA	-8.665421e-12	1.010829e+05	-1.598794e+05	-6.537944e+04	-2.937944e+04	3.812056e+04	1.388121e+06	<U+2587><U+2581><U+2581><U+2581><U+2581>
7	numeric	income_type	0	1	NA	NA	NA	-3.816574e-17	1.737955e+00	-2.400867e+00	-1.400867e+00	1.599133e+00	1.599133e+00	1.599133e+00	<U+2583><U+2582><U+2581><U+2581><U+2587>
8	numeric	edu_type	0	1	NA	NA	NA	3.279951e-16	1.341700e+00	-3.096311e+00	-2.096311e+00	9.036891e-01	9.036891e-01	9.036891e-01	<U+2581><U+2583><U+2581><U+2581><U+2587>
9	numeric	family_type	0	1	NA	NA	NA	4.615038e-17	9.498038e-01	-1.370376e+00	-3.703760e-01	-3.703760e-01	-3.703760e-01	2.629624e+00	<U+2581><U+2587><U+2581><U+2582><U+2581>
10	numeric	house_type	0	1	NA	NA	NA	-1.005445e-16	9.431263e-01	-1.279105e+00	-2.791049e-01	-2.791049e-01	-2.791049e-01	3.720895e+00	<U+2587><U+2581><U+2581><U+2581><U+2581>
11	numeric	FLAG_MOBIL	0	1	NA	NA	NA	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	<U+2581><U+2581><U+2587><U+2581><U+2581>
12	numeric	work_phone	0	1	NA	NA	NA	-6.307236e-18	4.190301e-01	-2.271948e-01	-2.271948e-01	-2.271948e-01	-2.271948e-01	7.728052e-01	<U+2587><U+2581><U+2581><U+2581><U+2582>
13	numeric	phone	0	1	NA	NA	NA	4.649305e-18	4.564296e-01	-2.958371e-01	-2.958371e-01	-2.958371e-01	7.041629e-01	7.041629e-01	<U+2587><U+2581><U+2581><U+2581><U+2583>
14	numeric	email	0	1	NA	NA	NA	-9.229684e-18	2.873566e-01	-9.081746e-02	-9.081746e-02	-9.081746e-02	-9.081746e-02	9.091825e-01	<U+2587><U+2581><U+2581><U+2581><U+2581>
15	numeric	occyp_type	0	1	NA	NA	NA	-7.371003e-16	5.986895e+00	-1.083474e+01	-4.834744e+00	-8.347445e-01	7.165256e+00	7.165256e+00	<U+2585><U+2582><U+2586><U+2583><U+2587>
16	numeric	family_size	0	1	NA	NA	NA	5.774248e-17	9.136404e-01	-1.196603e+00	-1.966032e-01	-1.966032e-01	8.033968e-01	1.280340e+01	<U+2587><U+2581><U+2581><U+2581><U+2581>
17	numeric	begin_month	0	1	NA	NA	NA	7.423105e-16	1.653955e+01	-3.394819e+01	-1.294819e+01	2.051809e+00	1.405181e+01	2.605181e+01	<U+2585><U+2586><U+2586><U+2587><U+2587>
18	numeric	yrs_birth	0	1	NA	NA	NA	-1.243295e-15	1.149912e+01	-2.219292e+01	-9.192924e+00	-1.192924e+00	9.807076e+00	2.480708e+01	<U+2585><U+2587><U+2587><U+2586><U+2583>
19	numeric	yrs_employed	0	1	NA	NA	NA	7.955240e-15	3.753039e+02	-8.401467e+02	1.618533e+02	1.648533e+02	1.688533e+02	2.038533e+02	<U+2582><U+2581><U+2581><U+2581><U+2587>

rec_tr %>% 
  map_df(~sum(is.na(.))) %>% 
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "na_count") %>% 
  filter(na_count > 0)

A tibble: 0 × 2
variable	na_count
<chr>	<int>

No null values in columns. good to go now!

cores = parallel::detectCores()-1
cores

Parallel processing is available in ranger. Use num.threads parameter for this.

m = rand_forest(trees=100)  %>% 
    set_engine('ranger', num.threads=cores)  %>% 
    set_mode('classification')

wf = workflow()  %>% 
    add_model(m)  %>% 
    add_recipe(rec)

workflow::workflow is great method to chain your model and recipe in one variable. It helps you research many models with reducing danger of messing up.

fit_wf = wf  %>% fit(data=tr)

preds = predict(fit_wf, vl, type='prob')

t = bind_cols(preds, vl$credit)
colnames(t) = c('0','1','2','y_true')
t = t  %>% mutate(y_true = as.factor(y_true))

logloss is used for this competition. It measure the proficiency of multiclass classification problem.

mn_log_loss(t, `0`:`2`, truth='y_true')

A tibble: 1 × 3
.metric	.estimator	.estimate
<chr>	<chr>	<dbl>
mn_log_loss	multiclass	0.722948

It is ready to submit your first submission! Load your test set and make prediction with predict and extract to csv file.

ts <- read_csv("test.csv") %>% mutate_if(is.character, as.factor)

preds = predict(fit_wf, ts, type='prob')

submission <- bind_cols(index = ts$index, preds)
colnames(submission) <- c("index", 0, 1, 2)
write_csv(submission, "ss.csv")