library(tidyverse)
library(tidymodels)
library(skimr)
library(lubridate)
First, call functions with library
.
Read data with readr::read_csv
. specially chain with mutate_if
to change chracter data type to factor type.
<- read_csv("train.csv") %>% mutate_if(is.character, as.factor) df
skim::skim
is function which enable showing summary in detail.
skim(df)
-- Data Summary ------------------------
Values
Name df
Number of rows 26457
Number of columns 20
_______________________
Column type frequency:
factor 8
numeric 12
________________________
Group variables None
-- Variable type: factor -------------------------------------------------------
skim_variable n_missing complete_rate ordered n_unique
1 gender 0 1 FALSE 2
2 car 0 1 FALSE 2
3 reality 0 1 FALSE 2
4 income_type 0 1 FALSE 5
5 edu_type 0 1 FALSE 5
6 family_type 0 1 FALSE 5
7 house_type 0 1 FALSE 6
8 occyp_type 8171 0.691 FALSE 18
top_counts
1 F: 17697, M: 8760
2 N: 16410, Y: 10047
3 Y: 17830, N: 8627
4 Wor: 13645, Com: 6202, Pen: 4449, Sta: 2154
5 Sec: 17995, Hig: 7162, Inc: 1020, Low: 257
6 Mar: 18196, Sin: 3496, Civ: 2123, Sep: 1539
7 Hou: 23653, Wit: 1257, Mun: 818, Ren: 429
8 Lab: 4512, Cor: 2646, Sal: 2539, Man: 2167
-- Variable type: numeric ------------------------------------------------------
skim_variable n_missing complete_rate mean sd p0 p25
1 index 0 1 13228 7638. 0 6614
2 child_num 0 1 0.429 0.747 0 0
3 income_total 0 1 187307. 101878. 27000 121500
4 DAYS_BIRTH 0 1 -15958. 4202. -25152 -19431
5 DAYS_EMPLOYED 0 1 59069. 137475. -15713 -3153
6 FLAG_MOBIL 0 1 1 0 1 1
7 work_phone 0 1 0.225 0.417 0 0
8 phone 0 1 0.294 0.456 0 0
9 email 0 1 0.0913 0.288 0 0
10 family_size 0 1 2.20 0.917 1 2
11 begin_month 0 1 -26.1 16.6 -60 -39
12 credit 0 1 1.52 0.702 0 1
p50 p75 p100 hist
1 13228 19842 26456 "\u2587\u2587\u2587\u2587\u2587"
2 0 1 19 "\u2587\u2581\u2581\u2581\u2581"
3 157500 225000 1575000 "\u2587\u2581\u2581\u2581\u2581"
4 -15547 -12446 -7705 "\u2583\u2586\u2587\u2587\u2585"
5 -1539 -407 365243 "\u2587\u2581\u2581\u2581\u2582"
6 1 1 1 "\u2581\u2581\u2587\u2581\u2581"
7 0 0 1 "\u2587\u2581\u2581\u2581\u2582"
8 0 1 1 "\u2587\u2581\u2581\u2581\u2583"
9 0 0 1 "\u2587\u2581\u2581\u2581\u2581"
10 2 3 20 "\u2587\u2581\u2581\u2581\u2581"
11 -24 -12 0 "\u2585\u2586\u2586\u2587\u2587"
12 2 2 2 "\u2582\u2581\u2583\u2581\u2587"
skim_type | skim_variable | n_missing | complete_rate | factor.ordered | factor.n_unique | factor.top_counts | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <int> | <dbl> | <lgl> | <int> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | |
1 | factor | gender | 0 | 1.0000000 | FALSE | 2 | F: 17697, M: 8760 | NA | NA | NA | NA | NA | NA | NA | NA |
2 | factor | car | 0 | 1.0000000 | FALSE | 2 | N: 16410, Y: 10047 | NA | NA | NA | NA | NA | NA | NA | NA |
3 | factor | reality | 0 | 1.0000000 | FALSE | 2 | Y: 17830, N: 8627 | NA | NA | NA | NA | NA | NA | NA | NA |
4 | factor | income_type | 0 | 1.0000000 | FALSE | 5 | Wor: 13645, Com: 6202, Pen: 4449, Sta: 2154 | NA | NA | NA | NA | NA | NA | NA | NA |
5 | factor | edu_type | 0 | 1.0000000 | FALSE | 5 | Sec: 17995, Hig: 7162, Inc: 1020, Low: 257 | NA | NA | NA | NA | NA | NA | NA | NA |
6 | factor | family_type | 0 | 1.0000000 | FALSE | 5 | Mar: 18196, Sin: 3496, Civ: 2123, Sep: 1539 | NA | NA | NA | NA | NA | NA | NA | NA |
7 | factor | house_type | 0 | 1.0000000 | FALSE | 6 | Hou: 23653, Wit: 1257, Mun: 818, Ren: 429 | NA | NA | NA | NA | NA | NA | NA | NA |
8 | factor | occyp_type | 8171 | 0.6911592 | FALSE | 18 | Lab: 4512, Cor: 2646, Sal: 2539, Man: 2167 | NA | NA | NA | NA | NA | NA | NA | NA |
9 | numeric | index | 0 | 1.0000000 | NA | NA | NA | 1.322800e+04 | 7.637622e+03 | 0 | 6614 | 13228 | 19842 | 26456 | <U+2587><U+2587><U+2587><U+2587><U+2587> |
10 | numeric | child_num | 0 | 1.0000000 | NA | NA | NA | 4.286578e-01 | 7.473264e-01 | 0 | 0 | 0 | 1 | 19 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
11 | numeric | income_total | 0 | 1.0000000 | NA | NA | NA | 1.873065e+05 | 1.018784e+05 | 27000 | 121500 | 157500 | 225000 | 1575000 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
12 | numeric | DAYS_BIRTH | 0 | 1.0000000 | NA | NA | NA | -1.595805e+04 | 4.201589e+03 | -25152 | -19431 | -15547 | -12446 | -7705 | <U+2583><U+2586><U+2587><U+2587><U+2585> |
13 | numeric | DAYS_EMPLOYED | 0 | 1.0000000 | NA | NA | NA | 5.906875e+04 | 1.374754e+05 | -15713 | -3153 | -1539 | -407 | 365243 | <U+2587><U+2581><U+2581><U+2581><U+2582> |
14 | numeric | FLAG_MOBIL | 0 | 1.0000000 | NA | NA | NA | 1.000000e+00 | 0.000000e+00 | 1 | 1 | 1 | 1 | 1 | <U+2581><U+2581><U+2587><U+2581><U+2581> |
15 | numeric | work_phone | 0 | 1.0000000 | NA | NA | NA | 2.247420e-01 | 4.174202e-01 | 0 | 0 | 0 | 0 | 1 | <U+2587><U+2581><U+2581><U+2581><U+2582> |
16 | numeric | phone | 0 | 1.0000000 | NA | NA | NA | 2.942510e-01 | 4.557140e-01 | 0 | 0 | 0 | 1 | 1 | <U+2587><U+2581><U+2581><U+2581><U+2583> |
17 | numeric | 0 | 1.0000000 | NA | NA | NA | 9.128019e-02 | 2.880126e-01 | 0 | 0 | 0 | 0 | 1 | <U+2587><U+2581><U+2581><U+2581><U+2581> | |
18 | numeric | family_size | 0 | 1.0000000 | NA | NA | NA | 2.196848e+00 | 9.167167e-01 | 1 | 2 | 2 | 3 | 20 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
19 | numeric | begin_month | 0 | 1.0000000 | NA | NA | NA | -2.612329e+01 | 1.655955e+01 | -60 | -39 | -24 | -12 | 0 | <U+2585><U+2586><U+2586><U+2587><U+2587> |
20 | numeric | credit | 0 | 1.0000000 | NA | NA | NA | 1.519560e+00 | 7.022828e-01 | 0 | 1 | 2 | 2 | 2 | <U+2582><U+2581><U+2583><U+2581><U+2587> |
Split your train dataset to train set and validation set. It prevents from data leakage and make it possible to assess your prediction accuracy.
= df %>% initial_split(prop=0.75, strata='credit') split
= split %>% training()
tr = split %>% testing() vl
Record your preprocessing step with recipe
. the specific steps are below.
- Set your target value with
credit
- Convert features with day to year (
DAYS_BIRTH
andDAYS_EMPLOYED
) - Drop
index
,DAYS_BIRTH
,DAYS_EMPLOYED
. - Replace missing value in
occyp_type
with ‘unknown’. - Control scale of numeric values from 0 to 1.
<- tr %>%
rec recipe(credit ~.) %>%
step_mutate(credit = as.factor(credit), skip=TRUE) %>%
step_mutate(yrs_birth = -ceiling(DAYS_BIRTH/365),
yrs_employed = -ceiling(DAYS_EMPLOYED/365)) %>%
step_rm(index, DAYS_BIRTH, DAYS_EMPLOYED) %>%
step_unknown(occyp_type) %>%
step_integer(all_nominal(), -all_outcomes()) %>%
step_center(all_predictors(), -all_outcomes())
print(credit_rec)
Recipe
Inputs:
role #variables
outcome 1
predictor 19
Training data contained 26457 data points and 8171 incomplete rows.
Operations:
Variables removed index [trained]
Variable mutation for ~factor(credit) [trained]
Variable mutation for ~factor(gender), ~factor(car), ~factor(rea... [trained]
Variable mutation for ~ifelse(DAYS_EMPLOYED > 0, 0, DAYS_EMPLOYED) [trained]
Unknown factor level assignment for occyp_type [trained]
Yeo-Johnson transformation on DAYS_BIRTH, DAYS_EMPLOYED, begin_month [trained]
Centering and scaling for child_num, family_size [trained]
Log transformation on income_total [trained]
You can view the prepped data with juice
.
<- rec %>% prep(tr) %>% juice() rec_tr
head(rec_tr)
gender | car | reality | child_num | income_total | income_type | edu_type | family_type | house_type | FLAG_MOBIL | work_phone | phone | occyp_type | family_size | begin_month | credit | yrs_birth | yrs_employed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <fct> | <dbl> | <dbl> |
-0.3319726 | 0.6210563 | 0.3278399 | 0.5708094 | -29379.44 | -2.400867 | 0.9036891 | -0.370376 | -0.2791049 | 0 | -0.2271948 | -0.2958371 | -0.09081746 | -10.8347445 | 0.8033968 | -33.948191 | 0 | -11.192924 | 165.8533 |
0.6680274 | 0.6210563 | 0.3278399 | 1.5708094 | -29379.44 | 1.599133 | 0.9036891 | -0.370376 | 3.7208951 | 0 | -0.2271948 | 0.7041629 | -0.09081746 | -6.8347445 | 1.8033968 | -33.948191 | 0 | -8.192924 | 171.8533 |
0.6680274 | 0.6210563 | 0.3278399 | -0.4291906 | -51879.44 | -2.400867 | -2.0963109 | -0.370376 | -0.2791049 | 0 | 0.7728052 | -0.2958371 | 0.90918254 | -2.8347445 | -0.1966032 | 8.051809 | 0 | -11.192924 | 169.8533 |
-0.3319726 | -0.3789437 | 0.3278399 | -0.4291906 | -51879.44 | 1.599133 | -2.0963109 | -0.370376 | -0.2791049 | 0 | -0.2271948 | -0.2958371 | -0.09081746 | -2.8347445 | -0.1966032 | 24.051809 | 0 | -7.192924 | 164.8533 |
0.6680274 | 0.6210563 | 0.3278399 | 0.5708094 | 83120.56 | 1.599133 | 0.9036891 | -0.370376 | -0.2791049 | 0 | -0.2271948 | 0.7041629 | -0.09081746 | -2.8347445 | 0.8033968 | 4.051809 | 0 | -4.192924 | 164.8533 |
0.6680274 | -0.3789437 | -0.6721601 | 0.5708094 | 83120.56 | 1.599133 | 0.9036891 | -0.370376 | -0.2791049 | 0 | 0.7728052 | -0.2958371 | -0.09081746 | -0.8347445 | 0.8033968 | -8.948191 | 0 | -16.192924 | 161.8533 |
As you can see below, all features are in numeric type. Also, mean is 0 because we use step_center
above.
skim(rec_tr)
-- Data Summary ------------------------
Values
Name rec_tr
Number of rows 19842
Number of columns 19
_______________________
Column type frequency:
factor 1
numeric 18
________________________
Group variables None
-- Variable type: factor -------------------------------------------------------
skim_variable n_missing complete_rate ordered n_unique
1 credit 0 1 FALSE 3
top_counts
1 2: 12726, 1: 4700, 0: 2416
-- Variable type: numeric ------------------------------------------------------
skim_variable n_missing complete_rate mean sd p0
1 gender 0 1 -1.03e-16 0.471 -0.332
2 car 0 1 8.70e-17 0.485 -0.379
3 reality 0 1 9.21e-17 0.469 -0.672
4 child_num 0 1 -2.68e-17 0.743 -0.429
5 income_total 0 1 -8.67e-12 101083. -159879.
6 income_type 0 1 -3.82e-17 1.74 -2.40
7 edu_type 0 1 3.28e-16 1.34 -3.10
8 family_type 0 1 4.62e-17 0.950 -1.37
9 house_type 0 1 -1.01e-16 0.943 -1.28
10 FLAG_MOBIL 0 1 0 0 0
11 work_phone 0 1 -6.31e-18 0.419 -0.227
12 phone 0 1 4.65e-18 0.456 -0.296
13 email 0 1 -9.23e-18 0.287 -0.0908
14 occyp_type 0 1 -7.37e-16 5.99 -10.8
15 family_size 0 1 5.77e-17 0.914 -1.20
16 begin_month 0 1 7.42e-16 16.5 -33.9
17 yrs_birth 0 1 -1.24e-15 11.5 -22.2
18 yrs_employed 0 1 7.96e-15 375. -840.
p25 p50 p75 p100
1 -0.332 -0.332 0.668 0.668
2 -0.379 -0.379 0.621 0.621
3 -0.672 0.328 0.328 0.328
4 -0.429 -0.429 0.571 13.6
5 -65379. -29379. 38121. 1388121.
6 -1.40 1.60 1.60 1.60
7 -2.10 0.904 0.904 0.904
8 -0.370 -0.370 -0.370 2.63
9 -0.279 -0.279 -0.279 3.72
10 0 0 0 0
11 -0.227 -0.227 -0.227 0.773
12 -0.296 -0.296 0.704 0.704
13 -0.0908 -0.0908 -0.0908 0.909
14 -4.83 -0.835 7.17 7.17
15 -0.197 -0.197 0.803 12.8
16 -12.9 2.05 14.1 26.1
17 -9.19 -1.19 9.81 24.8
18 162. 165. 169. 204.
hist
1 "\u2587\u2581\u2581\u2581\u2583"
2 "\u2587\u2581\u2581\u2581\u2585"
3 "\u2583\u2581\u2581\u2581\u2587"
4 "\u2587\u2581\u2581\u2581\u2581"
5 "\u2587\u2581\u2581\u2581\u2581"
6 "\u2583\u2582\u2581\u2581\u2587"
7 "\u2581\u2583\u2581\u2581\u2587"
8 "\u2581\u2587\u2581\u2582\u2581"
9 "\u2587\u2581\u2581\u2581\u2581"
10 "\u2581\u2581\u2587\u2581\u2581"
11 "\u2587\u2581\u2581\u2581\u2582"
12 "\u2587\u2581\u2581\u2581\u2583"
13 "\u2587\u2581\u2581\u2581\u2581"
14 "\u2585\u2582\u2586\u2583\u2587"
15 "\u2587\u2581\u2581\u2581\u2581"
16 "\u2585\u2586\u2586\u2587\u2587"
17 "\u2585\u2587\u2587\u2586\u2583"
18 "\u2582\u2581\u2581\u2581\u2587"
skim_type | skim_variable | n_missing | complete_rate | factor.ordered | factor.n_unique | factor.top_counts | numeric.mean | numeric.sd | numeric.p0 | numeric.p25 | numeric.p50 | numeric.p75 | numeric.p100 | numeric.hist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <int> | <dbl> | <lgl> | <int> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | |
1 | factor | credit | 0 | 1 | FALSE | 3 | 2: 12726, 1: 4700, 0: 2416 | NA | NA | NA | NA | NA | NA | NA | NA |
2 | numeric | gender | 0 | 1 | NA | NA | NA | -1.033030e-16 | 4.709331e-01 | -3.319726e-01 | -3.319726e-01 | -3.319726e-01 | 6.680274e-01 | 6.680274e-01 | <U+2587><U+2581><U+2581><U+2581><U+2583> |
3 | numeric | car | 0 | 1 | NA | NA | NA | 8.698253e-17 | 4.851363e-01 | -3.789437e-01 | -3.789437e-01 | -3.789437e-01 | 6.210563e-01 | 6.210563e-01 | <U+2587><U+2581><U+2581><U+2581><U+2585> |
4 | numeric | reality | 0 | 1 | NA | NA | NA | 9.209199e-17 | 4.694380e-01 | -6.721601e-01 | -6.721601e-01 | 3.278399e-01 | 3.278399e-01 | 3.278399e-01 | <U+2583><U+2581><U+2581><U+2581><U+2587> |
5 | numeric | child_num | 0 | 1 | NA | NA | NA | -2.679652e-17 | 7.430613e-01 | -4.291906e-01 | -4.291906e-01 | -4.291906e-01 | 5.708094e-01 | 1.357081e+01 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
6 | numeric | income_total | 0 | 1 | NA | NA | NA | -8.665421e-12 | 1.010829e+05 | -1.598794e+05 | -6.537944e+04 | -2.937944e+04 | 3.812056e+04 | 1.388121e+06 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
7 | numeric | income_type | 0 | 1 | NA | NA | NA | -3.816574e-17 | 1.737955e+00 | -2.400867e+00 | -1.400867e+00 | 1.599133e+00 | 1.599133e+00 | 1.599133e+00 | <U+2583><U+2582><U+2581><U+2581><U+2587> |
8 | numeric | edu_type | 0 | 1 | NA | NA | NA | 3.279951e-16 | 1.341700e+00 | -3.096311e+00 | -2.096311e+00 | 9.036891e-01 | 9.036891e-01 | 9.036891e-01 | <U+2581><U+2583><U+2581><U+2581><U+2587> |
9 | numeric | family_type | 0 | 1 | NA | NA | NA | 4.615038e-17 | 9.498038e-01 | -1.370376e+00 | -3.703760e-01 | -3.703760e-01 | -3.703760e-01 | 2.629624e+00 | <U+2581><U+2587><U+2581><U+2582><U+2581> |
10 | numeric | house_type | 0 | 1 | NA | NA | NA | -1.005445e-16 | 9.431263e-01 | -1.279105e+00 | -2.791049e-01 | -2.791049e-01 | -2.791049e-01 | 3.720895e+00 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
11 | numeric | FLAG_MOBIL | 0 | 1 | NA | NA | NA | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | <U+2581><U+2581><U+2587><U+2581><U+2581> |
12 | numeric | work_phone | 0 | 1 | NA | NA | NA | -6.307236e-18 | 4.190301e-01 | -2.271948e-01 | -2.271948e-01 | -2.271948e-01 | -2.271948e-01 | 7.728052e-01 | <U+2587><U+2581><U+2581><U+2581><U+2582> |
13 | numeric | phone | 0 | 1 | NA | NA | NA | 4.649305e-18 | 4.564296e-01 | -2.958371e-01 | -2.958371e-01 | -2.958371e-01 | 7.041629e-01 | 7.041629e-01 | <U+2587><U+2581><U+2581><U+2581><U+2583> |
14 | numeric | 0 | 1 | NA | NA | NA | -9.229684e-18 | 2.873566e-01 | -9.081746e-02 | -9.081746e-02 | -9.081746e-02 | -9.081746e-02 | 9.091825e-01 | <U+2587><U+2581><U+2581><U+2581><U+2581> | |
15 | numeric | occyp_type | 0 | 1 | NA | NA | NA | -7.371003e-16 | 5.986895e+00 | -1.083474e+01 | -4.834744e+00 | -8.347445e-01 | 7.165256e+00 | 7.165256e+00 | <U+2585><U+2582><U+2586><U+2583><U+2587> |
16 | numeric | family_size | 0 | 1 | NA | NA | NA | 5.774248e-17 | 9.136404e-01 | -1.196603e+00 | -1.966032e-01 | -1.966032e-01 | 8.033968e-01 | 1.280340e+01 | <U+2587><U+2581><U+2581><U+2581><U+2581> |
17 | numeric | begin_month | 0 | 1 | NA | NA | NA | 7.423105e-16 | 1.653955e+01 | -3.394819e+01 | -1.294819e+01 | 2.051809e+00 | 1.405181e+01 | 2.605181e+01 | <U+2585><U+2586><U+2586><U+2587><U+2587> |
18 | numeric | yrs_birth | 0 | 1 | NA | NA | NA | -1.243295e-15 | 1.149912e+01 | -2.219292e+01 | -9.192924e+00 | -1.192924e+00 | 9.807076e+00 | 2.480708e+01 | <U+2585><U+2587><U+2587><U+2586><U+2583> |
19 | numeric | yrs_employed | 0 | 1 | NA | NA | NA | 7.955240e-15 | 3.753039e+02 | -8.401467e+02 | 1.618533e+02 | 1.648533e+02 | 1.688533e+02 | 2.038533e+02 | <U+2582><U+2581><U+2581><U+2581><U+2587> |
%>%
rec_tr map_df(~sum(is.na(.))) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "na_count") %>%
filter(na_count > 0)
variable | na_count |
---|---|
<chr> | <int> |
No null values in columns. good to go now!
= parallel::detectCores()-1
cores cores
Parallel processing is available in ranger. Use num.threads
parameter for this.
= rand_forest(trees=100) %>%
m set_engine('ranger', num.threads=cores) %>%
set_mode('classification')
= workflow() %>%
wf add_model(m) %>%
add_recipe(rec)
workflow::workflow
is great method to chain your model and recipe in one variable. It helps you research many models with reducing danger of messing up.
= wf %>% fit(data=tr) fit_wf
= predict(fit_wf, vl, type='prob') preds
= bind_cols(preds, vl$credit)
t colnames(t) = c('0','1','2','y_true')
= t %>% mutate(y_true = as.factor(y_true)) t
logloss is used for this competition. It measure the proficiency of multiclass classification problem.
mn_log_loss(t, `0`:`2`, truth='y_true')
.metric | .estimator | .estimate |
---|---|---|
<chr> | <chr> | <dbl> |
mn_log_loss | multiclass | 0.722948 |
It is ready to submit your first submission! Load your test set and make prediction with predict
and extract to csv file.
<- read_csv("test.csv") %>% mutate_if(is.character, as.factor) ts
= predict(fit_wf, ts, type='prob') preds
<- bind_cols(index = ts$index, preds)
submission colnames(submission) <- c("index", 0, 1, 2)
write_csv(submission, "ss.csv")