import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)
# load train data profile
data_path = "home-credit-default-risk/application_train.csv"
pdf_data = pd.read_csv(data_path)
pdf_data.head()
SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.0188 | -9461 | -637 | -3648.0 | -2120 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | WEDNESDAY | 10 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.0830 | 0.2629 | 0.1394 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | reg oper account | block of flats | 0.0149 | Stone, brick | No | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | Family | State servant | Higher education | Married | House / apartment | 0.0035 | -16765 | -1188 | -1186.0 | -291 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Core staff | 2.0 | 1 | 1 | MONDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | School | 0.3113 | 0.6222 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | reg oper account | block of flats | 0.0714 | Block | No | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.0100 | -19046 | -225 | -4260.0 | -2531 | 26.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 2 | 2 | MONDAY | 9 | 0 | 0 | 0 | 0 | 0 | 0 | Government | NaN | 0.5559 | 0.7296 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.0080 | -19005 | -3039 | -9833.0 | -2437 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | WEDNESDAY | 17 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.6504 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.0287 | -19932 | -3038 | -4311.0 | -3458 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 1.0 | 2 | 2 | THURSDAY | 11 | 0 | 0 | 0 | 0 | 1 | 1 | Religion | NaN | 0.3227 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
# load meta data
meta_path = "../02_pandas/reports/report_application_train.csv"
pdf_meta = pd.read_csv(meta_path)
pdf_meta
name | sub_type | n_distinct | n_miss | n_negative | n_zeros | 25% | 50% | 75% | count | max | mean | min | std | sample_0 | sample_1 | sample_2 | sample_3 | sample_4 | sample_5 | sample_6 | sample_7 | sample_8 | sample_9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | SK_ID_CURR | int64 | 307511 (100.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 189145.5000 | 278202.0000 | 367142.5000 | 307511.0 | 4.5626e+05 | 2.7818e+05 | 1.0000e+05 | 102790.1753 | 326682 | 414578 | 432657 | 346257 | 169928 | 228494 | 305986 | 450918 | 393627 | 121604 |
1 | TARGET | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 282686 (91.93%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 8.0729e-02 | 0.0000e+00 | 0.2724 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
2 | NAME_CONTRACT_TYPE | object | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | Cash loans | Cash loans | Revolving loans | Cash loans | Cash loans | Cash loans | Cash loans | Cash loans | Cash loans | Cash loans |
3 | CODE_GENDER | object | 3 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | M | M | M | M | M | F | M | F | F | F |
4 | FLAG_OWN_CAR | object | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | Y | Y | N | N | N | N | N | Y | N | N |
5 | FLAG_OWN_REALTY | object | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | Y | N | Y | Y | Y | Y | Y | N | Y | Y |
6 | CNT_CHILDREN | int64 | 15 (0.00%) | 0 (0.00%) | 0 (0.00%) | 215371 (70.04%) | 0.0000 | 0.0000 | 1.0000 | 307511.0 | 1.9000e+01 | 4.1705e-01 | 0.0000e+00 | 0.7221 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 |
7 | AMT_INCOME_TOTAL | float64 | 2548 (0.83%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 112500.0000 | 147150.0000 | 202500.0000 | 307511.0 | 1.1700e+08 | 1.6880e+05 | 2.5650e+04 | 237123.1463 | 166500.0 | 450000.0 | 157500.0 | 135000.0 | 202500.0 | 117000.0 | 90000.0 | 180000.0 | 157500.0 | 90000.0 |
8 | AMT_CREDIT | float64 | 5603 (1.82%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 270000.0000 | 513531.0000 | 808650.0000 | 307511.0 | 4.0500e+06 | 5.9903e+05 | 4.5000e+04 | 402490.7770 | 254700.0 | 1381113.0 | 450000.0 | 1764000.0 | 203760.0 | 254700.0 | 538704.0 | 630000.0 | 679500.0 | 755190.0 |
9 | AMT_ANNUITY | float64 | 13672 (4.45%) | 12 (0.00%) | 0 (0.00%) | 0 (0.00%) | 16524.0000 | 24903.0000 | 34596.0000 | 307499.0 | 2.5803e+05 | 2.7109e+04 | 1.6155e+03 | 14493.7373 | 25191.0 | 39712.5 | 22500.0 | 48510.0 | 16227.0 | 25191.0 | 26046.0 | 23274.0 | 36202.5 | 36459.0 |
10 | AMT_GOODS_PRICE | float64 | 1002 (0.33%) | 278 (0.09%) | 0 (0.00%) | 0 (0.00%) | 238500.0000 | 450000.0000 | 679500.0000 | 307233.0 | 4.0500e+06 | 5.3840e+05 | 4.0500e+04 | 369446.4605 | 225000.0 | 1206000.0 | 450000.0 | 1764000.0 | 180000.0 | 225000.0 | 481500.0 | 630000.0 | 679500.0 | 675000.0 |
11 | NAME_TYPE_SUITE | object | 7 (0.00%) | 1292 (0.42%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | Unaccompanied | Unaccompanied | Unaccompanied | Unaccompanied | Unaccompanied | Unaccompanied | Unaccompanied | Unaccompanied | Family | Family |
12 | NAME_INCOME_TYPE | object | 8 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | State servant | Working | Working | Commercial associate | Working | Pensioner | Working | Working | Working | Working |
13 | NAME_EDUCATION_TYPE | object | 5 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | Higher education | Incomplete higher | Secondary / secondary special | Secondary / secondary special | Secondary / secondary special | Secondary / secondary special | Secondary / secondary special | Secondary / secondary special | Secondary / secondary special | Secondary / secondary special |
14 | NAME_FAMILY_STATUS | object | 6 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | Married | Married | Married | Married | Separated | Married | Single / not married | Married | Married | Married |
15 | NAME_HOUSING_TYPE | object | 6 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | House / apartment | House / apartment | House / apartment | House / apartment | House / apartment | House / apartment | House / apartment | House / apartment | House / apartment | With parents |
16 | REGION_POPULATION_RELATIVE | float64 | 81 (0.03%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0100 | 0.0188 | 0.0287 | 307511.0 | 7.2508e-02 | 2.0868e-02 | 2.9000e-04 | 0.0138 | 0.02461 | 0.018634 | 0.015221 | 0.008019 | 0.0105 | 0.008865999999999999 | 0.018634 | 0.0105 | 0.01885 | 0.030755 |
17 | DAYS_BIRTH | int64 | 17460 (5.68%) | 0 (0.00%) | 307511 (100.00%) | 0 (0.00%) | -19682.0000 | -15750.0000 | -12413.0000 | 307511.0 | -7.4890e+03 | -1.6037e+04 | -2.5229e+04 | 4363.9886 | -21882 | -14026 | -9905 | -11946 | -10493 | -24264 | -15251 | -14263 | -11179 | -10477 |
18 | DAYS_EMPLOYED | int64 | 12574 (4.09%) | 0 (0.00%) | 252135 (81.99%) | 2 (0.00%) | -2760.0000 | -1213.0000 | -289.0000 | 307511.0 | 3.6524e+05 | 6.3815e+04 | -1.7912e+04 | 141275.7665 | -2987 | -270 | -2691 | -1526 | -656 | 365243 | -1984 | -481 | -687 | -1400 |
19 | DAYS_REGISTRATION | float64 | 15688 (5.10%) | 0 (0.00%) | 307431 (99.97%) | 80 (0.03%) | -7479.5000 | -4504.0000 | -2010.0000 | 307511.0 | 0.0000e+00 | -4.9861e+03 | -2.4672e+04 | 3522.8863 | -11125.0 | -1625.0 | -4725.0 | -1513.0 | -2389.0 | -87.0 | -6933.0 | -1315.0 | -1491.0 | -5034.0 |
20 | DAYS_ID_PUBLISH | int64 | 6168 (2.01%) | 0 (0.00%) | 307495 (99.99%) | 16 (0.01%) | -4299.0000 | -3254.0000 | -1720.0000 | 307511.0 | 0.0000e+00 | -2.9942e+03 | -7.1970e+03 | 1509.4504 | -3984 | -4768 | -2549 | -4392 | -2526 | -4388 | -4396 | -4830 | -2742 | -1625 |
21 | OWN_CAR_AGE | float64 | 62 (0.02%) | 202929 (65.99%) | 0 (0.00%) | 2134 (0.69%) | 5.0000 | 9.0000 | 15.0000 | 104582.0 | 9.1000e+01 | 1.2061e+01 | 0.0000e+00 | 11.9448 | 4.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | 18.0 | NaN | NaN |
22 | FLAG_MOBIL | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 1 (0.00%) | 1.0000 | 1.0000 | 1.0000 | 307511.0 | 1.0000e+00 | 1.0000e+00 | 0.0000e+00 | 0.0018 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
23 | FLAG_EMP_PHONE | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 55386 (18.01%) | 1.0000 | 1.0000 | 1.0000 | 307511.0 | 1.0000e+00 | 8.1989e-01 | 0.0000e+00 | 0.3843 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
24 | FLAG_WORK_PHONE | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 246203 (80.06%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 1.9937e-01 | 0.0000e+00 | 0.3995 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 |
25 | FLAG_CONT_MOBILE | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 574 (0.19%) | 1.0000 | 1.0000 | 1.0000 | 307511.0 | 1.0000e+00 | 9.9813e-01 | 0.0000e+00 | 0.0432 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
26 | FLAG_PHONE | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 221080 (71.89%) | 0.0000 | 0.0000 | 1.0000 | 307511.0 | 1.0000e+00 | 2.8107e-01 | 0.0000e+00 | 0.4495 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
27 | FLAG_EMAIL | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 290069 (94.33%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 5.6720e-02 | 0.0000e+00 | 0.2313 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
28 | OCCUPATION_TYPE | object | 18 (0.01%) | 96391 (31.35%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | NaN | Drivers | Laborers | NaN | Laborers | NaN | Laborers | NaN | Managers | Laborers |
29 | CNT_FAM_MEMBERS | float64 | 17 (0.01%) | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 2.0000 | 2.0000 | 3.0000 | 307509.0 | 2.0000e+01 | 2.1527e+00 | 1.0000e+00 | 0.9107 | 2.0 | 3.0 | 2.0 | 2.0 | 1.0 | 2.0 | 1.0 | 3.0 | 4.0 | 2.0 |
30 | REGION_RATING_CLIENT | int64 | 3 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 2.0000 | 2.0000 | 2.0000 | 307511.0 | 3.0000e+00 | 2.0525e+00 | 1.0000e+00 | 0.5090 | 2 | 2 | 2 | 2 | 3 | 2 | 2 | 3 | 2 | 2 |
31 | REGION_RATING_CLIENT_W_CITY | int64 | 3 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 2.0000 | 2.0000 | 2.0000 | 307511.0 | 3.0000e+00 | 2.0315e+00 | 1.0000e+00 | 0.5027 | 2 | 2 | 2 | 2 | 3 | 2 | 2 | 3 | 2 | 2 |
32 | WEEKDAY_APPR_PROCESS_START | object | 7 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | THURSDAY | MONDAY | WEDNESDAY | FRIDAY | WEDNESDAY | MONDAY | FRIDAY | MONDAY | TUESDAY | FRIDAY |
33 | HOUR_APPR_PROCESS_START | int64 | 24 (0.01%) | 0 (0.00%) | 0 (0.00%) | 40 (0.01%) | 10.0000 | 12.0000 | 14.0000 | 307511.0 | 2.3000e+01 | 1.2063e+01 | 0.0000e+00 | 3.2658 | 17 | 10 | 9 | 10 | 15 | 17 | 9 | 13 | 12 | 13 |
34 | REG_REGION_NOT_LIVE_REGION | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 302854 (98.49%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 1.5144e-02 | 0.0000e+00 | 0.1221 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
35 | REG_REGION_NOT_WORK_REGION | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 291899 (94.92%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 5.0769e-02 | 0.0000e+00 | 0.2195 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
36 | LIVE_REGION_NOT_WORK_REGION | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 295008 (95.93%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 4.0659e-02 | 0.0000e+00 | 0.1975 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
37 | REG_CITY_NOT_LIVE_CITY | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 283472 (92.18%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 7.8173e-02 | 0.0000e+00 | 0.2684 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
38 | REG_CITY_NOT_WORK_CITY | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 236644 (76.95%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 2.3045e-01 | 0.0000e+00 | 0.4211 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
39 | LIVE_CITY_NOT_WORK_CITY | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 252296 (82.04%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 1.7955e-01 | 0.0000e+00 | 0.3838 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
40 | ORGANIZATION_TYPE | object | 58 (0.02%) | 0 (0.00%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | Business Entity Type 2 | Business Entity Type 3 | Business Entity Type 2 | Business Entity Type 3 | Self-employed | XNA | Self-employed | Business Entity Type 3 | Trade: type 2 | Medicine |
41 | EXT_SOURCE_1 | float64 | 114584 (37.26%) | 173378 (56.38%) | 0 (0.00%) | 0 (0.00%) | 0.3340 | 0.5060 | 0.6751 | 134133.0 | 9.6269e-01 | 5.0213e-01 | 1.4568e-02 | 0.2111 | NaN | 0.3569777836552319 | NaN | 0.4949180654446097 | NaN | NaN | NaN | 0.5392576763343839 | 0.6575317720639661 | NaN |
42 | EXT_SOURCE_2 | float64 | 119831 (38.97%) | 660 (0.21%) | 0 (0.00%) | 0 (0.00%) | 0.3925 | 0.5660 | 0.6636 | 306851.0 | 8.5500e-01 | 5.1439e-01 | 8.1736e-08 | 0.1911 | 0.5784756531591329 | 0.5152771000292551 | 0.4248161058463641 | 0.6670352420910267 | 0.0962461218101204 | 0.39473825561809134 | 0.7881138083280189 | 0.5579841354415206 | 0.5916561691175296 | 0.6675741251599911 |
43 | EXT_SOURCE_3 | float64 | 814 (0.26%) | 60965 (19.83%) | 0 (0.00%) | 0 (0.00%) | 0.3706 | 0.5353 | 0.6691 | 246546.0 | 8.9601e-01 | 5.1085e-01 | 5.2727e-04 | 0.1948 | 0.5971924268337128 | 0.5226973172821112 | 0.31547215492577346 | NaN | 0.4170996682522097 | 0.6058362647264226 | NaN | 0.7490217048463391 | 0.4956658291397297 | 0.7394117535524816 |
44 | APARTMENTS_AVG | float64 | 2339 (0.76%) | 156061 (50.75%) | 0 (0.00%) | 751 (0.24%) | 0.0577 | 0.0876 | 0.1485 | 151450.0 | 1.0000e+00 | 1.1744e-01 | 0.0000e+00 | 0.1082 | NaN | 0.1485 | NaN | NaN | NaN | 0.2216 | 0.0629 | 0.0825 | 0.1031 | 0.1825 |
45 | BASEMENTAREA_AVG | float64 | 3780 (1.23%) | 179943 (58.52%) | 0 (0.00%) | 14745 (4.79%) | 0.0442 | 0.0763 | 0.1122 | 127568.0 | 1.0000e+00 | 8.8442e-02 | 0.0000e+00 | 0.0824 | NaN | 0.0991 | NaN | NaN | NaN | 0.0776 | 0.0756 | 0.0788 | NaN | 0.1322 |
46 | YEARS_BEGINEXPLUATATION_AVG | float64 | 285 (0.09%) | 150007 (48.78%) | 0 (0.00%) | 514 (0.17%) | 0.9767 | 0.9816 | 0.9866 | 157504.0 | 1.0000e+00 | 9.7773e-01 | 0.0000e+00 | 0.0592 | 0.9856 | 0.9871 | NaN | NaN | NaN | 0.9826 | 0.9831 | 0.9786 | 0.9771 | 0.9861 |
47 | YEARS_BUILD_AVG | float64 | 149 (0.05%) | 204488 (66.50%) | 0 (0.00%) | 102 (0.03%) | 0.6872 | 0.7552 | 0.8232 | 103023.0 | 1.0000e+00 | 7.5247e-01 | 0.0000e+00 | 0.1133 | NaN | 0.8232 | NaN | NaN | NaN | NaN | 0.7688 | 0.7076 | 0.6872 | 0.8096 |
48 | COMMONAREA_AVG | float64 | 3181 (1.03%) | 214865 (69.87%) | 0 (0.00%) | 8442 (2.75%) | 0.0078 | 0.0211 | 0.0515 | 92646.0 | 1.0000e+00 | 4.4621e-02 | 0.0000e+00 | 0.0760 | NaN | 0.0889 | NaN | NaN | NaN | NaN | NaN | 0.0079 | NaN | 0.0378 |
49 | ELEVATORS_AVG | float64 | 257 (0.08%) | 163891 (53.30%) | 0 (0.00%) | 85718 (27.87%) | 0.0000 | 0.0000 | 0.1200 | 143620.0 | 1.0000e+00 | 7.8942e-02 | 0.0000e+00 | 0.1346 | 0.08 | 0.16 | NaN | NaN | NaN | 0.08 | 0.0 | 0.0 | NaN | 0.2 |
50 | ENTRANCES_AVG | float64 | 285 (0.09%) | 154828 (50.35%) | 0 (0.00%) | 323 (0.11%) | 0.0690 | 0.1379 | 0.2069 | 152683.0 | 1.0000e+00 | 1.4972e-01 | 0.0000e+00 | 0.1000 | 0.069 | 0.1379 | NaN | NaN | NaN | 0.0345 | 0.1379 | 0.1379 | 0.2069 | 0.1724 |
51 | FLOORSMAX_AVG | float64 | 403 (0.13%) | 153020 (49.76%) | 0 (0.00%) | 2938 (0.96%) | 0.1667 | 0.1667 | 0.3333 | 154491.0 | 1.0000e+00 | 2.2628e-01 | 0.0000e+00 | 0.1446 | 0.3333 | 0.3333 | NaN | NaN | NaN | 0.3333 | 0.1667 | 0.1667 | 0.1667 | 0.3333 |
52 | FLOORSMIN_AVG | float64 | 305 (0.10%) | 208642 (67.85%) | 0 (0.00%) | 2320 (0.75%) | 0.0833 | 0.2083 | 0.3750 | 98869.0 | 1.0000e+00 | 2.3189e-01 | 0.0000e+00 | 0.1614 | NaN | 0.375 | NaN | NaN | NaN | NaN | NaN | 0.2083 | NaN | 0.375 |
53 | LANDAREA_AVG | float64 | 3527 (1.15%) | 182590 (59.38%) | 0 (0.00%) | 15600 (5.07%) | 0.0187 | 0.0481 | 0.0856 | 124921.0 | 1.0000e+00 | 6.6333e-02 | 0.0000e+00 | 0.0812 | NaN | 0.1127 | NaN | NaN | NaN | 0.0911 | 0.0151 | 0.0203 | NaN | 0.1238 |
54 | LIVINGAPARTMENTS_AVG | float64 | 1868 (0.61%) | 210199 (68.35%) | 0 (0.00%) | 418 (0.14%) | 0.0504 | 0.0756 | 0.1210 | 97312.0 | 1.0000e+00 | 1.0077e-01 | 0.0000e+00 | 0.0926 | NaN | 0.121 | NaN | NaN | NaN | NaN | 0.0504 | 0.0672 | 0.0807 | 0.1488 |
55 | LIVINGAREA_AVG | float64 | 5199 (1.69%) | 154350 (50.19%) | 0 (0.00%) | 284 (0.09%) | 0.0453 | 0.0745 | 0.1299 | 153161.0 | 1.0000e+00 | 1.0740e-01 | 0.0000e+00 | 0.1106 | 0.0739 | 0.0915 | NaN | NaN | NaN | 0.0582 | 0.0556 | 0.0703 | 0.0841 | 0.1824 |
56 | NONLIVINGAPARTMENTS_AVG | float64 | 386 (0.13%) | 213514 (69.43%) | 0 (0.00%) | 54549 (17.74%) | 0.0000 | 0.0000 | 0.0039 | 93997.0 | 1.0000e+00 | 8.8087e-03 | 0.0000e+00 | 0.0477 | NaN | 0.0 | NaN | NaN | NaN | NaN | 0.0039 | 0.0 | 0.0154 | 0.0 |
57 | NONLIVINGAREA_AVG | float64 | 3290 (1.07%) | 169682 (55.18%) | 0 (0.00%) | 58735 (19.10%) | 0.0000 | 0.0036 | 0.0277 | 137829.0 | 1.0000e+00 | 2.8358e-02 | 0.0000e+00 | 0.0695 | NaN | 0.0 | NaN | NaN | NaN | 0.2242 | 0.0188 | 0.0 | 0.0128 | 0.0022 |
58 | APARTMENTS_MODE | float64 | 760 (0.25%) | 156061 (50.75%) | 0 (0.00%) | 976 (0.32%) | 0.0525 | 0.0840 | 0.1439 | 151450.0 | 1.0000e+00 | 1.1423e-01 | 0.0000e+00 | 0.1079 | NaN | 0.1513 | NaN | NaN | NaN | 0.2258 | 0.0641 | 0.084 | 0.105 | 0.1859 |
59 | BASEMENTAREA_MODE | float64 | 3841 (1.25%) | 179943 (58.52%) | 0 (0.00%) | 16598 (5.40%) | 0.0407 | 0.0746 | 0.1124 | 127568.0 | 1.0000e+00 | 8.7543e-02 | 0.0000e+00 | 0.0843 | NaN | 0.1028 | NaN | NaN | NaN | 0.0806 | 0.0785 | 0.0818 | NaN | 0.1372 |
60 | YEARS_BEGINEXPLUATATION_MODE | float64 | 221 (0.07%) | 150007 (48.78%) | 0 (0.00%) | 142 (0.05%) | 0.9767 | 0.9816 | 0.9866 | 157504.0 | 1.0000e+00 | 9.7707e-01 | 0.0000e+00 | 0.0646 | 0.9856 | 0.9871 | NaN | NaN | NaN | 0.9826 | 0.9831 | 0.9786 | 0.9772 | 0.9861 |
61 | YEARS_BUILD_MODE | float64 | 154 (0.05%) | 204488 (66.50%) | 0 (0.00%) | 103 (0.03%) | 0.6994 | 0.7648 | 0.8236 | 103023.0 | 1.0000e+00 | 7.5964e-01 | 0.0000e+00 | 0.1101 | NaN | 0.8301 | NaN | NaN | NaN | NaN | 0.7779 | 0.7190000000000001 | 0.6994 | 0.8171 |
62 | COMMONAREA_MODE | float64 | 3128 (1.02%) | 214865 (69.87%) | 0 (0.00%) | 9690 (3.15%) | 0.0072 | 0.0190 | 0.0490 | 92646.0 | 1.0000e+00 | 4.2553e-02 | 0.0000e+00 | 0.0744 | NaN | 0.0897 | NaN | NaN | NaN | NaN | NaN | 0.008 | NaN | 0.0382 |
63 | ELEVATORS_MODE | float64 | 26 (0.01%) | 163891 (53.30%) | 0 (0.00%) | 89498 (29.10%) | 0.0000 | 0.0000 | 0.1208 | 143620.0 | 1.0000e+00 | 7.4490e-02 | 0.0000e+00 | 0.1323 | 0.0806 | 0.1611 | NaN | NaN | NaN | 0.0806 | 0.0 | 0.0 | NaN | 0.2014 |
64 | ENTRANCES_MODE | float64 | 30 (0.01%) | 154828 (50.35%) | 0 (0.00%) | 387 (0.13%) | 0.0690 | 0.1379 | 0.2069 | 152683.0 | 1.0000e+00 | 1.4519e-01 | 0.0000e+00 | 0.1010 | 0.069 | 0.1379 | NaN | NaN | NaN | 0.0345 | 0.1379 | 0.1379 | 0.2069 | 0.1724 |
65 | FLOORSMAX_MODE | float64 | 25 (0.01%) | 153020 (49.76%) | 0 (0.00%) | 3415 (1.11%) | 0.1667 | 0.1667 | 0.3333 | 154491.0 | 1.0000e+00 | 2.2232e-01 | 0.0000e+00 | 0.1437 | 0.3333 | 0.3333 | NaN | NaN | NaN | 0.3333 | 0.1667 | 0.1667 | 0.1667 | 0.3333 |
66 | FLOORSMIN_MODE | float64 | 25 (0.01%) | 208642 (67.85%) | 0 (0.00%) | 2517 (0.82%) | 0.0833 | 0.2083 | 0.3750 | 98869.0 | 1.0000e+00 | 2.2806e-01 | 0.0000e+00 | 0.1612 | NaN | 0.375 | NaN | NaN | NaN | NaN | NaN | 0.2083 | NaN | 0.375 |
67 | LANDAREA_MODE | float64 | 3563 (1.16%) | 182590 (59.38%) | 0 (0.00%) | 17453 (5.68%) | 0.0166 | 0.0458 | 0.0841 | 124921.0 | 1.0000e+00 | 6.4958e-02 | 0.0000e+00 | 0.0818 | NaN | 0.1153 | NaN | NaN | NaN | 0.0932 | 0.0155 | 0.0207 | NaN | 0.1266 |
68 | LIVINGAPARTMENTS_MODE | float64 | 736 (0.24%) | 210199 (68.35%) | 0 (0.00%) | 519 (0.17%) | 0.0542 | 0.0771 | 0.1313 | 97312.0 | 1.0000e+00 | 1.0564e-01 | 0.0000e+00 | 0.0979 | NaN | 0.1322 | NaN | NaN | NaN | NaN | 0.0551 | 0.0735 | 0.0882 | 0.1625 |
69 | LIVINGAREA_MODE | float64 | 5301 (1.72%) | 154350 (50.19%) | 0 (0.00%) | 444 (0.14%) | 0.0427 | 0.0731 | 0.1252 | 153161.0 | 1.0000e+00 | 1.0598e-01 | 0.0000e+00 | 0.1118 | 0.077 | 0.0953 | NaN | NaN | NaN | 0.0606 | 0.0579 | 0.0733 | 0.0876 | 0.1901 |
70 | NONLIVINGAPARTMENTS_MODE | float64 | 167 (0.05%) | 213514 (69.43%) | 0 (0.00%) | 59255 (19.27%) | 0.0000 | 0.0000 | 0.0039 | 93997.0 | 1.0000e+00 | 8.0764e-03 | 0.0000e+00 | 0.0463 | NaN | 0.0 | NaN | NaN | NaN | NaN | 0.0039 | 0.0 | 0.0156 | 0.0 |
71 | NONLIVINGAREA_MODE | float64 | 3327 (1.08%) | 169682 (55.18%) | 0 (0.00%) | 67126 (21.83%) | 0.0000 | 0.0011 | 0.0231 | 137829.0 | 1.0000e+00 | 2.7022e-02 | 0.0000e+00 | 0.0703 | NaN | 0.0 | NaN | NaN | NaN | 0.2373 | 0.0199 | 0.0 | 0.0136 | 0.0023 |
72 | APARTMENTS_MEDI | float64 | 1148 (0.37%) | 156061 (50.75%) | 0 (0.00%) | 771 (0.25%) | 0.0583 | 0.0864 | 0.1489 | 151450.0 | 1.0000e+00 | 1.1785e-01 | 0.0000e+00 | 0.1091 | NaN | 0.1499 | NaN | NaN | NaN | 0.2238 | 0.0635 | 0.0833 | 0.1041 | 0.1842 |
73 | BASEMENTAREA_MEDI | float64 | 3772 (1.23%) | 179943 (58.52%) | 0 (0.00%) | 14991 (4.87%) | 0.0437 | 0.0758 | 0.1116 | 127568.0 | 1.0000e+00 | 8.7955e-02 | 0.0000e+00 | 0.0822 | NaN | 0.0991 | NaN | NaN | NaN | 0.0776 | 0.0756 | 0.0788 | NaN | 0.1322 |
74 | YEARS_BEGINEXPLUATATION_MEDI | float64 | 245 (0.08%) | 150007 (48.78%) | 0 (0.00%) | 548 (0.18%) | 0.9767 | 0.9816 | 0.9866 | 157504.0 | 1.0000e+00 | 9.7775e-01 | 0.0000e+00 | 0.0599 | 0.9856 | 0.9871 | NaN | NaN | NaN | 0.9826 | 0.9831 | 0.9786 | 0.9771 | 0.9861 |
75 | YEARS_BUILD_MEDI | float64 | 151 (0.05%) | 204488 (66.50%) | 0 (0.00%) | 101 (0.03%) | 0.6914 | 0.7585 | 0.8256 | 103023.0 | 1.0000e+00 | 7.5575e-01 | 0.0000e+00 | 0.1121 | NaN | 0.8256 | NaN | NaN | NaN | NaN | 0.7719 | 0.7115 | 0.6914 | 0.8121 |
76 | COMMONAREA_MEDI | float64 | 3202 (1.04%) | 214865 (69.87%) | 0 (0.00%) | 8691 (2.83%) | 0.0079 | 0.0208 | 0.0513 | 92646.0 | 1.0000e+00 | 4.4595e-02 | 0.0000e+00 | 0.0761 | NaN | 0.0895 | NaN | NaN | NaN | NaN | NaN | 0.008 | NaN | 0.0381 |
77 | ELEVATORS_MEDI | float64 | 46 (0.01%) | 163891 (53.30%) | 0 (0.00%) | 87026 (28.30%) | 0.0000 | 0.0000 | 0.1200 | 143620.0 | 1.0000e+00 | 7.8078e-02 | 0.0000e+00 | 0.1345 | 0.08 | 0.16 | NaN | NaN | NaN | 0.08 | 0.0 | 0.0 | NaN | 0.2 |
78 | ENTRANCES_MEDI | float64 | 46 (0.01%) | 154828 (50.35%) | 0 (0.00%) | 329 (0.11%) | 0.0690 | 0.1379 | 0.2069 | 152683.0 | 1.0000e+00 | 1.4921e-01 | 0.0000e+00 | 0.1004 | 0.069 | 0.1379 | NaN | NaN | NaN | 0.0345 | 0.1379 | 0.1379 | 0.2069 | 0.1724 |
79 | FLOORSMAX_MEDI | float64 | 49 (0.02%) | 153020 (49.76%) | 0 (0.00%) | 2995 (0.97%) | 0.1667 | 0.1667 | 0.3333 | 154491.0 | 1.0000e+00 | 2.2590e-01 | 0.0000e+00 | 0.1451 | 0.3333 | 0.3333 | NaN | NaN | NaN | 0.3333 | 0.1667 | 0.1667 | 0.1667 | 0.3333 |
80 | FLOORSMIN_MEDI | float64 | 47 (0.02%) | 208642 (67.85%) | 0 (0.00%) | 2351 (0.76%) | 0.0833 | 0.2083 | 0.3750 | 98869.0 | 1.0000e+00 | 2.3162e-01 | 0.0000e+00 | 0.1619 | NaN | 0.375 | NaN | NaN | NaN | NaN | NaN | 0.2083 | NaN | 0.375 |
81 | LANDAREA_MEDI | float64 | 3560 (1.16%) | 182590 (59.38%) | 0 (0.00%) | 15919 (5.18%) | 0.0187 | 0.0487 | 0.0868 | 124921.0 | 1.0000e+00 | 6.7169e-02 | 0.0000e+00 | 0.0822 | NaN | 0.1147 | NaN | NaN | NaN | 0.0927 | 0.0154 | 0.0206 | NaN | 0.126 |
82 | LIVINGAPARTMENTS_MEDI | float64 | 1097 (0.36%) | 210199 (68.35%) | 0 (0.00%) | 433 (0.14%) | 0.0513 | 0.0761 | 0.1231 | 97312.0 | 1.0000e+00 | 1.0195e-01 | 0.0000e+00 | 0.0936 | NaN | 0.1231 | NaN | NaN | NaN | NaN | 0.0513 | 0.0684 | 0.0821 | 0.1513 |
83 | LIVINGAREA_MEDI | float64 | 5281 (1.72%) | 154350 (50.19%) | 0 (0.00%) | 299 (0.10%) | 0.0457 | 0.0749 | 0.1303 | 153161.0 | 1.0000e+00 | 1.0861e-01 | 0.0000e+00 | 0.1123 | 0.0752 | 0.0931 | NaN | NaN | NaN | 0.0592 | 0.0566 | 0.0716 | 0.0856 | 0.1857 |
84 | NONLIVINGAPARTMENTS_MEDI | float64 | 214 (0.07%) | 213514 (69.43%) | 0 (0.00%) | 56097 (18.24%) | 0.0000 | 0.0000 | 0.0039 | 93997.0 | 1.0000e+00 | 8.6510e-03 | 0.0000e+00 | 0.0474 | NaN | 0.0 | NaN | NaN | NaN | NaN | 0.0039 | 0.0 | 0.0155 | 0.0 |
85 | NONLIVINGAREA_MEDI | float64 | 3323 (1.08%) | 169682 (55.18%) | 0 (0.00%) | 60954 (19.82%) | 0.0000 | 0.0031 | 0.0266 | 137829.0 | 1.0000e+00 | 2.8236e-02 | 0.0000e+00 | 0.0702 | NaN | 0.0 | NaN | NaN | NaN | 0.2289 | 0.0192 | 0.0 | 0.0131 | 0.0022 |
86 | FONDKAPREMONT_MODE | object | 4 (0.00%) | 210295 (68.39%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | NaN | reg oper account | NaN | NaN | NaN | NaN | reg oper spec account | reg oper account | reg oper account | reg oper account |
87 | HOUSETYPE_MODE | object | 3 (0.00%) | 154297 (50.18%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | NaN | block of flats | NaN | NaN | NaN | block of flats | block of flats | block of flats | block of flats | block of flats |
88 | TOTALAREA_MODE | float64 | 5116 (1.66%) | 148431 (48.27%) | 0 (0.00%) | 582 (0.19%) | 0.0412 | 0.0688 | 0.1276 | 159080.0 | 1.0000e+00 | 1.0255e-01 | 0.0000e+00 | 0.1075 | 0.0581 | 0.1206 | NaN | NaN | NaN | 0.0945 | 0.0475 | 0.0574 | 0.0689 | 0.1644 |
89 | WALLSMATERIAL_MODE | object | 7 (0.00%) | 156341 (50.84%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | NaN | Panel | NaN | NaN | NaN | Stone, brick | Stone, brick | Panel | Stone, brick | Panel |
90 | EMERGENCYSTATE_MODE | object | 2 (0.00%) | 145755 (47.40%) | 0 (0.00%) | 0 (0.00%) | 0.0000 | 0.0000 | 0.0000 | 0.0 | 0.0000e+00 | 0.0000e+00 | 0.0000e+00 | 0.0000 | No | No | NaN | NaN | NaN | No | No | No | No | No |
91 | OBS_30_CNT_SOCIAL_CIRCLE | float64 | 33 (0.01%) | 1021 (0.33%) | 0 (0.00%) | 163910 (53.30%) | 0.0000 | 0.0000 | 2.0000 | 306490.0 | 3.4800e+02 | 1.4222e+00 | 0.0000e+00 | 2.4010 | 0.0 | 2.0 | 1.0 | 0.0 | 0.0 | 1.0 | 2.0 | 3.0 | 9.0 | 0.0 |
92 | DEF_30_CNT_SOCIAL_CIRCLE | float64 | 10 (0.00%) | 1021 (0.33%) | 0 (0.00%) | 271324 (88.23%) | 0.0000 | 0.0000 | 0.0000 | 306490.0 | 3.4000e+01 | 1.4342e-01 | 0.0000e+00 | 0.4467 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
93 | OBS_60_CNT_SOCIAL_CIRCLE | float64 | 33 (0.01%) | 1021 (0.33%) | 0 (0.00%) | 164666 (53.55%) | 0.0000 | 0.0000 | 2.0000 | 306490.0 | 3.4400e+02 | 1.4053e+00 | 0.0000e+00 | 2.3798 | 0.0 | 2.0 | 1.0 | 0.0 | 0.0 | 1.0 | 2.0 | 3.0 | 9.0 | 0.0 |
94 | DEF_60_CNT_SOCIAL_CIRCLE | float64 | 9 (0.00%) | 1021 (0.33%) | 0 (0.00%) | 280721 (91.29%) | 0.0000 | 0.0000 | 0.0000 | 306490.0 | 2.4000e+01 | 1.0005e-01 | 0.0000e+00 | 0.3623 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
95 | DAYS_LAST_PHONE_CHANGE | float64 | 3773 (1.23%) | 1 (0.00%) | 269838 (87.75%) | 37672 (12.25%) | -1570.0000 | -757.0000 | -274.0000 | 307510.0 | 0.0000e+00 | -9.6286e+02 | -4.2920e+03 | 826.8085 | -3143.0 | -2.0 | -1523.0 | -1224.0 | 0.0 | -201.0 | -1128.0 | -2959.0 | -1634.0 | -1258.0 |
96 | FLAG_DOCUMENT_2 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307498 (100.00%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 4.2275e-05 | 0.0000e+00 | 0.0065 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
97 | FLAG_DOCUMENT_3 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 89171 (29.00%) | 0.0000 | 1.0000 | 1.0000 | 307511.0 | 1.0000e+00 | 7.1002e-01 | 0.0000e+00 | 0.4538 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
98 | FLAG_DOCUMENT_4 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307486 (99.99%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 8.1298e-05 | 0.0000e+00 | 0.0090 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
99 | FLAG_DOCUMENT_5 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 302863 (98.49%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 1.5115e-02 | 0.0000e+00 | 0.1220 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
100 | FLAG_DOCUMENT_6 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 280433 (91.19%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 8.8055e-02 | 0.0000e+00 | 0.2834 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
101 | FLAG_DOCUMENT_7 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307452 (99.98%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 1.9186e-04 | 0.0000e+00 | 0.0139 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
102 | FLAG_DOCUMENT_8 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 282487 (91.86%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 8.1376e-02 | 0.0000e+00 | 0.2734 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
103 | FLAG_DOCUMENT_9 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 306313 (99.61%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 3.8958e-03 | 0.0000e+00 | 0.0623 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
104 | FLAG_DOCUMENT_10 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307504 (100.00%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 2.2763e-05 | 0.0000e+00 | 0.0048 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
105 | FLAG_DOCUMENT_11 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 306308 (99.61%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 3.9121e-03 | 0.0000e+00 | 0.0624 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
106 | FLAG_DOCUMENT_12 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307509 (100.00%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 6.5038e-06 | 0.0000e+00 | 0.0026 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
107 | FLAG_DOCUMENT_13 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 306427 (99.65%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 3.5251e-03 | 0.0000e+00 | 0.0593 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
108 | FLAG_DOCUMENT_14 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 306608 (99.71%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 2.9365e-03 | 0.0000e+00 | 0.0541 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
109 | FLAG_DOCUMENT_15 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307139 (99.88%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 1.2097e-03 | 0.0000e+00 | 0.0348 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
110 | FLAG_DOCUMENT_16 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 304458 (99.01%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 9.9281e-03 | 0.0000e+00 | 0.0991 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
111 | FLAG_DOCUMENT_17 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307429 (99.97%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 2.6666e-04 | 0.0000e+00 | 0.0163 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
112 | FLAG_DOCUMENT_18 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 305011 (99.19%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 8.1298e-03 | 0.0000e+00 | 0.0898 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
113 | FLAG_DOCUMENT_19 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307328 (99.94%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 5.9510e-04 | 0.0000e+00 | 0.0244 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
114 | FLAG_DOCUMENT_20 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307355 (99.95%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 5.0730e-04 | 0.0000e+00 | 0.0225 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
115 | FLAG_DOCUMENT_21 | int64 | 2 (0.00%) | 0 (0.00%) | 0 (0.00%) | 307408 (99.97%) | 0.0000 | 0.0000 | 0.0000 | 307511.0 | 1.0000e+00 | 3.3495e-04 | 0.0000e+00 | 0.0183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
116 | AMT_REQ_CREDIT_BUREAU_HOUR | float64 | 5 (0.00%) | 41519 (13.50%) | 0 (0.00%) | 264366 (85.97%) | 0.0000 | 0.0000 | 0.0000 | 265992.0 | 4.0000e+00 | 6.4024e-03 | 0.0000e+00 | 0.0838 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 |
117 | AMT_REQ_CREDIT_BUREAU_DAY | float64 | 9 (0.00%) | 41519 (13.50%) | 0 (0.00%) | 264503 (86.01%) | 0.0000 | 0.0000 | 0.0000 | 265992.0 | 9.0000e+00 | 7.0002e-03 | 0.0000e+00 | 0.1108 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 |
118 | AMT_REQ_CREDIT_BUREAU_WEEK | float64 | 9 (0.00%) | 41519 (13.50%) | 0 (0.00%) | 257456 (83.72%) | 0.0000 | 0.0000 | 0.0000 | 265992.0 | 8.0000e+00 | 3.4362e-02 | 0.0000e+00 | 0.2047 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 |
119 | AMT_REQ_CREDIT_BUREAU_MON | float64 | 24 (0.01%) | 41519 (13.50%) | 0 (0.00%) | 222233 (72.27%) | 0.0000 | 0.0000 | 0.0000 | 265992.0 | 2.7000e+01 | 2.6740e-01 | 0.0000e+00 | 0.9160 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 |
120 | AMT_REQ_CREDIT_BUREAU_QRT | float64 | 11 (0.00%) | 41519 (13.50%) | 0 (0.00%) | 215417 (70.05%) | 0.0000 | 0.0000 | 0.0000 | 265992.0 | 2.6100e+02 | 2.6547e-01 | 0.0000e+00 | 0.7941 | 0.0 | 0.0 | 0.0 | NaN | 1.0 | 0.0 | NaN | 0.0 | 1.0 | 0.0 |
121 | AMT_REQ_CREDIT_BUREAU_YEAR | float64 | 25 (0.01%) | 41519 (13.50%) | 0 (0.00%) | 71801 (23.35%) | 0.0000 | 1.0000 | 3.0000 | 265992.0 | 2.5000e+01 | 1.9000e+00 | 0.0000e+00 | 1.8693 | 1.0 | 0.0 | 5.0 | NaN | 1.0 | 3.0 | NaN | 1.0 | 1.0 | 1.0 |
#
def grid_bar_charts(pdf, ls_cname, ncols = 3):
"""
Vẽ nhiều bar chart cho các thuộc tính được xếp vào grid
Cho số lượng grid column, ta sẽ fill out bar chart cho từng cell của grid
"""
# tính số dòng cần cho grid
n_cat = len(ls_cname)
nrows = int(math.ceil(n_cat * 1.0 / ncols))
# khởi tạo figure gồm nrows * ncols cho grid
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 3 * nrows))
# dùng tuỳ chọn này để các chart được rời nhau
fig.set_tight_layout(False)
# fill out grid
for i in range(nrows):
for j in range(ncols):
# xác định vị trí tên column trong danh sách dựa vào (i, j, ncols)
idx = i * ncols + j
# khi plot hết thì dừng
if idx == n_cat:
break
# lấy tên column cần plot
cname = ls_cname[idx]
s00 = pdf[~pdf[cname].isna()]
s00 = s00.groupby(cname).size()
# sắp giá trị giảm dần trước khi plot
s00.sort_values(ascending=False).plot.bar(ax=axes[i][j], rot=45)
# plot grid
plt.tight_layout()
plt.show()
#
def plot_wordcloud(pdf, ls_cname):
"""
Vẽ wordcloud cho biến có nhiều giá trị categories
"""
for cname in ls_cname:
# get sequence of types
s00 = pdf[~pdf[cname].isna()][cname]
text = " ".join(s00.astype(str).tolist())
# generate wordcloud
wordcloud = WordCloud(background_color="white", width=1600, height=800).generate(text)
#
fig, ax = plt.subplots(figsize=(15, 15))
fig.set_tight_layout(False)
# plot wordcloud
ax.imshow(wordcloud, interpolation="bilinear")
ax.axis("off")
ax.set_title("Word cloud of {}".format(cname), fontsize=20)
#
plt.tight_layout()
plt.show()
#
def grid_histogram(pdf, ls_cname, ncols = 3):
"""
Vẽ nhiều histogram cho các thuộc tính được xếp vào grid
Cho số lượng grid column, ta sẽ fill out histogram cho từng cell của grid
"""
# tính số dòng cần cho grid
n_cat = len(ls_cname)
nrows = int(math.ceil(n_cat * 1.0 / ncols))
# khởi tạo figure gồm nrows * ncols cho grid
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 4 * nrows))
# dùng tuỳ chọn này để các chart được rời nhau
fig.set_tight_layout(False)
# fill out grid
for i in range(nrows):
for j in range(ncols):
# xác định vị trí tên column trong danh sách dựa vào (i, j, ncols)
idx = i * ncols + j
# khi plot hết thì dừng
if idx == n_cat:
break
cname = ls_cname[idx]
s00 = pdf[~pdf[cname].isna()][cname]
s00.plot(kind="hist", ax=axes[i][j], rot=45, title=cname)
plt.tight_layout()
plt.show()
#
def plot_continuous_data(s00, title):
"""
Quan sát continuous data bằng histogram và boxplot
"""
# khởi tạo figure
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
# plot
s00.hist(bins=30, ax=ax1)
s00.plot.box(ax=ax2)
#
plt.suptitle(title)
plt.show()
# check categorical data attributes
ls_cat_name = pdf_meta[pdf_meta["sub_type"] == "object"]["name"].tolist()
ls_cat_name
['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
# if number of category is small we could use bar chart, otherwise use cloud chart
pdf_meta00 = pdf_meta[pdf_meta["name"].isin(ls_cat_name)][["name", "n_distinct"]]
pdf_meta00["chart"] = pdf_meta00["n_distinct"].apply(lambda x: "wordcloud" if int(x.split()[0]) > 10 else "bar")
pdf_meta00
name | n_distinct | chart | |
---|---|---|---|
2 | NAME_CONTRACT_TYPE | 2 (0.00%) | bar |
3 | CODE_GENDER | 3 (0.00%) | bar |
4 | FLAG_OWN_CAR | 2 (0.00%) | bar |
5 | FLAG_OWN_REALTY | 2 (0.00%) | bar |
11 | NAME_TYPE_SUITE | 7 (0.00%) | bar |
12 | NAME_INCOME_TYPE | 8 (0.00%) | bar |
13 | NAME_EDUCATION_TYPE | 5 (0.00%) | bar |
14 | NAME_FAMILY_STATUS | 6 (0.00%) | bar |
15 | NAME_HOUSING_TYPE | 6 (0.00%) | bar |
28 | OCCUPATION_TYPE | 18 (0.01%) | wordcloud |
32 | WEEKDAY_APPR_PROCESS_START | 7 (0.00%) | bar |
40 | ORGANIZATION_TYPE | 58 (0.02%) | wordcloud |
86 | FONDKAPREMONT_MODE | 4 (0.00%) | bar |
87 | HOUSETYPE_MODE | 3 (0.00%) | bar |
89 | WALLSMATERIAL_MODE | 7 (0.00%) | bar |
90 | EMERGENCYSTATE_MODE | 2 (0.00%) | bar |
ls_cat_bar = pdf_meta00.query("chart == 'bar'")["name"].tolist()
grid_bar_charts(pdf_data, ls_cat_bar)
ls_cat_wordcloud = pdf_meta00.query("chart == 'wordcloud'")["name"].tolist()
plot_wordcloud(pdf_data, ls_cat_wordcloud)