This an app to predict if someone make more or less than 50k/year using different features. This app can be used when that information is not available or is confidential during a loan application at any financial institution or car financing application to have a better financial picture of the applicant. In this notebook, a classification model with a precision > 80% is the goal.
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from IPython.core.display import HTML, display
from pandas_profiling import ProfileReport
from pathlib import Path
from scipy.stats import probplot, chi2_contingency, chi2
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.inspection import permutation_importance
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, roc_curve, roc_auc_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from imblearn.over_sampling import SMOTE
import joblib
import os
import pickle
import re
import streamlit as st
#from scipy.stats import norm
%matplotlib inline
train_original = pd.read_csv('datasets/train.csv')
test_original = pd.read_csv('datasets/test.csv')
full_data = pd.concat([train_original, test_original], axis=0)
# shuffle the data
full_data = full_data.sample(frac=1).reset_index(drop=True)
full_data.shape
(44856, 15)
# split the data into train and test
def data_split(df, test_size):
train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
return train_df.reset_index(drop=True), test_df.reset_index(drop=True)
train_original, test_original = data_split(full_data, 0.2)
train_original.shape
(35884, 15)
test_original.shape
(8972, 15)
train_copy = train_original.copy()
test_copy = test_original.copy()
train_copy.head()
age | workclass | fnlwgt | education | educational-num | marital-status | occupation | relationship | race | gender | capital-gain | capital-loss | hours-per-week | native-country | income_>50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 38 | Private | 110402 | Assoc-voc | 11 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | 0.0 |
1 | 26 | Private | 58371 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 40 | United-States | 0.0 |
2 | 43 | Private | 173728 | Bachelors | 13 | Separated | Prof-specialty | Unmarried | White | Female | 0 | 0 | 40 | United-States | 0.0 |
3 | 36 | Private | 207789 | HS-grad | 9 | Divorced | Exec-managerial | Not-in-family | White | Male | 0 | 0 | 52 | United-States | 0.0 |
4 | 31 | Private | 234537 | HS-grad | 9 | Never-married | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 60 | United-States | 0.0 |
train_copy.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 35884 entries, 0 to 35883 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 35884 non-null int64 1 workclass 33866 non-null object 2 fnlwgt 35884 non-null int64 3 education 35884 non-null object 4 educational-num 35884 non-null int64 5 marital-status 35884 non-null object 6 occupation 33861 non-null object 7 relationship 35884 non-null object 8 race 35884 non-null object 9 gender 35884 non-null object 10 capital-gain 35884 non-null int64 11 capital-loss 35884 non-null int64 12 hours-per-week 35884 non-null int64 13 native-country 35300 non-null object 14 income_>50K 35167 non-null float64 dtypes: float64(1), int64(6), object(8) memory usage: 4.1+ MB
train_copy.describe()
age | fnlwgt | educational-num | capital-gain | capital-loss | hours-per-week | income_>50K | |
---|---|---|---|---|---|---|---|
count | 35884.000000 | 3.588400e+04 | 35884.000000 | 35884.000000 | 35884.000000 | 35884.000000 | 35167.000000 |
mean | 38.577249 | 1.899824e+05 | 10.081150 | 1129.073459 | 88.232499 | 40.401098 | 0.240083 |
std | 13.721873 | 1.057441e+05 | 2.569686 | 7806.554154 | 404.264410 | 12.427052 | 0.427140 |
min | 17.000000 | 1.349200e+04 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
25% | 28.000000 | 1.176060e+05 | 9.000000 | 0.000000 | 0.000000 | 40.000000 | 0.000000 |
50% | 37.000000 | 1.782820e+05 | 10.000000 | 0.000000 | 0.000000 | 40.000000 | 0.000000 |
75% | 48.000000 | 2.379462e+05 | 12.000000 | 0.000000 | 0.000000 | 45.000000 | 0.000000 |
max | 90.000000 | 1.490400e+06 | 16.000000 | 99999.000000 | 4356.000000 | 99.000000 | 1.000000 |
msno.matrix(train_copy)
plt.show()
msno.bar(train_copy)
plt.show()
profile_report = ProfileReport(train_copy, explorative=True, dark_mode=True)
profile_report_file_path = Path('pandas_profile_file/income_class_profile.html')
try:
profile_report_file_path.resolve(strict=True)
except FileNotFoundError:
profile_report.to_file("pandas_profile_file/income_class_profile.html")
else:
pass
#Function that will return the value count and frequency of each observation within a column
def value_cnt_norm_cal(df,feature):
ftr_value_cnt = df[feature].value_counts()
ftr_value_cnt_norm = df[feature].value_counts(normalize=True) * 100
ftr_value_cnt_concat = pd.concat([ftr_value_cnt, ftr_value_cnt_norm], axis=1)
ftr_value_cnt_concat.columns = ['Count', 'Frequency (%)']
return ftr_value_cnt_concat
train_copy['age'].head()
0 38 1 26 2 43 3 36 4 31 Name: age, dtype: int64
train_copy['age'].describe()
count 35884.000000 mean 38.577249 std 13.721873 min 17.000000 25% 28.000000 50% 37.000000 75% 48.000000 max 90.000000 Name: age, dtype: float64
train_copy['age'].dtype
dtype('int64')
train_copy['age'].isnull().sum()
0
age_value_cnt_norm = train_copy['age'].value_counts(normalize=True) * 100
age_value_cnt = train_copy['age'].value_counts()
value_cnt_norm_cal(train_copy, 'age')
Count | Frequency (%) | |
---|---|---|
36 | 1006 | 2.803478 |
33 | 1005 | 2.800691 |
31 | 994 | 2.770037 |
34 | 990 | 2.758890 |
23 | 982 | 2.736596 |
... | ... | ... |
88 | 5 | 0.013934 |
85 | 4 | 0.011147 |
87 | 2 | 0.005574 |
89 | 1 | 0.002787 |
86 | 1 | 0.002787 |
74 rows × 2 columns
fig, ax = plt.subplots(figsize=(18,10))
sns.histplot(train_copy['age'])
plt.title('Age Distribution (histogram)')
plt.show()
fig, ax = plt.subplots(figsize=(6,10))
sns.boxplot(y=train_copy['age'])
plt.title('Age Distribution (boxplot)')
plt.show()
train_copy['workclass'].dtype
dtype('O')
train_copy['workclass'].head()
0 Private 1 Private 2 Private 3 Private 4 Private Name: workclass, dtype: object
value_cnt_norm_cal(train_copy,'workclass')
Count | Frequency (%) | |
---|---|---|
Private | 25014 | 73.861690 |
Self-emp-not-inc | 2806 | 8.285596 |
Local-gov | 2335 | 6.894821 |
State-gov | 1407 | 4.154609 |
Self-emp-inc | 1237 | 3.652631 |
Federal-gov | 1044 | 3.082738 |
Without-pay | 18 | 0.053151 |
Never-worked | 5 | 0.014764 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'workclass').index, y=value_cnt_norm_cal(train_copy,'workclass')['Frequency (%)'])
plt.title('Workclass Distribution (Frenquency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'workclass').index, y=value_cnt_norm_cal(train_copy,'workclass')['Count'])
plt.title('Workclass Distribution (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
train_copy['workclass'].isnull().sum()
2018
fig, ax = plt.subplots(figsize=(18,10))
plt.pie(train_copy['workclass'].value_counts(), labels=train_copy['workclass'].value_counts().index, autopct='%1.2f%%')
plt.title('Workclass Distribution (Pie Chart)')
plt.legend(loc='upper right')
plt.show()
Its a weight assigned by the Census Bureau. The idea is that if two samples have the same(or similar) fnlwgt they have similar characteristics, demographically speaking. Say sample 7,12 and 33 are having similar fnlwgt values they are more likely to be of the same race, similar educational and social background, etc.. but if you are planning to use it, you should be aware that they are not standardized across different states/country. So someone in one country might share similar fnlwgt with somebody in another country but might have entirely different socio-economic background.
train_copy['fnlwgt'].dtype
dtype('int64')
train_copy['fnlwgt'].head()
0 110402 1 58371 2 173728 3 207789 4 234537 Name: fnlwgt, dtype: int64
pd.set_option('display.float_format', lambda x: '%.3f' % x)
train_copy['fnlwgt'].describe()
count 35884.000 mean 189982.359 std 105744.110 min 13492.000 25% 117606.000 50% 178282.000 75% 237946.250 max 1490400.000 Name: fnlwgt, dtype: float64
fig, ax = plt.subplots(figsize=(18,10))
ax.ticklabel_format(style='plain')
sns.histplot(train_copy['fnlwgt'],bins=100)
plt.title('Final weight Distribution (histogram)')
plt.xlabel('Final weight')
plt.show()
train_copy['education'].dtype
dtype('O')
train_copy['education'].head()
0 Assoc-voc 1 HS-grad 2 Bachelors 3 HS-grad 4 HS-grad Name: education, dtype: object
train_copy['education'].isnull().sum()
0
value_cnt_norm_cal(train_copy,'education')
Count | Frequency (%) | |
---|---|---|
HS-grad | 11592 | 32.304 |
Some-college | 7985 | 22.252 |
Bachelors | 5933 | 16.534 |
Masters | 1947 | 5.426 |
Assoc-voc | 1476 | 4.113 |
11th | 1336 | 3.723 |
Assoc-acdm | 1200 | 3.344 |
10th | 1042 | 2.904 |
7th-8th | 702 | 1.956 |
Prof-school | 606 | 1.689 |
9th | 556 | 1.549 |
12th | 470 | 1.310 |
Doctorate | 438 | 1.221 |
5th-6th | 359 | 1.000 |
1st-4th | 182 | 0.507 |
Preschool | 60 | 0.167 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'education').index, y=value_cnt_norm_cal(train_copy,'education')['Frequency (%)'])
plt.title('Education level (Frenquency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'education').index, y=value_cnt_norm_cal(train_copy,'education')['Count'])
plt.title('Education level (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
train_copy['educational-num'].dtype
dtype('int64')
train_copy['educational-num'].head()
0 11 1 9 2 13 3 9 4 9 Name: educational-num, dtype: int64
train_copy['educational-num'].value_counts()
9 11592 10 7985 13 5933 14 1947 11 1476 7 1336 12 1200 6 1042 4 702 15 606 5 556 8 470 16 438 3 359 2 182 1 60 Name: educational-num, dtype: int64
train_copy[['education','educational-num']].nunique()
education 16 educational-num 16 dtype: int64
train_copy[['education','educational-num']].value_counts()
education educational-num HS-grad 9 11592 Some-college 10 7985 Bachelors 13 5933 Masters 14 1947 Assoc-voc 11 1476 11th 7 1336 Assoc-acdm 12 1200 10th 6 1042 7th-8th 4 702 Prof-school 15 606 9th 5 556 12th 8 470 Doctorate 16 438 5th-6th 3 359 1st-4th 2 182 Preschool 1 60 dtype: int64
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'educational-num').index, y=value_cnt_norm_cal(train_copy,'educational-num')['Count'])
plt.title('Education level (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
train_copy['marital-status'].dtype
dtype('O')
train_copy['marital-status'].head()
0 Married-civ-spouse 1 Never-married 2 Separated 3 Divorced 4 Never-married Name: marital-status, dtype: object
train_copy['marital-status'].isnull().sum()
0
value_cnt_norm_cal(train_copy,'marital-status')
Count | Frequency (%) | |
---|---|---|
Married-civ-spouse | 16405 | 45.717 |
Never-married | 11846 | 33.012 |
Divorced | 4946 | 13.783 |
Separated | 1110 | 3.093 |
Widowed | 1086 | 3.026 |
Married-spouse-absent | 467 | 1.301 |
Married-AF-spouse | 24 | 0.067 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'marital-status').index, y=value_cnt_norm_cal(train_copy,'marital-status')['Frequency (%)'])
plt.title('Marital status (Frenquency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'marital-status').index, y=value_cnt_norm_cal(train_copy,'marital-status')['Count'])
plt.title('Marital status (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
plt.pie(train_copy['marital-status'].value_counts(), labels=train_copy['marital-status'].value_counts().index, autopct='%1.2f%%')
plt.title('Marital status (Pie Chart)')
plt.legend(loc='upper right')
plt.show()
train_copy['occupation'].dtype
dtype('O')
train_copy['occupation'].head()
0 Machine-op-inspct 1 Adm-clerical 2 Prof-specialty 3 Exec-managerial 4 Handlers-cleaners Name: occupation, dtype: object
train_copy['occupation'].isnull().sum()
2023
train_copy['occupation'].value_counts()
Craft-repair 4550 Prof-specialty 4517 Exec-managerial 4425 Sales 4090 Adm-clerical 4082 Other-service 3666 Machine-op-inspct 2198 Transport-moving 1714 Handlers-cleaners 1537 Tech-support 1098 Farming-fishing 1058 Protective-serv 734 Priv-house-serv 180 Armed-Forces 12 Name: occupation, dtype: int64
train_copy['occupation'].nunique()
14
value_cnt_norm_cal(train_copy,'occupation')
Count | Frequency (%) | |
---|---|---|
Craft-repair | 4550 | 13.437 |
Prof-specialty | 4517 | 13.340 |
Exec-managerial | 4425 | 13.068 |
Sales | 4090 | 12.079 |
Adm-clerical | 4082 | 12.055 |
Other-service | 3666 | 10.827 |
Machine-op-inspct | 2198 | 6.491 |
Transport-moving | 1714 | 5.062 |
Handlers-cleaners | 1537 | 4.539 |
Tech-support | 1098 | 3.243 |
Farming-fishing | 1058 | 3.125 |
Protective-serv | 734 | 2.168 |
Priv-house-serv | 180 | 0.532 |
Armed-Forces | 12 | 0.035 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'occupation').index, y=value_cnt_norm_cal(train_copy,'occupation')['Frequency (%)'])
plt.title('Occupation (Frenquency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'occupation').index, y=value_cnt_norm_cal(train_copy,'occupation')['Count'])
plt.title('Occupation (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
train_copy['relationship'].dtype
dtype('O')
train_copy['relationship'].head()
0 Husband 1 Own-child 2 Unmarried 3 Not-in-family 4 Not-in-family Name: relationship, dtype: object
train_copy['relationship'].value_counts()
Husband 14459 Not-in-family 9260 Own-child 5572 Unmarried 3794 Wife 1700 Other-relative 1099 Name: relationship, dtype: int64
train_copy['relationship'].isnull().sum()
0
value_cnt_norm_cal(train_copy,'relationship')
Count | Frequency (%) | |
---|---|---|
Husband | 14459 | 40.294 |
Not-in-family | 9260 | 25.805 |
Own-child | 5572 | 15.528 |
Unmarried | 3794 | 10.573 |
Wife | 1700 | 4.737 |
Other-relative | 1099 | 3.063 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'relationship').index, y=value_cnt_norm_cal(train_copy,'relationship')['Frequency (%)'])
plt.title('relationship (Frenquency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'relationship').index, y=value_cnt_norm_cal(train_copy,'relationship')['Count'])
plt.title('relationship (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
plt.pie(train_copy['relationship'].value_counts(), labels=train_copy['relationship'].value_counts().index, autopct='%1.2f%%')
plt.title('relationship (Pie Chart)')
plt.legend(loc='upper right')
plt.show()
train_copy['race'].dtype
dtype('O')
train_copy['race'].isnull().sum()
0
train_copy['race'].head()
0 White 1 White 2 White 3 White 4 White Name: race, dtype: object
value_cnt_norm_cal(train_copy,'race')
Count | Frequency (%) | |
---|---|---|
White | 30652 | 85.420 |
Black | 3482 | 9.703 |
Asian-Pac-Islander | 1104 | 3.077 |
Amer-Indian-Eskimo | 353 | 0.984 |
Other | 293 | 0.817 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'race').index, y=value_cnt_norm_cal(train_copy,'race')['Count'])
plt.title('race (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'race').index, y=value_cnt_norm_cal(train_copy,'race')['Frequency (%)'])
plt.title('race (Frequency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
plt.pie(train_copy['race'].value_counts(), labels=train_copy['race'].value_counts().index, autopct='%1.2f%%')
plt.title('race (Pie Chart)')
plt.legend(loc='upper right')
plt.show()
train_copy['gender'].dtype
dtype('O')
train_copy['race'].isnull().sum()
0
value_cnt_norm_cal(train_copy,'gender')
Count | Frequency (%) | |
---|---|---|
Male | 23937 | 66.707 |
Female | 11947 | 33.293 |
fig, ax = plt.subplots(figsize=(18,10))
plt.pie(train_copy['gender'].value_counts(), labels=train_copy['gender'].value_counts().index, autopct='%1.2f%%')
plt.title('gender (Pie Chart)')
plt.legend(loc='upper right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'gender').index, y=value_cnt_norm_cal(train_copy,'gender')['Frequency (%)'])
plt.title('gender (Frequency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'gender').index, y=value_cnt_norm_cal(train_copy,'gender')['Count'])
plt.title('gender (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
train_copy['capital-gain'].dtype
dtype('int64')
train_copy['capital-gain'].describe()
count 35884.000 mean 1129.073 std 7806.554 min 0.000 25% 0.000 50% 0.000 75% 0.000 max 99999.000 Name: capital-gain, dtype: float64
train_copy['capital-gain'].isnull().sum()
0
train_copy['capital-gain'].value_counts()
0 32968 15024 378 7688 301 7298 265 99999 199 ... 1111 1 6612 1 1731 1 2993 1 22040 1 Name: capital-gain, Length: 121, dtype: int64
train_copy['capital-gain'].shape
(35884,)
fig, ax = plt.subplots(figsize=(18,10))
sns.histplot(train_copy['capital-gain'],bins=100)
plt.title('Capital gain (histogram)')
plt.show()
train_copy['capital-loss'].dtype
dtype('int64')
train_copy['capital-loss'].describe()
count 35884.000 mean 88.232 std 404.264 min 0.000 25% 0.000 50% 0.000 75% 0.000 max 4356.000 Name: capital-loss, dtype: float64
train_copy['capital-loss'].isnull().sum()
0
train_copy['capital-loss'].value_counts()
0 34193 1902 231 1977 191 1887 159 2415 50 ... 1411 1 323 1 2201 1 2163 1 3683 1 Name: capital-loss, Length: 95, dtype: int64
train_copy['capital-loss'].shape
(35884,)
value_cnt_norm_cal(train_copy,'capital-loss')
Count | Frequency (%) | |
---|---|---|
0 | 34193 | 95.288 |
1902 | 231 | 0.644 |
1977 | 191 | 0.532 |
1887 | 159 | 0.443 |
2415 | 50 | 0.139 |
... | ... | ... |
1411 | 1 | 0.003 |
323 | 1 | 0.003 |
2201 | 1 | 0.003 |
2163 | 1 | 0.003 |
3683 | 1 | 0.003 |
95 rows × 2 columns
fig, ax = plt.subplots(figsize=(18,10))
sns.histplot(train_copy['capital-loss'],bins=100)
plt.title('Capital loss (histogram)')
plt.show()
train_copy['hours-per-week'].dtype
dtype('int64')
train_copy['hours-per-week'].describe()
count 35884.000 mean 40.401 std 12.427 min 1.000 25% 40.000 50% 40.000 75% 45.000 max 99.000 Name: hours-per-week, dtype: float64
value_cnt_norm_cal(train_copy,'hours-per-week')
Count | Frequency (%) | |
---|---|---|
40 | 16723 | 46.603 |
50 | 3090 | 8.611 |
45 | 1995 | 5.560 |
60 | 1590 | 4.431 |
35 | 1489 | 4.149 |
... | ... | ... |
88 | 1 | 0.003 |
94 | 1 | 0.003 |
87 | 1 | 0.003 |
79 | 1 | 0.003 |
86 | 1 | 0.003 |
95 rows × 2 columns
train_copy['hours-per-week'].isnull().sum()
0
#checking if hours-per-week is normally distributed
fig, ax = plt.subplots(figsize=(18,10))
probplot(train_copy['hours-per-week'], dist="norm", plot=ax)
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.histplot(train_copy['hours-per-week'],bins=100,kde=True)
plt.title('Age Distribution (histogram)')
plt.show()
train_copy['native-country'].dtype
dtype('O')
train_copy['native-country'].isnull().sum()
584
value_cnt_norm_cal(train_copy,'native-country')
Count | Frequency (%) | |
---|---|---|
United-States | 32250 | 91.360 |
Mexico | 711 | 2.014 |
Philippines | 218 | 0.618 |
Puerto-Rico | 149 | 0.422 |
Germany | 136 | 0.385 |
Canada | 132 | 0.374 |
El-Salvador | 117 | 0.331 |
India | 110 | 0.312 |
England | 108 | 0.306 |
Cuba | 95 | 0.269 |
South korea | 90 | 0.255 |
China | 87 | 0.246 |
Jamaica | 83 | 0.235 |
Dominican-Republic | 80 | 0.227 |
Italy | 75 | 0.212 |
Japan | 70 | 0.198 |
Guatemala | 65 | 0.184 |
Vietnam | 61 | 0.173 |
Poland | 58 | 0.164 |
Haiti | 57 | 0.161 |
Columbia | 53 | 0.150 |
Portugal | 52 | 0.147 |
Taiwan | 51 | 0.144 |
Iran | 43 | 0.122 |
Nicaragua | 40 | 0.113 |
Ecuador | 36 | 0.102 |
Peru | 35 | 0.099 |
Greece | 33 | 0.093 |
Ireland | 25 | 0.071 |
France | 24 | 0.068 |
Thailand | 21 | 0.059 |
Hong-Kong | 21 | 0.059 |
Cambodia | 21 | 0.059 |
Trinadad&Tobago | 19 | 0.054 |
Outlying-US(Guam-USVI-etc) | 16 | 0.045 |
Yugoslavia | 16 | 0.045 |
Honduras | 15 | 0.042 |
Hungary | 14 | 0.040 |
Laos | 12 | 0.034 |
Netherlands | 1 | 0.003 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'native-country').index, y=value_cnt_norm_cal(train_copy,'native-country')['Count'])
plt.title('Native country (Count)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'native-country').index, y=value_cnt_norm_cal(train_copy,'native-country')['Frequency (%)'])
plt.title('Native country (Frequency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
train_copy['income_>50K'].dtype
dtype('float64')
train_copy['income_>50K'].isnull().sum()
717
value_cnt_norm_cal(train_copy,'income_>50K')
Count | Frequency (%) | |
---|---|---|
0.000 | 26724 | 75.992 |
1.000 | 8443 | 24.008 |
fig, ax = plt.subplots(figsize=(18,10))
sns.barplot(x=value_cnt_norm_cal(train_copy,'income_>50K').index, y=value_cnt_norm_cal(train_copy,'income_>50K')['Frequency (%)'])
plt.title('Native country (Frequency)')
plt.grid(True,axis='y')
plt.xticks(rotation=45,ha='right')
plt.show()
fig, ax = plt.subplots(figsize=(18,10))
plt.pie(train_copy['income_>50K'].value_counts(), labels=train_copy['income_>50K'].value_counts().index, autopct='%1.2f%%')
plt.title('gender (Pie Chart)')
plt.legend(loc='upper right')
plt.show()
sns.pairplot(train_copy)
plt.show()
sns.jointplot(x='age', y='hours-per-week', data=train_copy, kind="kde", color="#f50f0f")
plt.grid()
plt.show()
sns.jointplot(x='age', y='educational-num', data=train_copy, kind="kde", color="g")
plt.yticks(np.arange(1, train_copy['educational-num'].nunique()+1, 1))
plt.grid()
plt.show()
education_education_num_relation = pd.DataFrame(train_copy[['education','educational-num']].value_counts()).sort_values(by=['educational-num'], ascending=True).drop([0],axis=1)
education_education_num_relation
education | educational-num |
---|---|
Preschool | 1 |
1st-4th | 2 |
5th-6th | 3 |
7th-8th | 4 |
9th | 5 |
10th | 6 |
11th | 7 |
12th | 8 |
HS-grad | 9 |
Some-college | 10 |
Assoc-voc | 11 |
Assoc-acdm | 12 |
Bachelors | 13 |
Masters | 14 |
Prof-school | 15 |
Doctorate | 16 |
sns.jointplot(x='educational-num', y='hours-per-week', data=train_copy, kind="kde", color="b")
plt.grid()
sns.jointplot(x='educational-num', y='age', data=train_copy, kind="kde", color="y")
plt.show()
def chi2_test(feature_1,feature_2,show_contingency_table=False):
# creating a cross tabulation of the two features
feat_1_2_crosstab = pd.crosstab(train_copy[feature_1], train_copy[feature_2])
stat, p, dof, cont_table = chi2_contingency(feat_1_2_crosstab)
cont_table_df = pd.DataFrame(cont_table, index=feat_1_2_crosstab.index, columns=feat_1_2_crosstab.columns)
# toggling the boolean to show the contingency table
if show_contingency_table:
print(cont_table_df)
print('\n')
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
print('\n')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
# Different implementation of chi2 test
# df_ft_1 = df[feature_1].dropna(axis=0, how='any')
# df_ft_2 = df[feature_2].dropna(axis=0, how='any')
# data_crosstab = pd.crosstab(df_ft_1,df_ft_2,margins=True, margins_name="Total")
# # significance level
# alpha = 0.05
# Calculation of Chisquare test statistics
# chi_square = 0
# rows = df_ft_1.unique()
# columns = df_ft_2.unique()
# for i in columns:
# for j in rows:
# O = data_crosstab[i][j]
# E = data_crosstab[i]['Total'] * data_crosstab['Total'][j] / data_crosstab['Total']['Total']
# chi_square += (O-E)**2/E
# # The p-value approach
# print("Approach 1: The p-value approach to hypothesis testing in the decision rule")
# p_value = 1 - norm.cdf(chi_square, (len(rows)-1)*(len(columns)-1))
# conclusion = "Failed to reject the null hypothesis."
# if p_value <= alpha:
# conclusion = "Null Hypothesis is rejected."
# print("chisquare-score is:", chi_square, " and p value is:", p_value)
# print(conclusion)
# # The critical value approach
# print("\n--------------------------------------------------------------------------------------")
# print("Approach 2: The critical value approach to hypothesis testing in the decision rule")
# critical_value = chi2.ppf(1-alpha, (len(rows)-1)*(len(columns)-1))
# conclusion = "Failed to reject the null hypothesis."
# if chi_square > critical_value:
# conclusion = "Null Hypothesis is rejected."
# print("chisquare-score is:", chi_square, " and p value is:", critical_value)
# print(conclusion)
all_cat_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income_>50K']
for i in all_cat_features:
for j in all_cat_features:
chi2_test(i,j)
probability=0.950, critical=66.339, stat=237062.000 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=129.918, stat=2500.713 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=58.124, stat=1210.640 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=99.617, stat=9370.526 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=49.802, stat=1351.090 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=41.337, stat=475.627 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=14.067, stat=710.261 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=312.538, stat=486.891 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=14.067, stat=895.241 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=129.918, stat=2500.713 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=260.992, stat=538260.000 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=113.145, stat=1714.905 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=228.580, stat=17033.585 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=96.217, stat=2680.998 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=79.082, stat=706.061 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=24.996, stat=279.039 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=642.377, stat=9049.430 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=24.996, stat=4674.157 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=58.124, stat=1210.640 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=113.145, stat=1714.905 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=50.998, stat=215304.000 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=99.617, stat=3499.032 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=43.773, stat=42590.780 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=36.415, stat=965.515 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=12.592, stat=7577.369 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=270.684, stat=973.841 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=12.592, stat=7110.553 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=99.617, stat=9370.526 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=228.580, stat=17033.585 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=99.617, stat=3499.032 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=200.334, stat=440193.000 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=84.821, stat=5314.426 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=69.832, stat=902.008 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=22.362, stat=6472.442 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=560.490, stat=2238.992 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=22.362, stat=3960.808 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=49.802, stat=1351.090 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=96.217, stat=2680.998 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=43.773, stat=42590.780 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=84.821, stat=5314.426 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=37.652, stat=179420.000 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=31.410, stat=1379.199 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=11.070, stat=15004.363 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=228.580, stat=1095.406 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=11.070, stat=7313.732 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=41.337, stat=475.627 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=79.082, stat=706.061 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=36.415, stat=965.515 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=69.832, stat=902.008 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=31.410, stat=1379.199 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=26.296, stat=143536.000 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=9.488, stat=471.792 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=186.146, stat=23963.259 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=9.488, stat=337.226 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=14.067, stat=710.261 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=24.996, stat=279.039 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=12.592, stat=7577.369 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=22.362, stat=6472.442 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=11.070, stat=15004.363 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=9.488, stat=471.792 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=3.841, stat=35879.497 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=54.572, stat=155.096 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=3.841, stat=1627.603 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=312.538, stat=486.891 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=642.377, stat=9049.430 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=270.684, stat=973.841 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=560.490, stat=2238.992 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=228.580, stat=1095.406 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=186.146, stat=23963.259 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=54.572, stat=155.096 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=1612.844, stat=1376700.000 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=54.572, stat=344.482 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=14.067, stat=895.241 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=24.996, stat=4674.157 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=12.592, stat=7110.553 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=22.362, stat=3960.808 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=11.070, stat=7313.732 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=9.488, stat=337.226 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=3.841, stat=1627.603 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=54.572, stat=344.482 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0) probability=0.950, critical=3.841, stat=35161.519 Dependent (reject H0) significance=0.050, p=0.000 Dependent (reject H0)
all_cat_features
['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income_>50K']
fig, axes = plt.subplots(3,3,figsize=(25,15),dpi=180)
fig.tight_layout(pad=5.0)
for cat_ft_count, ax in enumerate(axes):
for row_count in range(3):
for col_count in range(3):
sns.boxplot(ax=axes[row_count,col_count],x=train_copy[all_cat_features[cat_ft_count]],y=train_copy['age'])
axes[row_count,col_count].set_title(all_cat_features[cat_ft_count] + " vs age")
plt.sca(axes[row_count,col_count])
plt.xticks(rotation=45,ha='right')
cat_ft_count += 1
break
train_copy_corr = train_copy.corr()
fig, ax = plt.subplots(figsize=(18,10))
sns.heatmap(train_copy_corr, annot=True, cmap='coolwarm')
plt.show()
mask = np.zeros_like(train_copy_corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
train_copy_corr[mask] = np.nan
(train_copy_corr
.style
.background_gradient(cmap='bwr', axis=None, vmin=-1, vmax=1)
.highlight_null(null_color='#f1f1f1')
)
age | fnlwgt | educational-num | capital-gain | capital-loss | hours-per-week | income_>50K | |
---|---|---|---|---|---|---|---|
age | nan | nan | nan | nan | nan | nan | nan |
fnlwgt | -0.079167 | nan | nan | nan | nan | nan | nan |
educational-num | 0.034597 | -0.042468 | nan | nan | nan | nan | nan |
capital-gain | 0.077325 | -0.002336 | 0.130593 | nan | nan | nan | nan |
capital-loss | 0.054293 | -0.007103 | 0.078856 | -0.031567 | nan | nan | nan |
hours-per-week | 0.068691 | -0.014005 | 0.146415 | 0.082703 | 0.055377 | nan | nan |
income_>50K | 0.228892 | -0.010216 | 0.332821 | 0.225526 | 0.143284 | 0.229856 | nan |
Age:
Workclass:
Fnwgt:
Education:
Education-num:
Marital-status:
Occupation:
Relation:
Race:
Gender:
Capital gain:
Capital loss:
Hours per week:
Native country:
GDP Category(extracted from the Native country):
Income (Target):
# GDP data that will be used to group the countries by their GDP
# GDP data of countries in 1990
# Using 1990 data because the Train and test data is an old dataset. How do I know that? One of the country name is Yugoslavia which is no longer since the mid-1990's.
gdp_data = pd.read_csv('datasets/GDP.csv')
gdp_data.sort_values(by='1990' , inplace=True,ascending=False)
gdp_data.reset_index(inplace=True, drop=True)
gdp_data.rename(columns={'Country Name':'native-country','1990':'GDP_1990'},inplace=True)
gdp_data
native-country | GDP_1990 | |
---|---|---|
0 | Monaco | 84303.880 |
1 | United Arab Emirates | 71359.170 |
2 | Brunei | 54713.545 |
3 | Liechtenstein | 49373.610 |
4 | Luxembourg | 29664.795 |
... | ... | ... |
191 | Afghanistan | 292.000 |
192 | Mozambique | 244.601 |
193 | Eritrea | 147.000 |
194 | Somalia | 126.920 |
195 | Cambodia | 100.000 |
196 rows × 2 columns
def add_gdp_data(train_copy,test_copy,gdp_data):
full_data_copy = pd.concat([train_copy,test_copy],ignore_index=True)
gdp_group = []
for idx in gdp_data.index:
if idx <= 65:
gdp_group.append('High GDP')
elif idx >= 65 and idx <= 130:
gdp_group.append('Medium GDP')
else:
gdp_group.append('Low GDP')
# concatenate the gdp_data with the gdp_group list
gdp_data = pd.concat([gdp_data.rename(columns={'country':'native-country'}), pd.Series(gdp_group, name='GDP Group')], axis=1)
# we no longer need the GDP column, so let's drop it
gdp_data.drop(['GDP_1990'],axis=1,inplace=True)
# we need to merge the gdp_data with X dataframe
full_data_copy = pd.merge(full_data_copy, gdp_data, on='native-country', how='left')
# make income_>50K the last column
new_col_order = [col for col in full_data_copy.columns if col != 'income_>50K'] + ['income_>50K']
return full_data_copy[new_col_order]
full_data_copy = add_gdp_data(train_copy,test_copy,gdp_data)
train_copy, test_copy = data_split(full_data_copy,0.2)
class OutlierHandler(BaseEstimator, TransformerMixin):
def __init__(self,col_with_outliers = ['age']):
self.col_with_outliers = col_with_outliers
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
if (set(self.col_with_outliers).issubset(X.columns)):
Q1 = X[self.col_with_outliers].quantile(.25)
Q3 = X[self.col_with_outliers].quantile(.75)
IQR = Q3 - Q1
outlier_condition = (X[self.col_with_outliers] < (Q1 - 1.5 * IQR)) | (X[self.col_with_outliers] > (Q3 + 1.5 * IQR))
index_to_keep = X[~outlier_condition.any(axis=1)].index
return X.loc[index_to_keep]
else:
print("Columns not found")
return X
msno.bar(train_copy)
plt.show()
class MissingValHandler(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
# drop all the rows with missing values in X
X.dropna(inplace=True)
X.reset_index(inplace=True, drop=True)
return X
For more information, refere to this post
class SkewnessHandler(BaseEstimator, TransformerMixin):
def __init__(self,col_with_skewness=['age','capital-gain','capital-loss']):
self.col_with_skewness = col_with_skewness
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
if (set(self.col_with_skewness).issubset(X.columns)):
# Handle skewness with cubic root transformation
X[self.col_with_skewness] = np.cbrt(X[self.col_with_skewness])
return X
else:
print('One or more skewed columns are not found')
return X
class OversampleSMOTE(BaseEstimator,TransformerMixin):
def __init__(self, perform_oversampling = True):
self.perform_oversampling = perform_oversampling
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
# function to oversample the minority class
if self.perform_oversampling:
smote = SMOTE()
X_bal, y_bal = smote.fit_resample(X.iloc[:,:-1],X.iloc[:,-1])
X_y_bal = pd.concat([pd.DataFrame(X_bal),pd.DataFrame(y_bal)],axis=1)
return X_y_bal
else:
print("No oversampling performed")
return X
def smote_pipeline_fuc(df):
smote_pipeline = Pipeline([
('smote', OversampleSMOTE()) # default: perform_oversampling = True
])
smote_pip_result = smote_pipeline.fit_transform(df)
return smote_pip_result
def concat_fuc(df_ordinal_minmax, df_onehot, df_target):
concat_df = pd.concat([df_ordinal_minmax, df_onehot, df_target], axis=1)
return concat_df
def one_hot_enc_fuc(df):
columns_to_one_hot_enc = ['race', 'gender', 'workclass', 'occupation','marital-status', 'relationship']
one_hot_enc = OneHotEncoder()
one_hot_enc.fit(df[columns_to_one_hot_enc])
# get the result of the one hot encoding columns names
cols_names_one_hot_enc = one_hot_enc.get_feature_names_out(columns_to_one_hot_enc)
# change the array of the one hot encoding to a dataframe with the column names
one_hot_result_with_names_col = pd.DataFrame(one_hot_enc.transform(df[columns_to_one_hot_enc]).toarray(),columns=cols_names_one_hot_enc)
return one_hot_result_with_names_col
def ordinal_minmax_scaler_fuc(df):
columns_to_ordinal_enc = ['education', 'GDP Group']
columns_to_scale = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
col_transformer = ColumnTransformer([
('Ordinal encoder',OrdinalEncoder(),columns_to_ordinal_enc), # ordinal encoding for education and GDP Group because they are ranked
('Min max scaler',MinMaxScaler(),columns_to_scale)]) # scaling for age, capital-gain, capital-loss, hours-per-week
ordinal_minmax_scaler_result = col_transformer.fit_transform(df)
ordinal_minmax_scaler_result_with_names_col = pd.DataFrame(ordinal_minmax_scaler_result,columns=columns_to_ordinal_enc+columns_to_scale)
return ordinal_minmax_scaler_result_with_names_col
def extract_target_col(df):
target = df.iloc[:,-1].to_frame().reset_index(drop=True)
return target
def initial_pipeline_fuc(df):
init_pipeline = Pipeline([
('Missing values handler', MissingValHandler()), # drop missing values in the whole dataset
('Outliers handler', OutlierHandler()),
('Skewness handler', SkewnessHandler()), # columns with skewness are 'age','capital-gain','capital-loss'
])
init_pip_result = init_pipeline.fit_transform(df)
return init_pip_result
def full_pipeline_fuc(df):
# initial pipeline
init_pip_result = initial_pipeline_fuc(df)
#extracting the target variable
target = extract_target_col(init_pip_result)
# column transformers to apply ordinal and minmax transformation on specific columns
ordinal_minmax_result = ordinal_minmax_scaler_fuc(init_pip_result)
#one hot encoding
one_hot_enc_result = one_hot_enc_fuc(init_pip_result)
# concat the result from the ordinal and minmax transformation and one hot encoding with the target variable
encoded_concat_result = concat_fuc(ordinal_minmax_result,one_hot_enc_result,target)
# balance the imbalance data with smote function
smote_pip_result = smote_pipeline_fuc(encoded_concat_result)
return smote_pip_result
train_copy.head(2)
age | workclass | fnlwgt | education | educational-num | marital-status | occupation | relationship | race | gender | capital-gain | capital-loss | hours-per-week | native-country | GDP Group | income_>50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 51 | State-gov | 105943 | HS-grad | 9 | Married-civ-spouse | Adm-clerical | Wife | White | Female | 3908 | 0 | 40 | United-States | High GDP | 0.000 |
1 | 38 | Local-gov | 34744 | Some-college | 10 | Divorced | Adm-clerical | Unmarried | White | Female | 0 | 0 | 38 | United-States | High GDP | 0.000 |
train_copy_prep = full_pipeline_fuc(train_copy)
train_copy_prep.shape
train_copy_prep.head(2)
education | GDP Group | age | capital-gain | capital-loss | hours-per-week | race_Amer-Indian-Eskimo | race_Asian-Pac-Islander | race_Black | race_Other | ... | marital-status_Never-married | marital-status_Separated | marital-status_Widowed | relationship_Husband | relationship_Not-in-family | relationship_Other-relative | relationship_Own-child | relationship_Unmarried | relationship_Wife | income_>50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11.000 | 0.000 | 0.691 | 0.339 | 0.000 | 0.398 | 0.000 | 0.000 | 0.000 | 0.000 | ... | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 |
1 | 15.000 | 0.000 | 0.480 | 0.000 | 0.000 | 0.378 | 0.000 | 0.000 | 0.000 | 0.000 | ... | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 |
2 rows × 48 columns
pd.set_option('display.max_columns', None)
train_copy_prep
education | GDP Group | age | capital-gain | capital-loss | hours-per-week | race_Amer-Indian-Eskimo | race_Asian-Pac-Islander | race_Black | race_Other | race_White | gender_Female | gender_Male | workclass_Federal-gov | workclass_Local-gov | workclass_Private | workclass_Self-emp-inc | workclass_Self-emp-not-inc | workclass_State-gov | workclass_Without-pay | occupation_Adm-clerical | occupation_Armed-Forces | occupation_Craft-repair | occupation_Exec-managerial | occupation_Farming-fishing | occupation_Handlers-cleaners | occupation_Machine-op-inspct | occupation_Other-service | occupation_Priv-house-serv | occupation_Prof-specialty | occupation_Protective-serv | occupation_Sales | occupation_Tech-support | occupation_Transport-moving | marital-status_Divorced | marital-status_Married-AF-spouse | marital-status_Married-civ-spouse | marital-status_Married-spouse-absent | marital-status_Never-married | marital-status_Separated | marital-status_Widowed | relationship_Husband | relationship_Not-in-family | relationship_Other-relative | relationship_Own-child | relationship_Unmarried | relationship_Wife | income_>50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11.000 | 0.000 | 0.691 | 0.339 | 0.000 | 0.398 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 |
1 | 15.000 | 0.000 | 0.480 | 0.000 | 0.000 | 0.378 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 |
2 | 11.000 | 0.000 | 0.238 | 0.000 | 0.000 | 0.296 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 |
3 | 11.000 | 0.000 | 0.260 | 0.000 | 0.764 | 0.398 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
4 | 15.000 | 1.000 | 0.166 | 0.000 | 0.000 | 0.398 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
48859 | 14.000 | 0.000 | 0.799 | 0.000 | 0.844 | 0.287 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
48860 | 15.000 | 0.000 | 0.614 | 0.000 | 0.775 | 0.500 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
48861 | 7.000 | 0.000 | 0.642 | 0.000 | 0.000 | 0.420 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
48862 | 15.000 | 0.000 | 0.496 | 0.000 | 0.000 | 0.398 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 |
48863 | 11.000 | 0.000 | 0.523 | 0.000 | 0.000 | 0.398 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
48864 rows × 48 columns
test_copy_prep = full_pipeline_fuc(test_copy)
test_copy_prep.shape
(12022, 48)
X_train_copy_prep = train_copy_prep.iloc[:,:-1]
X_train_copy_prep.shape
(48864, 47)
y_train_copy_prep = train_copy_prep.iloc[:,-1]
y_train_copy_prep.shape
(48864,)
X_test_copy_prep = test_copy_prep.iloc[:,:-1]
X_test_copy_prep.shape
(12022, 47)
y_test_copy_prep = test_copy_prep.iloc[:,-1]
y_test_copy_prep.shape
(12022,)
def create_summary_table(summary_data):
summary_table_col = [
'Model name',
'Precision score (Validation set)',
'Recall score (Validation set)',
'F1 score (Validation set)',
'Accuracy score (Validation set)',
'AUC score (Validation set)',
'AUC score (Train set)',
'Has overfit (AUC score (Train set) > AUC score (Validation set))',
'Confusion matrix',
'Roc auc curve',
'Top 10 important features',
'Top 10 useless features',
'Top 10 important features plot',
'Top 10 useless features plot',
]
print('\n *************** Metrics Summary Table ***************\n')
# print all the models summary and gave the width and height to the plot
summary_df = pd.DataFrame(summary_data, columns=summary_table_col).iloc[[4,6,7,9,10]].style.set_properties(subset=['Confusion matrix','Roc auc curve', 'Top 10 important features plot', 'Top 10 useless features plot'], **{'width': '600px','height': '600px'})
# print only knn, random forest, NN, bagging, gradient boosting
display(HTML(summary_df.to_html()))
#return summary_df
def top_and_worst_feat_fuc(col_with_coef):
top_10_feat, worst_10_feat = col_with_coef[:10], col_with_coef[-10:]
top_10_feat_str = ""
worst_10_feat_str = ""
for count,feat in enumerate(top_10_feat, start=1):
# top 10 features string formatting
top_10_feat_str += "{0}. feature name: {1}".format(count,feat[0])+ "<br>" + "coefficient: {:.4f}".format(feat[1]) + "<br>"
for count,feat in enumerate(worst_10_feat, start=1):
# worst 10 features string formatting
worst_10_feat_str += "{0}. feature name: {1}".format(count,feat[0])+ "<br>" + "coefficient: {:.4f}".format(feat[1]) + "<br>"
return top_10_feat_str, worst_10_feat_str
def check_overfit(auc_score_val_set, auc_score_train_set):
# if the auc score of the training set is higher than the validation set by more than 0.03, then the model is overfitting
if (auc_score_train_set - auc_score_val_set) > 0.03:
return True
else:
return False
summary_data = []
def growing_summary_table_fuc(model_name,precision_score,recall_score,f1_score, accuracy_score, auc_score_val_set, auc_score_train_set, is_overfitting, img_conf_matrix, img_roc_auc, col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html):
if col_with_coef == 'No coeficient or feature importance for this model':
each_clf_data = [model_name,precision_score,recall_score,f1_score, accuracy_score, auc_score_val_set, auc_score_train_set, is_overfitting, img_conf_matrix, img_roc_auc, col_with_coef, col_with_coef, col_with_coef, col_with_coef]
summary_data.append(each_clf_data)
else:
top_10_feat, worst_10_feat = top_and_worst_feat_fuc(col_with_coef)
each_clf_data = [model_name,precision_score,recall_score,f1_score, accuracy_score, auc_score_val_set, auc_score_train_set, is_overfitting, img_conf_matrix, img_roc_auc, top_10_feat, worst_10_feat, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html]
summary_data.append(each_clf_data)
def roc_curve_fuc(model_trn,model_name,X_train_copy_prep,y_train_copy_prep):
# path to save the roc curve
roc_curve_path = Path('saved_preliminary_models/{0}/{0}_roc_curve.jpg'.format(model_name))
try:
roc_curve_path.resolve(strict=True)
except FileNotFoundError:
print('\n ROC curve\n')
lower_than_50k_probs = [0 for _ in range(len(y_train_copy_prep))]
higher_than_50k_probs = model_trn.predict_proba(X_train_copy_prep)
higher_than_50k_probs_pos_outcome = higher_than_50k_probs[:,1]
lower_than_50k_auc = roc_auc_score(y_train_copy_prep,lower_than_50k_probs)
higher_than_50k_probs_auc = roc_auc_score(y_train_copy_prep,higher_than_50k_probs_pos_outcome)
#save the auc
with open('saved_preliminary_models/{0}/lower_than_50k_auc_{0}.pickle'.format(model_name),'wb') as handle:
pickle.dump(lower_than_50k_auc,handle)
with open('saved_preliminary_models/{0}/higher_than_50k_probs_auc_{0}.pickle'.format(model_name),'wb') as handle:
pickle.dump(higher_than_50k_probs_auc,handle)
# print the auc
print('Income lower than 50k: ROC AUC=%.3f' % (lower_than_50k_auc))
print('Income higher than 50k: ROC AUC=%.3f' % (higher_than_50k_probs_auc))
lower_than_50k_false_pos_rate, lower_than_50k_true_pos_rate, _ = roc_curve(y_train_copy_prep,lower_than_50k_probs)
higher_than_50k_false_pos_rate, higher_than_50k_true_pos_rate, _ = roc_curve(y_train_copy_prep,higher_than_50k_probs_pos_outcome)
plt.plot(lower_than_50k_false_pos_rate, lower_than_50k_true_pos_rate, linestyle='--', label='Income lower than 50k')
plt.plot(higher_than_50k_false_pos_rate, higher_than_50k_true_pos_rate, marker='.', label='Income higher than 50k')
# axis labels
plt.xlabel('False Positive Rate (Precision)')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC curve')
# show the legend
plt.legend()
# save the plot
plt.savefig('saved_preliminary_models/{0}/{0}_ROC_curve.jpg'.format(model_name))
# show the plot
plt.show()
# creating the html roc auc curve image
img_roc_auc = '<img src="'+ 'saved_preliminary_models/{0}/{0}_ROC_curve.jpg'.format(model_name) + '">'
return higher_than_50k_probs_auc, img_roc_auc
else:
# if roc curve path exists, load the auc first
with open('saved_preliminary_models/{0}/lower_than_50k_auc_{0}.pickle'.format(model_name),'rb') as handle:
lower_than_50k_auc = pickle.load(handle)
with open('saved_preliminary_models/{0}/higher_than_50k_probs_auc_{0}.pickle'.format(model_name),'rb') as handle:
higher_than_50k_probs_auc = pickle.load(handle)
# print the auc
print('Income lower than 50k: ROC AUC=%.3f' % (lower_than_50k_auc))
print('Income higher than 50k: ROC AUC=%.3f' % (higher_than_50k_probs_auc))
# read the ROC image
img_roc = mpimg.imread('saved_preliminary_models/{0}/{0}_ROC_curve.jpg'.format(model_name))
# plot the ROC image
img_roc_plot = plt.imshow(img_roc)
#remove the axis
plt.axis('off')
# show the plot
plt.show()
# creating the html roc auc curve image
img_roc_auc = '<img src="'+ 'saved_preliminary_models/{0}/{0}_ROC_curve.jpg'.format(model_name) + '">'
return higher_than_50k_probs_auc, img_roc_auc
def confusion_matrix_fuc(model_name,y_train_copy_prep,y_train_copy_pred):
#path to save the confusion matrix
confusion_matrix_path = Path('saved_preliminary_models/{0}/{0}_confusion_matrix.jpg'.format(model_name))
try:
#check if the path exists
confusion_matrix_path.resolve(strict=True)
except FileNotFoundError:
print('\n Confusion Matrix\n')
#plot confusion matrix
confusion_matrix = ConfusionMatrixDisplay.from_predictions(y_train_copy_prep,y_train_copy_pred, cmap='Blues',values_format='d')
#give a title to the plot using the model name
plt.title('Confusion Matrix')
#save the plot as jpg
plt.savefig('saved_preliminary_models/{0}/{0}_confusion_matrix.jpg'.format(model_name))
#show the plot
plt.show()
#img_conf_matrix = 'saved_preliminary_models/{0}/{0}_confusion_matrix.jpg'.format(model_name)
img_conf_matrix_html = '<img src="' + 'saved_preliminary_models/{0}/{0}_confusion_matrix.jpg'.format(model_name) + '">'
return img_conf_matrix_html
else:
img_conf_matrix_html = '<img src="' + 'saved_preliminary_models/{0}/{0}_confusion_matrix.jpg'.format(model_name) + '">'
img_conf_matrix = mpimg.imread('saved_preliminary_models/{0}/{0}_confusion_matrix.jpg'.format(model_name))
# plot the confusion matrix image
img_conf_matrix_plot = plt.imshow(img_conf_matrix)
# disable the axis
plt.axis('off')
plt.show()
return img_conf_matrix_html
def scores_cal_fuc(model_name,X_train_copy_prep,y_train_copy_prep):
score_list = ['precision','recall','f1','accuracy','roc_auc']
scores = []
scores_mean_for_each_type = []
scores_mean = 0
scores_std = 0
# path to save the model folder
model_dir_path = Path('saved_preliminary_models/{0}/'.format(model_name))
files_start_with_score_path = []
#for loop to check if there is any file start with 'score' in the model folder
for i in os.listdir(model_dir_path):
if os.path.isfile(os.path.join(model_dir_path,i)) and 'score' in i:
files_start_with_score_path.append(os.path.join(model_dir_path,i))
# file that start with 'score' found, load the scores list, mean and std using pickle
if files_start_with_score_path:
for score_type in score_list:
# load the scores list
with open('saved_preliminary_models/{0}/score_{1}_list.pickle'.format(model_name,score_type),'rb') as handle:
scores = pickle.load(handle)
# load the mean score
with open('saved_preliminary_models/{0}/score_{1}_mean.pickle'.format(model_name,score_type),'rb') as handle:
scores_mean = pickle.load(handle)
scores_mean_for_each_type.append(scores_mean)
# load the std score
with open('saved_preliminary_models/{0}/score_{1}_std.pickle'.format(model_name,score_type),'rb') as handle:
scores_std = pickle.load(handle)
print('\n {} score\n'.format(score_type))
print('Scores: {}\n'.format(scores))
print('Mean of the scores: {}\n'.format(scores_mean))
print('Standard deviation of the scores: {}\n\n'.format(scores_std))
return scores_mean_for_each_type
# no file start with score in the model folder
else:
for score_type in score_list:
# calculate the scores for each score type using kfold cross validation
scores = cross_val_score(model,X_train_copy_prep,y_train_copy_prep,scoring=score_type,cv=10,n_jobs=-1)
scores_mean = scores.mean()
scores_mean_for_each_type.append(scores_mean)
scores_std = scores.std()
print('\n {} score\n'.format(score_type))
print('Scores: {}\n'.format(scores))
print('Mean of the scores: {}\n'.format(scores_mean))
print('Standard deviation of the scores: {}\n\n'.format(scores_std))
# save the scores using pickle
with open('saved_preliminary_models/{0}/score_{1}_list.pickle'.format(model_name,score_type),'wb') as handle:
pickle.dump(scores,handle)
# save the mean scores using pickle
with open('saved_preliminary_models/{0}/score_{1}_mean.pickle'.format(model_name,score_type),'wb') as handle:
pickle.dump(scores_mean,handle)
# save the standard deviation scores using pickle
with open('saved_preliminary_models/{0}/score_{1}_std.pickle'.format(model_name,score_type),'wb') as handle:
pickle.dump(scores_std,handle)
return scores_mean_for_each_type
def classification_report_fuc(model_name,y_train_copy_prep,y_train_copy_pred):
# path to save the classification report
class_rep_path = Path('saved_preliminary_models/{0}/class_rep_{0}.pickle'.format(model_name))
try:
#check if the path exists
class_rep_path.resolve(strict=True)
except FileNotFoundError:
#cross validation prediction with kfold = 10
print('\n Classification Report\n')
#classification report
cls_rep = classification_report(y_train_copy_prep,y_train_copy_pred)
print(cls_rep)
# save the classification report
with open('saved_preliminary_models/{0}/class_rep_{0}.pickle'.format(model_name),'wb') as handle:
pickle.dump(cls_rep,handle)
return cls_rep
else:
# if it exist load the classification report
with open('saved_preliminary_models/{0}/class_rep_{0}.pickle'.format(model_name),'rb') as handle:
cls_rep = pickle.load(handle)
print(' {} Classification Report\n'.format(model_name))
print(cls_rep)
return cls_rep
def load_coef_and_plot(model_name):
with open('saved_preliminary_models/{0}/coef_{0}.pickle'.format(model_name),'rb') as handle:
col_with_coef = pickle.load(handle)
# print the coefficients of the model
# print("\nCoefficients for feature importance:\n")
# [print(i) for i in col_with_coef]
# print('\n')
# load top 10 features image and plot it
img_col_with_coef_df_top_10 = mpimg.imread('saved_preliminary_models/{0}/{0}_top_10.jpg'.format(model_name))
# plot the confusion matrix image
img_col_with_coef_df_top_10_plot = plt.imshow(img_col_with_coef_df_top_10)
#remove the axis
plt.axis('off')
plt.show()
# load bottom 10 features image and plot it
img_col_with_coef_df_bottom_10 = mpimg.imread('saved_preliminary_models/{0}/{0}_bottom_10.jpg'.format(model_name))
# plot the confusion matrix image
img_col_with_coef_df_bottom_10_plot = plt.imshow(img_col_with_coef_df_bottom_10)
#remove the axis
plt.axis('off')
plt.show()
# save the top 10 features plot to a html tag
col_with_coef_df_top_10_html = '<img src="' + 'saved_preliminary_models/{0}/{0}_top_10.jpg'.format(model_name) + '">'
# save the bottom 10 features plot to a html tag
col_with_coef_df_bottom_10_html = '<img src="' + 'saved_preliminary_models/{0}/{0}_bottom_10.jpg'.format(model_name) + '">'
return col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html
def fit_and_save_coef_and_plot(model_name, X_train_copy_prep, coef):
columns_names = X_train_copy_prep.columns
col_with_coef = list(zip(columns_names,coef))
col_with_coef.sort(key=lambda x: x[1],reverse=True)
# print("\nCoefficients for feature importance:\n")
# [print(i) for i in col_with_coef]
# print('\n')
# horizontal bar plot of the top 10 features
col_with_coef_df_top_10 = pd.DataFrame(col_with_coef[:10], columns=['Columns','Coefficients'])
# horizontal bar plot of the bottom 10 features
col_with_coef_df_bottom_10 = pd.DataFrame(col_with_coef[-10:], columns=['Columns','Coefficients'])
sns.barplot(y=col_with_coef_df_top_10['Columns'],x=col_with_coef_df_top_10['Coefficients'])
# plot title top 10
plt.title('Top 10 most predictive features')
# save the plot to a jpg file
plt.savefig('saved_preliminary_models/{0}/{0}_top_10.jpg'.format(model_name))
plt.show()
sns.barplot(y=col_with_coef_df_bottom_10['Columns'],x=col_with_coef_df_bottom_10['Coefficients'])
# plot title bottom 10
plt.title('Top 10 least predictive features')
# save the plot to a jpg file
plt.savefig('saved_preliminary_models/{0}/{0}_bottom_10.jpg'.format(model_name))
plt.show()
# save the coefficients of the model to pickle
with open('saved_preliminary_models/{0}/coef_{0}.pickle'.format(model_name),'wb') as handle:
pickle.dump(col_with_coef,handle)
# save the top 10 features plot to a html tag
col_with_coef_df_top_10_html = '<img src="' + 'saved_preliminary_models/{0}/{0}_top_10.jpg'.format(model_name) + '">'
# save the bottom 10 features plot to a html tag
col_with_coef_df_bottom_10_html = '<img src="' + 'saved_preliminary_models/{0}/{0}_bottom_10.jpg'.format(model_name) + '">'
return col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html
col_with_coef = []
def folder_and_file_model_check(model, model_name, X_train_copy_prep, y_train_copy_prep):
col_with_coef_df_top_10_html = ''
col_with_coef_df_bottom_10_html = ''
# check if the folder with the model name exist and if not create them
if not os.path.exists('saved_preliminary_models/{}'.format(model_name)):
os.makedirs('saved_preliminary_models/{}'.format(model_name))
# check if the model file exist and if not create, train and save it
model_file_path = Path('saved_preliminary_models/{0}/{0}_model.sav'.format(model_name))
try:
model_file_path.resolve(strict=True)
except FileNotFoundError:
model_trn = model.fit(X_train_copy_prep,y_train_copy_prep)
joblib.dump(model_trn,model_file_path)
# coeficient of the model for feature importance using switch-case statement [new in Python 3.10]
match model_name:
# for sgd, logistic regression and linear discriminant analysis, use coef_
case 'SGD' | 'Logistic_regression' | 'Linear_discriminant_analysis':
coef_of_each_feat = model_trn.coef_[0]
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = fit_and_save_coef_and_plot(model_name, X_train_copy_prep, coef_of_each_feat)
# no coefficients for the svm model as it took a while to train
case 'Support_vector_machine':
# no coefficients or feature importance
col_with_coef = 'No coeficient or feature importance for this model'
pass
# for decision tree, random forest, gradient boosting, adaboost and Extra_trees, use feature_importances_
case 'Decision_tree' | 'Random_forest' | 'Gradient_boosting' | 'AdaBoost' | 'Extra_trees':
coef_of_each_feat = model_trn.feature_importances_
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = fit_and_save_coef_and_plot(model_name, X_train_copy_prep, coef_of_each_feat)
# does not have does not offer an intrinsic method to evaluate feature importance. refer to https://stackoverflow.com/questions/62933365/how-to-get-the-feature-importance-in-gaussian-naive-bayes, will use permutation_importance
case 'Gaussian_naive_bayes':
#Gaussian naive uses the permutation importance method to evaluate feature importance
imps = permutation_importance(model_trn, X_train_copy_prep, y_train_copy_prep)
coef_of_each_feat = imps.importances_mean
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = fit_and_save_coef_and_plot(model_name, X_train_copy_prep, coef_of_each_feat)
# Feature importance is not defined for the KNN Classification algorithm
case 'K-Nearest_neighbors':
# no coefficients or feature importance
col_with_coef = 'No coeficient or feature importance for this model'
pass
case 'Bagging':
coef_of_each_feat = np.mean([tree.feature_importances_ for tree in model_trn.estimators_], axis=0)
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = fit_and_save_coef_and_plot(model_name, X_train_copy_prep, coef_of_each_feat)
# Feature importance is not defined for the Neural Network Classification algorithm
case 'Neural_network':
# no coefficients or feature importance
col_with_coef = 'No coeficient or feature importance for this model'
pass
else:
# if it exist load the model
model_trn = joblib.load(model_file_path)
#load the coefficients of the model from pickle
match model_name:
case 'SGD' | 'Logistic_regression' | 'Linear_discriminant_analysis':
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = load_coef_and_plot(model_name)
case 'Support_vector_machine':
col_with_coef = 'No coeficient or feature importance for this model'
pass
case 'Decision_tree' | 'Random_forest' | 'Gradient_boosting' | 'AdaBoost' | 'Extra_trees':
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = load_coef_and_plot(model_name)
case 'Gaussian_naive_bayes':
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = load_coef_and_plot(model_name)
case 'K-Nearest_neighbors':
col_with_coef = 'No coeficient or feature importance for this model'
pass
case 'Bagging':
col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = load_coef_and_plot(model_name)
case 'Neural_network':
col_with_coef = 'No coeficient or feature importance for this model'
pass
# check if y_train_copy_prep exist and if not create it
y_train_copy_pred_path = Path('saved_preliminary_models/{0}/y_train_copy_pred_{0}.sav'.format(model_name))
try:
y_train_copy_pred_path.resolve(strict=True)
except FileNotFoundError:
#cross validation prediction with kfold = 10
y_train_copy_pred = cross_val_predict(model_trn,X_train_copy_prep,y_train_copy_prep,cv=10,n_jobs=-1)
#save the predictions
joblib.dump(y_train_copy_pred,y_train_copy_pred_path)
return y_train_copy_pred, model_trn, col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html
else:
# if it exist load the predictions
y_train_copy_pred = joblib.load(y_train_copy_pred_path)
return y_train_copy_pred, model_trn, col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html
def evaluate_model(model,model_name,X_train_copy_prep,y_train_copy_prep):
print('\n *************** {} ***************\n'.format(model_name))
#create the folder and the model file if they don't exist
y_train_copy_pred,model_trn, col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html = folder_and_file_model_check(model,model_name,X_train_copy_prep,y_train_copy_prep)
# return the classification report
classification_report_fuc(model_name,y_train_copy_prep,y_train_copy_pred)
# print the scores by score type, mean scores and std scores and return the mean scores
scores_mean= scores_cal_fuc(model_name,X_train_copy_prep,y_train_copy_prep)
# return the confusion matrix
img_conf_matrix = confusion_matrix_fuc(model_name,y_train_copy_prep,y_train_copy_pred)
# return the ROC curve and numpy array of image auc and roc curve
auc_score_train, img_roc_auc = roc_curve_fuc(model_trn,model_name,X_train_copy_prep,y_train_copy_prep)
# check if the model has overfit
is_overfitting = check_overfit(scores_mean[4],auc_score_train)
# create a comparison summary table
growing_summary_table_fuc(model_name, scores_mean[0], scores_mean[1], scores_mean[2], scores_mean[3], scores_mean[4], auc_score_train, is_overfitting, img_conf_matrix, img_roc_auc, col_with_coef, col_with_coef_df_top_10_html, col_with_coef_df_bottom_10_html)
model_dict = {
'SGD':SGDClassifier(random_state=42,loss='log'),
'Logistic_regression':LogisticRegression(random_state=42,max_iter=1000),
'Support_vector_machine':SVC(random_state=42,probability=True),
'Decision_tree':DecisionTreeClassifier(random_state=42),
'Random_forest':RandomForestClassifier(random_state=42),
'Gaussian_naive_bayes':GaussianNB(),
'K-Nearest_neighbors':KNeighborsClassifier(),
'Gradient_boosting':GradientBoostingClassifier(random_state=42),
'Linear_discriminant_analysis':LinearDiscriminantAnalysis(),
'Bagging':BaggingClassifier(random_state=42),
'Neural_network':MLPClassifier(random_state=42,max_iter=1000),
'AdaBoost':AdaBoostClassifier(random_state=42),
'Extra_trees':ExtraTreesClassifier(random_state=42),
}
def model_evaluation_df(model_dict, X_train_copy_prep, y_train_copy_prep):
for model_name,model in model_dict.items():
evaluate_model(model,model_name,X_train_copy_prep,y_train_copy_prep)
model_evaluation_df(model_dict,X_train_copy_prep, y_train_copy_prep)
*************** SGD ***************
SGD Classification Report precision recall f1-score support 0.0 0.84 0.74 0.79 24355 1.0 0.77 0.86 0.81 24355 accuracy 0.80 48710 macro avg 0.81 0.80 0.80 48710 weighted avg 0.81 0.80 0.80 48710 precision score Scores: [0.77293659 0.71717172 0.76694619 0.87867178 0.76108628 0.76272948 0.78172969 0.7593985 0.76288301 0.79200308] Mean of the scores: 0.7755556306651717 Standard deviation of the scores: 0.03902148182704336 recall score Scores: [0.89573071 0.93267652 0.90106732 0.56486043 0.908867 0.90431211 0.85749487 0.91252567 0.89979466 0.84599589] Mean of the scores: 0.8623325173728771 Standard deviation of the scores: 0.1020757288798986 f1 score Scores: [0.82981555 0.81084939 0.82861457 0.68765617 0.82843779 0.82750846 0.81786134 0.82894982 0.8257019 0.81810961] Mean of the scores: 0.8103504615169751 Standard deviation of the scores: 0.041335667606863 accuracy score Scores: [0.81625949 0.78238555 0.81359064 0.74337918 0.81174297 0.81153767 0.80907411 0.81174297 0.8101006 0.81194827] Mean of the scores: 0.8021761445288442 Standard deviation of the scores: 0.021572995928904982 roc_auc score Scores: [0.8936248 0.88152406 0.89703523 0.89240128 0.89275801 0.88998122 0.88750788 0.8912173 0.89109979 0.89664133] Mean of the scores: 0.8913790911818952 Standard deviation of the scores: 0.004264255374658238
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.893
*************** Logistic_regression ***************
Logistic_regression Classification Report precision recall f1-score support 0.0 0.83 0.78 0.81 24355 1.0 0.80 0.84 0.82 24355 accuracy 0.81 48710 macro avg 0.81 0.81 0.81 48710 weighted avg 0.81 0.81 0.81 48710 precision score Scores: [0.8017412 0.78974158 0.80124951 0.80195695 0.79187621 0.79380635 0.79248238 0.7981904 0.79457364 0.78947368] Mean of the scores: 0.7955091904805255 Standard deviation of the scores: 0.004651659062771395 recall score Scores: [0.8316913 0.82799672 0.84236453 0.841133 0.84031199 0.83162218 0.8312115 0.83326489 0.84188912 0.85010267] Mean of the scores: 0.8371587886021787 Standard deviation of the scores: 0.006616499906219255 f1 score Scores: [0.81644167 0.80841683 0.82129278 0.82107794 0.81537542 0.81227437 0.81138505 0.81535061 0.81754736 0.81866719] Mean of the scores: 0.815782922126188 Standard deviation of the scores: 0.003956328713365685 accuracy score Scores: [0.81297475 0.8037364 0.81667009 0.81667009 0.80969 0.80784233 0.80681585 0.81133238 0.81215356 0.81174297] Mean of the scores: 0.8109628413056867 Standard deviation of the scores: 0.003900420311899132 roc_auc score Scores: [0.8938642 0.88615042 0.89735622 0.89752396 0.89336813 0.89083309 0.88761595 0.89185961 0.89221129 0.89687634] Mean of the scores: 0.8927659205011752 Standard deviation of the scores: 0.003705754655283074
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.893
*************** Support_vector_machine *************** Support_vector_machine Classification Report precision recall f1-score support 0.0 0.83 0.76 0.79 24355 1.0 0.78 0.84 0.81 24355 accuracy 0.80 48710 macro avg 0.80 0.80 0.80 48710 weighted avg 0.80 0.80 0.80 48710 precision score Scores: [0.78557729 0.77589526 0.77294867 0.78454511 0.77997716 0.7743295 0.77726751 0.78106964 0.77883881 0.77304702] Mean of the scores: 0.7783495974903237 Standard deviation of the scores: 0.004260915904273536 recall score Scores: [0.8407225 0.8271757 0.84688013 0.84605911 0.841133 0.82997947 0.83408624 0.83367556 0.83737166 0.85749487] Mean of the scores: 0.8394578246224498 Standard deviation of the scores: 0.008577520424725805 f1 score Scores: [0.81221495 0.80071528 0.80822723 0.81414181 0.80940154 0.8011893 0.80467512 0.80651569 0.80704532 0.81308411] Mean of the scores: 0.8077210352765105 Standard deviation of the scores: 0.004429400343103379 accuracy score Scores: [0.80558407 0.79408746 0.79901458 0.80681585 0.80188873 0.79408746 0.7975775 0.80004106 0.79983576 0.80291521] Mean of the scores: 0.8001847669882981 Standard deviation of the scores: 0.004076108059984443 roc_auc score Scores: [0.88666014 0.87994861 0.88927602 0.89097453 0.88745613 0.88296699 0.88342066 0.88701881 0.88687383 0.89049145] Mean of the scores: 0.886508717627106 Standard deviation of the scores: 0.0033150033248867746
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.888
*************** Decision_tree ***************
Decision_tree Classification Report precision recall f1-score support 0.0 0.84 0.86 0.85 24355 1.0 0.85 0.84 0.85 24355 accuracy 0.85 48710 macro avg 0.85 0.85 0.85 48710 weighted avg 0.85 0.85 0.85 48710 precision score Scores: [0.85 0.83900929 0.83704453 0.85761453 0.8623298 0.8451514 0.85818882 0.86607143 0.86192308 0.86140147] Mean of the scores: 0.8538734353430595 Standard deviation of the scores: 0.009846927492459533 recall score Scores: [0.68390805 0.66748768 0.67898194 0.85303777 0.93596059 0.90554415 0.91457906 0.91622177 0.92032854 0.9137577 ] Mean of the scores: 0.8389807237771553 Standard deviation of the scores: 0.10815763683815853 f1 score Scores: [0.75796178 0.74348422 0.74977335 0.85532002 0.8976378 0.87430611 0.88548708 0.89044103 0.89016882 0.88680749] Mean of the scores: 0.8431387698630253 Standard deviation of the scores: 0.0617598941079285 accuracy score Scores: [0.78156436 0.76965715 0.77335249 0.85567645 0.89324574 0.86984192 0.88174913 0.88729214 0.88647095 0.8833915 ] Mean of the scores: 0.8482241839458016 Standard deviation of the scores: 0.049113812313280644 roc_auc score Scores: [0.79159047 0.78260108 0.7902679 0.86593298 0.90325937 0.88258076 0.89123382 0.89692363 0.89832028 0.89394959] Mean of the scores: 0.859665987261576 Standard deviation of the scores: 0.04786261226652227
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.999
*************** Random_forest ***************
Random_forest Classification Report precision recall f1-score support 0.0 0.89 0.75 0.81 24355 1.0 0.78 0.91 0.84 24355 accuracy 0.83 48710 macro avg 0.84 0.83 0.83 48710 weighted avg 0.84 0.83 0.83 48710 precision score Scores: [0.7853461 0.77720398 0.77970386 0.78558813 0.78519036 0.77661431 0.78773092 0.79137447 0.79143258 0.77766393] Mean of the scores: 0.7837848656437945 Standard deviation of the scores: 0.005368155520012824 recall score Scores: [0.87561576 0.86494253 0.886289 0.91297209 0.9228243 0.9137577 0.92813142 0.92689938 0.92566735 0.93511294] Mean of the scores: 0.9092212466661946 Standard deviation of the scores: 0.023337689926362273 f1 score Scores: [0.82802795 0.81872936 0.82958694 0.84450351 0.84846197 0.83962264 0.85218703 0.85379232 0.85330305 0.84915159] Mean of the scores: 0.8417366361971291 Standard deviation of the scores: 0.011685660638545033 accuracy score Scores: [0.81810716 0.80845822 0.81790187 0.83186204 0.83514679 0.82549784 0.83904742 0.84130569 0.84089509 0.83391501] Mean of the scores: 0.8292137138164648 Standard deviation of the scores: 0.010675208847656577 roc_auc score Scores: [0.90298618 0.8960157 0.90475364 0.91872005 0.92139536 0.91904703 0.92053017 0.92550677 0.92499014 0.92435912] Mean of the scores: 0.9158304167804628 Standard deviation of the scores: 0.010011574334813289
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.920
*************** Gaussian_naive_bayes ***************
Gaussian_naive_bayes Classification Report precision recall f1-score support 0.0 0.87 0.66 0.75 24355 1.0 0.73 0.90 0.80 24355 accuracy 0.78 48710 macro avg 0.80 0.78 0.78 48710 weighted avg 0.80 0.78 0.78 48710 precision score Scores: [0.72600927 0.72544418 0.72055427 0.73025435 0.72099901 0.72733311 0.7239894 0.72668701 0.73043767 0.72697368] Mean of the scores: 0.7258681957893447 Standard deviation of the scores: 0.0031596398301239124 recall score Scores: [0.90065681 0.88834154 0.89655172 0.89573071 0.90065681 0.89938398 0.8973306 0.88008214 0.90472279 0.90759754] Mean of the scores: 0.8971054645748409 Standard deviation of the scores: 0.0075323992601607674 f1 score Scores: [0.80395749 0.79867134 0.79897567 0.80457227 0.80087607 0.80426001 0.80139373 0.79606241 0.80829206 0.80730594] Mean of the scores: 0.8024366986317366 Standard deviation of the scores: 0.0037216150210196753 accuracy score Scores: [0.78033258 0.77602135 0.77437898 0.78238555 0.77602135 0.78115377 0.77766372 0.77458427 0.785465 0.78341203] Mean of the scores: 0.7791418599876823 Standard deviation of the scores: 0.003732561914141176 roc_auc score Scores: [0.86468493 0.85735823 0.8641672 0.86733478 0.86562691 0.86884843 0.86079259 0.86114401 0.86695841 0.8721599 ] Mean of the scores: 0.8649075385305295 Standard deviation of the scores: 0.004091810176145528
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.865
*************** K-Nearest_neighbors *************** K-Nearest_neighbors Classification Report precision recall f1-score support 0.0 0.88 0.79 0.83 24355 1.0 0.81 0.89 0.85 24355 accuracy 0.84 48710 macro avg 0.85 0.84 0.84 48710 weighted avg 0.85 0.84 0.84 48710 precision score Scores: [0.80837859 0.80660194 0.81588167 0.81002244 0.81172161 0.79878701 0.80468178 0.82259259 0.80811007 0.80611511] Mean of the scores: 0.809289281035513 Standard deviation of the scores: 0.006147642674653737 recall score Scores: [0.85550082 0.85262726 0.86042693 0.88916256 0.90968801 0.91950719 0.90349076 0.91211499 0.91663244 0.92032854] Mean of the scores: 0.8939479504894077 Standard deviation of the scores: 0.02620074943360306 f1 score Scores: [0.83127244 0.82897625 0.83756244 0.84774951 0.85791715 0.85490645 0.85122848 0.86504382 0.85895709 0.85944391] Mean of the scores: 0.8493057538017748 Standard deviation of the scores: 0.011965176735247296 accuracy score Scores: [0.82631903 0.82406077 0.83309382 0.8402792 0.84931226 0.84397454 0.84212687 0.85772942 0.84951755 0.84951755] Mean of the scores: 0.8415931020324369 Standard deviation of the scores: 0.010325503313498302 roc_auc score Scores: [0.89390575 0.89075394 0.89814495 0.90669416 0.91588839 0.90818295 0.90979894 0.92222489 0.91327529 0.91890803] Mean of the scores: 0.9077777300114974 Standard deviation of the scores: 0.010045223582988625
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.964
*************** Gradient_boosting ***************
Gradient_boosting Classification Report precision recall f1-score support 0.0 0.89 0.82 0.85 24355 1.0 0.83 0.90 0.86 24355 accuracy 0.86 48710 macro avg 0.86 0.86 0.86 48710 weighted avg 0.86 0.86 0.86 48710 precision score Scores: [0.83354193 0.81529699 0.8254354 0.82915718 0.83253939 0.82415177 0.83461963 0.83832778 0.83685577 0.83309144] Mean of the scores: 0.8303017263935084 Standard deviation of the scores: 0.006612571779948256 recall score Scores: [0.82019704 0.8226601 0.83661741 0.89655172 0.93267652 0.92772074 0.93264887 0.93059548 0.92689938 0.94291581] Mean of the scores: 0.8969483078935745 Standard deviation of the scores: 0.04763366247012269 f1 score Scores: [0.82681564 0.81896199 0.83098879 0.86153846 0.87976767 0.87287481 0.88091544 0.88205527 0.87957911 0.88460798] Mean of the scores: 0.861810515856692 Standard deviation of the scores: 0.024628973229388777 accuracy score Scores: [0.8281667 0.81810716 0.82980907 0.85588175 0.87251078 0.8649148 0.87394785 0.87559023 0.87312667 0.8770273 ] Mean of the scores: 0.8569082323958119 Standard deviation of the scores: 0.02163733233872181 roc_auc score Scores: [0.91419729 0.90615266 0.91584607 0.93978895 0.94812574 0.95045485 0.94915251 0.95297295 0.95222568 0.95368168] Mean of the scores: 0.9382598378868648 Standard deviation of the scores: 0.01768799138062214
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.941
*************** Linear_discriminant_analysis ***************
Linear_discriminant_analysis Classification Report precision recall f1-score support 0.0 0.83 0.76 0.80 24355 1.0 0.78 0.85 0.81 24355 accuracy 0.80 48710 macro avg 0.81 0.80 0.80 48710 weighted avg 0.81 0.80 0.80 48710 precision score Scores: [0.78552279 0.77646611 0.78741048 0.78479576 0.78190691 0.77837014 0.77570449 0.78217822 0.78078078 0.77662338] Mean of the scores: 0.7809759059416234 Standard deviation of the scores: 0.0039099889899369454 recall score Scores: [0.84195402 0.83702791 0.85755337 0.85180624 0.85509031 0.83942505 0.83655031 0.84353183 0.85420945 0.85954825] Mean of the scores: 0.8476696742564476 Standard deviation of the scores: 0.008416562783895857 f1 score Scores: [0.81276006 0.80561043 0.82098644 0.81692913 0.81686275 0.8077455 0.80497925 0.81169729 0.81584624 0.81598441] Mean of the scores: 0.81294015072365 Standard deviation of the scores: 0.0050932951548407265 accuracy score Scores: [0.80599466 0.79798809 0.81297475 0.80907411 0.80825293 0.80024636 0.7973722 0.80435229 0.80722644 0.80619996] Mean of the scores: 0.804968179018682 Standard deviation of the scores: 0.004780371263654884 roc_auc score Scores: [0.89128364 0.88237593 0.89404838 0.89266799 0.88786933 0.88558211 0.88480493 0.88670456 0.8902754 0.89294978] Mean of the scores: 0.8888562055141394 Standard deviation of the scores: 0.003753428366703254
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.889
*************** Bagging ***************
Bagging Classification Report precision recall f1-score support 0.0 0.86 0.88 0.87 24355 1.0 0.88 0.86 0.87 24355 accuracy 0.87 48710 macro avg 0.87 0.87 0.87 48710 weighted avg 0.87 0.87 0.87 48710 precision score Scores: [0.87336473 0.85942327 0.85894207 0.88398018 0.87984791 0.87089381 0.88305215 0.88746605 0.88417431 0.89000386] Mean of the scores: 0.8771148337925846 Standard deviation of the scores: 0.010530766085909948 recall score Scores: [0.68513957 0.68513957 0.6999179 0.87889984 0.9499179 0.93634497 0.94579055 0.93921971 0.94989733 0.94702259] Mean of the scores: 0.8617289932329231 Standard deviation of the scores: 0.11415338466639222 f1 score Scores: [0.7678859 0.76244861 0.77131871 0.88143269 0.91354126 0.9024342 0.91334523 0.91260974 0.91585825 0.91762833] Mean of the scores: 0.8658502898656746 Standard deviation of the scores: 0.06534887047971946 accuracy score Scores: [0.79285568 0.78649148 0.79244508 0.88174913 0.91008007 0.89878875 0.91028536 0.91008007 0.91274892 0.91500719] Mean of the scores: 0.8710531718332991 Standard deviation of the scores: 0.053464266478599407 roc_auc score Scores: [0.89113511 0.88191071 0.89104332 0.94283396 0.96490738 0.96198366 0.96354477 0.96662519 0.96301406 0.96583208] Mean of the scores: 0.9392830253251198 Standard deviation of the scores: 0.034242736074440616
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.997
*************** Neural_network *************** Neural_network Classification Report precision recall f1-score support 0.0 0.89 0.80 0.84 24355 1.0 0.82 0.90 0.86 24355 accuracy 0.85 48710 macro avg 0.85 0.85 0.85 48710 weighted avg 0.85 0.85 0.85 48710 precision score Scores: [0.82311516 0.79018359 0.83143219 0.82537879 0.85283474 0.80678206 0.808839 0.83508246 0.81947913 0.78250901] Mean of the scores: 0.8175636127573244 Standard deviation of the scores: 0.020024279254790008 recall score Scores: [0.81568144 0.86576355 0.80788177 0.89449918 0.87068966 0.93798768 0.94702259 0.91498973 0.94332649 0.98110883] Mean of the scores: 0.8978950917618338 Standard deviation of the scores: 0.054724104883165564 f1 score Scores: [0.81938144 0.82624878 0.81948782 0.85855004 0.86166971 0.86745158 0.87249338 0.87321184 0.87705231 0.87062682] Mean of the scores: 0.8546173715371264 Standard deviation of the scores: 0.022215504017905195 accuracy score Scores: [0.82016013 0.81790187 0.8220078 0.852597 0.86019298 0.85670294 0.86163006 0.86717307 0.86778896 0.85423938] Mean of the scores: 0.8480394169575037 Standard deviation of the scores: 0.018936721801716936 roc_auc score Scores: [0.90566882 0.90557449 0.91031634 0.92781886 0.93566944 0.93330484 0.93965618 0.94216729 0.93710732 0.94078125] Mean of the scores: 0.9278064825023685 Standard deviation of the scores: 0.014084740591121225
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.947
*************** AdaBoost ***************
AdaBoost Classification Report precision recall f1-score support 0.0 0.86 0.82 0.84 24355 1.0 0.83 0.87 0.85 24355 accuracy 0.84 48710 macro avg 0.85 0.84 0.84 48710 weighted avg 0.85 0.84 0.84 48710 precision score Scores: [0.82581189 0.81544885 0.8239466 0.83260611 0.82690854 0.82424242 0.84074217 0.84603361 0.83587786 0.83333333] Mean of the scores: 0.8304951394737436 Standard deviation of the scores: 0.008550426693125727 recall score Scores: [0.80377668 0.80172414 0.81075534 0.86165846 0.89819376 0.8936345 0.89322382 0.88911704 0.89938398 0.90349076] Mean of the scores: 0.8654958477053641 Standard deviation of the scores: 0.04082535096646606 f1 score Scores: [0.81464531 0.80852826 0.81729774 0.8468832 0.86107832 0.85753695 0.86618877 0.86704045 0.86646884 0.86699507] Mean of the scores: 0.8472662900108331 Standard deviation of the scores: 0.022948470911779945 accuracy score Scores: [0.81708068 0.8101006 0.81872305 0.84417984 0.85506056 0.85157052 0.86204065 0.86368302 0.86142476 0.86142476] Mean of the scores: 0.8445288441798399 Standard deviation of the scores: 0.020024381343844655 roc_auc score Scores: [0.90717126 0.89823658 0.9074668 0.92693563 0.93385165 0.93206539 0.93579875 0.93777787 0.93733356 0.93831449] Mean of the scores: 0.9254951986459103 Standard deviation of the scores: 0.01442360177194457
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.928
*************** Extra_trees ***************
Extra_trees Classification Report precision recall f1-score support 0.0 0.89 0.87 0.88 24355 1.0 0.87 0.89 0.88 24355 accuracy 0.88 48710 macro avg 0.88 0.88 0.88 48710 weighted avg 0.88 0.88 0.88 48710 precision score Scores: [0.86946494 0.86832413 0.86857936 0.88014383 0.87758945 0.86263531 0.87608943 0.8847032 0.87537651 0.88286581] Mean of the scores: 0.8745771961852187 Standard deviation of the scores: 0.006767883091768545 recall score Scores: [0.77380952 0.77422003 0.77052545 0.9043514 0.95648604 0.94907598 0.94948665 0.95482546 0.95482546 0.95646817] Mean of the scores: 0.8944074171479823 Standard deviation of the scores: 0.08090910344437814 f1 score Scores: [0.81885317 0.81857639 0.81661953 0.89208342 0.9153408 0.90379351 0.91131257 0.91842781 0.91337655 0.91819436] Mean of the scores: 0.8826578110385158 Standard deviation of the scores: 0.042952680352943286 accuracy score Scores: [0.82878259 0.828372 0.82693492 0.89057688 0.91151714 0.89899405 0.90761651 0.91521248 0.90946418 0.91480189] Mean of the scores: 0.8832272633956066 Standard deviation of the scores: 0.0368080125539174 roc_auc score Scores: [0.88657113 0.88800176 0.89205045 0.93667945 0.95763699 0.94882217 0.95863907 0.96187947 0.9557446 0.95811215] Mean of the scores: 0.9344137222969625 Standard deviation of the scores: 0.030573753266218293
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.999
create_summary_table(summary_data)
*************** Metrics Summary Table ***************
Model name | Precision score (Validation set) | Recall score (Validation set) | F1 score (Validation set) | Accuracy score (Validation set) | AUC score (Validation set) | AUC score (Train set) | Has overfit (AUC score (Train set) > AUC score (Validation set)) | Confusion matrix | Roc auc curve | Top 10 important features | Top 10 useless features | Top 10 important features plot | Top 10 useless features plot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | Random_forest | 0.783785 | 0.909221 | 0.841737 | 0.829214 | 0.915830 | 0.919659 | False | 1. feature name: marital-status_Married-civ-spouse coefficient: 0.3072 2. feature name: relationship_Husband coefficient: 0.1443 3. feature name: capital-gain coefficient: 0.1067 4. feature name: hours-per-week coefficient: 0.1008 5. feature name: age coefficient: 0.0956 6. feature name: education coefficient: 0.0619 7. feature name: marital-status_Never-married coefficient: 0.0506 8. feature name: occupation_Prof-specialty coefficient: 0.0296 9. feature name: relationship_Wife coefficient: 0.0271 10. feature name: occupation_Exec-managerial coefficient: 0.0227 |
1. feature name: occupation_Handlers-cleaners coefficient: 0.0001 2. feature name: workclass_Federal-gov coefficient: 0.0001 3. feature name: marital-status_Married-AF-spouse coefficient: 0.0001 4. feature name: race_Amer-Indian-Eskimo coefficient: 0.0000 5. feature name: occupation_Protective-serv coefficient: 0.0000 6. feature name: marital-status_Married-spouse-absent coefficient: 0.0000 7. feature name: race_Other coefficient: 0.0000 8. feature name: workclass_Without-pay coefficient: 0.0000 9. feature name: occupation_Armed-Forces coefficient: 0.0000 10. feature name: occupation_Priv-house-serv coefficient: 0.0000 |
||||
6 | K-Nearest_neighbors | 0.809289 | 0.893948 | 0.849306 | 0.841593 | 0.907778 | 0.964316 | True | No coeficient or feature importance for this model | No coeficient or feature importance for this model | No coeficient or feature importance for this model | No coeficient or feature importance for this model | ||
7 | Gradient_boosting | 0.830302 | 0.896948 | 0.861811 | 0.856908 | 0.938260 | 0.940931 | False | 1. feature name: marital-status_Married-civ-spouse coefficient: 0.4506 2. feature name: capital-gain coefficient: 0.1232 3. feature name: age coefficient: 0.1118 4. feature name: hours-per-week coefficient: 0.1063 5. feature name: education coefficient: 0.0584 6. feature name: occupation_Prof-specialty coefficient: 0.0387 7. feature name: capital-loss coefficient: 0.0264 8. feature name: occupation_Exec-managerial coefficient: 0.0255 9. feature name: occupation_Other-service coefficient: 0.0122 10. feature name: relationship_Not-in-family coefficient: 0.0083 |
1. feature name: race_Other coefficient: 0.0000 2. feature name: workclass_Without-pay coefficient: 0.0000 3. feature name: occupation_Adm-clerical coefficient: 0.0000 4. feature name: occupation_Armed-Forces coefficient: 0.0000 5. feature name: occupation_Craft-repair coefficient: 0.0000 6. feature name: occupation_Priv-house-serv coefficient: 0.0000 7. feature name: marital-status_Divorced coefficient: 0.0000 8. feature name: marital-status_Married-spouse-absent coefficient: 0.0000 9. feature name: marital-status_Never-married coefficient: 0.0000 10. feature name: marital-status_Widowed coefficient: 0.0000 |
||||
9 | Bagging | 0.877115 | 0.861729 | 0.865850 | 0.871053 | 0.939283 | 0.997220 | True | 1. feature name: marital-status_Married-civ-spouse coefficient: 0.2948 2. feature name: age coefficient: 0.1905 3. feature name: hours-per-week coefficient: 0.1229 4. feature name: education coefficient: 0.0876 5. feature name: capital-gain coefficient: 0.0789 6. feature name: capital-loss coefficient: 0.0222 7. feature name: occupation_Prof-specialty coefficient: 0.0198 8. feature name: occupation_Exec-managerial coefficient: 0.0184 9. feature name: workclass_Private coefficient: 0.0123 10. feature name: occupation_Sales coefficient: 0.0106 |
1. feature name: relationship_Unmarried coefficient: 0.0016 2. feature name: relationship_Other-relative coefficient: 0.0015 3. feature name: race_Amer-Indian-Eskimo coefficient: 0.0015 4. feature name: marital-status_Widowed coefficient: 0.0014 5. feature name: marital-status_Married-spouse-absent coefficient: 0.0011 6. feature name: race_Other coefficient: 0.0007 7. feature name: marital-status_Married-AF-spouse coefficient: 0.0002 8. feature name: workclass_Without-pay coefficient: 0.0002 9. feature name: occupation_Priv-house-serv coefficient: 0.0001 10. feature name: occupation_Armed-Forces coefficient: 0.0000 |
||||
10 | Neural_network | 0.817564 | 0.897895 | 0.854617 | 0.848039 | 0.927806 | 0.946898 | False | No coeficient or feature importance for this model | No coeficient or feature importance for this model | No coeficient or feature importance for this model | No coeficient or feature importance for this model |
Dropping drop the 10 least predictive features for Random forest, Gradiant boosting and Bagging retrain the model
#changing the summary data to a numpy array for easier manipulation
np_summary_data = np.array(summary_data)
pd.set_option('max_colwidth', 2000)
model_ft_to_drop_df_all = pd.DataFrame(np_summary_data[:,[0,11]], columns=['Model name','Least predictive feat'])
# extract the most promising model: Random forest, Bagging, Gradient boosting, KNN, and Neural network.
model_ft_to_drop_df = model_ft_to_drop_df_all.iloc[[4,6,7,9,10]]
# regular expression to extract the feature names
patterns = ':\s*([^:.<]+)<'
# extract the feature names as a list from the model_ft_to_drop_df
def extract_ft_names(model_ft_to_drop_df):
for _, row in model_ft_to_drop_df.iterrows():
if row['Least predictive feat'] != 'No coeficient or feature importance for this model':
row['Least predictive feat'] = re.findall(patterns,row['Least predictive feat'])
extract_ft_names(model_ft_to_drop_df)
#Implement a feature selection function that takes in the least predictive features for each model, drop them and retrain the models automatically.
def drop_least_useful_ft(model_name,feat_list):
X_train_copy_prep_drop_ft = X_train_copy_prep.drop(feat_list,axis=1)
X_train_copy_prep_drop_ft_path = Path('saved_preliminary_models/{0}/X_train_copy_prep_drop_ft_{0}.sav'.format(model_name))
try:
#check if the path exists
X_train_copy_prep_drop_ft_path.resolve(strict=True)
except FileNotFoundError:
joblib.dump(X_train_copy_prep_drop_ft,X_train_copy_prep_drop_ft_path)
model_dict_drop_ft = {model_name:model_dict[model_name]}
# retrain each models with the least predictive features dropped
model_evaluation_df(model_dict_drop_ft, X_train_copy_prep_drop_ft, y_train_copy_prep)
def drop_ft_retrain(model_ft_to_drop_df):
# drop the least predictive features
for indexes, row in model_ft_to_drop_df.iterrows():
if row['Least predictive feat'] != 'No coeficient or feature importance for this model':
#X_train_copy_prep.drop(row['Least predictive feat'],axis=1,inplace=True)
#print(row['Model name'],row['Least predictive feat'])
drop_least_useful_ft(row['Model name'],row['Least predictive feat'])
drop_ft_retrain(model_ft_to_drop_df)
*************** Random_forest ***************
Random_forest Classification Report precision recall f1-score support 0.0 0.89 0.75 0.81 24355 1.0 0.78 0.91 0.84 24355 accuracy 0.83 48710 macro avg 0.84 0.83 0.83 48710 weighted avg 0.84 0.83 0.83 48710 precision score Scores: [0.7853461 0.77720398 0.77970386 0.78558813 0.78519036 0.77661431 0.78773092 0.79137447 0.79143258 0.77766393] Mean of the scores: 0.7837848656437945 Standard deviation of the scores: 0.005368155520012824 recall score Scores: [0.87561576 0.86494253 0.886289 0.91297209 0.9228243 0.9137577 0.92813142 0.92689938 0.92566735 0.93511294] Mean of the scores: 0.9092212466661946 Standard deviation of the scores: 0.023337689926362273 f1 score Scores: [0.82802795 0.81872936 0.82958694 0.84450351 0.84846197 0.83962264 0.85218703 0.85379232 0.85330305 0.84915159] Mean of the scores: 0.8417366361971291 Standard deviation of the scores: 0.011685660638545033 accuracy score Scores: [0.81810716 0.80845822 0.81790187 0.83186204 0.83514679 0.82549784 0.83904742 0.84130569 0.84089509 0.83391501] Mean of the scores: 0.8292137138164648 Standard deviation of the scores: 0.010675208847656577 roc_auc score Scores: [0.90298618 0.8960157 0.90475364 0.91872005 0.92139536 0.91904703 0.92053017 0.92550677 0.92499014 0.92435912] Mean of the scores: 0.9158304167804628 Standard deviation of the scores: 0.010011574334813289
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.920
*************** Gradient_boosting ***************
Gradient_boosting Classification Report precision recall f1-score support 0.0 0.89 0.82 0.85 24355 1.0 0.83 0.90 0.86 24355 accuracy 0.86 48710 macro avg 0.86 0.86 0.86 48710 weighted avg 0.86 0.86 0.86 48710 precision score Scores: [0.83354193 0.81529699 0.8254354 0.82915718 0.83253939 0.82415177 0.83461963 0.83832778 0.83685577 0.83309144] Mean of the scores: 0.8303017263935084 Standard deviation of the scores: 0.006612571779948256 recall score Scores: [0.82019704 0.8226601 0.83661741 0.89655172 0.93267652 0.92772074 0.93264887 0.93059548 0.92689938 0.94291581] Mean of the scores: 0.8969483078935745 Standard deviation of the scores: 0.04763366247012269 f1 score Scores: [0.82681564 0.81896199 0.83098879 0.86153846 0.87976767 0.87287481 0.88091544 0.88205527 0.87957911 0.88460798] Mean of the scores: 0.861810515856692 Standard deviation of the scores: 0.024628973229388777 accuracy score Scores: [0.8281667 0.81810716 0.82980907 0.85588175 0.87251078 0.8649148 0.87394785 0.87559023 0.87312667 0.8770273 ] Mean of the scores: 0.8569082323958119 Standard deviation of the scores: 0.02163733233872181 roc_auc score Scores: [0.91419729 0.90615266 0.91584607 0.93978895 0.94812574 0.95045485 0.94915251 0.95297295 0.95222568 0.95368168] Mean of the scores: 0.9382598378868648 Standard deviation of the scores: 0.01768799138062214
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.941
*************** Bagging ***************
Bagging Classification Report precision recall f1-score support 0.0 0.86 0.88 0.87 24355 1.0 0.88 0.86 0.87 24355 accuracy 0.87 48710 macro avg 0.87 0.87 0.87 48710 weighted avg 0.87 0.87 0.87 48710 precision score Scores: [0.87336473 0.85942327 0.85894207 0.88398018 0.87984791 0.87089381 0.88305215 0.88746605 0.88417431 0.89000386] Mean of the scores: 0.8771148337925846 Standard deviation of the scores: 0.010530766085909948 recall score Scores: [0.68513957 0.68513957 0.6999179 0.87889984 0.9499179 0.93634497 0.94579055 0.93921971 0.94989733 0.94702259] Mean of the scores: 0.8617289932329231 Standard deviation of the scores: 0.11415338466639222 f1 score Scores: [0.7678859 0.76244861 0.77131871 0.88143269 0.91354126 0.9024342 0.91334523 0.91260974 0.91585825 0.91762833] Mean of the scores: 0.8658502898656746 Standard deviation of the scores: 0.06534887047971946 accuracy score Scores: [0.79285568 0.78649148 0.79244508 0.88174913 0.91008007 0.89878875 0.91028536 0.91008007 0.91274892 0.91500719] Mean of the scores: 0.8710531718332991 Standard deviation of the scores: 0.053464266478599407 roc_auc score Scores: [0.89113511 0.88191071 0.89104332 0.94283396 0.96490738 0.96198366 0.96354477 0.96662519 0.96301406 0.96583208] Mean of the scores: 0.9392830253251198 Standard deviation of the scores: 0.034242736074440616
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.997
We will focus on 5 models: Random forest, Bagging, Gradient boosting, KNN, and Neural network as it yields the best results and use precision as the metric.
param_grid_rand_for = {
'n_estimators' : [100, 200, 300, 500, 800, 1200, 1500, 1800, 2000],
'max_features' : ['auto', 'sqrt'],
'max_depth' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'min_samples_split' : [2, 5, 10, 15, 100],
'min_samples_leaf' : [1, 2, 5, 10],
'bootstrap' : [True, False]
}
rand_forest_clf = model_dict['Random_forest']
randomize_search_rand_for = RandomizedSearchCV(estimator = rand_forest_clf, param_distributions = param_grid_rand_for, cv=5, n_jobs=-1, verbose=3, scoring='precision')
X_train_copy_prep_drop_ft = joblib.load('saved_preliminary_models/Random_forest/X_train_copy_prep_drop_ft_Random_forest.sav')
randomize_search_rand_for.fit(X_train_copy_prep_drop_ft, y_train_copy_prep)
Fitting 5 folds for each of 10 candidates, totalling 50 fits [CV 4/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.818 total time= 37.6s [CV 3/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.825 total time= 38.0s [CV 2/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.837 total time= 38.0s [CV 5/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.826 total time= 38.4s [CV 1/5] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.823 total time= 38.8s [CV 3/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1200;, score=0.857 total time= 1.1min [CV 1/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1200;, score=0.860 total time= 1.2min [CV 2/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1200;, score=0.866 total time= 1.2min [CV 1/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.833 total time= 4.1s [CV 2/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.846 total time= 4.6s [CV 3/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.835 total time= 4.8s [CV 4/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.830 total time= 4.2s [CV 2/5] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.877 total time= 2.2min [CV 1/5] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.881 total time= 2.2min [CV 3/5] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.874 total time= 2.2min [CV 5/5] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.833 total time= 4.3s [CV 4/5] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.873 total time= 2.2min [CV 5/5] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=2000;, score=0.875 total time= 2.2min [CV 4/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1200;, score=0.849 total time= 1.2min [CV 5/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1200;, score=0.858 total time= 1.2min [CV 3/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1800;, score=0.836 total time= 1.1min [CV 2/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1800;, score=0.848 total time= 1.1min [CV 1/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1800;, score=0.832 total time= 1.2min [CV 4/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1800;, score=0.831 total time= 1.1min [CV 5/5] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1800;, score=0.835 total time= 1.1min [CV 1/5] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1800;, score=0.832 total time= 1.2min [CV 2/5] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1800;, score=0.848 total time= 1.2min [CV 3/5] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1800;, score=0.836 total time= 1.1min [CV 4/5] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1800;, score=0.831 total time= 1.1min [CV 5/5] END bootstrap=True, max_depth=70, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=1800;, score=0.835 total time= 1.2min [CV 1/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300;, score=0.881 total time= 20.2s [CV 1/5] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=2000;, score=0.833 total time= 1.3min [CV 2/5] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=2000;, score=0.849 total time= 1.3min [CV 2/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300;, score=0.877 total time= 20.6s [CV 3/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300;, score=0.873 total time= 19.7s [CV 4/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300;, score=0.874 total time= 19.1s [CV 5/5] END bootstrap=False, max_depth=80, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=300;, score=0.876 total time= 18.9s [CV 3/5] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=2000;, score=0.836 total time= 1.2min [CV 4/5] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=2000;, score=0.831 total time= 1.2min [CV 5/5] END bootstrap=True, max_depth=80, max_features=auto, min_samples_leaf=5, min_samples_split=5, n_estimators=2000;, score=0.834 total time= 1.2min [CV 1/5] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.793 total time= 39.6s [CV 2/5] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.804 total time= 37.1s [CV 3/5] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.800 total time= 36.7s [CV 4/5] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.792 total time= 36.3s [CV 5/5] END bootstrap=False, max_depth=10, max_features=auto, min_samples_leaf=10, min_samples_split=2, n_estimators=1200;, score=0.798 total time= 35.3s [CV 1/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800;, score=0.824 total time= 24.1s [CV 2/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800;, score=0.837 total time= 22.8s [CV 4/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800;, score=0.819 total time= 19.0s [CV 3/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800;, score=0.826 total time= 19.7s [CV 5/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, min_samples_split=2, n_estimators=800;, score=0.826 total time= 18.0s
RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1, param_distributions={'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 5, 10], 'min_samples_split': [2, 5, 10, 15, 100], 'n_estimators': [100, 200, 300, 500, 800, 1200, 1500, 1800, 2000]}, scoring='precision', verbose=3)
best_rand_for_clf = randomize_search_rand_for.best_estimator_
best_rand_for_clf.fit(X_train_copy_prep_drop_ft, y_train_copy_prep)
precision_score_rand_for = np.mean(cross_val_score(best_rand_for_clf,X_train_copy_prep_drop_ft,y_train_copy_prep,scoring='precision',cv=10,n_jobs=-1,verbose=3))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[CV] END ................................ score: (test=0.877) total time= 18.3s [CV] END ................................ score: (test=0.879) total time= 18.3s [CV] END ................................ score: (test=0.884) total time= 18.5s
[Parallel(n_jobs=-1)]: Done 3 out of 10 | elapsed: 18.5s remaining: 43.2s
[CV] END ................................ score: (test=0.877) total time= 18.6s [CV] END ................................ score: (test=0.872) total time= 18.7s [CV] END ................................ score: (test=0.878) total time= 18.8s [CV] END ................................ score: (test=0.883) total time= 18.8s
[Parallel(n_jobs=-1)]: Done 7 out of 10 | elapsed: 18.9s remaining: 8.1s
[CV] END ................................ score: (test=0.874) total time= 19.0s [CV] END ................................ score: (test=0.891) total time= 11.2s [CV] END ................................ score: (test=0.873) total time= 11.2s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 29.7s finished
precision_score_rand_for
0.8787312719483935
roc_curve_fuc(best_rand_for_clf, 'Random_forest', X_train_copy_prep_drop_ft, y_train_copy_prep)
Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.920
(0.9196586790350982, '<img src="saved_preliminary_models/Random_forest/Random_forest_ROC_curve.jpg">')
randomize_search.best_params_
{'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25, 'bootstrap': False}
param_grid_nn = {
'hidden_layer_sizes': [(1,),(50,),(10,30,10),(20,),(50,50,50), (50,100,50), (100,),(500, 400, 300, 200, 100), (400, 400, 400, 400, 400)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
'solver': ['lbfgs','sgd', 'adam'],
'alpha': [0.005,0.0005, 0.05],
'learning_rate': ['constant','adaptive'],
}
Neural_network_clf = model_dict['Neural_network']
randomize_search_nn = RandomizedSearchCV(estimator = Neural_network_clf, param_distributions = param_grid_nn, cv=5, n_jobs=-1, verbose=3, scoring='precision')
randomize_search_nn.fit(X_train_copy_prep, y_train_copy_prep)
Fitting 5 folds for each of 10 candidates, totalling 50 fits [CV 3/5] END activation=identity, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.805 total time= 2.6s [CV 5/5] END activation=identity, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.798 total time= 3.2s [CV 4/5] END activation=identity, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.789 total time= 3.3s [CV 2/5] END activation=identity, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.785 total time= 3.6s [CV 1/5] END activation=identity, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.802 total time= 4.2s [CV 1/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(10, 30, 10), learning_rate=adaptive, solver=lbfgs;, score=0.800 total time= 10.9s [CV 3/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(10, 30, 10), learning_rate=adaptive, solver=lbfgs;, score=0.802 total time= 11.1s [CV 2/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(10, 30, 10), learning_rate=adaptive, solver=lbfgs;, score=0.794 total time= 14.6s [CV 1/5] END activation=logistic, alpha=0.05, hidden_layer_sizes=(1,), learning_rate=constant, solver=lbfgs;, score=0.790 total time= 5.5s [CV 4/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(10, 30, 10), learning_rate=adaptive, solver=lbfgs;, score=0.788 total time= 11.5s [CV 5/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(10, 30, 10), learning_rate=adaptive, solver=lbfgs;, score=0.794 total time= 12.5s [CV 2/5] END activation=logistic, alpha=0.05, hidden_layer_sizes=(1,), learning_rate=constant, solver=lbfgs;, score=0.784 total time= 6.0s [CV 3/5] END activation=logistic, alpha=0.05, hidden_layer_sizes=(1,), learning_rate=constant, solver=lbfgs;, score=0.793 total time= 6.3s [CV 4/5] END activation=logistic, alpha=0.05, hidden_layer_sizes=(1,), learning_rate=constant, solver=lbfgs;, score=0.780 total time= 5.9s [CV 5/5] END activation=logistic, alpha=0.05, hidden_layer_sizes=(1,), learning_rate=constant, solver=lbfgs;, score=0.785 total time= 5.4s [CV 1/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(1,), learning_rate=constant, solver=adam;, score=0.800 total time= 5.3s [CV 2/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(1,), learning_rate=constant, solver=adam;, score=0.800 total time= 5.1s [CV 3/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(1,), learning_rate=constant, solver=adam;, score=0.797 total time= 4.7s [CV 4/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(1,), learning_rate=constant, solver=adam;, score=0.784 total time= 4.9s [CV 5/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(1,), learning_rate=constant, solver=adam;, score=0.797 total time= 4.5s [CV 3/5] END activation=tanh, alpha=0.005, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam;, score=0.807 total time= 46.0s [CV 4/5] END activation=identity, alpha=0.0005, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd;, score=0.759 total time= 10.6s [CV 2/5] END activation=tanh, alpha=0.005, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam;, score=0.816 total time= 57.9s [CV 1/5] END activation=identity, alpha=0.0005, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd;, score=0.800 total time= 18.9s [CV 3/5] END activation=identity, alpha=0.0005, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd;, score=0.808 total time= 16.0s [CV 4/5] END activation=tanh, alpha=0.005, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam;, score=0.818 total time= 58.8s [CV 2/5] END activation=identity, alpha=0.0005, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd;, score=0.792 total time= 18.9s [CV 5/5] END activation=identity, alpha=0.0005, hidden_layer_sizes=(50, 100, 50), learning_rate=constant, solver=sgd;, score=0.802 total time= 10.7s [CV 5/5] END activation=tanh, alpha=0.005, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam;, score=0.824 total time= 1.2min [CV 1/5] END activation=tanh, alpha=0.005, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam;, score=0.801 total time= 1.3min [CV 1/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.803 total time= 1.1min [CV 4/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.816 total time= 1.1min [CV 2/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.818 total time= 1.1min [CV 5/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.824 total time= 1.1min [CV 3/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(50, 50, 50), learning_rate=constant, solver=adam;, score=0.817 total time= 1.1min [CV 1/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.786 total time= 10.6s [CV 3/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.798 total time= 15.0s [CV 4/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.794 total time= 10.7s [CV 2/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.783 total time= 21.6s [CV 5/5] END activation=relu, alpha=0.005, hidden_layer_sizes=(20,), learning_rate=constant, solver=adam;, score=0.791 total time= 15.5s [CV 3/5] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.799 total time= 2.0min [CV 2/5] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.797 total time= 3.0min [CV 1/5] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.798 total time= 3.1min [CV 4/5] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.789 total time= 2.0min [CV 5/5] END activation=tanh, alpha=0.05, hidden_layer_sizes=(50, 100, 50), learning_rate=adaptive, solver=sgd;, score=0.799 total time= 2.6min [CV 3/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(400, 400, 400, 400, 400), learning_rate=constant, solver=lbfgs;, score=0.802 total time=39.1min [CV 2/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(400, 400, 400, 400, 400), learning_rate=constant, solver=lbfgs;, score=0.794 total time=41.5min [CV 1/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(400, 400, 400, 400, 400), learning_rate=constant, solver=lbfgs;, score=0.800 total time=42.5min [CV 5/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(400, 400, 400, 400, 400), learning_rate=constant, solver=lbfgs;, score=0.794 total time=41.7min [CV 4/5] END activation=identity, alpha=0.05, hidden_layer_sizes=(400, 400, 400, 400, 400), learning_rate=constant, solver=lbfgs;, score=0.788 total time=43.4min
RandomizedSearchCV(cv=5, estimator=MLPClassifier(max_iter=1000, random_state=42), n_jobs=-1, param_distributions={'activation': ['identity', 'logistic', 'tanh', 'relu'], 'alpha': [0.005, 0.0005, 0.05], 'hidden_layer_sizes': [(1,), (50,), (10, 30, 10), (20,), (50, 50, 50), (50, 100, 50), (100,), (500, 400, 300, 200, 100), (400, 400, 400, 400, 400)], 'learning_rate': ['constant', 'adaptive'], 'solver': ['lbfgs', 'sgd', 'adam']}, scoring='precision', verbose=3)
best_nn_clf = randomize_search_nn.best_estimator_
best_nn_clf.fit(X_train_copy_prep, y_train_copy_prep)
precision_score_nn = np.mean(cross_val_score(best_nn_clf,X_train_copy_prep,y_train_copy_prep,scoring='precision',cv=10,n_jobs=-1,verbose=3))
precision_score_nn
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[CV] END ................................ score: (test=0.805) total time= 1.1min [CV] END ................................ score: (test=0.810) total time= 1.1min [CV] END ................................ score: (test=0.825) total time= 1.2min
[Parallel(n_jobs=-1)]: Done 3 out of 10 | elapsed: 1.2min remaining: 2.9min
[CV] END ................................ score: (test=0.837) total time= 1.3min [CV] END ................................ score: (test=0.826) total time= 1.3min [CV] END ................................ score: (test=0.800) total time= 1.4min [CV] END ................................ score: (test=0.824) total time= 1.4min
[Parallel(n_jobs=-1)]: Done 7 out of 10 | elapsed: 1.5min remaining: 37.3s
[CV] END ................................ score: (test=0.794) total time= 1.4min [CV] END ................................ score: (test=0.823) total time= 45.3s [CV] END ................................ score: (test=0.834) total time= 51.3s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 2.0min finished
0.817835238910804
param_grid_knn = {
'n_neighbors' : [5,7,9,11,13,15],
'weights' : ['uniform','distance'],
'metric' : ['minkowski','euclidean','manhattan']
}
knn_clf = model_dict['K-Nearest_neighbors']
randomize_search_knn = RandomizedSearchCV(estimator = knn_clf, param_distributions = param_grid_knn, cv=5, n_jobs=-1, verbose=3, scoring='precision')
randomize_search_knn.fit(X_train_copy_prep, y_train_copy_prep)
best_knn_clf = randomize_search_knn.best_estimator_
best_knn_clf.fit(X_train_copy_prep, y_train_copy_prep)
precision_score_knn = np.mean(cross_val_score(best_knn_clf,X_train_copy_prep,y_train_copy_prep,scoring='precision',cv=10,n_jobs=-1,verbose=3))
precision_score_knn
Fitting 5 folds for each of 10 candidates, totalling 50 fits [CV 2/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.824 total time= 4.1min [CV 2/5] END metric=minkowski, n_neighbors=9, weights=distance;, score=0.818 total time= 4.1min [CV 5/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.830 total time= 4.1min [CV 4/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.830 total time= 4.1min [CV 3/5] END metric=minkowski, n_neighbors=9, weights=distance;, score=0.824 total time= 4.1min [CV 3/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.829 total time= 4.1min [CV 1/5] END metric=minkowski, n_neighbors=5, weights=distance;, score=0.824 total time= 4.1min [CV 1/5] END metric=minkowski, n_neighbors=9, weights=distance;, score=0.820 total time= 4.1min [CV 4/5] END metric=minkowski, n_neighbors=9, weights=distance;, score=0.824 total time= 1.0min [CV 5/5] END metric=minkowski, n_neighbors=9, weights=distance;, score=0.829 total time= 1.0min [CV 4/5] END metric=manhattan, n_neighbors=13, weights=distance;, score=0.836 total time= 2.5min [CV 5/5] END metric=manhattan, n_neighbors=13, weights=distance;, score=0.839 total time= 2.5min [CV 3/5] END metric=manhattan, n_neighbors=13, weights=distance;, score=0.834 total time= 2.5min [CV 1/5] END metric=manhattan, n_neighbors=15, weights=distance;, score=0.834 total time= 2.5min [CV 1/5] END metric=manhattan, n_neighbors=13, weights=distance;, score=0.834 total time= 2.5min [CV 2/5] END metric=manhattan, n_neighbors=13, weights=distance;, score=0.827 total time= 2.5min [CV 2/5] END metric=manhattan, n_neighbors=15, weights=distance;, score=0.827 total time= 2.2min [CV 3/5] END metric=manhattan, n_neighbors=15, weights=distance;, score=0.831 total time= 2.2min [CV 4/5] END metric=euclidean, n_neighbors=13, weights=distance;, score=0.822 total time= 1.1min [CV 3/5] END metric=euclidean, n_neighbors=13, weights=distance;, score=0.822 total time= 1.4min [CV 2/5] END metric=euclidean, n_neighbors=13, weights=distance;, score=0.813 total time= 1.7min [CV 4/5] END metric=manhattan, n_neighbors=15, weights=distance;, score=0.835 total time= 2.1min [CV 5/5] END metric=manhattan, n_neighbors=15, weights=distance;, score=0.837 total time= 2.1min [CV 1/5] END metric=euclidean, n_neighbors=13, weights=distance;, score=0.815 total time= 2.6min [CV 1/5] END metric=euclidean, n_neighbors=15, weights=distance;, score=0.815 total time= 2.0min [CV 5/5] END metric=euclidean, n_neighbors=13, weights=distance;, score=0.825 total time= 2.0min [CV 2/5] END metric=euclidean, n_neighbors=15, weights=distance;, score=0.813 total time= 1.9min [CV 3/5] END metric=euclidean, n_neighbors=15, weights=distance;, score=0.822 total time= 2.1min [CV 5/5] END metric=euclidean, n_neighbors=15, weights=distance;, score=0.823 total time= 1.4min [CV 4/5] END metric=euclidean, n_neighbors=15, weights=distance;, score=0.822 total time= 1.9min [CV 1/5] END metric=manhattan, n_neighbors=9, weights=uniform;, score=0.808 total time= 2.2min [CV 2/5] END metric=manhattan, n_neighbors=9, weights=uniform;, score=0.806 total time= 2.6min [CV 5/5] END metric=manhattan, n_neighbors=9, weights=uniform;, score=0.814 total time= 2.1min [CV 3/5] END metric=manhattan, n_neighbors=9, weights=uniform;, score=0.811 total time= 2.5min [CV 4/5] END metric=manhattan, n_neighbors=9, weights=uniform;, score=0.811 total time= 2.5min [CV 2/5] END metric=manhattan, n_neighbors=13, weights=uniform;, score=0.793 total time= 2.4min [CV 3/5] END metric=manhattan, n_neighbors=13, weights=uniform;, score=0.802 total time= 2.4min [CV 1/5] END metric=manhattan, n_neighbors=13, weights=uniform;, score=0.805 total time= 2.4min [CV 1/5] END metric=minkowski, n_neighbors=15, weights=distance;, score=0.815 total time= 14.1s [CV 4/5] END metric=manhattan, n_neighbors=13, weights=uniform;, score=0.808 total time= 2.4min [CV 2/5] END metric=minkowski, n_neighbors=15, weights=distance;, score=0.813 total time= 43.0s [CV 3/5] END metric=minkowski, n_neighbors=15, weights=distance;, score=0.822 total time= 38.5s [CV 3/5] END metric=manhattan, n_neighbors=5, weights=uniform;, score=0.824 total time= 2.2min [CV 5/5] END metric=manhattan, n_neighbors=13, weights=uniform;, score=0.807 total time= 2.2min [CV 1/5] END metric=manhattan, n_neighbors=5, weights=uniform;, score=0.821 total time= 2.2min [CV 2/5] END metric=manhattan, n_neighbors=5, weights=uniform;, score=0.818 total time= 2.2min [CV 4/5] END metric=minkowski, n_neighbors=15, weights=distance;, score=0.822 total time= 37.3s [CV 5/5] END metric=minkowski, n_neighbors=15, weights=distance;, score=0.823 total time= 11.2s [CV 4/5] END metric=manhattan, n_neighbors=5, weights=uniform;, score=0.825 total time= 1.6min [CV 5/5] END metric=manhattan, n_neighbors=5, weights=uniform;, score=0.822 total time= 1.6min
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[CV] END ................................ score: (test=0.844) total time= 1.3min [CV] END ................................ score: (test=0.843) total time= 1.3min [CV] END ................................ score: (test=0.837) total time= 1.3min [CV] END ................................ score: (test=0.830) total time= 1.3min [CV] END ................................ score: (test=0.843) total time= 1.3min [CV] END ................................ score: (test=0.823) total time= 1.3min [CV] END ................................ score: (test=0.836) total time= 1.3min [CV] END ................................ score: (test=0.837) total time= 1.3min
[Parallel(n_jobs=-1)]: Done 3 out of 10 | elapsed: 1.3min remaining: 3.1min [Parallel(n_jobs=-1)]: Done 7 out of 10 | elapsed: 1.3min remaining: 34.1s
[CV] END ................................ score: (test=0.839) total time= 11.2s [CV] END ................................ score: (test=0.844) total time= 11.2s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.5min finished
0.8375594861235637
param_grid_gb = {
"n_estimators":[5,50,250,500],
"max_depth":[1,3,5,7,9],
"learning_rate":[0.01,0.1,1,10,100]
}
gb_clf = model_dict['Gradient_boosting']
randomize_search_gb = RandomizedSearchCV(estimator = gb_clf, param_distributions = param_grid_gb, cv=5, n_jobs=-1, verbose=3, scoring='precision')
X_train_copy_prep_drop_ft = joblib.load('saved_preliminary_models/Gradient_boosting/X_train_copy_prep_drop_ft_Gradient_boosting.sav')
randomize_search_gb.fit(X_train_copy_prep_drop_ft, y_train_copy_prep)
best_gb_clf = randomize_search_gb.best_estimator_
best_gb_clf.fit(X_train_copy_prep_drop_ft, y_train_copy_prep)
precision_score_gb = np.mean(cross_val_score(best_gb_clf,X_train_copy_prep_drop_ft,y_train_copy_prep,scoring='precision',cv=10,n_jobs=-1,verbose=3))
precision_score_gb
Fitting 5 folds for each of 10 candidates, totalling 50 fits [CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.811 total time= 2.6s [CV 1/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.806 total time= 2.6s [CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.815 total time= 2.7s [CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.809 total time= 2.7s [CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=50;, score=0.806 total time= 2.8s [CV 2/5] END learning_rate=100, max_depth=5, n_estimators=5;, score=0.498 total time= 0.4s [CV 3/5] END learning_rate=100, max_depth=5, n_estimators=5;, score=0.430 total time= 0.4s [CV 1/5] END learning_rate=100, max_depth=5, n_estimators=5;, score=0.302 total time= 0.5s [CV 4/5] END learning_rate=100, max_depth=5, n_estimators=5;, score=0.505 total time= 0.4s [CV 5/5] END learning_rate=100, max_depth=5, n_estimators=5;, score=0.496 total time= 0.4s [CV 3/5] END learning_rate=0.01, max_depth=1, n_estimators=250;, score=0.747 total time= 4.8s [CV 2/5] END learning_rate=0.01, max_depth=1, n_estimators=250;, score=0.737 total time= 4.9s [CV 1/5] END learning_rate=0.01, max_depth=1, n_estimators=250;, score=0.744 total time= 5.1s [CV 5/5] END learning_rate=0.01, max_depth=1, n_estimators=250;, score=0.741 total time= 5.3s [CV 4/5] END learning_rate=0.01, max_depth=1, n_estimators=250;, score=0.738 total time= 5.3s [CV 1/5] END learning_rate=1, max_depth=7, n_estimators=50;, score=0.899 total time= 7.0s [CV 3/5] END learning_rate=1, max_depth=7, n_estimators=50;, score=0.884 total time= 6.6s [CV 2/5] END learning_rate=1, max_depth=7, n_estimators=50;, score=0.883 total time= 7.1s [CV 1/5] END learning_rate=1, max_depth=1, n_estimators=250;, score=0.850 total time= 6.0s [CV 5/5] END learning_rate=1, max_depth=7, n_estimators=50;, score=0.887 total time= 6.7s [CV 4/5] END learning_rate=1, max_depth=7, n_estimators=50;, score=0.880 total time= 6.8s [CV 3/5] END learning_rate=1, max_depth=1, n_estimators=250;, score=0.851 total time= 5.0s [CV 2/5] END learning_rate=1, max_depth=1, n_estimators=250;, score=0.842 total time= 5.1s [CV 4/5] END learning_rate=1, max_depth=1, n_estimators=250;, score=0.842 total time= 4.8s [CV 5/5] END learning_rate=1, max_depth=1, n_estimators=250;, score=0.845 total time= 4.8s [CV 3/5] END learning_rate=1, max_depth=5, n_estimators=250;, score=0.883 total time= 18.7s [CV 1/5] END learning_rate=1, max_depth=5, n_estimators=250;, score=0.896 total time= 19.8s [CV 4/5] END learning_rate=1, max_depth=5, n_estimators=250;, score=0.884 total time= 19.0s [CV 2/5] END learning_rate=1, max_depth=5, n_estimators=250;, score=0.896 total time= 20.0s [CV 5/5] END learning_rate=1, max_depth=5, n_estimators=250;, score=0.892 total time= 19.1s [CV 1/5] END learning_rate=0.1, max_depth=5, n_estimators=250;, score=0.904 total time= 20.6s [CV 3/5] END learning_rate=0.1, max_depth=5, n_estimators=250;, score=0.883 total time= 20.5s [CV 2/5] END learning_rate=0.1, max_depth=5, n_estimators=250;, score=0.885 total time= 21.5s [CV 1/5] END learning_rate=0.1, max_depth=7, n_estimators=5;, score=0.788 total time= 0.6s [CV 2/5] END learning_rate=0.1, max_depth=7, n_estimators=5;, score=0.786 total time= 0.5s [CV 3/5] END learning_rate=0.1, max_depth=7, n_estimators=5;, score=0.793 total time= 0.7s [CV 4/5] END learning_rate=0.1, max_depth=7, n_estimators=5;, score=0.787 total time= 0.6s [CV 5/5] END learning_rate=0.1, max_depth=7, n_estimators=5;, score=0.789 total time= 0.7s [CV 1/5] END learning_rate=1, max_depth=7, n_estimators=5;, score=0.849 total time= 0.6s [CV 2/5] END learning_rate=1, max_depth=7, n_estimators=5;, score=0.845 total time= 0.6s [CV 3/5] END learning_rate=1, max_depth=7, n_estimators=5;, score=0.839 total time= 0.5s [CV 4/5] END learning_rate=1, max_depth=7, n_estimators=5;, score=0.838 total time= 0.6s [CV 5/5] END learning_rate=1, max_depth=7, n_estimators=5;, score=0.843 total time= 0.6s [CV 4/5] END learning_rate=0.1, max_depth=5, n_estimators=250;, score=0.883 total time= 21.3s [CV 5/5] END learning_rate=0.1, max_depth=5, n_estimators=250;, score=0.887 total time= 21.7s [CV 3/5] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.897 total time= 1.1min [CV 4/5] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.893 total time= 1.1min [CV 5/5] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.897 total time= 1.0min [CV 2/5] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.904 total time= 1.1min [CV 1/5] END learning_rate=0.1, max_depth=9, n_estimators=500;, score=0.912 total time= 1.1min
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[CV] END ................................ score: (test=0.911) total time= 1.4min [CV] END ................................ score: (test=0.904) total time= 1.4min [CV] END ................................ score: (test=0.906) total time= 1.4min
[Parallel(n_jobs=-1)]: Done 3 out of 10 | elapsed: 1.4min remaining: 3.3min
[CV] END ................................ score: (test=0.898) total time= 1.4min [CV] END ................................ score: (test=0.906) total time= 1.4min [CV] END ................................ score: (test=0.900) total time= 1.4min [CV] END ................................ score: (test=0.900) total time= 1.4min
[Parallel(n_jobs=-1)]: Done 7 out of 10 | elapsed: 1.4min remaining: 36.8s
[CV] END ................................ score: (test=0.888) total time= 1.4min [CV] END ................................ score: (test=0.901) total time= 51.4s [CV] END ................................ score: (test=0.912) total time= 51.3s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 2.3min finished
0.902628326566183
param_grid_bag = {
'bootstrap': [True, False],
'bootstrap_features': [True, False],
'n_estimators': [5, 10, 15],
'max_samples' : [0.6, 0.8, 1.0],
'max_features' : [0.6, 0.8, 1.0]
}
bag_clf = model_dict['Bagging']
randomize_search_bag = RandomizedSearchCV(estimator = bag_clf, param_distributions = param_grid_bag, cv=5, n_jobs=-1, verbose=3, scoring='precision')
X_train_copy_prep_drop_ft = joblib.load('saved_preliminary_models/Bagging/X_train_copy_prep_drop_ft_Bagging.sav')
randomize_search_bag.fit(X_train_copy_prep_drop_ft, y_train_copy_prep)
best_bag_clf = randomize_search_bag.best_estimator_
best_bag_clf.fit(X_train_copy_prep_drop_ft, y_train_copy_prep)
precision_score_bag = np.mean(cross_val_score(best_bag_clf,X_train_copy_prep_drop_ft,y_train_copy_prep,scoring='precision',cv=10,n_jobs=-1,verbose=3))
precision_score_bag
Fitting 5 folds for each of 10 candidates, totalling 50 fits [CV 2/5] END bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.857 total time= 1.1s [CV 3/5] END bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.858 total time= 1.1s [CV 1/5] END bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.857 total time= 1.2s [CV 3/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.850 total time= 1.6s [CV 4/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.852 total time= 1.7s [CV 5/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.860 total time= 1.7s [CV 1/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.839 total time= 1.7s [CV 2/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.840 total time= 1.8s [CV 1/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.880 total time= 1.0s [CV 4/5] END bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.856 total time= 1.3s [CV 3/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.862 total time= 1.0s [CV 5/5] END bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0, n_estimators=5;, score=0.867 total time= 1.6s [CV 4/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.852 total time= 1.0s [CV 2/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.864 total time= 1.1s [CV 5/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.858 total time= 1.0s [CV 1/5] END bootstrap=False, bootstrap_features=True, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.881 total time= 1.0s [CV 2/5] END bootstrap=False, bootstrap_features=True, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.878 total time= 0.7s [CV 4/5] END bootstrap=False, bootstrap_features=True, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.871 total time= 0.5s [CV 3/5] END bootstrap=False, bootstrap_features=True, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.876 total time= 0.6s [CV 5/5] END bootstrap=False, bootstrap_features=True, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.873 total time= 0.6s [CV 1/5] END bootstrap=False, bootstrap_features=False, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.872 total time= 0.7s [CV 3/5] END bootstrap=False, bootstrap_features=False, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.868 total time= 0.7s [CV 2/5] END bootstrap=False, bootstrap_features=False, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.866 total time= 0.7s [CV 4/5] END bootstrap=False, bootstrap_features=False, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.864 total time= 0.7s [CV 5/5] END bootstrap=False, bootstrap_features=False, max_features=0.8, max_samples=0.6, n_estimators=5;, score=0.871 total time= 0.6s [CV 1/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.8, n_estimators=10;, score=0.884 total time= 0.8s [CV 2/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.8, n_estimators=10;, score=0.865 total time= 0.8s [CV 3/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.8, n_estimators=10;, score=0.863 total time= 0.8s [CV 4/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.8, n_estimators=10;, score=0.853 total time= 0.8s [CV 5/5] END bootstrap=True, bootstrap_features=True, max_features=0.6, max_samples=0.8, n_estimators=10;, score=0.861 total time= 0.8s [CV 1/5] END bootstrap=True, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.887 total time= 1.2s [CV 3/5] END bootstrap=True, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.871 total time= 1.4s [CV 2/5] END bootstrap=True, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.875 total time= 1.5s [CV 1/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=0.6, n_estimators=5;, score=0.864 total time= 1.0s [CV 3/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=0.6, n_estimators=5;, score=0.866 total time= 1.0s [CV 2/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=0.6, n_estimators=5;, score=0.859 total time= 1.1s [CV 5/5] END bootstrap=True, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.874 total time= 1.5s [CV 4/5] END bootstrap=True, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.865 total time= 1.6s [CV 4/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=0.6, n_estimators=5;, score=0.858 total time= 1.0s [CV 5/5] END bootstrap=False, bootstrap_features=False, max_features=1.0, max_samples=0.6, n_estimators=5;, score=0.865 total time= 0.9s [CV 1/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.874 total time= 1.0s [CV 2/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.863 total time= 1.0s [CV 3/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.860 total time= 1.0s [CV 2/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.879 total time= 1.9s [CV 1/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.892 total time= 1.9s [CV 3/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.875 total time= 1.8s [CV 4/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.868 total time= 1.8s [CV 5/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.8, n_estimators=15;, score=0.877 total time= 1.7s [CV 4/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.857 total time= 0.6s [CV 5/5] END bootstrap=False, bootstrap_features=False, max_features=0.6, max_samples=0.6, n_estimators=10;, score=0.863 total time= 0.7s
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[CV] END ................................ score: (test=0.871) total time= 2.2s [CV] END ................................ score: (test=0.880) total time= 2.3s [CV] END ................................ score: (test=0.876) total time= 2.4s [CV] END ................................ score: (test=0.862) total time= 2.5s [CV] END ................................ score: (test=0.881) total time= 2.4s [CV] END ................................ score: (test=0.884) total time= 2.5s [CV] END ................................ score: (test=0.880) total time= 2.5s [CV] END ................................ score: (test=0.880) total time= 2.6s
[Parallel(n_jobs=-1)]: Done 3 out of 10 | elapsed: 2.5s remaining: 5.8s [Parallel(n_jobs=-1)]: Done 7 out of 10 | elapsed: 2.6s remaining: 1.1s
[CV] END ................................ score: (test=0.882) total time= 1.2s [CV] END ................................ score: (test=0.882) total time= 1.1s
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 3.6s finished
0.8777322951435707
roc_curve_fuc(best_bag_clf, 'Bagging', X_train_copy_prep_drop_ft, y_train_copy_prep)
ROC curve Income lower than 50k: ROC AUC=0.500 Income higher than 50k: ROC AUC=0.833
(0.8334865911184245, '<img src="saved_preliminary_models/Bagging/Bagging_ROC_curve.jpg">')
We will use the random forest with the best parameters and use the precision as metrics
# best_parameter_rand_forest = {
# 'n_estimators': 500,
# 'min_samples_split': 10,
# 'min_samples_leaf': 1,
# 'max_features': 'sqrt',
# 'max_depth': 25,
# 'bootstrap': False}
X_test_copy_prep
X_train_copy_prep_drop_ft
education | GDP Group | age | capital-gain | capital-loss | hours-per-week | race_Asian-Pac-Islander | race_Black | race_White | gender_Female | gender_Male | workclass_Local-gov | workclass_Private | workclass_Self-emp-inc | workclass_Self-emp-not-inc | workclass_State-gov | occupation_Adm-clerical | occupation_Craft-repair | occupation_Exec-managerial | occupation_Farming-fishing | occupation_Machine-op-inspct | occupation_Other-service | occupation_Prof-specialty | occupation_Sales | occupation_Tech-support | occupation_Transport-moving | marital-status_Divorced | marital-status_Married-civ-spouse | marital-status_Never-married | marital-status_Separated | marital-status_Widowed | relationship_Husband | relationship_Not-in-family | relationship_Other-relative | relationship_Own-child | relationship_Unmarried | relationship_Wife | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11.000 | 0.000 | 0.691 | 0.339 | 0.000 | 0.398 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
1 | 15.000 | 0.000 | 0.480 | 0.000 | 0.000 | 0.378 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 |
2 | 11.000 | 0.000 | 0.238 | 0.000 | 0.000 | 0.296 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 |
3 | 11.000 | 0.000 | 0.260 | 0.000 | 0.764 | 0.398 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
4 | 15.000 | 1.000 | 0.166 | 0.000 | 0.000 | 0.398 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
48859 | 14.000 | 0.000 | 0.799 | 0.000 | 0.844 | 0.287 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
48860 | 15.000 | 0.000 | 0.614 | 0.000 | 0.775 | 0.500 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
48861 | 7.000 | 0.000 | 0.642 | 0.000 | 0.000 | 0.420 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
48862 | 15.000 | 0.000 | 0.496 | 0.000 | 0.000 | 0.398 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
48863 | 11.000 | 0.000 | 0.523 | 0.000 | 0.000 | 0.398 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
48864 rows × 37 columns
# get the row that are not present in another dataframe
col_dropped = []
for col in X_test_copy_prep.columns:
if col not in X_train_copy_prep_drop_ft.columns:
col_dropped.append(col)
col_dropped
['race_Amer-Indian-Eskimo', 'race_Other', 'workclass_Federal-gov', 'workclass_Without-pay', 'occupation_Armed-Forces', 'occupation_Handlers-cleaners', 'occupation_Priv-house-serv', 'occupation_Protective-serv', 'marital-status_Married-AF-spouse', 'marital-status_Married-spouse-absent']
X_test_copy_prep_drop_ft = X_test_copy_prep.drop(col_dropped, axis=1)
X_test_copy_prep_drop_ft
education | GDP Group | age | capital-gain | capital-loss | hours-per-week | race_Asian-Pac-Islander | race_Black | race_White | gender_Female | gender_Male | workclass_Local-gov | workclass_Private | workclass_Self-emp-inc | workclass_Self-emp-not-inc | workclass_State-gov | occupation_Adm-clerical | occupation_Craft-repair | occupation_Exec-managerial | occupation_Farming-fishing | occupation_Machine-op-inspct | occupation_Other-service | occupation_Prof-specialty | occupation_Sales | occupation_Tech-support | occupation_Transport-moving | marital-status_Divorced | marital-status_Married-civ-spouse | marital-status_Never-married | marital-status_Separated | marital-status_Widowed | relationship_Husband | relationship_Not-in-family | relationship_Other-relative | relationship_Own-child | relationship_Unmarried | relationship_Wife | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11.000 | 0.000 | 0.570 | 0.000 | 0.000 | 0.480 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
1 | 9.000 | 0.000 | 0.210 | 0.000 | 0.000 | 0.500 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 |
2 | 8.000 | 0.000 | 0.434 | 0.000 | 0.000 | 0.398 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 |
3 | 9.000 | 0.000 | 0.676 | 0.000 | 0.000 | 0.500 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
4 | 0.000 | 0.000 | 0.785 | 0.000 | 0.000 | 0.398 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
12017 | 9.000 | 0.755 | 0.451 | 0.000 | 0.000 | 0.454 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
12018 | 10.000 | 0.000 | 0.326 | 0.339 | 0.000 | 0.732 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
12019 | 15.000 | 0.000 | 0.678 | 0.000 | 0.000 | 0.398 | 0.000 | 0.000 | 0.480 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 |
12020 | 15.000 | 0.000 | 0.641 | 0.000 | 0.000 | 0.317 | 0.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.733 | 0.000 | 0.000 | 0.267 |
12021 | 15.000 | 0.000 | 0.379 | 0.000 | 0.000 | 0.503 | 0.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 |
12022 rows × 37 columns
final_predictions = best_rand_for_clf.predict(X_test_copy_prep_drop_ft)
final_predictions
array([0., 0., 0., ..., 1., 1., 1.])
final_predictions.shape
(12022,)
y_test_copy_prep.shape
(12022,)
n_correct = sum(final_predictions == y_test_copy_prep)
print(n_correct/len(final_predictions))
0.8428714024288804