# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from scipy import stats
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from mlxtend.plotting import plot_confusion_matrix
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import shuffle
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report,ConfusionMatrixDisplay,precision_score,recall_score, f1_score,roc_auc_score,roc_curve, balanced_accuracy_score
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
print('done importing')
done importing
dm_raw = pd.read_csv("/kaggle/input/health-dataset/diabetes_data.csv")
dm_raw.head()
Age | Sex | HighChol | CholCheck | BMI | Smoker | HeartDiseaseorAttack | PhysActivity | Fruits | Veggies | HvyAlcoholConsump | GenHlth | MentHlth | PhysHlth | DiffWalk | Stroke | HighBP | Diabetes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.0 | 1.0 | 0.0 | 1.0 | 26.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 3.0 | 5.0 | 30.0 | 0.0 | 0.0 | 1.0 | 0.0 |
1 | 12.0 | 1.0 | 1.0 | 1.0 | 26.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 |
2 | 13.0 | 1.0 | 0.0 | 1.0 | 26.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 11.0 | 1.0 | 1.0 | 1.0 | 28.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 3.0 | 0.0 | 3.0 | 0.0 | 0.0 | 1.0 | 0.0 |
4 | 8.0 | 0.0 | 0.0 | 1.0 | 29.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Describe columns
Age: 13-level age category (_AGEG5YR see codebook)
1 = 18-24 / 2 = 25-29 / 3 = 30-34 / 4 = 35-39 / 5 = 40-44 / 6 = 45-49 / 7 = 50-54 / 8 = 55-59 / 9 = 60-64 / 10 = 65-69 / 11 = 70-74 / 12 = 75-79 / 13 = 80 or older
Sex: patient's gender (1: male; 0: female)
HighChol: 0 = no high cholesterol 1 = high cholesterol
CholCheck: 0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
BMI: Body Mass Index
Smoker: Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes
HeartDiseaseorAttack: coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
PhysActivity: physical activity in past 30 days - not including job 0 = no 1 = yes
Fruits: Consume Fruit 1 or more times per day 0 = no 1 = yes
Veggies: Consume Vegetables 1 or more times per day 0 = no 1 = yes
HvyAlcoholConsump: (adult men >=14 drinks per week and adult women>=7 drinks per week) 0 = no 1 = yes
GenHlth: Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
MentHlth: days of poor mental health scale 1-30 days
PhysHlth: physical illness or injury days in past 30 days scale 1-30
DiffWalk: Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
Stroke: you ever had a stroke. 0 = no, 1 = yes
HighBP: 0 = no high, BP 1 = high BP
Diabetes: 0 = no diabetes, 1 = diabetes
#get column names
dm_raw.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 70692 entries, 0 to 70691 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 70692 non-null float64 1 Sex 70692 non-null float64 2 HighChol 70692 non-null float64 3 CholCheck 70692 non-null float64 4 BMI 70692 non-null float64 5 Smoker 70692 non-null float64 6 HeartDiseaseorAttack 70692 non-null float64 7 PhysActivity 70692 non-null float64 8 Fruits 70692 non-null float64 9 Veggies 70692 non-null float64 10 HvyAlcoholConsump 70692 non-null float64 11 GenHlth 70692 non-null float64 12 MentHlth 70692 non-null float64 13 PhysHlth 70692 non-null float64 14 DiffWalk 70692 non-null float64 15 Stroke 70692 non-null float64 16 HighBP 70692 non-null float64 17 Diabetes 70692 non-null float64 dtypes: float64(18) memory usage: 9.7 MB
#No null variable
#select variables that are medically likely to predict diabetes
dm = dm_raw[["Age","Sex","HighChol","BMI","Smoker","PhysActivity","PhysHlth","Fruits","Veggies","HvyAlcoholConsump","Stroke","HighBP","Diabetes"]]
dm.head()
Age | Sex | HighChol | BMI | Smoker | PhysActivity | PhysHlth | Fruits | Veggies | HvyAlcoholConsump | Stroke | HighBP | Diabetes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.0 | 1.0 | 0.0 | 26.0 | 0.0 | 1.0 | 30.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
1 | 12.0 | 1.0 | 1.0 | 26.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 |
2 | 13.0 | 1.0 | 0.0 | 26.0 | 0.0 | 1.0 | 10.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 11.0 | 1.0 | 1.0 | 28.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
4 | 8.0 | 0.0 | 0.0 | 29.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
dm.shape
(70692, 13)
#check unique values
unique_values = {}
for col in dm.columns:
unique_values[col] = dm[col].value_counts().shape[0]
pd.DataFrame(unique_values, index=['unique value count']).transpose()
unique value count | |
---|---|
Age | 13 |
Sex | 2 |
HighChol | 2 |
BMI | 80 |
Smoker | 2 |
PhysActivity | 2 |
PhysHlth | 31 |
Fruits | 2 |
Veggies | 2 |
HvyAlcoholConsump | 2 |
Stroke | 2 |
HighBP | 2 |
Diabetes | 2 |
#check frequency of all values in the column
# All data columns except for color
feature_cols = [x for x in dm.columns if x not in 'stroke']
plt.figure(figsize=(25,35))
# loop for subplots
for i in range(len(feature_cols)):
plt.subplot(8,5,i+1)
plt.title(feature_cols[i])
plt.xticks(rotation=90)
plt.hist(dm[feature_cols[i]],color = "deepskyblue")
plt.tight_layout()
#we should drop the columns with very small categories- (HvyAlcoholConsump and stroke)
dm.drop(['HvyAlcoholConsump','Stroke'], axis=1, inplace=True)
#check correlation of other columns with diabetes column
dm.drop('Diabetes', axis=1).corrwith(dm.Diabetes).plot(kind='bar', grid=True, figsize=(10, 6), title="Correlation with Diabetes",color="deepskyblue");
#variables with correlation less than 0.1 are Sex, Smoker, Fruits, Veggies
# Correlation between any two features
# check for possible co-variates
sns.set(rc = {'figure.figsize':(10,10)})
sns.heatmap(dm.corr(),vmin=-1, vmax=1, annot = True, fmt='.1g',cmap= 'coolwarm')
<AxesSubplot:>
#drop the variables with low correlations Sex, Smoker, Fruits, Veggies
dm.drop(['Sex','Smoker','Fruits','Veggies'], axis=1, inplace=True)
dm.head()
Age | HighChol | BMI | PhysActivity | PhysHlth | HighBP | Diabetes | |
---|---|---|---|---|---|---|---|
0 | 4.0 | 0.0 | 26.0 | 1.0 | 30.0 | 1.0 | 0.0 |
1 | 12.0 | 1.0 | 26.0 | 0.0 | 0.0 | 1.0 | 0.0 |
2 | 13.0 | 0.0 | 26.0 | 1.0 | 10.0 | 0.0 | 0.0 |
3 | 11.0 | 1.0 | 28.0 | 1.0 | 3.0 | 1.0 | 0.0 |
4 | 8.0 | 0.0 | 29.0 | 1.0 | 0.0 | 0.0 | 0.0 |
#narrowed down to 6 possible determinants
#determine which predictors are more useful
# Bivariate bar plot for categorical variables
features = [x for x in dm.columns if x not in ['Age','BMI','PhysHlth','Diabetes']]
plt.figure(figsize = (30,23))
plt.suptitle('Diabetes by categorical features')
#subplots
for i in enumerate(features):
plt.subplot(2,4, i[0]+1)
x = sns.countplot(data=dm, x=i[1], hue='Diabetes', palette = ['deepskyblue','crimson'])
for z in x.patches:
x.annotate('{:.1f}'.format((z.get_height()/dm.shape[0])*100)+'%',(z.get_x()+0.25, z.get_height()+0.01))
#for numeric variables
plt.figure(figsize=(12,5))
sns.displot(x='BMI', col='Diabetes' , data = dm, kind="kde" ,color = 'deepskyblue')
<seaborn.axisgrid.FacetGrid at 0x7f1d6d38a090>
<Figure size 1200x500 with 0 Axes>
plt.figure(figsize=(12,20))
sns.displot(data=dm,col='Diabetes',x='Age',color='deepskyblue')
<seaborn.axisgrid.FacetGrid at 0x7f1d52a7e390>
<Figure size 1200x2000 with 0 Axes>
#Check skewness
#can only be checked for numeric data
dm_skew = dm[['Age','BMI','PhysHlth']]
skew = pd.DataFrame(dm_skew.skew())
skew.columns = ['skew']
skew['too_skewed'] = skew['skew'] > .75
skew
skew | too_skewed | |
---|---|---|
Age | -0.545923 | False |
BMI | 1.719180 | True |
PhysHlth | 1.657304 | True |
#BMI and PhysHlth are skewed. It needs to be transformed
#Scaling the data for features selection using the MinMaxScaler method.
#only numeric variables apply here
mms = MinMaxScaler()
dm[['BMI']] = mms.fit_transform(dm[['BMI']])
dm[['Age']] = mms.fit_transform(dm[['Age']])
dm[['PhysHlth']] = mms.fit_transform(dm[['PhysHlth']])
dm.head()
Age | HighChol | BMI | PhysActivity | PhysHlth | HighBP | Diabetes | |
---|---|---|---|---|---|---|---|
0 | 0.250000 | 0.0 | 0.162791 | 1.0 | 1.000000 | 1.0 | 0.0 |
1 | 0.916667 | 1.0 | 0.162791 | 0.0 | 0.000000 | 1.0 | 0.0 |
2 | 1.000000 | 0.0 | 0.162791 | 1.0 | 0.333333 | 0.0 | 0.0 |
3 | 0.833333 | 1.0 | 0.186047 | 1.0 | 0.100000 | 1.0 | 0.0 |
4 | 0.583333 | 0.0 | 0.197674 | 1.0 | 0.000000 | 0.0 | 0.0 |
#Features selection -step 1
#1. Define X,y
y = (dm['Diabetes']).astype(int)
X = dm.loc[:, dm.columns != 'Diabetes'] # everything except "Diabetes"
#step 2
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
plt.figure(figsize=(8,6))
feat_importances.nlargest(6).plot(kind='barh')
plt.show()
[0.15190905 0.11040928 0.34254753 0.0242652 0.14311465 0.22775429]
#method 2
#apply SelectKBest class to extract top 5 best features #Do this before quantile transformation
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(6,'Score')) #print 5 best features
Specs Score 5 HighBP 4491.799960 1 HighChol 2804.501278 4 PhysHlth 1864.301775 3 PhysActivity 528.494034 0 Age 490.930094 2 BMI 200.502204
#Method 3
#Create a logistic regression classifier
lr = LogisticRegression()
# Create an EFS object
efs = EFS(estimator=lr, # Use logistic regression as the classifier/estimator
min_features=1, # The minimum number of features to consider is 1
max_features=5, # The maximum number of features to consider is 5
scoring='accuracy', # The metric to use to evaluate the classifier is accuracy
cv=4) # The number of cross-validations to perform is 4
# Train EFS with our dataset
efs = efs.fit(X, y)
# Print the results
print('Best accuracy score: %.2f' % efs.best_score_) # best_score_ shows the best score
print('Best subset (indices):', efs.best_idx_) # best_idx_ shows the index of features that yield the best score
print('Best subset (corresponding names):', efs.best_feature_names_) # best_feature_names_ shows the feature names
Features: 62/62
Best accuracy score: 0.72 Best subset (indices): (0, 1, 2, 4, 5) Best subset (corresponding names): ('Age', 'HighChol', 'BMI', 'PhysHlth', 'HighBP')
#recheck the skew
dm_skew = dm[['Age','BMI','PhysHlth']]
skew = pd.DataFrame(dm_skew.skew())
skew.columns = ['skew']
skew['too_skewed'] = skew['skew'] > .75
skew
skew | too_skewed | |
---|---|---|
Age | -0.545923 | False |
BMI | 1.719180 | True |
PhysHlth | 1.657304 | True |
#use quantile tranformation
qt = QuantileTransformer(n_quantiles=500, output_distribution='normal')
dm[['BMI']] = qt.fit_transform(dm[['BMI']])
dm[['PhysHlth']] = qt.fit_transform(dm[['PhysHlth']])
#recheck the skew
dm_skew = dm[['Age','BMI','PhysHlth']]
skew = pd.DataFrame(dm_skew.skew())
skew.columns = ['skew']
skew['too_skewed'] = skew['skew'] > .75
skew
skew | too_skewed | |
---|---|---|
Age | -0.545923 | False |
BMI | 0.016868 | False |
PhysHlth | 0.693496 | False |
dm.head()
Age | HighChol | BMI | PhysActivity | PhysHlth | HighBP | Diabetes | |
---|---|---|---|---|---|---|---|
0 | 0.250000 | 0.0 | -0.505473 | 1.0 | 5.199338 | 1.0 | 0.0 |
1 | 0.916667 | 1.0 | -0.505473 | 0.0 | -5.199338 | 1.0 | 0.0 |
2 | 1.000000 | 0.0 | -0.505473 | 1.0 | 0.822449 | 0.0 | 0.0 |
3 | 0.833333 | 1.0 | -0.093065 | 1.0 | 0.468708 | 1.0 | 0.0 |
4 | 0.583333 | 0.0 | 0.065349 | 1.0 | -5.199338 | 0.0 | 0.0 |
#Data splitting
y = (dm['Diabetes']).astype(int)
X = dm.loc[:, dm.columns != 'stroke'] # everything except "stroke"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape
(49484, 7)
X_test.shape
(21208, 7)
#Predict with Decision tree, KNN and Extra Tree
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'n_neighbors': [1,3,5,7,9,11,13,15,17,19], #odd numbers because there are 2 classes in target coulmn
'weights': ['distance', 'uniform']}
gridKNN = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
gridKNN.fit(X_train, y_train)
Fitting 5 folds for each of 20 candidates, totalling 100 fits [CV 1/5] END ...n_neighbors=1, weights=distance;, score=1.000 total time= 0.3s [CV 2/5] END ...n_neighbors=1, weights=distance;, score=1.000 total time= 0.3s [CV 3/5] END ...n_neighbors=1, weights=distance;, score=1.000 total time= 0.3s [CV 4/5] END ...n_neighbors=1, weights=distance;, score=1.000 total time= 0.3s [CV 5/5] END ...n_neighbors=1, weights=distance;, score=1.000 total time= 0.3s [CV 1/5] END ....n_neighbors=1, weights=uniform;, score=1.000 total time= 0.6s [CV 2/5] END ....n_neighbors=1, weights=uniform;, score=1.000 total time= 0.5s [CV 3/5] END ....n_neighbors=1, weights=uniform;, score=1.000 total time= 0.5s [CV 4/5] END ....n_neighbors=1, weights=uniform;, score=1.000 total time= 0.6s [CV 5/5] END ....n_neighbors=1, weights=uniform;, score=1.000 total time= 0.5s [CV 1/5] END ...n_neighbors=3, weights=distance;, score=1.000 total time= 0.3s [CV 2/5] END ...n_neighbors=3, weights=distance;, score=1.000 total time= 0.3s [CV 3/5] END ...n_neighbors=3, weights=distance;, score=1.000 total time= 0.3s [CV 4/5] END ...n_neighbors=3, weights=distance;, score=1.000 total time= 0.4s [CV 5/5] END ...n_neighbors=3, weights=distance;, score=1.000 total time= 0.3s [CV 1/5] END ....n_neighbors=3, weights=uniform;, score=1.000 total time= 0.6s [CV 2/5] END ....n_neighbors=3, weights=uniform;, score=1.000 total time= 0.6s [CV 3/5] END ....n_neighbors=3, weights=uniform;, score=1.000 total time= 0.6s [CV 4/5] END ....n_neighbors=3, weights=uniform;, score=1.000 total time= 0.6s [CV 5/5] END ....n_neighbors=3, weights=uniform;, score=1.000 total time= 0.6s [CV 1/5] END ...n_neighbors=5, weights=distance;, score=1.000 total time= 0.4s [CV 2/5] END ...n_neighbors=5, weights=distance;, score=1.000 total time= 0.3s [CV 3/5] END ...n_neighbors=5, weights=distance;, score=1.000 total time= 0.3s [CV 4/5] END ...n_neighbors=5, weights=distance;, score=1.000 total time= 0.4s [CV 5/5] END ...n_neighbors=5, weights=distance;, score=1.000 total time= 0.3s [CV 1/5] END ....n_neighbors=5, weights=uniform;, score=1.000 total time= 0.6s [CV 2/5] END ....n_neighbors=5, weights=uniform;, score=1.000 total time= 0.6s [CV 3/5] END ....n_neighbors=5, weights=uniform;, score=1.000 total time= 0.6s [CV 4/5] END ....n_neighbors=5, weights=uniform;, score=1.000 total time= 0.6s [CV 5/5] END ....n_neighbors=5, weights=uniform;, score=1.000 total time= 0.6s [CV 1/5] END ...n_neighbors=7, weights=distance;, score=1.000 total time= 0.4s [CV 2/5] END ...n_neighbors=7, weights=distance;, score=1.000 total time= 0.3s [CV 3/5] END ...n_neighbors=7, weights=distance;, score=1.000 total time= 0.3s [CV 4/5] END ...n_neighbors=7, weights=distance;, score=1.000 total time= 0.4s [CV 5/5] END ...n_neighbors=7, weights=distance;, score=1.000 total time= 0.4s [CV 1/5] END ....n_neighbors=7, weights=uniform;, score=1.000 total time= 0.6s [CV 2/5] END ....n_neighbors=7, weights=uniform;, score=1.000 total time= 0.6s [CV 3/5] END ....n_neighbors=7, weights=uniform;, score=1.000 total time= 0.6s [CV 4/5] END ....n_neighbors=7, weights=uniform;, score=1.000 total time= 0.6s [CV 5/5] END ....n_neighbors=7, weights=uniform;, score=1.000 total time= 0.6s [CV 1/5] END ...n_neighbors=9, weights=distance;, score=1.000 total time= 0.4s [CV 2/5] END ...n_neighbors=9, weights=distance;, score=1.000 total time= 0.4s [CV 3/5] END ...n_neighbors=9, weights=distance;, score=1.000 total time= 0.4s [CV 4/5] END ...n_neighbors=9, weights=distance;, score=1.000 total time= 0.4s [CV 5/5] END ...n_neighbors=9, weights=distance;, score=1.000 total time= 0.4s [CV 1/5] END ....n_neighbors=9, weights=uniform;, score=1.000 total time= 0.6s [CV 2/5] END ....n_neighbors=9, weights=uniform;, score=1.000 total time= 0.6s [CV 3/5] END ....n_neighbors=9, weights=uniform;, score=1.000 total time= 0.6s [CV 4/5] END ....n_neighbors=9, weights=uniform;, score=1.000 total time= 0.7s [CV 5/5] END ....n_neighbors=9, weights=uniform;, score=1.000 total time= 0.6s [CV 1/5] END ..n_neighbors=11, weights=distance;, score=1.000 total time= 0.4s [CV 2/5] END ..n_neighbors=11, weights=distance;, score=1.000 total time= 0.4s [CV 3/5] END ..n_neighbors=11, weights=distance;, score=1.000 total time= 0.4s [CV 4/5] END ..n_neighbors=11, weights=distance;, score=1.000 total time= 0.4s [CV 5/5] END ..n_neighbors=11, weights=distance;, score=1.000 total time= 0.4s [CV 1/5] END ...n_neighbors=11, weights=uniform;, score=1.000 total time= 0.7s [CV 2/5] END ...n_neighbors=11, weights=uniform;, score=1.000 total time= 0.6s [CV 3/5] END ...n_neighbors=11, weights=uniform;, score=1.000 total time= 0.6s [CV 4/5] END ...n_neighbors=11, weights=uniform;, score=1.000 total time= 0.7s [CV 5/5] END ...n_neighbors=11, weights=uniform;, score=1.000 total time= 0.7s [CV 1/5] END ..n_neighbors=13, weights=distance;, score=1.000 total time= 0.5s [CV 2/5] END ..n_neighbors=13, weights=distance;, score=1.000 total time= 0.5s [CV 3/5] END ..n_neighbors=13, weights=distance;, score=1.000 total time= 0.5s [CV 4/5] END ..n_neighbors=13, weights=distance;, score=1.000 total time= 0.5s [CV 5/5] END ..n_neighbors=13, weights=distance;, score=1.000 total time= 0.4s [CV 1/5] END ...n_neighbors=13, weights=uniform;, score=1.000 total time= 0.7s [CV 2/5] END ...n_neighbors=13, weights=uniform;, score=1.000 total time= 0.7s [CV 3/5] END ...n_neighbors=13, weights=uniform;, score=1.000 total time= 0.7s [CV 4/5] END ...n_neighbors=13, weights=uniform;, score=1.000 total time= 0.7s [CV 5/5] END ...n_neighbors=13, weights=uniform;, score=1.000 total time= 0.7s [CV 1/5] END ..n_neighbors=15, weights=distance;, score=1.000 total time= 0.5s [CV 2/5] END ..n_neighbors=15, weights=distance;, score=1.000 total time= 0.5s [CV 3/5] END ..n_neighbors=15, weights=distance;, score=1.000 total time= 0.4s [CV 4/5] END ..n_neighbors=15, weights=distance;, score=1.000 total time= 0.5s [CV 5/5] END ..n_neighbors=15, weights=distance;, score=1.000 total time= 0.4s [CV 1/5] END ...n_neighbors=15, weights=uniform;, score=1.000 total time= 0.7s [CV 2/5] END ...n_neighbors=15, weights=uniform;, score=1.000 total time= 0.7s [CV 3/5] END ...n_neighbors=15, weights=uniform;, score=1.000 total time= 0.7s [CV 4/5] END ...n_neighbors=15, weights=uniform;, score=1.000 total time= 0.7s [CV 5/5] END ...n_neighbors=15, weights=uniform;, score=1.000 total time= 0.7s [CV 1/5] END ..n_neighbors=17, weights=distance;, score=1.000 total time= 0.5s [CV 2/5] END ..n_neighbors=17, weights=distance;, score=1.000 total time= 0.5s [CV 3/5] END ..n_neighbors=17, weights=distance;, score=1.000 total time= 0.5s [CV 4/5] END ..n_neighbors=17, weights=distance;, score=1.000 total time= 0.5s [CV 5/5] END ..n_neighbors=17, weights=distance;, score=1.000 total time= 0.4s [CV 1/5] END ...n_neighbors=17, weights=uniform;, score=1.000 total time= 0.7s [CV 2/5] END ...n_neighbors=17, weights=uniform;, score=1.000 total time= 0.7s [CV 3/5] END ...n_neighbors=17, weights=uniform;, score=1.000 total time= 0.7s [CV 4/5] END ...n_neighbors=17, weights=uniform;, score=1.000 total time= 0.7s [CV 5/5] END ...n_neighbors=17, weights=uniform;, score=1.000 total time= 0.7s [CV 1/5] END ..n_neighbors=19, weights=distance;, score=1.000 total time= 0.5s [CV 2/5] END ..n_neighbors=19, weights=distance;, score=1.000 total time= 0.5s [CV 3/5] END ..n_neighbors=19, weights=distance;, score=1.000 total time= 0.5s [CV 4/5] END ..n_neighbors=19, weights=distance;, score=1.000 total time= 0.5s [CV 5/5] END ..n_neighbors=19, weights=distance;, score=1.000 total time= 0.5s [CV 1/5] END ...n_neighbors=19, weights=uniform;, score=1.000 total time= 0.7s [CV 2/5] END ...n_neighbors=19, weights=uniform;, score=1.000 total time= 0.7s [CV 3/5] END ...n_neighbors=19, weights=uniform;, score=1.000 total time= 0.7s [CV 4/5] END ...n_neighbors=19, weights=uniform;, score=1.000 total time= 0.7s [CV 5/5] END ...n_neighbors=19, weights=uniform;, score=1.000 total time= 0.7s
GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19], 'weights': ['distance', 'uniform']}, verbose=3)
print(gridKNN.best_params_)
{'n_neighbors': 1, 'weights': 'distance'}
#predict with the best parameter
y_pred_test = gridKNN.predict(X_test)
y_pred_train = gridKNN.predict(X_train)
#Check accuracy and overfitting
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))
1.0 0.9999528479818937
#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_test, labels=gridKNN.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=gridKNN.classes_)
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)
plt.grid(which='major') #remove cell gridlines
plt.gcf().set_size_inches(6, 6) # Adjust the size of the plot
plt.show()
<Figure size 500x500 with 0 Axes>
#model metrics
#function that get y_test and calculate into df all the relevant metric
def train_evaluate_model(y_test):
#fit the model instance
predictions = y_pred_test # calculate predictions
#compute metrics for evaluation
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
balanced_accuracy = balanced_accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, predictions)
#create a dataframe to visualize the results
eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
return eval_df
#model metrics
results = train_evaluate_model(y_test)
results.index = ['K Nearest Neighbors - Method 1']
results.style.background_gradient(cmap = sns.color_palette("blend:green,red", as_cmap=True))
accuracy | f1_score | precision | recall | balanced_accuracy | auc | |
---|---|---|---|---|---|---|
K Nearest Neighbors - Method 1 | 0.999953 | 0.999953 | 0.999906 | 1.000000 | 0.999953 | 0.999953 |
dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)
#dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)
# defining parameter range
param_grid = {'max_depth':range(1, dt.tree_.max_depth+1, 2),
'max_features': range(1, len(dt.feature_importances_)+1)}
gridDT = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, n_jobs=-1)
# fitting the model for grid search
gridDT.fit(X_train, y_train)
GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1, param_grid={'max_depth': range(1, 2, 2), 'max_features': range(1, 8)})
print(gridDT.best_params_)
{'max_depth': 1, 'max_features': 1}
y_pred_test = gridDT.predict(X_test)
y_pred_train = gridDT.predict(X_train)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))
1.0 1.0
#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_test, labels=gridDT.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=gridDT.classes_)
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)
plt.grid(which='major') #remove cell gridlines
plt.gcf().set_size_inches(6, 6) # Adjust the size of the plot
plt.show()
<Figure size 500x500 with 0 Axes>
resultsDT = train_evaluate_model(y_test)
resultsDT.index = ['Decision Trees - Method 2']
results = results.append(resultsDT)
results.style.background_gradient(cmap = sns.color_palette("blend:red,green", as_cmap=True))
accuracy | f1_score | precision | recall | balanced_accuracy | auc | |
---|---|---|---|---|---|---|
K Nearest Neighbors - Method 1 | 0.999953 | 0.999953 | 0.999906 | 1.000000 | 0.999953 | 0.999953 |
Decision Trees - Method 2 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
RF = RandomForestClassifier(oob_score=True,
random_state=42,
warm_start=True,
n_jobs=-1)
# defining parameter range
param_grid = {'n_estimators':[15, 20, 30, 40, 50, 100, 150, 200, 300, 400]
}
gridRF = GridSearchCV(RF, param_grid)
# fitting the model for grid search
gridRF.fit(X_train, y_train)
GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=42, warm_start=True), param_grid={'n_estimators': [15, 20, 30, 40, 50, 100, 150, 200, 300, 400]})
print(gridRF.best_params_)
{'n_estimators': 15}
Prediction according to this model.
y_pred_test = gridRF.predict(X_test)
y_pred_train = gridRF.predict(X_train)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))
1.0 1.0
#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_test, labels=gridRF.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=gridRF.classes_)
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues)
plt.grid(which='major') #remove cell gridlines
plt.gcf().set_size_inches(6, 6) # Adjust the size of the plot
plt.show()
<Figure size 500x500 with 0 Axes>
resultsRF = train_evaluate_model(y_test)
resultsRF.index = ['Random Forest - Method 3']
results = results.append(resultsRF)
results.style.background_gradient(cmap = sns.color_palette("blend:red,green", as_cmap=True))
accuracy | f1_score | precision | recall | balanced_accuracy | auc | |
---|---|---|---|---|---|---|
K Nearest Neighbors - Method 1 | 0.999953 | 0.999953 | 0.999906 | 1.000000 | 0.999953 | 0.999953 |
Decision Trees - Method 2 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Random Forest - Method 3 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
Analysis by Olusola Fajobi, sholex111@gmail.com
Appreciation to SAMI OR YERMIYAHU