# 라이브러리 로드
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
# 데이터 로드
heart_d = pd.read_csv('heart_2020_cleaned.csv')
# 중복 제거
heart_d = heart_d.drop_duplicates()
heart_d
HeartDisease | BMI | Smoking | AlcoholDrinking | Stroke | PhysicalHealth | MentalHealth | DiffWalking | Sex | AgeCategory | Race | Diabetic | PhysicalActivity | GenHealth | SleepTime | Asthma | KidneyDisease | SkinCancer | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | No | 16.60 | Yes | No | No | 3.0 | 30.0 | No | Female | 55-59 | White | Yes | Yes | Very good | 5.0 | Yes | No | Yes |
1 | No | 20.34 | No | No | Yes | 0.0 | 0.0 | No | Female | 80 or older | White | No | Yes | Very good | 7.0 | No | No | No |
2 | No | 26.58 | Yes | No | No | 20.0 | 30.0 | No | Male | 65-69 | White | Yes | Yes | Fair | 8.0 | Yes | No | No |
3 | No | 24.21 | No | No | No | 0.0 | 0.0 | No | Female | 75-79 | White | No | No | Good | 6.0 | No | No | Yes |
4 | No | 23.71 | No | No | No | 28.0 | 0.0 | Yes | Female | 40-44 | White | No | Yes | Very good | 8.0 | No | No | No |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319790 | Yes | 27.41 | Yes | No | No | 7.0 | 0.0 | Yes | Male | 60-64 | Hispanic | Yes | No | Fair | 6.0 | Yes | No | No |
319791 | No | 29.84 | Yes | No | No | 0.0 | 0.0 | No | Male | 35-39 | Hispanic | No | Yes | Very good | 5.0 | Yes | No | No |
319792 | No | 24.24 | No | No | No | 0.0 | 0.0 | No | Female | 45-49 | Hispanic | No | Yes | Good | 6.0 | No | No | No |
319793 | No | 32.81 | No | No | No | 0.0 | 0.0 | No | Female | 25-29 | Hispanic | No | No | Good | 12.0 | No | No | No |
319794 | No | 46.56 | No | No | No | 0.0 | 0.0 | No | Female | 80 or older | Hispanic | No | Yes | Good | 8.0 | No | No | No |
301717 rows × 18 columns
# 변수별 시각화
plt.figure(figsize=(10,5))
g = sns.countplot(x=heart_d['HeartDisease'])
g.set_title('HeartDisease', fontweight='bold')
plt.show()
range_col_name1 = [['Smoking','AlcoholDrinking'],['Stroke','DiffWalking'],['Sex','PhysicalActivity'],['Diabetic','Asthma'],['KidneyDisease','SkinCancer']]
heart_d['AgeCategory'] = pd.Categorical(heart_d['AgeCategory'], ordered=True)
range_col_name2 = [['AgeCategory','Race','GenHealth']]
def printCount(cols_name):
fig, axes = plt.subplots(5, 2, figsize=(20, 30))
row=0
col=0
for row in range(5):
for col in range(2):
plt.subplots_adjust(left=0.075, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.35)
column = cols_name[row][col]
sns.countplot(ax=axes[row,col],x=heart_d[column],hue=heart_d['HeartDisease'])
axes[row,col].set_title("Counts of {}".format(column), fontweight='bold')
def printCount_2(cols_name):
fig, axes = plt.subplots(1, 3, figsize=(20, 8))
row=0
col=0
for row in range(1):
for col in range(3):
column = cols_name[row][col]
sns.countplot(ax=axes[col],x=heart_d[column],hue=heart_d['HeartDisease'])
axes[col].set_title("Counts of {}".format(column), fontweight='bold')
fig.autofmt_xdate(rotation=45)
printCount(range_col_name1)
printCount_2(range_col_name2)
col_name = [['BMI','PhysicalHealth'],['MentalHealth', 'SleepTime']]
def printCount__(cols_name):
row=0
col=0
for row in range(2):
for col in range(2):
plt.figure(figsize=(18,22))
ax= [None for _ in range(2)]
ax[0] = plt.subplot2grid((3,4), (0,0), colspan = 2)
ax[1] = plt.subplot2grid((3,4), (1,0), colspan = 2)
column = cols_name[row][col]
sns.histplot(data= heart_d, x = heart_d[column],palette='mako',alpha = 0.5 ,binwidth = 1,kde = True, ax=ax[0])
sns.histplot(data= heart_d, x = heart_d[column],palette='mako',alpha = 0.5 ,binwidth = 1,kde = True, ax=ax[1],hue = 'HeartDisease')
printCount__(col_name)
heart_one_hot = heart_d.copy()
heart_one_hot['AgeCategory'].value_counts()
heart_one_hot_race = heart_one_hot.copy()
heart_one_hot_race = pd.get_dummies(heart_one_hot['Race'])
heart_one_hot = heart_one_hot.drop(['Race'], axis='columns')
# one-hot-encoding
y_n = {"Yes":1, "No":0}
diabetic = {"Yes":2, "Yes (during pregnancy)":2, "No, borderline diabetes":1, "No":0}
sex = {"Female": 0, "Male":1}
health_map = {'Excellent' : 4, 'Very good' : 3, 'Good' : 2, 'Fair' : 1, 'Poor' : 0}
age = {"18-24":20, "25-29":25, "30-34":30, "35-39":35, "40-44":40, "45-49":45, "50-54":50, "55-59":55, "60-64":60, "65-69":65, "70-74":70, "75-79":75, "80 or older":80}
heart_one_hot["Smoking"] = heart_one_hot["Smoking"].replace(y_n)
heart_one_hot["AlcoholDrinking"] = heart_one_hot["AlcoholDrinking"].replace(y_n)
heart_one_hot["Stroke"] = heart_one_hot["Stroke"].replace(y_n)
heart_one_hot["DiffWalking"] = heart_one_hot["DiffWalking"].replace(y_n)
heart_one_hot["PhysicalActivity"] = heart_one_hot["PhysicalActivity"].replace(y_n)
heart_one_hot["Asthma"] = heart_one_hot["Asthma"].replace(y_n)
heart_one_hot["KidneyDisease"] = heart_one_hot["KidneyDisease"].replace(y_n)
heart_one_hot["HeartDisease"] = heart_one_hot["HeartDisease"].replace(y_n)
heart_one_hot["SkinCancer"] = heart_one_hot["SkinCancer"].replace(y_n)
heart_one_hot["Diabetic"] = heart_one_hot["Diabetic"].replace(diabetic)
heart_one_hot["Sex"] = heart_one_hot["Sex"].replace(sex)
heart_one_hot["GenHealth"] = heart_one_hot["GenHealth"].replace(health_map)
heart_one_hot["AgeCategory"] = heart_one_hot["AgeCategory"].replace(age)
heart_one_hot = pd.concat([heart_one_hot,heart_one_hot_race], axis=1)
heart_one_hot
HeartDisease | BMI | Smoking | AlcoholDrinking | Stroke | PhysicalHealth | MentalHealth | DiffWalking | Sex | AgeCategory | ... | SleepTime | Asthma | KidneyDisease | SkinCancer | American Indian/Alaskan Native | Asian | Black | Hispanic | Other | White | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 16.60 | 1 | 0 | 0 | 3.0 | 30.0 | 0 | 0 | 55 | ... | 5.0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 20.34 | 0 | 0 | 1 | 0.0 | 0.0 | 0 | 0 | 80 | ... | 7.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 0 | 26.58 | 1 | 0 | 0 | 20.0 | 30.0 | 0 | 1 | 65 | ... | 8.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 0 | 24.21 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0 | 75 | ... | 6.0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 0 | 23.71 | 0 | 0 | 0 | 28.0 | 0.0 | 1 | 0 | 40 | ... | 8.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319790 | 1 | 27.41 | 1 | 0 | 0 | 7.0 | 0.0 | 1 | 1 | 60 | ... | 6.0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
319791 | 0 | 29.84 | 1 | 0 | 0 | 0.0 | 0.0 | 0 | 1 | 35 | ... | 5.0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
319792 | 0 | 24.24 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0 | 45 | ... | 6.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
319793 | 0 | 32.81 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0 | 25 | ... | 12.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
319794 | 0 | 46.56 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0 | 80 | ... | 8.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
301717 rows × 23 columns
x = heart_one_hot.iloc[:,1:]
y = heart_one_hot.iloc[:,0]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10)
# 모델링
def modeling(model,x_train,x_test,y_train,y_test):
model.fit(x_train,y_train)
pred = model.predict(x_test)
pred_proba = model.predict_proba(x_test)[:,1]
metrics(y_test,pred)
# 평가
def metrics(y_test,pred):
accuracy = accuracy_score(y_test,pred)
precision = precision_score(y_test,pred)
recall = recall_score(y_test,pred)
f1 = f1_score(y_test,pred)
matrix = confusion_matrix(y_test, pred)
roc_score = roc_auc_score(y_test,pred,average='macro')
print('Accuracy: ', format(accuracy,'.2f'),'\n')
print('Recall: ', format(recall,'.2f'),'\n')
print('Precision: ', format(precision,'.2f'),'\n')
print('F1_score: ', format(f1,'.2f'),'\n')
print('Roc_score: ', format(roc_score,'.2f'),'\n')
print('Confusion Matrix:','\n', matrix)
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(n_estimators=1000,num_leaves=64,n_jobs=-1,boost_from_average=False)
modeling(lgb,x_train,x_test,y_train,y_test)
Accuracy: 0.91 Recall: 0.10 Precision: 0.50 F1_score: 0.17 Roc_score: 0.55 Confusion Matrix: [[54306 543] [ 4944 551]]
smote = SMOTE(random_state=0)
x_train_s,y_train_s = smote.fit_resample(x_train,y_train)
print('SMOTE 적용 전: ', x_train.shape, y_train.shape)
print('SMOTE 적용 후: ', x_train_s.shape, y_train_s.shape)
print('SMOTE 적용 후 0/1 값 분포: \n', pd.Series(y_train_s).value_counts())
modeling(lgb,x_train_s,x_test,y_train_s,y_test)
Accuracy: 0.86 Recall: 0.35 Precision: 0.27 F1_score: 0.30 Roc_score: 0.63 Confusion Matrix: [[49700 5149] [ 3595 1900]]
svmsmote = SVMSMOTE(random_state=0)
x_train_svm,y_train_svm = svmsmote.fit_resample(x_train,y_train)
modeling(lgb,x_train_svm,x_test,y_train_svm,y_test)
from imblearn.over_sampling import BorderlineSMOTE
bsmote = BorderlineSMOTE(random_state=42)
x_train_b,y_train_b = bsmote.fit_resample(x_train,y_train)
print('SMOTE 적용 전: ', x_train.shape, y_train.shape)
print('SMOTE 적용 후: ', x_train_b.shape, y_train_b.shape)
print('SMOTE 적용 후 0/1 값 분포: \n', pd.Series(y_train_b).value_counts())
modeling(lgb,x_train_b,x_test,y_train_b,y_test)