import pandas as pd
df = pd.read_csv('heart_2020_cleaned.csv')
df = df.drop_duplicates()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 301717 entries, 0 to 319794 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 HeartDisease 301717 non-null object 1 BMI 301717 non-null float64 2 Smoking 301717 non-null object 3 AlcoholDrinking 301717 non-null object 4 Stroke 301717 non-null object 5 PhysicalHealth 301717 non-null float64 6 MentalHealth 301717 non-null float64 7 DiffWalking 301717 non-null object 8 Sex 301717 non-null object 9 AgeCategory 301717 non-null object 10 Race 301717 non-null object 11 Diabetic 301717 non-null object 12 PhysicalActivity 301717 non-null object 13 GenHealth 301717 non-null object 14 SleepTime 301717 non-null float64 15 Asthma 301717 non-null object 16 KidneyDisease 301717 non-null object 17 SkinCancer 301717 non-null object dtypes: float64(4), object(14) memory usage: 43.7+ MB
import missingno
missingno.matrix(df)
<AxesSubplot:>
흡연하는 사람은 흡연 안하는 사람보다 HeartDisease 발생할 확률 증가
import seaborn as sns
sns.histplot(data=df, x='Smoking', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='Smoking', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['Smoking']=='Yes')]) / len(df[df['Smoking']=='Yes'])
0.12540324163101918
len(df[(df['HeartDisease']=='Yes') & (df['Smoking']=='No')]) / len(df[df['Smoking']=='No'])
0.06473449905915829
HeartDisease 가진 환자의 절반 이상은 흡연을 함
sns.histplot(data=df, x='HeartDisease', hue='Smoking', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='HeartDisease', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['Smoking']=='Yes')]) / len(df[df['HeartDisease']=='Yes'])
0.586075345731998
알코올 안마시는 사람들에게서 심장병 발생확률이 좀 더 높음
sns.histplot(data=df, x='AlcoholDrinking', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='AlcoholDrinking', ylabel='Count'>
sns.histplot(data=df, x='HeartDisease', hue='AlcoholDrinking', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='HeartDisease', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['AlcoholDrinking']=='Yes')]) / len(df[df['AlcoholDrinking']=='Yes'])
0.052870580603308466
len(df[(df['HeartDisease']=='Yes') & (df['AlcoholDrinking']=='No')]) / len(df[df['AlcoholDrinking']=='No'])
0.09324042607876175
많은 사람이 천식이 없지만 천식 있는 사람이 심장병 걸릴 확률이 좀 더 높음
sns.histplot(data=df, x='Asthma', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='Asthma', ylabel='Count'>
sns.histplot(data=df, x='HeartDisease', hue='Asthma', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='HeartDisease', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['Asthma']=='Yes')]) / len(df[df['Asthma']=='Yes'])
0.11563621017092214
len(df[(df['HeartDisease']=='Yes') & (df['Asthma']=='No')]) / len(df[df['Asthma']=='No'])
0.08619039163765219
대부분의 사람이 심부전이 없지만 심부전 있으면 심장병 발생 확률 좀 더 높음
sns.histplot(data=df, x='KidneyDisease', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='KidneyDisease', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['KidneyDisease']=='Yes')]) / len(df[df['KidneyDisease']=='Yes'])
0.29330842391304346
len(df[(df['HeartDisease']=='Yes') & (df['KidneyDisease']=='No')]) / len(df[df['KidneyDisease']=='No'])
0.08210980854725616
많은 사람이 피부암 없지만 있는 사람이 없는 사람보다 심장병 발생확률 높음
sns.histplot(data=df, x='SkinCancer', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='SkinCancer', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['SkinCancer']=='Yes')]) / len(df[df['SkinCancer']=='Yes'])
0.16960262187628022
len(df[(df['HeartDisease']=='Yes') & (df['SkinCancer']=='No')]) / len(df[df['SkinCancer']=='No'])
0.08183169679728365
최근 30일 동안 운동 한 사람에게서 심장병 발생할 확률이 더 낮음
sns.histplot(data=df, x='PhysicalActivity', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='PhysicalActivity', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['PhysicalActivity']=='Yes')]) / len(df[df['PhysicalActivity']=='Yes'])
0.07542575907504817
len(df[(df['HeartDisease']=='Yes') & (df['PhysicalActivity']=='No')]) / len(df[df['PhysicalActivity']=='No'])
0.1385877568192974
걷거나 오르내릴때 어려움이 있는 사람에게서 심장병 발생 확률 높음
sns.histplot(data=df, x='DiffWalking', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='DiffWalking', ylabel='Count'>
sns.histplot(data=df, x='DiffWalking', hue='HeartDisease', shrink=0.5, multiple='dodge')
<AxesSubplot:xlabel='DiffWalking', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['DiffWalking']=='Yes')]) / len(df[df['DiffWalking']=='Yes'])
0.22599481456431067
len(df[(df['HeartDisease']=='Yes') & (df['DiffWalking']=='No')]) / len(df[df['DiffWalking']=='No'])
0.06697569959823128
당뇨가 발생했던 사람들에게서 심장병 발생 확률 높음
sns.histplot(data=df, x='Diabetic', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='Diabetic', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='Yes')]) / len(df[df['Diabetic']=='Yes'])
0.22045381753677104
len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='No')]) / len(df[df['Diabetic']=='No'])
0.06916710352825303
len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='No, borderline diabetes')]) / len(df[df['Diabetic']=='No, borderline diabetes'])
0.11644037780401417
len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='Yes (during pregnancy)')]) / len(df[df['Diabetic']=='Yes (during pregnancy)'])
0.04225352112676056
stroke 있는 사람들이 심장병 발생 확률 높음
sns.histplot(data=df, x='Stroke', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='Stroke', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['Stroke']=='Yes')]) / len(df[df['Stroke']=='Yes'])
0.36380968169761274
len(df[(df['HeartDisease']=='Yes') & (df['Stroke']=='No')]) / len(df[df['Stroke']=='No'])
0.07896344936872741
자신의 건강이 안좋다고 한 사람일수록 심장병 발생 확률 높음
Excellent < VeryGood < Good < Fair < Poor 순으로 높음
sns.histplot(data=df, x='GenHealth', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='GenHealth', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Poor')]) / len(df[df['GenHealth']=='Poor'])
0.341130604288499
len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Fair')]) / len(df[df['GenHealth']=='Fair'])
0.20436250324591015
len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Good')]) / len(df[df['GenHealth']=='Good'])
0.104176941877925
len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Very good')]) / len(df[df['GenHealth']=='Very good'])
0.05084163517691515
len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Excellent')]) / len(df[df['GenHealth']=='Excellent'])
0.02502636556907779
심장병 발생 비율은 White가 가장 많지만 American Indian/Alaskan Native에서 가장 높은 발생률을 보임
sns.histplot(data=df, x='Race', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='Race', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['Race']=='White')]) / len(df[df['Race']=='White'])
0.09834273067397376
len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Black')]) / len(df[df['Race']=='Black'])
0.07580008768084173
len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Asian')]) / len(df[df['Race']=='Asian'])
0.03327911922932566
len(df[(df['HeartDisease']=='Yes') & (df['Race']=='American Indian/Alaskan Native')]) / len(df[df['Race']=='American Indian/Alaskan Native'])
0.10439137134052388
len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Other')]) / len(df[df['Race']=='Other'])
0.08135157469470204
len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Hispanic')]) / len(df[df['Race']=='Hispanic'])
0.05323348212638802
# American Indian/Alaskan Native과 White의 PhysicalHealth 차이 - 유의미한 차이인지는 모르겠음
import matplotlib.pyplot as plt
plt.figure(figsize=(13,5))
sns.kdeplot(df[df['Race']=='American Indian/Alaskan Native']["PhysicalHealth"], shade=True, label = 'American Indian/Alaskan Native')
sns.kdeplot(df[df['Race']=='White']["PhysicalHealth"], shade=True, label = 'White')
plt.legend()
<matplotlib.legend.Legend at 0x271d8a71640>
# American Indian/Alaskan Native과 White의 BMI 차이 - American Indian/Alaskan Native의 BMI가 더 높은 경향을 보임
sns.kdeplot(df[df['Race']=='American Indian/Alaskan Native']["BMI"], shade=True, label = 'American Indian/Alaskan Native')
sns.kdeplot(df[df['Race']=='White']["BMI"], shade=True, label = 'White')
plt.legend()
<matplotlib.legend.Legend at 0x271d87b5fa0>
SleepTime과 HeartDisease사이의 관계는 모르겠음 - 적당한 수면시간?
sns.distplot(x=df['SleepTime'])
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:ylabel='Density'>
df['SleepTime'].value_counts()
8.0 90202 7.0 89445 6.0 64655 5.0 19101 9.0 15853 10.0 7782 4.0 7730 12.0 2204 3.0 1992 2.0 788 1.0 551 11.0 415 14.0 243 16.0 236 15.0 189 18.0 102 13.0 97 20.0 64 24.0 30 17.0 21 22.0 9 19.0 3 23.0 3 21.0 2 Name: SleepTime, dtype: int64
sns.boxplot(x=df['SleepTime'])
<AxesSubplot:xlabel='SleepTime'>
sns.violinplot(df['SleepTime'])
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='SleepTime'>
sns.countplot(df['SleepTime'], color='lightblue')
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='SleepTime', ylabel='count'>
sns.countplot(data = df, x = 'SleepTime', hue = 'HeartDisease')
<AxesSubplot:xlabel='SleepTime', ylabel='count'>
plt.figure(figsize=(9,5))
sns.kdeplot(df[df["HeartDisease"]=='No']["SleepTime"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["SleepTime"], shade=True, label = 'HeartDiseasee')
plt.legend()
<matplotlib.legend.Legend at 0x271d9ffb340>
BMI가 큰 영향을 미치지는 않지만 심장병 발생한 사람은 그렇지 않은 사람에 비해 BMI 지수가 높음
sns.distplot(x=df['BMI'], kde=True)
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:ylabel='Density'>
sns.boxplot(x=df['BMI'])
<AxesSubplot:xlabel='BMI'>
sns.histplot(data=df, x='BMI', hue='HeartDisease', multiple='stack')
<AxesSubplot:xlabel='BMI', ylabel='Count'>
sns.distplot(x=df['BMI'], kde=False, bins=100)
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:>
sns.kdeplot(df[df["HeartDisease"]=='No']["BMI"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["BMI"], shade=True, label = 'HeartDiseasee')
plt.legend()
<matplotlib.legend.Legend at 0x271d9c21490>
PhysicalHealth 값이 클수록 아프거나 다친 상태, 심장병 있으면 PhysicalHealth 값이 큼
sns.distplot(x=df['PhysicalHealth'])
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:ylabel='Density'>
sns.kdeplot(df[df["HeartDisease"]=='No']["PhysicalHealth"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["PhysicalHealth"], shade=True, label = 'HeartDiseasee')
plt.legend()
<matplotlib.legend.Legend at 0x271d9bc42e0>
sns.distplot(x=df['MentalHealth'])
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:ylabel='Density'>
sns.kdeplot(df[df["HeartDisease"]=='No']["MentalHealth"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["MentalHealth"], shade=True, label = 'HeartDiseasee')
plt.legend()
<matplotlib.legend.Legend at 0x271dbd9b430>
여성의 수가 더 많았으며 남성이 HeartDisease 가질 확률이 더 높음, 심장병 환자의 58%가 남성
sns.histplot(data=df, x='Sex', hue='HeartDisease', shrink=0.5, multiple='stack')
<AxesSubplot:xlabel='Sex', ylabel='Count'>
len(df[(df['HeartDisease']=='Yes') & (df['Sex']=='Male')]) / len(df[df['Sex']=='Male'])
0.11297748616645312
len(df[(df['HeartDisease']=='Yes') & (df['Sex']=='Female')]) / len(df[df['Sex']=='Female'])
0.07022565149588841
len(df[(df['HeartDisease']=='Yes') & (df['Sex']=='Male')]) / len(df[df['HeartDisease']=='Yes'])
0.5886797989802282
왜 남성에서 더 많이 발생될까?
BMI에서는 유의미한 차이 모르겠음
plt.figure(figsize=(13,5))
sns.kdeplot(df[df["Sex"]=='Female']["BMI"], shade=True, label = 'Female')
sns.kdeplot(df[df["Sex"]=='Male']["BMI"], shade=True, label = 'Male')
plt.legend()
<matplotlib.legend.Legend at 0x271db945760>
plt.figure(figsize=(13,5))
sns.distplot(df[df["Sex"]=='Female']["BMI"], kde=False,bins=100, label = 'Female')
sns.distplot(df[df["Sex"]=='Male']["BMI"], kde=False, bins=100,label = 'Male')
plt.legend()
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<matplotlib.legend.Legend at 0x271db8df520>
sns.barplot(x="Sex", y="BMI", hue='HeartDisease',data=df)
<AxesSubplot:xlabel='Sex', ylabel='BMI'>
'''
df_size = df.pivot_table(
index="Smoking", columns="Sex", aggfunc="size")
sns.heatmap(df_size, cmap=sns.light_palette(
"gray", as_cmap=True), annot=True, fmt="d")
plt.title("Heatmap")
plt.show()
'''
'\ndf_size = df.pivot_table(\n index="Smoking", columns="Sex", aggfunc="size")\n sns.heatmap(df_size, cmap=sns.light_palette(\n "gray", as_cmap=True), annot=True, fmt="d")\nplt.title("Heatmap")\nplt.show()\n'
#df_pivot = df.pivot("Smoking", "Sex", "BMI")
#sns.heatmap(df_pivot, annot=True, fmt="d", linewidths=1)
#sns.catplot(x="Sex", y="Smoking", col="HeartDisease", data=df, kind="bar")
df_female = df[df["Sex"]=='Female']
df_male = df[df["Sex"]=='Male']
남자의 흡연 비율이 더 높음 -> 심장병에 영향 가능
fig, ax = plt.subplots(ncols=2)
sns.set_context("paper", rc={"font.size":20,
"axes.titlesize":20,
"axes.labelsize":20},
font_scale = 1.0)
sns.histplot(data = df_female, x='Smoking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Smoking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
sns.catplot(data= df, x = "Sex", col= "Smoking", hue='HeartDisease', col_wrap = 3, kind="count")
<seaborn.axisgrid.FacetGrid at 0x271dbe218e0>
#ax = sns.histplot(data=df, x='Smoking', hue='Sex', stat='probability', multiple='stack', shrink=0.5)
#for p in ax.patches:
# left, bottom, width, height = p.get_bbox().bounds
# ax.annotate("%.1f"%(height*100), xy=(left+width/2, bottom+height/2), ha='center', va='center')
#sns.set_theme(style='darkgrid')
#ax = sns.histplot(data=df, x='Smoking', hue='Sex', stat='probability', multiple='fill', shrink=0.8)
#for p in ax.patches:
# left, bottom, width, height = p.get_bbox().bounds
# ax.annotate("%.1f"%(height*100), xy=(left+width/2, bottom+height/2), ha='center', va='center')
#sns.set_context("paper", font_scale = 1.5, rc={'axes.labelsize': 17.6,
# 'axes.titlesize': 19.200000000000003,
# 'font.size': 19.200000000000003})
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='AlcoholDrinking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='AlcoholDrinking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
남자의 뇌졸중 발생 비율이 더 높음 -> 심장병에 영향 가능
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='Stroke', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Stroke', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='DiffWalking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='DiffWalking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
남자의 KidneyDisease 비율이 더 높음 -> 심장병에 영향 가능
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='KidneyDisease', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='KidneyDisease', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
남자의 피부암 비율이 더 높음 -> 심장병에 영향 가능
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='SkinCancer', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='SkinCancer', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
df.loc[df['Diabetic']=='Yes (during pregnancy)', 'Diabetic'] = 'Yes'
df['Diabetic'].value_counts()
No 251796 Yes 43145 No, borderline diabetes 6776 Name: Diabetic, dtype: int64
df_female = df[df["Sex"]=='Female']
df_male = df[df["Sex"]=='Male']
남자의 당뇨 비율이 더 높음 -> 심장병에 영향 가능
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='Diabetic', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Diabetic', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='Asthma', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Asthma', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='PhysicalActivity', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='PhysicalActivity', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
fig, ax = plt.subplots(ncols=2)
sns.histplot(data = df_female, x='GenHealth', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='GenHealth', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])
fig.tight_layout()
plt.figure(figsize=(13,5))
sns.kdeplot(df[df["Sex"]=='Female']["PhysicalHealth"], shade=True, label = 'Female')
sns.kdeplot(df[df["Sex"]=='Male']["PhysicalHealth"], shade=True, label = 'Male')
plt.legend()
<matplotlib.legend.Legend at 0x271dbe84850>
plt.figure(figsize=(13,5))
sns.kdeplot(df[df["Sex"]=='Female']["MentalHealth"], shade=True, label = 'Female')
sns.kdeplot(df[df["Sex"]=='Male']["MentalHealth"], shade=True, label = 'Male')
plt.legend()
<matplotlib.legend.Legend at 0x271c062f040>
import matplotlib.pyplot as plt
plt.xticks(rotation=45)
sns.histplot(sorted(df["AgeCategory"]))
<AxesSubplot:ylabel='Count'>
sns.histplot(df[df["HeartDisease"]=='Yes']["AgeCategory"])
<AxesSubplot:xlabel='AgeCategory', ylabel='Count'>
plt.xticks(rotation=45)
sns.histplot(data=df, x='AgeCategory', hue='HeartDisease', multiple='stack', shrink=1)
<AxesSubplot:xlabel='AgeCategory', ylabel='Count'>
df['Age_temp'] = df['AgeCategory'].map(lambda x:x.split('-'))
df['Age_temp'] = df['Age_temp'].map(lambda x : x[0][:2] if len(x)==1 else (int(x[0])+int(x[1]))/2)
df['Age_temp'] = df['Age_temp'].astype('int64')
df['Age_temp'].value_counts()
67 31670 62 31219 72 29273 57 27610 52 23736 80 23352 77 20713 47 20518 21 19998 42 19837 37 19526 32 17953 27 16312 Name: Age_temp, dtype: int64
sns.histplot(data=df, x='Age_temp', hue='HeartDisease', multiple='stack')
<AxesSubplot:xlabel='Age_temp', ylabel='Count'>
sns.distplot(df["Age_temp"], kde=False)
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Age_temp'>
나이 많을수록 심장병 발생률 증가
sns.distplot(df[df["HeartDisease"]=='Yes']["Age_temp"], kde=True)
<AxesSubplot:xlabel='Age_temp', ylabel='Density'>
sns.kdeplot(df[df["HeartDisease"]=='Yes']["Age_temp"])
<AxesSubplot:xlabel='Age_temp', ylabel='Density'>
sns.distplot(df[df["HeartDisease"]=='Yes']["Age_temp"])
sns.distplot(df[df["HeartDisease"]=='No']["Age_temp"])
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Age_temp', ylabel='Density'>
fig, ax = plt.subplots(nrows=2)
sns.distplot(df[df["HeartDisease"]=='No']["Age_temp"], kde=False, ax=ax[0])
sns.distplot(df[df["HeartDisease"]=='Yes']["Age_temp"],kde=False, ax=ax[1])
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='Age_temp'>
plt.figure(figsize=(13,5))
sns.kdeplot(df[df["HeartDisease"]=='No']["Age_temp"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["Age_temp"], shade=True, label = 'HeartDiseasee')
plt.legend()
<matplotlib.legend.Legend at 0x271dbe97820>
왜 나이가 많을수록 심장병 많이?
나이가 많을수록 여러 질병 많이 걸리지 않았을까?
fig, ax = plt.subplots(figsize = (14,6))
sns.kdeplot(df[df["HeartDisease"]=='Yes']["Age_temp"], shade = False, label="HeartDisease", ax = ax)
sns.kdeplot(df[df["KidneyDisease"]=='Yes']["Age_temp"], shade = False, label="KidneyDisease", ax = ax) # 신부전
sns.kdeplot(df[df["SkinCancer"]=='Yes']["Age_temp"], shade = False, label="SkinCancer", ax = ax)
sns.kdeplot(df[df["Diabetic"]=='Yes']["Age_temp"], shade = False, label="Diabetic", ax = ax)
plt.legend()
<matplotlib.legend.Legend at 0x271da978b50>
import pandas as pd
df = pd.read_csv('heart_2020_cleaned.csv')
categorical_col = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
for col in categorical_col:
df[col] = df[col].map({'Yes': 1, 'No': 0})
df[categorical_col] = df[categorical_col].astype('int64')
df['Sex'] = df['Sex'].map({'Female': 1, 'Male': 0})
df['GenHealth'] = df['GenHealth'].map({'Poor': 1, 'Fair': 2, 'Good' : 3, 'Very good' : 4, 'Excellent' : 5})
df[['Sex', 'GenHealth']] = df[['Sex', 'GenHealth']].astype('int64')
df.loc[df['Diabetic']=='Yes (during pregnancy)', 'Diabetic'] = 'Yes'
df['Age_temp'] = df['AgeCategory'].map(lambda x:x.split('-'))
df['Age_temp'] = df['Age_temp'].map(lambda x : x[0][:2] if len(x)==1 else (int(x[0])+int(x[1]))/2)
df['Age_temp'] = df['Age_temp'].astype('int64')
#race, diabetic
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 319795 entries, 0 to 319794 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 HeartDisease 319795 non-null int64 1 BMI 319795 non-null float64 2 Smoking 319795 non-null int64 3 AlcoholDrinking 319795 non-null int64 4 Stroke 319795 non-null int64 5 PhysicalHealth 319795 non-null float64 6 MentalHealth 319795 non-null float64 7 DiffWalking 319795 non-null int64 8 Sex 319795 non-null int64 9 AgeCategory 319795 non-null object 10 Race 319795 non-null object 11 Diabetic 319795 non-null object 12 PhysicalActivity 319795 non-null int64 13 GenHealth 319795 non-null int64 14 SleepTime 319795 non-null float64 15 Asthma 319795 non-null int64 16 KidneyDisease 319795 non-null int64 17 SkinCancer 319795 non-null int64 18 Age_temp 319795 non-null int64 dtypes: float64(4), int64(12), object(3) memory usage: 46.4+ MB
df_temp = df.drop('AgeCategory', axis=1, inplace=False)
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
sns.set(font_scale=.8)
sns.heatmap(data = df_temp.corr(), square = True, linecolor = "white",cmap = plt.cm.PuBu, annot = True)
<AxesSubplot:>
import warnings
warnings.filterwarnings("ignore")
from dataprep.eda import plot, plot_correlation, create_report, plot_missing
plot(df_temp)
0%| | 0/648 [00:00<…
Number of Variables | 18 |
---|---|
Number of Rows | 319795 |
Missing Cells | 0 |
Missing Cells (%) | 0.0% |
Duplicate Rows | 18094 |
Duplicate Rows (%) | 5.7% |
Total Size in Memory | 76.3 MB |
Average Row Size in Memory | 250.2 B |
Variable Types |
|
PhysicalHealth and MentalHealth have similar distributions | Similar Distribution |
---|---|
BMI is skewed | Skewed |
PhysicalHealth is skewed | Skewed |
MentalHealth is skewed | Skewed |
SleepTime is skewed | Skewed |
Dataset has 18094 (5.66%) duplicate rows | Duplicates |
HeartDisease has constant length 1 | Constant Length |
Smoking has constant length 1 | Constant Length |
AlcoholDrinking has constant length 1 | Constant Length |
Stroke has constant length 1 | Constant Length |
DiffWalking has constant length 1 | Constant Length |
---|---|
Sex has constant length 1 | Constant Length |
PhysicalActivity has constant length 1 | Constant Length |
GenHealth has constant length 1 | Constant Length |
Asthma has constant length 1 | Constant Length |
KidneyDisease has constant length 1 | Constant Length |
SkinCancer has constant length 1 | Constant Length |
PhysicalHealth has 226589 (70.85%) zeros | Zeros |
MentalHealth has 205401 (64.23%) zeros | Zeros |
Number of plots per page:
create_report(df_temp)
0%| | 0/2261 [00:00<…
error happended in column:HeartDisease
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_29784/2537952997.py in <module> ----> 1 create_report(df_temp) ~\anaconda3\lib\site-packages\dataprep\eda\create_report\__init__.py in create_report(df, config, display, title, mode, progress) 66 "resources": INLINE.render(), 67 "title": title, ---> 68 "components": format_report(df, cfg, mode, progress), 69 } 70 template_base = ENV_LOADER.get_template("base.html") ~\anaconda3\lib\site-packages\dataprep\eda\create_report\formatter.py in format_report(df, cfg, mode, progress) 76 if mode == "basic": 77 edaframe = EDAFrame(df) ---> 78 comps = format_basic(edaframe, cfg) 79 # elif mode == "full": 80 # comps = format_full(df) ~\anaconda3\lib\site-packages\dataprep\eda\create_report\formatter.py in format_basic(df, cfg) 289 290 res_overview = _format_overview(data, cfg) --> 291 res_variables = _format_variables(df, cfg, data) 292 res_interaction = _format_interaction(data, cfg) 293 res_correlations = _format_correlation(data, cfg) ~\anaconda3\lib\site-packages\dataprep\eda\create_report\formatter.py in _format_variables(df, cfg, data) 118 raise RuntimeError(f"the type of column {col} is unknown: {type(dtp)}") 119 --> 120 rndrd = render(itmdt, cfg) 121 layout = rndrd["layout"] 122 figs_var: List[Figure] = [] ~\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py in render(itmdt, cfg) 2471 visual_elem = render_distribution_grid(itmdt, cfg) 2472 elif itmdt.visual_type == "categorical_column": -> 2473 visual_elem = render_cat(itmdt, cfg) 2474 elif itmdt.visual_type == "geography_column": 2475 visual_elem = render_geo(itmdt, cfg) ~\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py in render_cat(itmdt, cfg) 1591 if cfg.wordcloud.enable: 1592 if data["nuniq_words_cloud"] > 0: -> 1593 tabs.append(wordcloud_viz(data["word_cnts_cloud"], plot_width, plot_height)) 1594 htgs["Word Cloud"] = cfg.wordcloud.how_to_guide(plot_height, plot_width) 1595 if cfg.wordfreq.enable: ~\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py in wordcloud_viz(word_cnts, plot_width, plot_height) 140 ellipse_mask = np.load(f"{Path(__file__).parent.parent.parent}/assets/ellipse.npz").get("image") 141 wordcloud = WordCloud(background_color="white", mask=ellipse_mask) --> 142 wordcloud.generate_from_frequencies(word_cnts) 143 wcarr = wordcloud.to_array().astype(np.uint8) 144 ~\anaconda3\lib\site-packages\wordcloud\wordcloud.py in generate_from_frequencies(self, frequencies, max_font_size) 444 font_size = self.height 445 else: --> 446 self.generate_from_frequencies(dict(frequencies[:2]), 447 max_font_size=self.height) 448 # find font sizes ~\anaconda3\lib\site-packages\wordcloud\wordcloud.py in generate_from_frequencies(self, frequencies, max_font_size) 494 while True: 495 # try to find a position --> 496 font = ImageFont.truetype(self.font_path, font_size) 497 # transpose font optionally 498 transposed_font = ImageFont.TransposedFont( ~\anaconda3\lib\site-packages\PIL\ImageFont.py in truetype(font, size, index, encoding, layout_engine) 853 854 try: --> 855 return freetype(font) 856 except OSError: 857 if not isPath(font): ~\anaconda3\lib\site-packages\PIL\ImageFont.py in freetype(font) 850 851 def freetype(font): --> 852 return FreeTypeFont(font, size, index, encoding, layout_engine) 853 854 try: ~\anaconda3\lib\site-packages\PIL\ImageFont.py in __init__(self, font, size, index, encoding, layout_engine) 185 if layout_engine not in (LAYOUT_BASIC, LAYOUT_RAQM): 186 layout_engine = LAYOUT_BASIC --> 187 if core.HAVE_RAQM: 188 layout_engine = LAYOUT_RAQM 189 elif layout_engine == LAYOUT_RAQM and not core.HAVE_RAQM: ~\anaconda3\lib\site-packages\PIL\ImageFont.py in __getattr__(self, id) 42 # module placeholder 43 def __getattr__(self, id): ---> 44 raise ImportError("The _imagingft C module is not installed") 45 46 ImportError: The _imagingft C module is not installed
plot_correlation(df_temp)
100%|############################################################################################| 4/4 [00:00<…
Pearson | Spearman | KendallTau | |
---|---|---|---|
Highest Positive Correlation | 0.428 | 0.355 | 0.333 |
Highest Negative Correlation | -0.483 | -0.408 | -0.357 |
Lowest Correlation | 0.0 | 0.0 | 0.0 |
Mean Correlation | 0.014 | 0.013 | 0.012 |
plot_missing(df_temp)
plot(df, 'BMI', 'SleepTime')