In [1]:

import pandas as pd
df = pd.read_csv('heart_2020_cleaned.csv')

In [2]:

df = df.drop_duplicates()

In [3]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 301717 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      301717 non-null  object 
 1   BMI               301717 non-null  float64
 2   Smoking           301717 non-null  object 
 3   AlcoholDrinking   301717 non-null  object 
 4   Stroke            301717 non-null  object 
 5   PhysicalHealth    301717 non-null  float64
 6   MentalHealth      301717 non-null  float64
 7   DiffWalking       301717 non-null  object 
 8   Sex               301717 non-null  object 
 9   AgeCategory       301717 non-null  object 
 10  Race              301717 non-null  object 
 11  Diabetic          301717 non-null  object 
 12  PhysicalActivity  301717 non-null  object 
 13  GenHealth         301717 non-null  object 
 14  SleepTime         301717 non-null  float64
 15  Asthma            301717 non-null  object 
 16  KidneyDisease     301717 non-null  object 
 17  SkinCancer        301717 non-null  object 
dtypes: float64(4), object(14)
memory usage: 43.7+ MB

In [4]:

import missingno
missingno.matrix(df)

Out[4]:

<AxesSubplot:>

Smoking 정보¶

흡연하는 사람은 흡연 안하는 사람보다 HeartDisease 발생할 확률 증가

In [5]:

import seaborn as sns
sns.histplot(data=df, x='Smoking', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[5]:

<AxesSubplot:xlabel='Smoking', ylabel='Count'>

In [6]:

len(df[(df['HeartDisease']=='Yes') & (df['Smoking']=='Yes')]) / len(df[df['Smoking']=='Yes'])

Out[6]:

0.12540324163101918

In [7]:

len(df[(df['HeartDisease']=='Yes') & (df['Smoking']=='No')]) / len(df[df['Smoking']=='No'])

Out[7]:

0.06473449905915829

HeartDisease 가진 환자의 절반 이상은 흡연을 함

In [8]:

sns.histplot(data=df, x='HeartDisease', hue='Smoking', shrink=0.5, multiple='stack')

Out[8]:

<AxesSubplot:xlabel='HeartDisease', ylabel='Count'>

In [9]:

len(df[(df['HeartDisease']=='Yes') & (df['Smoking']=='Yes')]) / len(df[df['HeartDisease']=='Yes'])

Out[9]:

0.586075345731998

AlcoholDrinking 정보¶

알코올 안마시는 사람들에게서 심장병 발생확률이 좀 더 높음

In [10]:

sns.histplot(data=df, x='AlcoholDrinking', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[10]:

<AxesSubplot:xlabel='AlcoholDrinking', ylabel='Count'>

In [11]:

sns.histplot(data=df, x='HeartDisease', hue='AlcoholDrinking', shrink=0.5, multiple='stack')

Out[11]:

<AxesSubplot:xlabel='HeartDisease', ylabel='Count'>

In [14]:

len(df[(df['HeartDisease']=='Yes') & (df['AlcoholDrinking']=='Yes')]) / len(df[df['AlcoholDrinking']=='Yes'])

Out[14]:

0.052870580603308466

In [15]:

len(df[(df['HeartDisease']=='Yes') & (df['AlcoholDrinking']=='No')]) / len(df[df['AlcoholDrinking']=='No'])

Out[15]:

0.09324042607876175

Asthma, KidneyDisease, SkinCancer¶

많은 사람이 천식이 없지만 천식 있는 사람이 심장병 걸릴 확률이 좀 더 높음

In [16]:

sns.histplot(data=df, x='Asthma', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[16]:

<AxesSubplot:xlabel='Asthma', ylabel='Count'>

In [17]:

sns.histplot(data=df, x='HeartDisease', hue='Asthma', shrink=0.5, multiple='stack')

Out[17]:

<AxesSubplot:xlabel='HeartDisease', ylabel='Count'>

In [18]:

len(df[(df['HeartDisease']=='Yes') & (df['Asthma']=='Yes')]) / len(df[df['Asthma']=='Yes'])

Out[18]:

0.11563621017092214

In [19]:

len(df[(df['HeartDisease']=='Yes') & (df['Asthma']=='No')]) / len(df[df['Asthma']=='No'])

Out[19]:

0.08619039163765219

대부분의 사람이 심부전이 없지만 심부전 있으면 심장병 발생 확률 좀 더 높음

In [20]:

sns.histplot(data=df, x='KidneyDisease', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[20]:

<AxesSubplot:xlabel='KidneyDisease', ylabel='Count'>

In [21]:

len(df[(df['HeartDisease']=='Yes') & (df['KidneyDisease']=='Yes')]) / len(df[df['KidneyDisease']=='Yes'])

Out[21]:

0.29330842391304346

In [22]:

len(df[(df['HeartDisease']=='Yes') & (df['KidneyDisease']=='No')]) / len(df[df['KidneyDisease']=='No'])

Out[22]:

0.08210980854725616

많은 사람이 피부암 없지만 있는 사람이 없는 사람보다 심장병 발생확률 높음

In [23]:

sns.histplot(data=df, x='SkinCancer', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[23]:

<AxesSubplot:xlabel='SkinCancer', ylabel='Count'>

In [24]:

len(df[(df['HeartDisease']=='Yes') & (df['SkinCancer']=='Yes')]) / len(df[df['SkinCancer']=='Yes'])

Out[24]:

0.16960262187628022

In [25]:

len(df[(df['HeartDisease']=='Yes') & (df['SkinCancer']=='No')]) / len(df[df['SkinCancer']=='No'])

Out[25]:

0.08183169679728365

PhysicalActivity¶

최근 30일 동안 운동 한 사람에게서 심장병 발생할 확률이 더 낮음

In [26]:

sns.histplot(data=df, x='PhysicalActivity', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[26]:

<AxesSubplot:xlabel='PhysicalActivity', ylabel='Count'>

In [27]:

len(df[(df['HeartDisease']=='Yes') & (df['PhysicalActivity']=='Yes')]) / len(df[df['PhysicalActivity']=='Yes'])

Out[27]:

0.07542575907504817

In [28]:

len(df[(df['HeartDisease']=='Yes') & (df['PhysicalActivity']=='No')]) / len(df[df['PhysicalActivity']=='No'])

Out[28]:

0.1385877568192974

DiffWalking - 걷거나 계단 오르내릴때 어려움이 있는지¶

걷거나 오르내릴때 어려움이 있는 사람에게서 심장병 발생 확률 높음

In [29]:

sns.histplot(data=df, x='DiffWalking', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[29]:

<AxesSubplot:xlabel='DiffWalking', ylabel='Count'>

In [30]:

sns.histplot(data=df, x='DiffWalking', hue='HeartDisease', shrink=0.5, multiple='dodge')

Out[30]:

<AxesSubplot:xlabel='DiffWalking', ylabel='Count'>

In [31]:

len(df[(df['HeartDisease']=='Yes') & (df['DiffWalking']=='Yes')]) / len(df[df['DiffWalking']=='Yes'])

Out[31]:

0.22599481456431067

In [32]:

len(df[(df['HeartDisease']=='Yes') & (df['DiffWalking']=='No')]) / len(df[df['DiffWalking']=='No'])

Out[32]:

0.06697569959823128

Diabetic, Stroke¶

당뇨가 발생했던 사람들에게서 심장병 발생 확률 높음

In [33]:

sns.histplot(data=df, x='Diabetic', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[33]:

<AxesSubplot:xlabel='Diabetic', ylabel='Count'>

In [34]:

len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='Yes')]) / len(df[df['Diabetic']=='Yes'])

Out[34]:

0.22045381753677104

In [35]:

len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='No')]) / len(df[df['Diabetic']=='No'])

Out[35]:

0.06916710352825303

In [36]:

len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='No, borderline diabetes')]) / len(df[df['Diabetic']=='No, borderline diabetes'])

Out[36]:

0.11644037780401417

In [37]:

len(df[(df['HeartDisease']=='Yes') & (df['Diabetic']=='Yes (during pregnancy)')]) / len(df[df['Diabetic']=='Yes (during pregnancy)'])

Out[37]:

0.04225352112676056

stroke 있는 사람들이 심장병 발생 확률 높음

In [38]:

sns.histplot(data=df, x='Stroke', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[38]:

<AxesSubplot:xlabel='Stroke', ylabel='Count'>

In [39]:

len(df[(df['HeartDisease']=='Yes') & (df['Stroke']=='Yes')]) / len(df[df['Stroke']=='Yes'])

Out[39]:

0.36380968169761274

In [40]:

len(df[(df['HeartDisease']=='Yes') & (df['Stroke']=='No')]) / len(df[df['Stroke']=='No'])

Out[40]:

0.07896344936872741

GenHealth¶

자신의 건강이 안좋다고 한 사람일수록 심장병 발생 확률 높음
Excellent < VeryGood < Good < Fair < Poor 순으로 높음

In [41]:

sns.histplot(data=df, x='GenHealth', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[41]:

<AxesSubplot:xlabel='GenHealth', ylabel='Count'>

In [42]:

len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Poor')]) / len(df[df['GenHealth']=='Poor'])

Out[42]:

0.341130604288499

In [43]:

len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Fair')]) / len(df[df['GenHealth']=='Fair'])

Out[43]:

0.20436250324591015

In [44]:

len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Good')]) / len(df[df['GenHealth']=='Good'])

Out[44]:

0.104176941877925

In [45]:

len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Very good')]) / len(df[df['GenHealth']=='Very good'])

Out[45]:

0.05084163517691515

In [46]:

len(df[(df['HeartDisease']=='Yes') & (df['GenHealth']=='Excellent')]) / len(df[df['GenHealth']=='Excellent'])

Out[46]:

0.02502636556907779

Race¶

심장병 발생 비율은 White가 가장 많지만 American Indian/Alaskan Native에서 가장 높은 발생률을 보임

In [47]:

sns.histplot(data=df, x='Race', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[47]:

<AxesSubplot:xlabel='Race', ylabel='Count'>

In [48]:

len(df[(df['HeartDisease']=='Yes') & (df['Race']=='White')]) / len(df[df['Race']=='White'])

Out[48]:

0.09834273067397376

In [49]:

len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Black')]) / len(df[df['Race']=='Black'])

Out[49]:

0.07580008768084173

In [50]:

len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Asian')]) / len(df[df['Race']=='Asian'])

Out[50]:

0.03327911922932566

In [51]:

len(df[(df['HeartDisease']=='Yes') & (df['Race']=='American Indian/Alaskan Native')]) / len(df[df['Race']=='American Indian/Alaskan Native'])

Out[51]:

0.10439137134052388

In [52]:

len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Other')]) / len(df[df['Race']=='Other'])

Out[52]:

0.08135157469470204

In [53]:

len(df[(df['HeartDisease']=='Yes') & (df['Race']=='Hispanic')]) / len(df[df['Race']=='Hispanic'])

Out[53]:

0.05323348212638802

In [55]:

# American Indian/Alaskan Native과 White의 PhysicalHealth 차이 - 유의미한 차이인지는 모르겠음
import matplotlib.pyplot as plt
plt.figure(figsize=(13,5))
sns.kdeplot(df[df['Race']=='American Indian/Alaskan Native']["PhysicalHealth"], shade=True, label = 'American Indian/Alaskan Native')
sns.kdeplot(df[df['Race']=='White']["PhysicalHealth"], shade=True, label = 'White')
plt.legend()

Out[55]:

<matplotlib.legend.Legend at 0x271d8a71640>

In [56]:

# American Indian/Alaskan Native과 White의 BMI 차이 - American Indian/Alaskan Native의 BMI가 더 높은 경향을 보임
sns.kdeplot(df[df['Race']=='American Indian/Alaskan Native']["BMI"], shade=True, label = 'American Indian/Alaskan Native')
sns.kdeplot(df[df['Race']=='White']["BMI"], shade=True, label = 'White')
plt.legend()

Out[56]:

<matplotlib.legend.Legend at 0x271d87b5fa0>

SleepTime¶

SleepTime과 HeartDisease사이의 관계는 모르겠음 - 적당한 수면시간?

In [58]:

sns.distplot(x=df['SleepTime'])

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[58]:

<AxesSubplot:ylabel='Density'>

In [59]:

df['SleepTime'].value_counts()

Out[59]:

8.0     90202
7.0     89445
6.0     64655
5.0     19101
9.0     15853
10.0     7782
4.0      7730
12.0     2204
3.0      1992
2.0       788
1.0       551
11.0      415
14.0      243
16.0      236
15.0      189
18.0      102
13.0       97
20.0       64
24.0       30
17.0       21
22.0        9
19.0        3
23.0        3
21.0        2
Name: SleepTime, dtype: int64

In [60]:

sns.boxplot(x=df['SleepTime'])

Out[60]:

<AxesSubplot:xlabel='SleepTime'>

In [61]:

sns.violinplot(df['SleepTime'])

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

Out[61]:

<AxesSubplot:xlabel='SleepTime'>

In [62]:

sns.countplot(df['SleepTime'], color='lightblue')

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

Out[62]:

<AxesSubplot:xlabel='SleepTime', ylabel='count'>

In [66]:

sns.countplot(data = df, x = 'SleepTime', hue = 'HeartDisease')

Out[66]:

<AxesSubplot:xlabel='SleepTime', ylabel='count'>

In [69]:

plt.figure(figsize=(9,5))
sns.kdeplot(df[df["HeartDisease"]=='No']["SleepTime"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["SleepTime"], shade=True, label = 'HeartDiseasee')
plt.legend()

Out[69]:

<matplotlib.legend.Legend at 0x271d9ffb340>

BMI¶

BMI가 큰 영향을 미치지는 않지만 심장병 발생한 사람은 그렇지 않은 사람에 비해 BMI 지수가 높음

In [70]:

sns.distplot(x=df['BMI'], kde=True)

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[70]:

<AxesSubplot:ylabel='Density'>

In [71]:

sns.boxplot(x=df['BMI'])

Out[71]:

<AxesSubplot:xlabel='BMI'>

In [72]:

sns.histplot(data=df, x='BMI', hue='HeartDisease', multiple='stack')

Out[72]:

<AxesSubplot:xlabel='BMI', ylabel='Count'>

In [73]:

sns.distplot(x=df['BMI'], kde=False, bins=100)

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[73]:

<AxesSubplot:>

In [74]:

sns.kdeplot(df[df["HeartDisease"]=='No']["BMI"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["BMI"], shade=True, label = 'HeartDiseasee')
plt.legend()

Out[74]:

<matplotlib.legend.Legend at 0x271d9c21490>

PhysicalHealth¶

PhysicalHealth 값이 클수록 아프거나 다친 상태, 심장병 있으면 PhysicalHealth 값이 큼

In [75]:

sns.distplot(x=df['PhysicalHealth'])

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[75]:

<AxesSubplot:ylabel='Density'>

In [76]:

sns.kdeplot(df[df["HeartDisease"]=='No']["PhysicalHealth"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["PhysicalHealth"], shade=True, label = 'HeartDiseasee')
plt.legend()

Out[76]:

<matplotlib.legend.Legend at 0x271d9bc42e0>

MentalHealth¶

In [77]:

sns.distplot(x=df['MentalHealth'])

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[77]:

<AxesSubplot:ylabel='Density'>

In [78]:

sns.kdeplot(df[df["HeartDisease"]=='No']["MentalHealth"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["MentalHealth"], shade=True, label = 'HeartDiseasee')
plt.legend()

Out[78]:

<matplotlib.legend.Legend at 0x271dbd9b430>

성별 정보¶

여성의 수가 더 많았으며 남성이 HeartDisease 가질 확률이 더 높음, 심장병 환자의 58%가 남성

In [79]:

sns.histplot(data=df, x='Sex', hue='HeartDisease', shrink=0.5, multiple='stack')

Out[79]:

<AxesSubplot:xlabel='Sex', ylabel='Count'>

In [80]:

len(df[(df['HeartDisease']=='Yes') & (df['Sex']=='Male')]) / len(df[df['Sex']=='Male'])

Out[80]:

0.11297748616645312

In [81]:

len(df[(df['HeartDisease']=='Yes') & (df['Sex']=='Female')]) / len(df[df['Sex']=='Female'])

Out[81]:

0.07022565149588841

In [82]:

len(df[(df['HeartDisease']=='Yes') & (df['Sex']=='Male')]) / len(df[df['HeartDisease']=='Yes'])

Out[82]:

0.5886797989802282

왜 남성에서 더 많이 발생될까?

BMI에서는 유의미한 차이 모르겠음

In [83]:

plt.figure(figsize=(13,5))
sns.kdeplot(df[df["Sex"]=='Female']["BMI"], shade=True, label = 'Female')
sns.kdeplot(df[df["Sex"]=='Male']["BMI"], shade=True, label = 'Male')
plt.legend()

Out[83]:

<matplotlib.legend.Legend at 0x271db945760>

In [84]:

plt.figure(figsize=(13,5))
sns.distplot(df[df["Sex"]=='Female']["BMI"], kde=False,bins=100, label = 'Female')
sns.distplot(df[df["Sex"]=='Male']["BMI"], kde=False, bins=100,label = 'Male')
plt.legend()

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[84]:

<matplotlib.legend.Legend at 0x271db8df520>

In [85]:

sns.barplot(x="Sex", y="BMI", hue='HeartDisease',data=df)

Out[85]:

<AxesSubplot:xlabel='Sex', ylabel='BMI'>

In [87]:

'''
df_size = df.pivot_table(
    index="Smoking", columns="Sex", aggfunc="size")
    sns.heatmap(df_size, cmap=sns.light_palette(
    "gray", as_cmap=True), annot=True, fmt="d")
plt.title("Heatmap")
plt.show()
'''

Out[87]:

'\ndf_size = df.pivot_table(\n    index="Smoking", columns="Sex", aggfunc="size")\n    sns.heatmap(df_size, cmap=sns.light_palette(\n    "gray", as_cmap=True), annot=True, fmt="d")\nplt.title("Heatmap")\nplt.show()\n'

In [88]:

#df_pivot = df.pivot("Smoking", "Sex", "BMI")
#sns.heatmap(df_pivot, annot=True, fmt="d", linewidths=1)
#sns.catplot(x="Sex", y="Smoking", col="HeartDisease", data=df, kind="bar")

In [86]:

df_female = df[df["Sex"]=='Female']
df_male = df[df["Sex"]=='Male']

남자의 흡연 비율이 더 높음 -> 심장병에 영향 가능

In [89]:

fig, ax = plt.subplots(ncols=2)
sns.set_context("paper", rc={"font.size":20,
                             "axes.titlesize":20,
                             "axes.labelsize":20},
                font_scale = 1.0)  

sns.histplot(data = df_female, x='Smoking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Smoking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

In [90]:

sns.catplot(data= df, x = "Sex", col= "Smoking", hue='HeartDisease', col_wrap = 3, kind="count")

Out[90]:

<seaborn.axisgrid.FacetGrid at 0x271dbe218e0>

In [91]:

#ax = sns.histplot(data=df, x='Smoking', hue='Sex', stat='probability', multiple='stack', shrink=0.5)
#for p in ax.patches:
#    left, bottom, width, height = p.get_bbox().bounds
#    ax.annotate("%.1f"%(height*100), xy=(left+width/2, bottom+height/2), ha='center', va='center')

In [92]:

#sns.set_theme(style='darkgrid')
#ax = sns.histplot(data=df, x='Smoking', hue='Sex', stat='probability', multiple='fill', shrink=0.8)
#for p in ax.patches:
#    left, bottom, width, height = p.get_bbox().bounds
#    ax.annotate("%.1f"%(height*100), xy=(left+width/2, bottom+height/2), ha='center', va='center')

In [93]:

#sns.set_context("paper", font_scale = 1.5, rc={'axes.labelsize': 17.6,
# 'axes.titlesize': 19.200000000000003,
# 'font.size': 19.200000000000003})

In [94]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='AlcoholDrinking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='AlcoholDrinking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

남자의 뇌졸중 발생 비율이 더 높음 -> 심장병에 영향 가능

In [94]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='Stroke', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Stroke', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

In [95]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='DiffWalking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='DiffWalking', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

남자의 KidneyDisease 비율이 더 높음 -> 심장병에 영향 가능

In [96]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='KidneyDisease', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='KidneyDisease', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

남자의 피부암 비율이 더 높음 -> 심장병에 영향 가능

In [97]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='SkinCancer', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='SkinCancer', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

In [98]:

df.loc[df['Diabetic']=='Yes (during pregnancy)', 'Diabetic'] = 'Yes'

In [99]:

df['Diabetic'].value_counts()

Out[99]:

No                         251796
Yes                         43145
No, borderline diabetes      6776
Name: Diabetic, dtype: int64

In [100]:

df_female = df[df["Sex"]=='Female']
df_male = df[df["Sex"]=='Male']

남자의 당뇨 비율이 더 높음 -> 심장병에 영향 가능

In [102]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='Diabetic', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Diabetic', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

In [103]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='Asthma', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='Asthma', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

In [104]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='PhysicalActivity', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='PhysicalActivity', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

In [105]:

fig, ax = plt.subplots(ncols=2)

sns.histplot(data = df_female, x='GenHealth', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[0])
sns.histplot(data = df_male, x='GenHealth', hue='HeartDisease',stat='probability', multiple='stack', shrink=0.8,ax=ax[1])

fig.tight_layout()

In [106]:

plt.figure(figsize=(13,5))
sns.kdeplot(df[df["Sex"]=='Female']["PhysicalHealth"], shade=True, label = 'Female')
sns.kdeplot(df[df["Sex"]=='Male']["PhysicalHealth"], shade=True, label = 'Male')
plt.legend()

Out[106]:

<matplotlib.legend.Legend at 0x271dbe84850>

In [107]:

plt.figure(figsize=(13,5))
sns.kdeplot(df[df["Sex"]=='Female']["MentalHealth"], shade=True, label = 'Female')
sns.kdeplot(df[df["Sex"]=='Male']["MentalHealth"], shade=True, label = 'Male')
plt.legend()

Out[107]:

<matplotlib.legend.Legend at 0x271c062f040>

AgeCategory 정보¶

In [108]:

import matplotlib.pyplot as plt
plt.xticks(rotation=45)
sns.histplot(sorted(df["AgeCategory"]))

Out[108]:

<AxesSubplot:ylabel='Count'>

In [109]:

sns.histplot(df[df["HeartDisease"]=='Yes']["AgeCategory"])

Out[109]:

<AxesSubplot:xlabel='AgeCategory', ylabel='Count'>

In [110]:

plt.xticks(rotation=45)
sns.histplot(data=df, x='AgeCategory', hue='HeartDisease', multiple='stack', shrink=1)

Out[110]:

<AxesSubplot:xlabel='AgeCategory', ylabel='Count'>

In [111]:

df['Age_temp'] = df['AgeCategory'].map(lambda x:x.split('-'))
df['Age_temp'] = df['Age_temp'].map(lambda x : x[0][:2] if len(x)==1 else (int(x[0])+int(x[1]))/2)
df['Age_temp'] = df['Age_temp'].astype('int64')

In [112]:

df['Age_temp'].value_counts()

Out[112]:

67    31670
62    31219
72    29273
57    27610
52    23736
80    23352
77    20713
47    20518
21    19998
42    19837
37    19526
32    17953
27    16312
Name: Age_temp, dtype: int64

In [113]:

sns.histplot(data=df, x='Age_temp', hue='HeartDisease', multiple='stack')

Out[113]:

<AxesSubplot:xlabel='Age_temp', ylabel='Count'>

In [114]:

sns.distplot(df["Age_temp"], kde=False)

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[114]:

<AxesSubplot:xlabel='Age_temp'>

나이 많을수록 심장병 발생률 증가

In [115]:

sns.distplot(df[df["HeartDisease"]=='Yes']["Age_temp"], kde=True)

Out[115]:

<AxesSubplot:xlabel='Age_temp', ylabel='Density'>

In [116]:

sns.kdeplot(df[df["HeartDisease"]=='Yes']["Age_temp"])

Out[116]:

<AxesSubplot:xlabel='Age_temp', ylabel='Density'>

In [117]:

sns.distplot(df[df["HeartDisease"]=='Yes']["Age_temp"])
sns.distplot(df[df["HeartDisease"]=='No']["Age_temp"])

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[117]:

<AxesSubplot:xlabel='Age_temp', ylabel='Density'>

In [118]:

fig, ax = plt.subplots(nrows=2)
sns.distplot(df[df["HeartDisease"]=='No']["Age_temp"], kde=False, ax=ax[0])
sns.distplot(df[df["HeartDisease"]=='Yes']["Age_temp"],kde=False, ax=ax[1])

C:\Users\hyomi\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[118]:

<AxesSubplot:xlabel='Age_temp'>

In [119]:

plt.figure(figsize=(13,5))
sns.kdeplot(df[df["HeartDisease"]=='No']["Age_temp"], shade=True, label = 'No HeartDisease')
sns.kdeplot(df[df["HeartDisease"]=='Yes']["Age_temp"], shade=True, label = 'HeartDiseasee')
plt.legend()

Out[119]:

<matplotlib.legend.Legend at 0x271dbe97820>

왜 나이가 많을수록 심장병 많이?
나이가 많을수록 여러 질병 많이 걸리지 않았을까?

In [120]:

fig, ax = plt.subplots(figsize = (14,6))
sns.kdeplot(df[df["HeartDisease"]=='Yes']["Age_temp"], shade = False, label="HeartDisease", ax = ax)
sns.kdeplot(df[df["KidneyDisease"]=='Yes']["Age_temp"], shade = False, label="KidneyDisease", ax = ax) # 신부전
sns.kdeplot(df[df["SkinCancer"]=='Yes']["Age_temp"], shade = False, label="SkinCancer", ax = ax)
sns.kdeplot(df[df["Diabetic"]=='Yes']["Age_temp"], shade = False, label="Diabetic", ax = ax)
plt.legend()

Out[120]:

<matplotlib.legend.Legend at 0x271da978b50>

상관관계 heatmap¶

In [121]:

import pandas as pd
df = pd.read_csv('heart_2020_cleaned.csv')

In [122]:

categorical_col = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']
for col in categorical_col:
    df[col] = df[col].map({'Yes': 1, 'No': 0})
df[categorical_col] = df[categorical_col].astype('int64')
df['Sex'] = df['Sex'].map({'Female': 1, 'Male': 0})
df['GenHealth'] = df['GenHealth'].map({'Poor': 1, 'Fair': 2, 'Good' : 3, 'Very good' : 4, 'Excellent' : 5})
df[['Sex', 'GenHealth']] = df[['Sex', 'GenHealth']].astype('int64')

df.loc[df['Diabetic']=='Yes (during pregnancy)', 'Diabetic'] = 'Yes'
df['Age_temp'] = df['AgeCategory'].map(lambda x:x.split('-'))
df['Age_temp'] = df['Age_temp'].map(lambda x : x[0][:2] if len(x)==1 else (int(x[0])+int(x[1]))/2)
df['Age_temp'] = df['Age_temp'].astype('int64')
#race, diabetic

In [123]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  int64  
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  int64  
 3   AlcoholDrinking   319795 non-null  int64  
 4   Stroke            319795 non-null  int64  
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  int64  
 8   Sex               319795 non-null  int64  
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  int64  
 13  GenHealth         319795 non-null  int64  
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  int64  
 16  KidneyDisease     319795 non-null  int64  
 17  SkinCancer        319795 non-null  int64  
 18  Age_temp          319795 non-null  int64  
dtypes: float64(4), int64(12), object(3)
memory usage: 46.4+ MB

In [124]:

df_temp = df.drop('AgeCategory', axis=1, inplace=False)

In [130]:

import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
sns.set(font_scale=.8)
sns.heatmap(data = df_temp.corr(), square = True, linecolor = "white",cmap = plt.cm.PuBu, annot = True)

Out[130]:

<AxesSubplot:>

AudoEda - dataprep¶

In [131]:

import warnings
warnings.filterwarnings("ignore")
from dataprep.eda import plot, plot_correlation, create_report, plot_missing

In [118]:

plot(df_temp)

  0%|                                                                                          | 0/648 [00:00<…

Out[118]:

DataPrep.EDA Report

Stats and Insights

Dataset Statistics

Number of Variables	18
Number of Rows	319795
Missing Cells	0
Missing Cells (%)	0.0%
Duplicate Rows	18094
Duplicate Rows (%)	5.7%
Total Size in Memory	76.3 MB
Average Row Size in Memory	250.2 B
Variable Types	Categorical: 13 Numerical: 5

Dataset Insights

PhysicalHealth and MentalHealth have similar distributions	Similar Distribution
BMI is skewed	Skewed
PhysicalHealth is skewed	Skewed
MentalHealth is skewed	Skewed
SleepTime is skewed	Skewed
Dataset has 18094 (5.66%) duplicate rows	Duplicates
HeartDisease has constant length 1	Constant Length
Smoking has constant length 1	Constant Length
AlcoholDrinking has constant length 1	Constant Length
Stroke has constant length 1	Constant Length

Dataset Insights

DiffWalking has constant length 1	Constant Length
Sex has constant length 1	Constant Length
PhysicalActivity has constant length 1	Constant Length
GenHealth has constant length 1	Constant Length
Asthma has constant length 1	Constant Length
KidneyDisease has constant length 1	Constant Length
SkinCancer has constant length 1	Constant Length
PhysicalHealth has 226589 (70.85%) zeros	Zeros
MentalHealth has 205401 (64.23%) zeros	Zeros

1
2

Number of plots per page:

In [23]:

create_report(df_temp)

  0%|                                                                                         | 0/2261 [00:00<…

error happended in column:HeartDisease

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_29784/2537952997.py in <module>
----> 1 create_report(df_temp)

~\anaconda3\lib\site-packages\dataprep\eda\create_report\__init__.py in create_report(df, config, display, title, mode, progress)
     66         "resources": INLINE.render(),
     67         "title": title,
---> 68         "components": format_report(df, cfg, mode, progress),
     69     }
     70     template_base = ENV_LOADER.get_template("base.html")

~\anaconda3\lib\site-packages\dataprep\eda\create_report\formatter.py in format_report(df, cfg, mode, progress)
     76         if mode == "basic":
     77             edaframe = EDAFrame(df)
---> 78             comps = format_basic(edaframe, cfg)
     79         # elif mode == "full":
     80         #     comps = format_full(df)

~\anaconda3\lib\site-packages\dataprep\eda\create_report\formatter.py in format_basic(df, cfg)
    289 
    290     res_overview = _format_overview(data, cfg)
--> 291     res_variables = _format_variables(df, cfg, data)
    292     res_interaction = _format_interaction(data, cfg)
    293     res_correlations = _format_correlation(data, cfg)

~\anaconda3\lib\site-packages\dataprep\eda\create_report\formatter.py in _format_variables(df, cfg, data)
    118                 raise RuntimeError(f"the type of column {col} is unknown: {type(dtp)}")
    119 
--> 120             rndrd = render(itmdt, cfg)
    121             layout = rndrd["layout"]
    122             figs_var: List[Figure] = []

~\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py in render(itmdt, cfg)
   2471         visual_elem = render_distribution_grid(itmdt, cfg)
   2472     elif itmdt.visual_type == "categorical_column":
-> 2473         visual_elem = render_cat(itmdt, cfg)
   2474     elif itmdt.visual_type == "geography_column":
   2475         visual_elem = render_geo(itmdt, cfg)

~\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py in render_cat(itmdt, cfg)
   1591     if cfg.wordcloud.enable:
   1592         if data["nuniq_words_cloud"] > 0:
-> 1593             tabs.append(wordcloud_viz(data["word_cnts_cloud"], plot_width, plot_height))
   1594             htgs["Word Cloud"] = cfg.wordcloud.how_to_guide(plot_height, plot_width)
   1595     if cfg.wordfreq.enable:

~\anaconda3\lib\site-packages\dataprep\eda\distribution\render.py in wordcloud_viz(word_cnts, plot_width, plot_height)
    140     ellipse_mask = np.load(f"{Path(__file__).parent.parent.parent}/assets/ellipse.npz").get("image")
    141     wordcloud = WordCloud(background_color="white", mask=ellipse_mask)
--> 142     wordcloud.generate_from_frequencies(word_cnts)
    143     wcarr = wordcloud.to_array().astype(np.uint8)
    144 

~\anaconda3\lib\site-packages\wordcloud\wordcloud.py in generate_from_frequencies(self, frequencies, max_font_size)
    444                 font_size = self.height
    445             else:
--> 446                 self.generate_from_frequencies(dict(frequencies[:2]),
    447                                                max_font_size=self.height)
    448                 # find font sizes

~\anaconda3\lib\site-packages\wordcloud\wordcloud.py in generate_from_frequencies(self, frequencies, max_font_size)
    494             while True:
    495                 # try to find a position
--> 496                 font = ImageFont.truetype(self.font_path, font_size)
    497                 # transpose font optionally
    498                 transposed_font = ImageFont.TransposedFont(

~\anaconda3\lib\site-packages\PIL\ImageFont.py in truetype(font, size, index, encoding, layout_engine)
    853 
    854     try:
--> 855         return freetype(font)
    856     except OSError:
    857         if not isPath(font):

~\anaconda3\lib\site-packages\PIL\ImageFont.py in freetype(font)
    850 
    851     def freetype(font):
--> 852         return FreeTypeFont(font, size, index, encoding, layout_engine)
    853 
    854     try:

~\anaconda3\lib\site-packages\PIL\ImageFont.py in __init__(self, font, size, index, encoding, layout_engine)
    185         if layout_engine not in (LAYOUT_BASIC, LAYOUT_RAQM):
    186             layout_engine = LAYOUT_BASIC
--> 187             if core.HAVE_RAQM:
    188                 layout_engine = LAYOUT_RAQM
    189         elif layout_engine == LAYOUT_RAQM and not core.HAVE_RAQM:

~\anaconda3\lib\site-packages\PIL\ImageFont.py in __getattr__(self, id)
     42     # module placeholder
     43     def __getattr__(self, id):
---> 44         raise ImportError("The _imagingft C module is not installed")
     45 
     46 

ImportError: The _imagingft C module is not installed

In [119]:

plot_correlation(df_temp)

100%|############################################################################################| 4/4 [00:00<…

Out[119]:

DataPrep.EDA Report

Stats Pearson Spearman KendallTau

	Pearson	Spearman	KendallTau
Highest Positive Correlation	0.428	0.355	0.333
Highest Negative Correlation	-0.483	-0.408	-0.357
Lowest Correlation	0.0	0.0	0.0
Mean Correlation	0.014	0.013	0.012

'height': 400

Height of the plot

'width': 400

Width of the plot

Most positive correlated: (PhysicalHealth, DiffWalking)
Most negative correlated: (PhysicalHealth, GenHealth)
Least correlated: (Asthma, SkinCancer)

'height': 400

Height of the plot

'width': 400

Width of the plot

Most positive correlated: (PhysicalHealth, DiffWalking)
Most negative correlated: (PhysicalHealth, GenHealth)
Least correlated: (Asthma, SkinCancer)

'height': 400

Height of the plot

'width': 400

Width of the plot

Most positive correlated: (PhysicalHealth, DiffWalking)
Most negative correlated: (PhysicalHealth, GenHealth)
Least correlated: (Asthma, SkinCancer)

In [ ]:

plot_missing(df_temp)

In [ ]:

plot(df, 'BMI', 'SleepTime')