#Importamos las librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=sns.load_dataset('iris')
df
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
df.shape
(150, 5)
#FacetGrid - sepal_length vs sepal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_length','sepal_width').add_legend();
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
#FacetGrid - sepal_length vs petal_length
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_length','petal_length').add_legend();
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
#FacetGrid - sepal_length vs petal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_length','petal_width').add_legend();
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
#FacetGrid - sepal_width vs petal_length
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_width','petal_length').add_legend();
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
#FacetGrid - sepal_width vs petal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_width','petal_width').add_legend();
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
#FacetGrid - petal_length vs petal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'petal_length','petal_width').add_legend();
plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
#Pairplot
sns.pairplot(df,hue = 'species' , size = 3 , palette = 'Set2')
/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:2076: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
<seaborn.axisgrid.PairGrid at 0x7ff538df8290>
#Importamos las librerias
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
url='https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
df =pd.read_csv(url,sep=',')
df.columns= ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
df.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
1 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
2 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
3 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
4 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | 0 |
#Reemplazar el Outcome de 1 a 'Diab' y 0 a 'Non-Diab'
df.Outcome = df.Outcome.replace({0:'Non-Diab',1:'Diab'})
df.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | Non-Diab |
1 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | Diab |
2 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | Non-Diab |
3 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | Diab |
4 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | Non-Diab |
#Renombrar la columna DiabetesPedigreeFunction a DPF
df.DiabetesPedigreeFunction = df.rename({'DiabetesPedigreeFunction':'DPF'},inplace = True,axis =1)
df.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DPF | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | Non-Diab |
1 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | Diab |
2 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | Non-Diab |
3 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | Diab |
4 | 5 | 116 | 74 | 0 | 0 | 25.6 | 0.201 | 30 | Non-Diab |
#Verificamos los tipos de datos
df.dtypes
Pregnancies int64 Glucose int64 BloodPressure int64 SkinThickness int64 Insulin int64 BMI float64 DPF float64 Age int64 Outcome object dtype: object
#Convertir a la variable: DPF en Integer y verificar su correcta conversion
df["DPF"] = pd.to_numeric(df["DPF"])
#Validacion cambio de dato
df.dtypes
Pregnancies int64 Glucose int64 BloodPressure int64 SkinThickness int64 Insulin int64 BMI float64 DPF float64 Age int64 Outcome object dtype: object
#Shape
df.shape
(767, 9)
#Info
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 767 entries, 0 to 766 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 767 non-null int64 1 Glucose 767 non-null int64 2 BloodPressure 767 non-null int64 3 SkinThickness 767 non-null int64 4 Insulin 767 non-null int64 5 BMI 767 non-null float64 6 DPF 767 non-null float64 7 Age 767 non-null int64 8 Outcome 767 non-null object dtypes: float64(2), int64(6), object(1) memory usage: 54.1+ KB
#Estadisticos básicos
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Pregnancies | 767.0 | 3.842243 | 3.370877 | 0.000 | 1.0000 | 3.000 | 6.000 | 17.00 |
Glucose | 767.0 | 120.859192 | 31.978468 | 0.000 | 99.0000 | 117.000 | 140.000 | 199.00 |
BloodPressure | 767.0 | 69.101695 | 19.368155 | 0.000 | 62.0000 | 72.000 | 80.000 | 122.00 |
SkinThickness | 767.0 | 20.517601 | 15.954059 | 0.000 | 0.0000 | 23.000 | 32.000 | 99.00 |
Insulin | 767.0 | 79.903520 | 115.283105 | 0.000 | 0.0000 | 32.000 | 127.500 | 846.00 |
BMI | 767.0 | 31.990482 | 7.889091 | 0.000 | 27.3000 | 32.000 | 36.600 | 67.10 |
DPF | 767.0 | 0.471674 | 0.331497 | 0.078 | 0.2435 | 0.371 | 0.625 | 2.42 |
Age | 767.0 | 33.219035 | 11.752296 | 21.000 | 24.0000 | 29.000 | 41.000 | 81.00 |
#Crear un Pairplot del dataset
plt.figure(dpi=120)
sns.pairplot(df)
plt.show()
<Figure size 720x480 with 0 Axes>
#Crear el Pairplot pero ahora con el atributo hue = 'Outcome'. Las leyendas deberán decir: 'Non Diabetic','Diabetic'
plt.figure(dpi = 120)
sns.pairplot(df,hue = 'Outcome',palette = 'plasma')
plt.legend(['Non Diabetic','Diabetic'])
plt.show()
<Figure size 720x480 with 0 Axes>
#Correlaciones
plt.figure(dpi = 120,figsize= (5,4))
mask = np.triu(np.ones_like(df.corr(),dtype = bool))
sns.heatmap(df.corr(),mask = mask, fmt = ".2f",annot=True,lw=1,cmap = 'plasma')
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)
plt.title('Correlation Heatmap')
plt.show()
#JoinPlot
plt.figure(dpi = 100, figsize = (5,4))
print("Joint plot de Glucosa con otras variables ==> \n")
for i in df.columns:
if i != 'Glucose' and i != 'Outcome':
print(f"Correlacion entre Glucose y {i} ==> ",df.corr().loc['Glucose'][i])
sns.jointplot(x='Glucose',y=i,data=df,kind = 'scatter',color = 'purple')
plt.show()
Joint plot de Glucosa con otras variables ==> Correlacion entre Glucose y Pregnancies ==> 0.12884571831523273
<Figure size 500x400 with 0 Axes>
Correlacion entre Glucose y BloodPressure ==> 0.15249786357400735
Correlacion entre Glucose y SkinThickness ==> 0.05638112882079316
Correlacion entre Glucose y Insulin ==> 0.33238342971312745
Correlacion entre Glucose y BMI ==> 0.22095509998436502
Correlacion entre Glucose y DPF ==> 0.13690295573006997
Correlacion entre Glucose y Age ==> 0.2624081766852211
Insight: La glucosa muestra una asociación lineal débil positiva con las otras variables del conjunto de datos. Eso significa que al aumentar el nivel de glucosa en los pacientes, también aumentarán las otras variables.
#Crear el join plot entre BloodPressure y el resto de las variables con su correspondiente interpretación.
col = list(df.columns)
idx = col.index('BloodPressure')
plt.figure(dpi = 100, figsize = (5,4))
print("Joint plot de BloodPressure con otras variables ==> \n")
for i in range(idx+1,len(col)-1):
print(f"Correlacion entre BloodPressure y {col[i]} ==> ",df.corr().loc['BloodPressure'][col[i]])
sns.jointplot(x='BloodPressure',y=col[i],data=df,kind = 'scatter',color = 'green')
plt.show()
Joint plot de BloodPressure con otras variables ==> Correlacion entre BloodPressure y SkinThickness ==> 0.207307826179924
<Figure size 500x400 with 0 Axes>
Correlacion entre BloodPressure y Insulin ==> 0.08909775002967421
Correlacion entre BloodPressure y BMI ==> 0.2817772447500641
Correlacion entre BloodPressure y DPF ==> 0.04118003922426516
Correlacion entre BloodPressure y Age ==> 0.23957101390158442
Insight: BloodPressure muestra una asociación lineal débil positiva con las otras variables del conjunto de datos dado. Eso significa que al aumentar el nivel de BP en los pacientes, también aumentarán el resto de las variables.
#Crear el join plot entre SkinThickness y el resto de las variables con su correspondiente interpretación.
col = list(df.columns)
idx = col.index('SkinThickness')
plt.figure(dpi = 100, figsize = (5,4))
print("Joint plot de SkinThickness con otras variables ==> \n")
for i in range(idx+1,len(col)-1):
print(f"Correlacion entre SkinThickness y {col[i]} ==> ",df.corr().loc['SkinThickness'][col[i]])
sns.jointplot(x='SkinThickness',y=col[i],data=df,kind = 'reg',color = 'blue')
plt.show()
Joint plot de SkinThickness con otras variables ==> Correlacion entre SkinThickness y Insulin ==> 0.4379742769264145
<Figure size 500x400 with 0 Axes>
Correlacion entre SkinThickness y BMI ==> 0.39255322579417173
Correlacion entre SkinThickness y DPF ==> 0.18349813612676866
Correlacion entre SkinThickness y Age ==> -0.1158728591822747
Insight: SkinThickness muestra una asociación lineal débil positiva con el resto de las variables, excepto con la edad. Eso significa que al aumentar el SkinThickness en los pacientes, las otras variables también aumentarán. SkinThickness con la edad muestran una correlación negativa débil es decir, al aumentar SkinThickness, la edad disminuirá.