Ejemplo 1¶

In [1]:

#Importamos las librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:

df=sns.load_dataset('iris')
df

Out[4]:

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

150 rows × 5 columns

In [5]:

df.shape

Out[5]:

(150, 5)

In [6]:

#FacetGrid - sepal_length vs sepal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_length','sepal_width').add_legend();
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

In [7]:

#FacetGrid - sepal_length vs petal_length
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_length','petal_length').add_legend();
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

In [8]:

#FacetGrid - sepal_length vs petal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_length','petal_width').add_legend();
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

In [9]:

#FacetGrid - sepal_width vs petal_length
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_width','petal_length').add_legend();
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

In [10]:

#FacetGrid  - sepal_width vs petal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'sepal_width','petal_width').add_legend();
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

In [11]:

#FacetGrid - petal_length vs petal_width
sns.FacetGrid(df,hue = 'species' , size = 5).map(plt.scatter,'petal_length','petal_width').add_legend();
plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:337: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

In [12]:

#Pairplot
sns.pairplot(df,hue = 'species' , size = 3 , palette = 'Set2')

/usr/local/lib/python3.7/dist-packages/seaborn/axisgrid.py:2076: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)

Out[12]:

<seaborn.axisgrid.PairGrid at 0x7ff538df8290>

Ejemplo 2¶

In [13]:

#Importamos las librerias
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

In [25]:

url='https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
df =pd.read_csv(url,sep=',')
df.columns= ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
df.head()

Out[25]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	1	85	66	29	0	26.6	0.351	31	0
1	8	183	64	0	0	23.3	0.672	32	1
2	1	89	66	23	94	28.1	0.167	21	0
3	0	137	40	35	168	43.1	2.288	33	1
4	5	116	74	0	0	25.6	0.201	30	0

In [26]:

#Reemplazar el Outcome de 1 a 'Diab' y 0 a 'Non-Diab'
df.Outcome = df.Outcome.replace({0:'Non-Diab',1:'Diab'})
df.head()

Out[26]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	1	85	66	29	0	26.6	0.351	31	Non-Diab
1	8	183	64	0	0	23.3	0.672	32	Diab
2	1	89	66	23	94	28.1	0.167	21	Non-Diab
3	0	137	40	35	168	43.1	2.288	33	Diab
4	5	116	74	0	0	25.6	0.201	30	Non-Diab

In [27]:

#Renombrar la columna DiabetesPedigreeFunction a DPF
df.DiabetesPedigreeFunction = df.rename({'DiabetesPedigreeFunction':'DPF'},inplace = True,axis =1)
df.head()

Out[27]:

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DPF	Age	Outcome
0	1	85	66	29	0	26.6	0.351	31	Non-Diab
1	8	183	64	0	0	23.3	0.672	32	Diab
2	1	89	66	23	94	28.1	0.167	21	Non-Diab
3	0	137	40	35	168	43.1	2.288	33	Diab
4	5	116	74	0	0	25.6	0.201	30	Non-Diab

In [28]:

#Verificamos los tipos de datos
df.dtypes

Out[28]:

Pregnancies        int64
Glucose            int64
BloodPressure      int64
SkinThickness      int64
Insulin            int64
BMI              float64
DPF              float64
Age                int64
Outcome           object
dtype: object

In [29]:

#Convertir a la variable: DPF en Integer y verificar su correcta conversion
df["DPF"] = pd.to_numeric(df["DPF"])

In [30]:

#Validacion cambio de dato
df.dtypes

Out[30]:

Pregnancies        int64
Glucose            int64
BloodPressure      int64
SkinThickness      int64
Insulin            int64
BMI              float64
DPF              float64
Age                int64
Outcome           object
dtype: object

In [31]:

#Shape
df.shape

Out[31]:

(767, 9)

In [32]:

#Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pregnancies    767 non-null    int64  
 1   Glucose        767 non-null    int64  
 2   BloodPressure  767 non-null    int64  
 3   SkinThickness  767 non-null    int64  
 4   Insulin        767 non-null    int64  
 5   BMI            767 non-null    float64
 6   DPF            767 non-null    float64
 7   Age            767 non-null    int64  
 8   Outcome        767 non-null    object 
dtypes: float64(2), int64(6), object(1)
memory usage: 54.1+ KB

In [33]:

#Estadisticos básicos
df.describe().T

Out[33]:

	count	mean	std	min	25%	50%	75%	max
Pregnancies	767.0	3.842243	3.370877	0.000	1.0000	3.000	6.000	17.00
Glucose	767.0	120.859192	31.978468	0.000	99.0000	117.000	140.000	199.00
BloodPressure	767.0	69.101695	19.368155	0.000	62.0000	72.000	80.000	122.00
SkinThickness	767.0	20.517601	15.954059	0.000	0.0000	23.000	32.000	99.00
Insulin	767.0	79.903520	115.283105	0.000	0.0000	32.000	127.500	846.00
BMI	767.0	31.990482	7.889091	0.000	27.3000	32.000	36.600	67.10
DPF	767.0	0.471674	0.331497	0.078	0.2435	0.371	0.625	2.42
Age	767.0	33.219035	11.752296	21.000	24.0000	29.000	41.000	81.00

In [34]:

#Crear un Pairplot del dataset
plt.figure(dpi=120)
sns.pairplot(df)
plt.show()

<Figure size 720x480 with 0 Axes>

In [35]:

#Crear el Pairplot pero ahora con el atributo hue = 'Outcome'. Las leyendas deberán decir: 'Non Diabetic','Diabetic'
plt.figure(dpi = 120)
sns.pairplot(df,hue = 'Outcome',palette = 'plasma')
plt.legend(['Non Diabetic','Diabetic'])
plt.show()

<Figure size 720x480 with 0 Axes>

In [36]:

#Correlaciones
plt.figure(dpi = 120,figsize= (5,4))
mask = np.triu(np.ones_like(df.corr(),dtype = bool))
sns.heatmap(df.corr(),mask = mask, fmt = ".2f",annot=True,lw=1,cmap = 'plasma')
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)
plt.title('Correlation Heatmap')
plt.show()

In [38]:

#JoinPlot
plt.figure(dpi = 100, figsize = (5,4))
print("Joint plot de Glucosa con otras variables ==> \n")
for i in  df.columns:
    if i != 'Glucose' and i != 'Outcome':
        print(f"Correlacion entre Glucose y {i} ==> ",df.corr().loc['Glucose'][i])
        sns.jointplot(x='Glucose',y=i,data=df,kind = 'scatter',color = 'purple')
        plt.show()

Joint plot de Glucosa con otras variables ==> 

Correlacion entre Glucose y Pregnancies ==>  0.12884571831523273

<Figure size 500x400 with 0 Axes>

Correlacion entre Glucose y BloodPressure ==>  0.15249786357400735

Correlacion entre Glucose y SkinThickness ==>  0.05638112882079316

Correlacion entre Glucose y Insulin ==>  0.33238342971312745

Correlacion entre Glucose y BMI ==>  0.22095509998436502

Correlacion entre Glucose y DPF ==>  0.13690295573006997

Correlacion entre Glucose y Age ==>  0.2624081766852211

Insight: La glucosa muestra una asociación lineal débil positiva con las otras variables del conjunto de datos. Eso significa que al aumentar el nivel de glucosa en los pacientes, también aumentarán las otras variables.

In [40]:

#Crear el join plot entre BloodPressure y el resto de las variables con su correspondiente interpretación.

col = list(df.columns)
idx = col.index('BloodPressure')

plt.figure(dpi = 100, figsize = (5,4))
print("Joint plot de BloodPressure con otras variables ==> \n")
for i in  range(idx+1,len(col)-1):
    print(f"Correlacion entre BloodPressure y {col[i]} ==> ",df.corr().loc['BloodPressure'][col[i]])
    sns.jointplot(x='BloodPressure',y=col[i],data=df,kind = 'scatter',color = 'green')
    plt.show()

Joint plot de BloodPressure con otras variables ==> 

Correlacion entre BloodPressure y SkinThickness ==>  0.207307826179924

<Figure size 500x400 with 0 Axes>

Correlacion entre BloodPressure y Insulin ==>  0.08909775002967421

Correlacion entre BloodPressure y BMI ==>  0.2817772447500641

Correlacion entre BloodPressure y DPF ==>  0.04118003922426516

Correlacion entre BloodPressure y Age ==>  0.23957101390158442

Insight: BloodPressure muestra una asociación lineal débil positiva con las otras variables del conjunto de datos dado. Eso significa que al aumentar el nivel de BP en los pacientes, también aumentarán el resto de las variables.

In [42]:

#Crear el join plot entre SkinThickness y el resto de las variables con su correspondiente interpretación.

col = list(df.columns)
idx = col.index('SkinThickness')

plt.figure(dpi = 100, figsize = (5,4))
print("Joint plot de SkinThickness con otras variables ==> \n")
for i in  range(idx+1,len(col)-1):
    print(f"Correlacion entre SkinThickness y {col[i]} ==> ",df.corr().loc['SkinThickness'][col[i]])
    sns.jointplot(x='SkinThickness',y=col[i],data=df,kind = 'reg',color = 'blue')
    plt.show()

Joint plot de SkinThickness con otras variables ==> 

Correlacion entre SkinThickness y Insulin ==>  0.4379742769264145

<Figure size 500x400 with 0 Axes>

Correlacion entre SkinThickness y BMI ==>  0.39255322579417173

Correlacion entre SkinThickness y DPF ==>  0.18349813612676866

Correlacion entre SkinThickness y Age ==>  -0.1158728591822747

Insight: SkinThickness muestra una asociación lineal débil positiva con el resto de las variables, excepto con la edad. Eso significa que al aumentar el SkinThickness en los pacientes, las otras variables también aumentarán. SkinThickness con la edad muestran una correlación negativa débil es decir, al aumentar SkinThickness, la edad disminuirá.

Created in Deepnote