# Import pandas
import pandas as pd
pd.set_option('display.max_columns', None)
# Import seaborn
import seaborn as sns
%matplotlib inline
# Apply the default theme
sns.set_theme()
pandas.DataFrame
¶df = pd.read_csv('../data/penguins.csv')
df.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
df.dtypes
species object island object bill_length_mm float64 bill_depth_mm float64 flipper_length_mm float64 body_mass_g float64 sex object year int64 dtype: object
df.shape
(344, 8)
df.describe()
bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | year | |
---|---|---|---|---|---|
count | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 344.000000 |
mean | 43.921930 | 17.151170 | 200.915205 | 4201.754386 | 2008.029070 |
std | 5.459584 | 1.974793 | 14.061714 | 801.954536 | 0.818356 |
min | 32.100000 | 13.100000 | 172.000000 | 2700.000000 | 2007.000000 |
25% | 39.225000 | 15.600000 | 190.000000 | 3550.000000 | 2007.000000 |
50% | 44.450000 | 17.300000 | 197.000000 | 4050.000000 | 2008.000000 |
75% | 48.500000 | 18.700000 | 213.000000 | 4750.000000 | 2009.000000 |
max | 59.600000 | 21.500000 | 231.000000 | 6300.000000 | 2009.000000 |
df['island'].value_counts()
Biscoe 168 Dream 124 Torgersen 52 Name: island, dtype: int64
df['species'].value_counts()
Adelie 152 Gentoo 124 Chinstrap 68 Name: species, dtype: int64
df['sex'].value_counts()
male 168 female 165 Name: sex, dtype: int64
hist = sns.displot(
data=df,
x="flipper_length_mm",
kind="hist",
aspect=1.5
).set(
title="Flipper Length (mm)"
)
hist = sns.displot(
data=df,
x="flipper_length_mm",
kind="hist",
aspect=1.5,
hue="species",
multiple="stack",
).set(
title="Flipper Length (mm)"
)
hist = sns.displot(
data=df,
x="bill_length_mm",
kind="hist",
aspect=1.5,
hue="species",
multiple="dodge"
).set(
title = "Bill Length (mm)"
)
kde = sns.displot(
data=df,
x="bill_depth_mm",
kind="kde",
aspect=1.5,
hue="species",
multiple="stack"
).set(
title="Bill Depth (mm)"
)
scatter = sns.relplot(
data=df,
x="flipper_length_mm",
y="bill_length_mm",
kind="scatter",
height=7,
aspect=1.5,
hue="species",
style="species",
).set(
title="Flipper Length (mm) vs Bill Length (mm)"
)
# See https://seaborn.pydata.org/tutorial/regression.html for a deeper treatment on linear regressions with seaborn
scatter = sns.lmplot(
data=df,
x="flipper_length_mm",
y="bill_length_mm",
height=7,
aspect=1.5,
hue="species",
).set(
title="Flipper Length (mm) vs Bill Length (mm)",
)
box = sns.catplot(
data=df,
kind="box",
x="sex",
col="species",
y="body_mass_g",
palette="rocket",
)
swarm = sns.catplot(
data=df,
kind="swarm",
x="species",
hue="sex",
y="body_mass_g",
aspect=1.5,
palette="rocket",
).set(title="Body Mass (g)")
violin = sns.catplot(
data=df,
kind="violin",
x="species",
hue="sex",
y="body_mass_g",
aspect=1.5,
split=True,
palette="rocket",
).set(title="Body Mass (g)")
_ = sns.pairplot(
data=df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'species']],
hue="species",
markers=["o", "s", "D"],
corner=True
)
_ = sns.pairplot(
data=df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']],
hue="sex",
markers=["o", "s"],
corner=True,
palette="rocket",
)