#!/usr/bin/env python # coding: utf-8 # # # # # # # #

Introduction to Python for Data Sciences

Franck Iutzeler
# # # #

# #
Chap. 3 - Data Handling with Pandas
# #

# # # # # 3- Fancy Visualization with Seaborn # # ## Advanced visualization # # # [Seaborn](https://seaborn.pydata.org/) is a package that produces somewhat nicer and more data oriented plots than Matplotlib. It also gives a fresher look to matlotlib plots. # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: # Create some data rng = np.random.RandomState(0) x = np.linspace(0, 10, 500) y = np.cumsum(rng.randn(500, 3), 0) # In[3]: plt.plot(x, y) plt.legend('one two three'.split(' ')); # Let us import seaborn and change the matplotlib style with sns.set() # In[4]: import seaborn as sns sns.set() # In[5]: # Same command but now seaborn is set plt.plot(x, y) plt.legend('one two three'.split(' ')); # ### Plotting Distributions # # Apart from the standard histograms plt.hist, Seaborn provides smoothed density plots based on data using sns.kdeplot or sns.displot. # In[6]: data = np.random.multivariate_normal([0, 1.5], [[1, 0.2], [0.2, 2]], size=2000) data = pd.DataFrame(data, columns=['x', 'y']) for col in 'xy': plt.hist(data[col], alpha=0.5) # alpha=0.5 provides semi-transparent plots # kdeplot provides density plots from an array or series (shade=True provide filled ones). # In[15]: sns.kdeplot(data['x']) sns.kdeplot(data['y'],shade=True) # displot is a mix of the two previous ones. # In[16]: sns.displot(data['x']) sns.histplot(data['y']) # Two-dimensional dataset may be represented by level sets with kdeplot. # In[19]: sns.kdeplot(data['x'],y = data['y'], shade=True, thresh=0.05, cmap="Reds", cbar=True) # Joint distribution and the marginal distributions can be displayed together using jointplot # In[21]: sns.jointplot(x= "x", y= "y", data = data, kind='kde'); # ### Exploring features correlations and interest to classification # # Seaborn provides an efficient tool for quickly exploring different features and classification with pairplot. # In[22]: import pandas as pd import numpy as np iris = pd.read_csv('data/iris.csv') print(iris.shape) iris.head() # In[23]: sns.pairplot(iris, hue='species') # factorplot also provides error plots. # In[25]: sns.catplot( x = "species" , y="sepal_length" , data=iris , kind="box") # ### Melting dataframes # # For displaying classification data, it is sometimes interesting to **melt** dataframes, that is separating # * **id:** the classes typically, things that are not numeric, that have to be kept in place (in our case with *iris*, the species) # * **values:** the *columns* corresponding to values (in our case with *iris*, the sepal_length, sepal_width, etc.) # # The command pd.melt return a dataframe with as columns: the id, the variable (former column) name, and associated value. # In[26]: irisS = pd.melt(iris,id_vars="species",value_vars=["sepal_length","sepal_width","petal_length","petal_width"]) irisS.head() # In[28]: sns.catplot( x= "species" , y = "value" , col="variable" , data=irisS , kind="box")