#!/usr/bin/env python
# coding: utf-8
#
#
#
#
#
#
#
# Chap. 3 - Data Handling with Pandas
#
#
#
#
#
# # 3- Fancy Visualization with Seaborn
#
# ## Advanced visualization
#
#
# [Seaborn](https://seaborn.pydata.org/) is a package that produces somewhat nicer and more data oriented plots than Matplotlib. It also gives a fresher look to matlotlib plots.
# In[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
# In[2]:
# Create some data
rng = np.random.RandomState(0)
x = np.linspace(0, 10, 500)
y = np.cumsum(rng.randn(500, 3), 0)
# In[3]:
plt.plot(x, y)
plt.legend('one two three'.split(' '));
# Let us import seaborn and change the matplotlib style with sns.set()
# In[4]:
import seaborn as sns
sns.set()
# In[5]:
# Same command but now seaborn is set
plt.plot(x, y)
plt.legend('one two three'.split(' '));
# ### Plotting Distributions
#
# Apart from the standard histograms plt.hist, Seaborn provides smoothed density plots based on data using sns.kdeplot or sns.displot.
# In[6]:
data = np.random.multivariate_normal([0, 1.5], [[1, 0.2], [0.2, 2]], size=2000)
data = pd.DataFrame(data, columns=['x', 'y'])
for col in 'xy':
plt.hist(data[col], alpha=0.5) # alpha=0.5 provides semi-transparent plots
# kdeplot provides density plots from an array or series (shade=True provide filled ones).
# In[15]:
sns.kdeplot(data['x'])
sns.kdeplot(data['y'],shade=True)
# displot is a mix of the two previous ones.
# In[16]:
sns.displot(data['x'])
sns.histplot(data['y'])
# Two-dimensional dataset may be represented by level sets with kdeplot.
# In[19]:
sns.kdeplot(data['x'],y = data['y'], shade=True, thresh=0.05, cmap="Reds", cbar=True)
# Joint distribution and the marginal distributions can be displayed together using jointplot
# In[21]:
sns.jointplot(x= "x", y= "y", data = data, kind='kde');
# ### Exploring features correlations and interest to classification
#
# Seaborn provides an efficient tool for quickly exploring different features and classification with pairplot.
# In[22]:
import pandas as pd
import numpy as np
iris = pd.read_csv('data/iris.csv')
print(iris.shape)
iris.head()
# In[23]:
sns.pairplot(iris, hue='species')
# factorplot also provides error plots.
# In[25]:
sns.catplot( x = "species" , y="sepal_length" , data=iris , kind="box")
# ### Melting dataframes
#
# For displaying classification data, it is sometimes interesting to **melt** dataframes, that is separating
# * **id:** the classes typically, things that are not numeric, that have to be kept in place (in our case with *iris*, the species)
# * **values:** the *columns* corresponding to values (in our case with *iris*, the sepal_length, sepal_width, etc.)
#
# The command pd.melt return a dataframe with as columns: the id, the variable (former column) name, and associated value.
# In[26]:
irisS = pd.melt(iris,id_vars="species",value_vars=["sepal_length","sepal_width","petal_length","petal_width"])
irisS.head()
# In[28]:
sns.catplot( x= "species" , y = "value" , col="variable" , data=irisS , kind="box")