#!/usr/bin/env python
# coding: utf-8
# ## 2D Kernel Density Distributions Using Plotly
# ##### ABOUT THE AUTHOR:
# This notebook was contributed by [Plotly user Emilia Petrisor](https://plotly.com/~empet). You can follow Emilia on Twitter [@mathinpython](https://twitter.com/mathinpython) or [GitHub](https://github.com/empet).
# ### Introduction:
# We have two `Excel` files with two columns. We read the files into two `pandas` dataframes and plot
# for each of them an estimate of the joint distribution of the corresponding two columns. The joint distribution is calcalutated by `scipy.stats.gaussian_kde` [function](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.gaussian_kde.html).
# In[23]:
import numpy as np
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
# Read the first file:
# In[24]:
xl = pd.ExcelFile("Data/CSCEng.xls")
dfc = xl.parse("Sheet1")
dfc.columns
# and the seconed one:
# In[25]:
xl = pd.ExcelFile("Data/SystEng.xls")
dfi = xl.parse("Sheet1")
dfi.columns
# The contour plot of the joint distribution of two variables (columns) is colored with a custom colorscale:
# In[26]:
cubehelix_cs=[[0.0, '#fcf9f7'],
[0.16666666666666666, '#edcfc9'],
[0.3333333333333333, '#daa2ac'],
[0.5, '#bc7897'],
[0.6666666666666666, '#925684'],
[0.8333333333333333, '#5f3868'],
[1.0, '#2d1e3e']]
# The function `kde_scipy` returns data for Plotly contour plot of the estimated 2D distribution:
# In[27]:
def kde_scipy( vals1, vals2, (a,b), (c,d), N ):
#vals1, vals2 are the values of two variables (columns)
#(a,b) interval for vals1; usually larger than (np.min(vals1), np.max(vals1))
#(c,d) -"- vals2
x=np.linspace(a,b,N)
y=np.linspace(c,d,N)
X,Y=np.meshgrid(x,y)
positions = np.vstack([Y.ravel(), X.ravel()])
values = np.vstack([vals1, vals2])
kernel = st.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
return [x, y, Z]
# ### Contour plot of the joint distribution of data from the first file ###
# In[28]:
import plotly.plotly as py
from plotly.graph_objs import *
# In[29]:
def make_kdeplot(varX, varY, (a,b), (c,d), N, colorsc, title):
#varX, varY are lists, 1d numpy.array(s), or dataframe columns, storing the values of two variables
x, y, Z = kde_scipy(varY, varX, (a,b), (c,d), N )
data = Data([
Contour(
z=Z,
x=x,
y=y,
colorscale=colorsc,
#reversescale=True,
opacity=0.9,
contours=Contours(
showlines=False)
),
])
layout = Layout(
title= title,
font= Font(family='Georgia, serif', color='#635F5D'),
showlegend=False,
autosize=False,
width=650,
height=650,
xaxis=XAxis(
range=[a,b],
showgrid=False,
nticks=7
),
yaxis=YAxis(
range=[c,d],
showgrid=False,
nticks=7
),
margin=Margin(
l=40,
r=40,
b=85,
t=100,
),
)
return Figure( data=data, layout=layout )
# In[30]:
N=200
a,b=(5,11)
fig=make_kdeplot(dfc['multiannual'], dfc['bachelor-th'], (a,b), (a,b),
N, cubehelix_cs,'kde plot of two sets of data' )
py.sign_in('empet', 'my_api_key')
py.iplot(fig, filename='kde-2D-CSCE')
# ### Contour plot of the joint distribution of data from the second file ###
# In[31]:
a, b=(4,12)
fig=make_kdeplot(dfi['multiannual'], dfi['bachelor-th'], (a,b), (a,b),
N, cubehelix_cs, 'kde plot of two sets of data')
py.iplot(fig, filename='kde-2D-SE')
# One notices that the second contourplot illustrates a [mixture of two bivariate
# distributions](https://en.wikipedia.org/wiki/Mixture_distribution).
# Finally we read a dataframe from a csv file posted on the Plotly's github account, select the rows corresponding to `Iris-virginica`, and plot the joint distribution of two virginica features:
# In[32]:
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/iris.csv')
virginica = df.loc[df.Name == "Iris-virginica"]
a, b=(5,8.5)
c,d=(2,4)
N=100
fig=make_kdeplot(virginica.SepalLength, virginica.SepalWidth, (a,b), (c,d),
N, cubehelix_cs, 'kde plot of joint distribution for virginica SepalLength and SepalWidth')
py.iplot(fig, filename='virginica-sepal-length-vs-width')
# In[33]:
from IPython.core.display import HTML
def css_styling():
styles = open("./custom.css", "r").read()
return HTML(styles)
css_styling()
# In[2]:
from IPython.display import HTML, display
display(HTML(''))
display(HTML(''))
import publisher
publisher.publish('2d-kernel-density-distributions', '/ipython-notebooks/2d-kernel-density-distributions/',
'2d Kernel Density Distributions',
'2D Kernel Density Distributions Using Plotly')
# In[ ]: