#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import seaborn as sns import matplotlib.pyplot as plt sns.set(style="darkgrid", color_codes=True) # In[2]: iris = pd.read_csv("Iris.csv") iris.head() # In[3]: iris["species"].value_counts() # In[4]: iris.plot(kind='scatter', x="sepal_length", y="sepal_width") # In[5]: sns.jointplot(x="sepal_length", y="sepal_width", data=iris, kind="scatter", size=5) plt.show() # In[6]: g = sns.JointGrid(x="sepal_length", y="sepal_width", data=iris, size=5) g = g.plot(plt.scatter, sns.kdeplot) # In[7]: sns.jointplot(x="sepal_length", y="sepal_width", data=iris, kind="hexbin", size=5) plt.show() # In[8]: g = sns.jointplot(x="sepal_length", y="sepal_width", data=iris, kind="kde", size=5) # In[9]: g = sns.jointplot(x="sepal_length", y="sepal_width", data=iris, kind="kde", color="m") g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+") g.ax_joint.collections[0].set_alpha(0) g.set_axis_labels("$Sepal length$", "$Sepal width$"); # In[10]: # One piece of information missing in the plots above is what species each plant is # We'll use seaborn's FacetGrid to color the scatterplot by species g = sns.FacetGrid(iris, hue="species", size=5) \ .map(plt.scatter, "sepal_length", "sepal_width") \ .add_legend() # In[11]: # We can look at an individual feature in Seaborn through a boxplot sns.boxplot(x="species", y="sepal_length", data=iris) g = sns.stripplot(x="species", y="sepal_length", data=iris, jitter=True, edgecolor="gray") # In[12]: # A violin plot combines the benefits of the previous two plots and simplifies them # Denser regions of the data are fatter, and sparser thiner in a violin plot g =sns.violinplot(x="species", y="petal_length", data=iris, size=6) # In[13]: # A final seaborn plot useful for looking at univariate relations is the kdeplot, # which creates and visualizes a kernel density estimate of the underlying feature g = sns.FacetGrid(iris, hue="species", size=6) \ .map(sns.kdeplot, "petal_length") \ .add_legend() # In[14]: # Another useful seaborn plot is the pairplot, which shows the bivariate relation # between each pair of features # # From the pairplot, we'll see that the Iris-setosa species is separataed from the other # two across all feature combinations g = sns.pairplot(iris, hue="species", size=3) # In[15]: g = sns.pairplot(iris, hue="species", size=3, diag_kind="hist") # In[16]: # Now that we've covered seaborn, let's go back to some of the ones we can make with Pandas # We can quickly make a boxplot with Pandas on each feature split out by species g = iris.boxplot(by="species", figsize=(12, 6)) # In[17]: # One cool more sophisticated technique pandas has available is called Andrews Curves # Andrews Curves involve using attributes of samples as coefficients for Fourier series # and then plotting these from pandas.plotting import andrews_curves p = andrews_curves(iris, "species") # In[18]: # Another multivariate visualization technique pandas has is parallel_coordinates # Parallel coordinates plots each feature on a separate column & then draws lines # connecting the features for each data sample from pandas.plotting import parallel_coordinates p = parallel_coordinates(iris, "species") # In[19]: # A final multivariate visualization technique pandas has is radviz # Which puts each feature as a point on a 2D plane, and then simulates # having each sample attached to those points through a spring weighted by the relative value for that feature from pandas.plotting import radviz p = radviz(iris, "species")