#!/usr/bin/env python # coding: utf-8 #
#
Data For Science, Inc
#
#

Applied Probability Theory From Scratch

#

Simpson's Paradox

#

Bruno Gonçalves
# www.data4sci.com
# @bgoncalves, @data4sci

#
# In[1]: import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt import sklearn from sklearn.linear_model import LinearRegression import watermark get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: get_ipython().run_line_magic('watermark', '-i -n -v -m -g -iv') # In[3]: plt.style.use('./d4sci.mplstyle') # ## Load the iris dataset # In[4]: iris = pd.read_csv('data/iris.csv') # In[5]: iris # Split the dataset across species for convenience # In[6]: setosa = iris[['sepal_width', 'petal_width']][iris['species'] == 'setosa'] versicolor = iris[['sepal_width', 'petal_width']][iris['species'] == 'versicolor'] virginica = iris[['sepal_width', 'petal_width']][iris['species'] == 'virginica'] # ## Perform the fits # In[7]: lm_setosa = LinearRegression() lm_setosa.fit(setosa['sepal_width'].values.reshape(-1,1), setosa['petal_width']) y_setosa = lm_setosa.predict(setosa['sepal_width'].values.reshape(-1,1)) lm_versicolor = LinearRegression() lm_versicolor.fit(versicolor['sepal_width'].values.reshape(-1,1), versicolor['petal_width']) y_versicolor = lm_versicolor.predict(versicolor['sepal_width'].values.reshape(-1,1)) lm_virginica = LinearRegression() lm_virginica.fit(virginica['sepal_width'].values.reshape(-1,1), virginica['petal_width']) y_virginica = lm_virginica.predict(virginica['sepal_width'].values.reshape(-1,1)) lm_full = LinearRegression() lm_full.fit(iris['sepal_width'].values.reshape(-1,1), iris['petal_width']) y_full = lm_full.predict(iris['sepal_width'].values.reshape(-1,1)) # ## Generate the plot # In[8]: fig, axs = plt.subplots(ncols=2, sharey=True) colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] setosa.plot.scatter(x='sepal_width', y='petal_width', label='setosa', ax=axs[0], c=colors[0]) versicolor.plot.scatter(x='sepal_width', y='petal_width', label='versicolor', ax=axs[0], c=colors[1]) virginica.plot.scatter(x='sepal_width', y='petal_width', label='virginica', ax=axs[0], c=colors[2]) l4, = axs[0].plot(iris['sepal_width'].values.reshape(-1,1), y_full, '-', c=colors[3]) setosa.plot.scatter(x='sepal_width', y='petal_width', ax=axs[1], c=colors[0]) versicolor.plot.scatter(x='sepal_width', y='petal_width', ax=axs[1], c=colors[1]) virginica.plot.scatter(x='sepal_width', y='petal_width', ax=axs[1], c=colors[2]) l1, = axs[1].plot(setosa['sepal_width'].values.reshape(-1,1), y_setosa, '-', c=colors[0]) l2, = axs[1].plot(versicolor['sepal_width'].values.reshape(-1,1), y_versicolor, '-', c=colors[1]) l3, = axs[1].plot(virginica['sepal_width'].values.reshape(-1,1), y_virginica, '-', c=colors[2]) axs[0].set_xlabel('Sepal Width') axs[1].set_xlabel('Sepal Width') axs[0].set_ylabel('Petal Width') fig.subplots_adjust(bottom=0.3, wspace=0.33) axs[0].legend(handles = [l1, l2, l3, l4] , labels=['Setosa', 'Versicolor', 'Virginica', 'Total'], loc='lower left', bbox_to_anchor=(0, -0.4), ncol=2, fancybox=True, shadow=False) # ## Removing setosa # In[9]: reduced = iris[iris['species'] != 'setosa'].copy() # In[10]: lm_reduced = LinearRegression() lm_reduced.fit(reduced['sepal_width'].values.reshape(-1,1), reduced['petal_width']) y_reduced = lm_reduced.predict(reduced['sepal_width'].values.reshape(-1,1)) # In[11]: fig, axs = plt.subplots(ncols=1, sharey=True) colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] versicolor.plot.scatter(x='sepal_width', y='petal_width', ax=axs, c=colors[1]) virginica.plot.scatter(x='sepal_width', y='petal_width', ax=axs, c=colors[2]) axs.plot(versicolor['sepal_width'].values.reshape(-1,1), y_versicolor, '-', c=colors[1], label='versicolor') axs.plot(virginica['sepal_width'].values.reshape(-1,1), y_virginica, '-', c=colors[2], label='virginica') axs.plot(reduced['sepal_width'].values.reshape(-1,1), y_reduced, '-', c=colors[3], label='reduced') axs.set_xlabel('Sepal Width') axs.set_ylabel('Petal Width') plt.legend() #
# Data For Science, Inc #