A Brief Look at Outliers¶

In [12]:

from sklearn.datasets import load_boston
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
boston = load_boston()
boston_data = boston.data
boston_target = boston.target

In [2]:

print(boston_data.shape)
print(boston.feature_names)

(506, 13)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT' 'MEDV']

In [13]:

fig, axes = plt.subplots()
lstat = np.squeeze(np.array(boston_data[:,12]))
bp = axes.boxplot([boston_data[:,12], boston_target])
axes.set_xticklabels(["LSTAT", "MEDV"])

Out[13]:

[<matplotlib.text.Text at 0x108fa5a50>, <matplotlib.text.Text at 0x10bfd62d0>]

In [14]:

lstat_outliers = bp['fliers'][0].get_data()[1]
medv_top_outliers = bp['fliers'][2].get_data()[1]
medv_bottom_outliers = bp['fliers'][3].get_data()[1]
print("LSTAT Outliers: >= {}".format(np.min(lstat_outliers)))
print("MEDV Outliers: >= {0} and <= {1}".format(np.min(medv_top_outliers),
                                                np.max(medv_bottom_outliers)))
lo = np.squeeze(np.where(lstat >= 31.99))
mot = np.squeeze(np.where(boston_target <= 5))
mob = np.squeeze(np.where(boston_target >= 37))
ao = np.concatenate([lo,mot,mob])
lstat_nooutliers = np.delete(lstat,ao)
medv_nooutliers = np.delete(boston_target,ao)
lstat_outliers = np.take(lstat,ao)
medv_outliers = np.take(boston_target,ao)

LSTAT Outliers: >= 31.99
MEDV Outliers: >= 37.0 and <= 5.0

In [15]:

fit_no = np.polyfit(lstat_nooutliers,medv_nooutliers,1)
fit_o = np.polyfit(lstat,boston_target,1)

fig, axes = plt.subplots()
fig.set_size_inches(10,10)
axes.scatter(lstat_nooutliers,medv_nooutliers)
axes.scatter(lstat_outliers,medv_outliers,c='red',label="Outliers")
axes.plot(lstat_nooutliers,fit_no[0]*lstat_nooutliers+fit_no[1],'b-')
axes.plot(lstat,fit_o[0]*lstat+fit_o[1],'r-')
axes.legend(scatterpoints=1)
axes.set_xlabel("LSTAT")
axes.set_ylabel("MEDV")

Out[15]:

<matplotlib.text.Text at 0x10d130250>

In [ ]: