# 8: Discriptive statistics, Estimation¶

### Box plots¶

Below we display the blox plot of the following data:

$$\{-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35\}.$$
In [119]:
# nbi:hide_in
import matplotlib.pyplot as plt
import numpy as np

# set up the figure
fig, ax = plt.subplots(1,1, figsize=(7,4))

# data
data = np.array([-30, -5,  9, 10, 13, 20, 40, 500]).astype(float)
# data = np.array([-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35]).astype(float)

def percentile(data, p):
"""
Compute the percentiles the way we defined in class
data : array of size N
p : percentile
"""
data = np.sort(data, axis=0)
rank = int(p * (data.shape[0] + 1) - 1) # the rank
assert rank > 0, "the rank does not exist"
alpha = p * (data.shape[0] + 1) - 1 - rank # the fractional part
return data[rank] + alpha * (data[rank + 1] - data [rank])

def box_plot(ax, data, width=0.4, showout = True, position = np.array([0.4])):
"""
ax : matplotlib ax
data : the data
width : box width
showout : show the outliers
position: the y axis of the box plot
"""
# compute the five number summary
minim = np.min(data)
maxim = np.max(data)
q1 = percentile(data, 0.25)
q2 = np.median(data)
q3 = percentile(data, 0.75)

# interquartile range
iqr = q3 - q1

# inner fences
left_innerfence = q1 - 1.5 * iqr
right_innerfence = q3 + 1.5 * iqr

# compute outliers
outliers = data[np.logical_or(data <left_innerfence, data >= right_innerfence)]

# whiskers
if showout==True:
low_whisker = np.min(data[data >= left_innerfence])
high_whisker = np.max(data[data <= right_innerfence])
else:
low_whisker = np.min(data)
high_whisker = np.max(data)

stats = [{'iqr': iqr,
'whishi': high_whisker,
'whislo': low_whisker,
'fliers': outliers,
'q1': q1,
'med': q2,
'q3': q3}]

flierprops = dict(markerfacecolor='black', markersize=5)
ax.bxp(stats, vert = False, widths=width, positions = position,
flierprops=flierprops, showfliers=showout)

if showout==True:
ax.vlines(q1-1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
ax.vlines(q3+1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)

ax.vlines(q1-3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
ax.vlines(q3+3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)

#
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_ylim(-0.1,position+0.3)
ax.set_yticks([])
plt.figtext(1,0.8,
r"$\min={:.4}$".format(minim)+"\n"+
r"$q_1={:.4}$".format(q1)+"\n"+
r"med$={:.4}$".format(q2)+"\n"+
r"$q_3={:.4}$".format(q3)+"\n"+
r"max$={:.4}$".format(maxim),
ha="left", va="top",
backgroundcolor=(0.1, 0.1, 1, 0.15),
fontsize="large")

def disp_data(ax, data):
ax.scatter(data, np.zeros(data.shape), zorder=2, s=10)
ax.set_yticks([])
#     ax.set_xticks([])
mean = np.mean(data)
ax.scatter(mean, 0, zorder=2, s=20, color="red")
ax.set_ylim(-0.01,0.1)
ax.axhline(y=0, color='k', zorder=1, linewidth=0.5)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

ax.set_ylim(-0.1,1.5)

box_plot(ax, data, width=0.2, showout=True)

plt.show();


## Ordinary least squares regression¶

The following data is taken from here. The dataset contains systolic blood pressure for 30 people of different ages.

In [112]:
# nbi:hide_in
import pandas as pd
from IPython.display import display, HTML
display(HTML("<style>.container { width:61% !important; }</style>"))


 Age SDP 39 47 45 47 65 46 67 42 67 56 64 56 59 34 42 48 45 17 20 19 36 50 39 21 44 53 63 29 25 69 144 220 138 145 162 142 170 124 158 154 162 150 140 110 128 130 135 114 116 124 136 142 120 120 160 158 144 130 125 175
In [54]:
# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]

# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
# cov = np.cov(age, sbp)[0,1]
cov = np.inner(age-meanx, sbp-meany)/age.shape[0]

xvals = np.arange(np.min(age)-5, np.max(age)+5, 0.1)
yvals = cov*(xvals - meanx)/varx + meany

slope = cov / varx
intercept = meany - cov*meanx / varx

plt.xlabel("Age")
plt.ylabel("Systolic Blood Pressure (SDP)")
plt.plot(xvals, yvals, c="r", zorder=1)

plt.figtext(0.9,0.8, " slope = {:.4f}".format(slope) +
"\n intercept = {:.4f}".format(intercept),
ha="left", va="top",
backgroundcolor=(0.1, 0.1, 1, 0.15), fontsize="large")
plt.title("Simple linear regression")
plt.scatter(age, sbp, zorder=0);

In [50]:
# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]

# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
cov = np.inner(age-meanx, sbp-meany)/age.shape[0]

res = sbp - cov*(age - meanx)/varx - meany

plt.gca().spines['bottom'].set_position(('data',0))
plt.title("Residuals")
plt.scatter(age, res, zorder=0, alpha=0.7);


## MLE linear regression for synthetic data¶

True valus are $a=-2, b = 1$ and $\sigma^2=0.04$.

In [87]:
# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6))
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# true parameters
a = -2
b = 1
sigma = 2
numData = 40

# generate data
x = np.random.rand(numData)*(b-a)+a
y = a * x + b
eps = np.random.randn(x.shape[0])*sigma
y = y + eps

# the least squares line
meanx = np.mean(x)
meany = np.mean(y)
varx = np.var(x)
vary = np.var(y)
cov = np.inner(x-meanx, y-meany)/x.shape[0]

# MLE
aMLE = cov / varx
bMLE = meany - cov*meanx / varx
res = y - cov*(x - meanx)/varx - meany
varMLE = np.var(res)
R2 = 1 -  varMLE / vary

# MLE line
xvals = np.arange(np.min(x)-0.3, np.max(x)+0.3, 0.1)
yvals = cov*(xvals - meanx)/varx + meany

plt.xlabel("x")
plt.ylabel("y")
plt.figtext(0.9,0.8, r" $\hat a$ = {:.4f}".format(aMLE) +
"\n $\hat b$ = {:.4f}\n".format(bMLE) +
r" $\hat\sigma^2$ = {:.4f}".format(varMLE) + "\n"
r" $R^2$ = {:.4f}".format(R2),
ha="left", va="top",
backgroundcolor=(0.1, 0.1, 1, 0.15), fontsize="large")
plt.plot(xvals, yvals, c="r", zorder=1)
plt.scatter(x, y, zorder=0, alpha=0.7);