#!/usr/bin/env python
# coding: utf-8

# # 8: Discriptive statistics, Estimation
# 
# 
#  #### [Back to main page](https://petrosyan.page/fall2020math3215)
# 

# ### Box plots

# Below we display the blox plot of the following data:
# 
# $$\{-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35\}.$$

# In[119]:


# nbi:hide_in
import matplotlib.pyplot as plt
import numpy as np

# set up the figure
fig, ax = plt.subplots(1,1, figsize=(7,4))

# data
data = np.array([-30, -5,  9, 10, 13, 20, 40, 500]).astype(float)
# data = np.array([-30,-1, -5, -0.5, 0.5, 0.6, 0, 2, 3, 4.6, 4, 7, 18, 35]).astype(float)


def percentile(data, p):
    """
    Compute the percentiles the way we defined in class 
    data : array of size N 
    p : percentile
    """
    data = np.sort(data, axis=0)
    rank = int(p * (data.shape[0] + 1) - 1) # the rank
    assert rank > 0, "the rank does not exist" 
    alpha = p * (data.shape[0] + 1) - 1 - rank # the fractional part
    return data[rank] + alpha * (data[rank + 1] - data [rank])

def box_plot(ax, data, width=0.4, showout = True, position = np.array([0.4])):
    """
    ax : matplotlib ax
    data : the data 
    width : box width
    showout : show the outliers   
    position: the y axis of the box plot
    """
    # compute the five number summary 
    minim = np.min(data)
    maxim = np.max(data)
    q1 = percentile(data, 0.25)
    q2 = np.median(data)
    q3 = percentile(data, 0.75)

    # interquartile range
    iqr = q3 - q1

    # inner fences
    left_innerfence = q1 - 1.5 * iqr
    right_innerfence = q3 + 1.5 * iqr

    # compute outliers 
    outliers = data[np.logical_or(data <left_innerfence, data >= right_innerfence)]
    
    # whiskers
    if showout==True:
        low_whisker = np.min(data[data >= left_innerfence])
        high_whisker = np.max(data[data <= right_innerfence])
    else:
        low_whisker = np.min(data)
        high_whisker = np.max(data)


    stats = [{'iqr': iqr,
              'whishi': high_whisker,
              'whislo': low_whisker,
              'fliers': outliers,
              'q1': q1,
              'med': q2,
              'q3': q3}]

    # add the box plot
    flierprops = dict(markerfacecolor='black', markersize=5)
    ax.bxp(stats, vert = False, widths=width, positions = position, 
           flierprops=flierprops, showfliers=showout)

    # add Tukey's fences
    if showout==True:
        ax.vlines(q1-1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
        ax.vlines(q3+1.5*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)

        ax.vlines(q1-3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)
        ax.vlines(q3+3*iqr, position-0.2,position+0.2, linestyle="dashed", linewidth=1)

    #  
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.set_ylim(-0.1,position+0.3)
    ax.set_yticks([])
    plt.figtext(1,0.8,
                r"$\min={:.4}$".format(minim)+"\n"+
                r"$q_1={:.4}$".format(q1)+"\n"+
                r"med$={:.4}$".format(q2)+"\n"+
                r"$q_3={:.4}$".format(q3)+"\n"+
                r"max$={:.4}$".format(maxim),
                ha="left", va="top",
                backgroundcolor=(0.1, 0.1, 1, 0.15),
                fontsize="large")
    
def disp_data(ax, data):
    ax.scatter(data, np.zeros(data.shape), zorder=2, s=10)
    ax.set_yticks([])
#     ax.set_xticks([])
    mean = np.mean(data)
    ax.scatter(mean, 0, zorder=2, s=20, color="red")
    ax.set_ylim(-0.01,0.1)
    ax.axhline(y=0, color='k', zorder=1, linewidth=0.5)

    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    ax.set_ylim(-0.1,1.5)
            
box_plot(ax, data, width=0.2, showout=True)

plt.show();


# ## Ordinary least squares regression
# 
# The following data is taken from [here](https://people.sc.fsu.edu/~jburkardt/datasets/regression/x03.txt). The dataset contains systolic blood pressure for 30 people of different ages.

# In[112]:


# nbi:hide_in
import pandas as pd 
from IPython.display import display, HTML
display(HTML("<style>.container { width:61% !important; }</style>"))

pdata=pd.DataFrame.transpose(pd.read_csv('files/blood_pressure.csv').astype(int))
display(HTML(pdata.to_html(header=False)))


# In[54]:


# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6)) 
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]

# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
# cov = np.cov(age, sbp)[0,1]
cov = np.inner(age-meanx, sbp-meany)/age.shape[0]

xvals = np.arange(np.min(age)-5, np.max(age)+5, 0.1)
yvals = cov*(xvals - meanx)/varx + meany

slope = cov / varx
intercept = meany - cov*meanx / varx

plt.xlabel("Age")
plt.ylabel("Systolic Blood Pressure (SDP)")
plt.plot(xvals, yvals, c="r", zorder=1)

plt.figtext(0.9,0.8, " slope = {:.4f}".format(slope) +
            "\n intercept = {:.4f}".format(intercept),
            ha="left", va="top",
            backgroundcolor=(0.1, 0.1, 1, 0.15), fontsize="large")
plt.title("Simple linear regression")
plt.scatter(age, sbp, zorder=0);


# In[50]:


# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6)) 
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# load data from a csv file
data = np.genfromtxt('files/blood_pressure.csv', delimiter=',')[1:,:]
age = data[:,0]
sbp = data[:,1]

# the least squares line
meanx = np.mean(age)
meany = np.mean(sbp)
varx = np.var(age)
cov = np.inner(age-meanx, sbp-meany)/age.shape[0]

res = sbp - cov*(age - meanx)/varx - meany

plt.gca().spines['bottom'].set_position(('data',0))
plt.title("Residuals")
plt.scatter(age, res, zorder=0, alpha=0.7);


# ## MLE linear regression for synthetic data
# 
# True valus are $a=-2, b = 1$ and $\sigma^2=0.04$.

# In[87]:


# nbi:hide_in
import numpy as np
import matplotlib.pyplot as plt

# plot setup
plt.figure(figsize=(7,6)) 
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# true parameters
a = -2
b = 1
sigma = 2
numData = 40

# generate data
x = np.random.rand(numData)*(b-a)+a
y = a * x + b
eps = np.random.randn(x.shape[0])*sigma
y = y + eps

# the least squares line
meanx = np.mean(x)
meany = np.mean(y)
varx = np.var(x)
vary = np.var(y)
cov = np.inner(x-meanx, y-meany)/x.shape[0]

# MLE 
aMLE = cov / varx
bMLE = meany - cov*meanx / varx
res = y - cov*(x - meanx)/varx - meany
varMLE = np.var(res)
R2 = 1 -  varMLE / vary


# MLE line
xvals = np.arange(np.min(x)-0.3, np.max(x)+0.3, 0.1)
yvals = cov*(xvals - meanx)/varx + meany

plt.xlabel("x")
plt.ylabel("y")
plt.figtext(0.9,0.8, r" $\hat a$ = {:.4f}".format(aMLE) +
            "\n $\hat b$ = {:.4f}\n".format(bMLE) +
            r" $\hat\sigma^2$ = {:.4f}".format(varMLE) + "\n"
            r" $R^2$ = {:.4f}".format(R2),
            ha="left", va="top",
            backgroundcolor=(0.1, 0.1, 1, 0.15), fontsize="large")
plt.plot(xvals, yvals, c="r", zorder=1)
plt.scatter(x, y, zorder=0, alpha=0.7);