#!/usr/bin/env python
# coding: utf-8

# # Linear regression

# This note is to remind myself about some of the basic concepts of linear regression.

# **Bibliography**
# 
# [1] James, Witten, Hastie, Tibshirani - Introduction to Statistical Learning

# ## Setup

# In[1]:


import numpy as np
import scipy.stats
import pandas as pd
import matplotlib.pylab as plt


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


(blue, orange, red, green, purple, brown, pink, yellow, lightred, lightblue,
lightorange, lightgreen, lightpurple) = \
('#377eb8', '#ff7f00', '#e41a1c', '#4daf4a', '#984ea3', '#a65628', '#f781bf',
'#d2d215', '#fb9a99', '#a6cee3', '#fdbf6f', '#b2df8a', '#cab2d6')


# ## Creating some (noisy) toy data

# In[4]:


def create_data(b0, b1, noise_sd, n=100):
    """Return a dataframe with `n` rows and columns 'X' and 'Y'
    The dataframe will contain random data points that follow
    ``Y = b1 * X + b0 + eps`` where the error `eps` is drawn from
    a normal distribution with standard deviation `noise_sd`
    """
    x = np.sort(10 * np.random.rand(n))
    y = b1 * x + b0
    noise = scipy.stats.norm(scale=noise_sd)
    y += noise.rvs(len(x))
    return pd.DataFrame({'X': x, 'Y': y})


# In[5]:


b0_true = 0.3
b1_true = 1.2
noise_sd = 2.0  # σ in the standard distribution

data = create_data(b0_true, b1_true, noise_sd)

def plot_data(data, *lines, show=True, **kwargs):
    fig, ax = plt.subplots(**kwargs)
    data.plot.scatter('X', 'Y', ax=ax)
    for (b0, b1, label, color) in lines:
        ax.plot(
            np.linspace(0, 10, 5),
            b0 + np.linspace(0, 10, 5) * b1,
            color=color, label=label)
    ax.legend()
    if show:
        plt.show(fig)
    else:
        return fig, ax
    
plot_data(data, (b0_true, b1_true, 'true line', 'red'))


# ## Linear Regression with statsmodels

# In[6]:


import statsmodels.api as sm


# In[7]:


mod = sm.OLS(data['Y'], sm.add_constant(data['X']))


# In[8]:


res = mod.fit()
res.summary()


# In[9]:


res.params


# ## Manual Linear Regression

# In[10]:


def linear_regression(data):
    """Estimated b0_hat, b1_hat for the given data.

    See [1] Eq. (3.4)
    """
    xbar = data['X'].mean()
    ybar = data['Y'].mean()
    delta_x = data['X'] - xbar
    delta_y = data['Y'] - ybar
    b1_hat = np.dot(delta_x, delta_y) / np.dot(delta_x, delta_x)
    b0_hat = ybar - b1_hat * xbar
    return b0_hat, b1_hat


# In[11]:


b0_hat, b1_hat = linear_regression(data)


# In[12]:


b0_hat, b1_hat


# In[13]:


b0_true, b1_true


# In[14]:


plot_data(
    data,
    (b0_true, b1_true, 'true line ("population regression line")', red),
    (b0_hat, b1_hat, 'fit ("least squares line")', blue)
)


# ## Goodness of Fit

# ### Standard Error

# The Standard Error describes how close an estimated quantity is expected to be to the actual quantity.
# 
# It is the standard deviation of the "sampling distribution" of a statistical quantity: If you take a large number of independent samples, each of size $n$, and calculate the statistical quantity for each sample individually, then the standard errror is the standard deviation of the distribution of those results. It is always $\sigma/\sqrt{n}$, where $\sigma$ is the (possibly unknown) standard deviation of the original distribution. Standard errors go to zero as sample sizes go to infinity; standard deviations do not.

# In[15]:


def residuals(data, b0, b1):
    """Array of residuals from the perfect linear relation ``Y=b1*X+b0``"""
    Y_hat = b1 * data['X'] + b0
    return data['Y'] - Y_hat


# In[16]:


def RSS(data, b0, b1):
    """Residual Sum of Squares
    
    How much the Y values vary around the predicted Y values 
    """
    return (residuals(data, b0, b1)**2).sum()


# In[17]:


def TSS(data):
    """Total Sum of Squares
   
    How much the Y values vary around the mean Y value
    """
    return ((data['Y'] - data['Y'].mean())**2).sum()


# In[18]:


def RSE(data, b0, b1):
    """Residual standard error
    
    This is an estimate for the standard deviation σ of the true
    distribution (noise_sd). The y-values in the sample deviate from the 
    from the true regression line by about RSE units on average.
    """
    return np.sqrt(RSS(data, b0, b1) / (len(data)-2))


# In[19]:


RSS(data, b0_hat, b1_hat)


# In[20]:


RSE(data, b0_hat, b1_hat)


# In[21]:


noise_sd


# In[22]:


def linear_regression_SE(data, sig):
    """Calculate the standard error for the estimate of b0, b1 in a
    linear regression for the given data, assuming a standard deviation of
    `sig` for the distribution of noise on Y. As `sig` is generally not known,
    it should be estimated by the RSE.
    
    Note that the standard error is a property of the sampling only:
    The `Y`-values do not enter!
    
    See [1] Eq. (3.8)
    """
    n = len(data)
    xbar = data['X'].mean()
    delta_x = data['X'] - xbar
    denom = np.dot(delta_x, delta_x)
    se_b0_sq = sig**2 * (1.0 / n + xbar**2/denom)
    se_b1_sq = sig**2 / denom
    return np.sqrt(se_b0_sq), np.sqrt(se_b1_sq)


# In[23]:


linear_regression_SE(data, RSE(data, b0_hat, b1_hat))


# The 95% confidence intervals is $\pm$ two standard errors (probability that the true value is in the range of the confidence interval)

# ### Confidence Interval plot

# See https://stats.stackexchange.com/questions/85560/shape-of-confidence-interval-for-predicted-values-in-linear-regression/85565#85565m for the full explanation and derivation.

# In[24]:


def regline_SE(data, sig, xgrid):
    """The standard error of the full regression line, assuming 
    a standard deviation of `sig` for the distribution of noise on Y. As
    `sig` is generally not known, it should be estimated by the RSE.
    """
    n = len(data)
    return sig * np.sqrt(
        (1.0 / n) + (
            (xgrid - data['X'].mean())**2 /
            ((data['X'] - data['X'].mean())**2).sum()))


# The 95% confidence band can be calculated as:

# In[25]:


def regline_confidence_bands(data, b0_hat, b1_hat, xgrid, confidence=0.95):
    """Return two lines (arrays of y-values), the lower and upper boundary
    of the confidence band"""
    Y_hat = b0_hat + b1_hat * xgrid  # the fitted line, on the given xgrid
    n = len(data)
    sig = RSE(data, b0_hat, b1_hat)
    se = regline_SE(data, sig, xgrid)
    t_minus, t_plus = scipy.stats.t(df=n-2).interval(confidence)
    return Y_hat + t_minus * se, Y_hat + t_plus * se


# In[26]:


fig, ax = plot_data(
    data,
    (b0_true, b1_true, 'true line ("population regression line")', red),
    (b0_hat, b1_hat, 'fit ("least squares line")', blue),
    show=False, figsize=(10, 6)
)
xgrid = np.linspace(0, 10, 100)
lm, lp = regline_confidence_bands(data, b0_hat, b1_hat, xgrid)
ax.fill_between(
    xgrid, lm, lp,
    color='black', alpha=0.2, label='95% confidence band')
ax.legend()
plt.show(fig)


# The confidence band should be interpreted as follows: There is a 95% probability that a fit based on any given sampling of the same distribution will land in the interval.

# ### Testing the null hypothesis

# The null-hypthesis is that Y and X have no reltionship, i.e. b1 = 0. The $t$-value is the b1 in units of the standard error:

# In[27]:


t = b1_hat / linear_regression_SE(data, RSE(data, b0_hat, b1_hat))[1]
t


# The $t$-value follows a student distribution, and we can calculate a p-value (probability that $t$ has the given value, instead of 0)

# In[28]:


2 * scipy.stats.t(df=len(data)-2).sf(t)  # = 1 - (cdf(t) - cdf(-t))


# ### The $R^2$ statistic

# In[29]:


def R_sq(data, b0, b1):
    """R^2 in [0, 1] measures how well the variation in Y around the mean
    matches the variation in Y around the predicted value.
    
    That is, how much of the variability can be explained by the sampling"""
    return 1 - RSS(data, b0, b1) / TSS(data)


# In[30]:


R_sq(data, b0_hat, b1_hat)


# A value close to 1 means that the linear fit is very good

# ### Normality of the residuals

# A graphical way to check that the residuals are normal-distributed is to plot the quantiles of the residules against the quantiles of a normal distribution,
# see https://stats.stackexchange.com/questions/321061/probability-that-residuals-are-normal/321071#321071

# In[31]:


fig, ax = plt.subplots()
ax.plot(
    scipy.stats.norm().ppf(np.linspace(0, 1, len(data))),
    residuals(data, b0_hat, b1_hat).sort_values() / RSE(data, b0_hat, b1_hat))
ax.set_title('QQ plot')
ax.plot(
    np.linspace(-3, 3, 10), np.linspace(-3, 3, 10), color='black',
    ls='dashed')
ax.set_xlabel('theoretical quantiles')
ax.set_ylabel('sample quantiles')
plt.show(fig)


# ## Fit with known slope

# In[32]:


b0_known_slope = np.mean(data['Y'] - b1_true*data['X'])


# In[33]:


b0_known_slope


# In[34]:


b0_true


# In[35]:


plot_data(
    data,
    (b0_true, b1_true, 'true line', red),
    (b0_hat, b1_hat, 'fit', blue),
    (b0_known_slope, b1_true, 'fit with known slope', orange),
    show=True,
)


# In[ ]: