#!/usr/bin/env python
# coding: utf-8

# # Simple Linear Regression

# We used the correlation function to measure the strength of the linear relationship between two variables. For most applications, knowing that such a linear relationship exists isn’t enough. We’ll want to be able to understand the nature of the relationship. This is where we’ll use simple linear regression.
# 

# ## The Model
# 
# $$y_i = \beta x_i + \alpha + \epsilon_i$$
# 
# 
# where 
# 
# - $y_i$ is the number of minutes user i spends on the site daily, 
# - $x_i$ is the number of friends user i has
# - $\alpha$ is the constant when x = 0.
# - $ε_i$ is a (hopefully small) error term representing the fact that there are other factors not accounted for by this simple model.

# # Least Squares Fit
# 
# 最小二乘法

# $$ y_i = X_i^T w$$
# 
# The constant could be represent by 1 in X
# 
# The squared error could be written as: 
# 
# $$ \sum_{i = 1}^m (y_i -X_i^T w)^2 $$

# If we know $\alpha$ and $\beta$, then we can make predictions.

# Since we know the actual output $y_i$ we can compute the error for each pair.

# Since the negative errors cancel out with the positive ones, we use squared errors.

# The least squares solution is to choose the $\alpha$ and $\beta$ that make **sum_of_squared_errors** as small as possible.

# The choice of beta means that when the input value increases by standard_deviation(x), the prediction increases by correlation(x, y) * standard_deviation(y). 
# 
# - In the case when x and y are perfectly positively correlated, a one standard deviation increase in x results in a one-standard-deviation-of-y increase in the prediction.
# - When they’re perfectly negatively correlated, the increase in x results in a decrease in the prediction. 
# - And when the correlation is zero, beta is zero, which means that changes in x don’t affect the prediction at all.

# > In this case, the slope of the fitted line is equal to the correlation between y and x corrected by the ratio of standard deviations of these variables. 
# 
# $$ y_i = \alpha + \beta x_i + \varepsilon_i $$
# 
# $$ \hat\varepsilon_i =y_i-a -b x_i $$
# 
# $$ \text{Find }\min_{a,\, b} Q(a, b), \quad \text{for } Q(a, b) = \sum_{i=1}^n\hat\varepsilon_i^{\,2} = \sum_{i=1}^n (y_i -a - b x_i)^2\ $$
# 

# By expanding to get a quadratic expression in $a$ and $b$, we can derive values of $a$ and $b$ that minimize the objective function $Q$ (these minimizing values are denoted $\hat{\alpha}$ and $\hat{\beta}$):
# 
# 
# 
# \begin{align}
#  \hat\alpha & = \bar{y} - \hat\beta\,\bar{x}, \\
#   \hat\beta &= \frac{ \sum_{i=1}^n (x_i - \bar{x})(y_i - \bar{y}) }{ \sum_{i=1}^n (x_i - \bar{x})^2 } = \frac{ \operatorname{Cov}(x, y) }{ \operatorname{Var}(x) } = r_{xy} \frac{s_y}{s_x}. \\[6pt]
# \end{align}
# 
# - $r_{xy}$ as the sample correlation coefficient between x and y
# - $s_x$ and $s_y$ as the uncorrected sample standard deviations of x and y
# 
# 
# > Kenney, J. F. and Keeping, E. S. (1962) "Linear Regression and Correlation." Ch. 15 in ''Mathematics of Statistics'', Pt. 1, 3rd ed. Princeton, NJ: Van Nostrand, pp. 252–285

# Substituting the above expressions for $\hat{\alpha}$ and $\hat{\beta}$ into
# 
# $$f = \hat{\alpha} + \hat{\beta} x,$$
# 
# yields
# 
# $$\frac{ f - \bar{y}}{s_y} = r_{xy} \frac{ x - \bar{x}}{s_x}  .$$
# 
# 
# > Kenney, J. F. and Keeping, E. S. (1962) "Linear Regression and Correlation." Ch. 15 in ''Mathematics of Statistics'', Pt. 1, 3rd ed. Princeton, NJ: Van Nostrand, pp. 252–285

# In[2]:


num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
alpha, beta = 22.9475, 0.90386


# In[4]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt

plt.scatter(num_friends_good, daily_minutes_good)
plt.plot(num_friends_good, [alpha + beta*i for i in num_friends_good], 'b-')
plt.xlabel('# of friends', fontsize = 20)
plt.ylabel('minutes per day', fontsize = 20)
plt.title('simple linear regression model', fontsize = 20)
plt.show()


# Of course, we need a better way to figure out how well we’ve fit the data than staring at the graph. 
# 
# A common measure is the coefficient of determination (or R-squared), which measures the fraction of the total variation in the dependent variable that is captured by the model.

# # Multiple Regression using Matrix Method
# 
# Machine Learning in Action
# 
# https://github.com/computational-class/machinelearninginaction/

# $$ y_i = X_i^T w$$
# 
# The constant could be represent by 1 in X
# 
# The squared error could be written as: 
# 
# $$ \sum_{i = 1}^m (y_i -X_i^T w)^2 $$

# We can also write this in matrix notation as $(y-Xw)^T(y-Xw)$. 
# 
# If we take the derivative of this with respect to w, we’ll get $X^T(y-Xw)$. 
# 
# We can set this to zero and solve for w to get the following equation:
# 
# $$\hat w = (X^T X)^{-1}X^T y$$

# In[16]:


# https://github.com/computational-class/machinelearninginaction/blob/master/Ch08/regression.py
import pandas as pd
import random

dat = pd.read_csv('../data/ex0.txt', sep = '\t', names = ['x1', 'x2', 'y'])
dat['x3'] = [yi*.3 + .5*random.random() for yi in dat['y']]
dat.head()


# In[17]:


from numpy import mat, linalg, corrcoef

def standRegres(xArr,yArr):
    xMat = mat(xArr); yMat = mat(yArr).T
    xTx = xMat.T*xMat
    if linalg.det(xTx) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws


# In[18]:


xs = [[dat.x1[i], dat.x2[i], dat.x3[i]] for i in dat.index]
y = dat.y
print(xs[:2])
ws = standRegres(xs, y)
print(ws)


# In[14]:


xMat=mat(xs)
yMat=mat(y)
yHat = xMat*ws

xCopy=xMat.copy()
xCopy.sort(0)
yHat=xCopy*ws


# In[15]:


fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0], yMat.T[:,0].flatten().A[0])
ax.plot(xCopy[:,1],yHat, 'r-')
plt.ylim(0, 5)
plt.show()


# In[108]:


yHat = xMat*ws
corrcoef(yHat.T, yMat)


# # Doing Statistics with statsmodels
# 
# http://www.statsmodels.org/stable/index.html
# 
# statsmodels is a Python module that provides classes and functions for the estimation of many different statistical models, as well as for conducting statistical tests, and statistical data exploration.

# In[109]:


import statsmodels.api as sm
import statsmodels.formula.api as smf


# In[121]:


dat = pd.read_csv('ex0.txt', sep = '\t', names = ['x1', 'x2', 'y'])
dat['x3'] = [yi*.3 - .1*random.random() for yi in y]
dat.head()


# In[123]:


results = smf.ols('y ~ x2 + x3', data=dat).fit()

results.summary()


# In[124]:


fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(results, fig = fig)
plt.show()


# In[135]:


import numpy as np
X = np.array(num_friends_good)
X = sm.add_constant(X, prepend=False)

mod = sm.OLS(daily_minutes_good, X)
res = mod.fit()
print(res.summary())


# In[137]:


fig = plt.figure(figsize=(6,8))
fig = sm.graphics.plot_partregress_grid(res, fig = fig)
plt.show()