In [1]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

np.random.seed(123)

In [2]:

def make_data():
    N = 20   # sample size
    X = np.random.random(N) * 10
    β = 3
    ϵ = np.random.normal(loc=0, scale=1, size=N)
    y = β * X + ϵ
    return pd.DataFrame([X, y], index=['x', 'y']).T

In [3]:

data = make_data()

In [4]:

data.plot.scatter('x', 'y')

Out[4]:

<AxesSubplot:xlabel='x', ylabel='y'>

Coefficient of determination (aka. $R^2$)¶

In [5]:

results = smf.ols('y ~ x', data=data).fit()

In [6]:

y_hat = results.predict(data.x)

In [7]:

var_y = np.var(data.y)
var_y_hat = np.var(y_hat)

In [8]:

r_squared = var_y_hat / var_y
ρ_squared = np.corrcoef(data.y, y_hat)[0][1] ** 2

In [9]:

print(f'{var_y=:.2f}, {var_y_hat=:.2f}, {r_squared=:.4f}, {ρ_squared=:.4f}')

var_y=47.42, var_y_hat=46.22, r_squared=0.9747, ρ_squared=0.9747

In [10]:

# make sure the calculation of `r_squared` is reasonable.
# note in real implementation, degree of freedoms may matter when sample size is small
assert np.isclose(r_squared, results.rsquared)

Show $\rho^2 = R^2$¶

In [11]:

assert np.isclose(r_squared, ρ_squared)

More stats of the fit¶

In [12]:

print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.975
Model:                            OLS   Adj. R-squared:                  0.973
Method:                 Least Squares   F-statistic:                     693.1
Date:                Sat, 02 Jan 2021   Prob (F-statistic):           8.00e-16
Time:                        20:57:08   Log-Likelihood:                -30.204
No. Observations:                  20   AIC:                             64.41
Df Residuals:                      18   BIC:                             66.40
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1367      0.602     -0.227      0.823      -1.401       1.128
x              2.9906      0.114     26.327      0.000       2.752       3.229
==============================================================================
Omnibus:                        1.344   Durbin-Watson:                   1.699
Prob(Omnibus):                  0.511   Jarque-Bera (JB):                0.972
Skew:                          -0.519   Prob(JB):                        0.615
Kurtosis:                       2.706   Cond. No.                         12.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.