In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

np.random.seed(123)
In [2]:
def make_data():
    N = 20   # sample size
    X = np.random.random(N) * 10
    β = 3
    ϵ = np.random.normal(loc=0, scale=1, size=N)
    y = β * X + ϵ
    return pd.DataFrame([X, y], index=['x', 'y']).T
In [3]:
data = make_data()
In [4]:
data.plot.scatter('x', 'y')
Out[4]:
<AxesSubplot:xlabel='x', ylabel='y'>

Coefficient of determination (aka. $R^2$)

In [5]:
results = smf.ols('y ~ x', data=data).fit()
In [6]:
y_hat = results.predict(data.x)
In [7]:
var_y = np.var(data.y)
var_y_hat = np.var(y_hat)
In [8]:
r_squared = var_y_hat / var_y
ρ_squared = np.corrcoef(data.y, y_hat)[0][1] ** 2
In [9]:
print(f'{var_y=:.2f}, {var_y_hat=:.2f}, {r_squared=:.4f}, {ρ_squared=:.4f}')
var_y=47.42, var_y_hat=46.22, r_squared=0.9747, ρ_squared=0.9747
In [10]:
# make sure the calculation of `r_squared` is reasonable.
# note in real implementation, degree of freedoms may matter when sample size is small
assert np.isclose(r_squared, results.rsquared)

Show $\rho^2 = R^2$

In [11]:
assert np.isclose(r_squared, ρ_squared)

More stats of the fit

In [12]:
print(results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.975
Model:                            OLS   Adj. R-squared:                  0.973
Method:                 Least Squares   F-statistic:                     693.1
Date:                Sat, 02 Jan 2021   Prob (F-statistic):           8.00e-16
Time:                        20:57:08   Log-Likelihood:                -30.204
No. Observations:                  20   AIC:                             64.41
Df Residuals:                      18   BIC:                             66.40
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1367      0.602     -0.227      0.823      -1.401       1.128
x              2.9906      0.114     26.327      0.000       2.752       3.229
==============================================================================
Omnibus:                        1.344   Durbin-Watson:                   1.699
Prob(Omnibus):                  0.511   Jarque-Bera (JB):                0.972
Skew:                          -0.519   Prob(JB):                        0.615
Kurtosis:                       2.706   Cond. No.                         12.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.