import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize,
poly)
import statsmodels.api as sm
# Define grid of GPA, IQ values to calculate salary
gpa = np.linspace(0, 4, 10)
iq = np.linspace(70, 130, 10)
GPA, IQ = np.meshgrid(gpa, iq)
# Calculate salary on grid
Salary_h = 50 + 20 * GPA + 0.07 * IQ + 0.01 * GPA * IQ
Salary_c = 50 + 20 * GPA + 0.07 * IQ + 35 + 0.01 * GPA * IQ - 10 * GPA
fig_h, ax_h = plt.subplots(subplot_kw={"projection": "3d"})
ax_h.plot_surface(GPA, IQ, Salary_h)
ax_h.set_xlabel('GPA')
ax_h.set_ylabel('IQ')
ax_h.set_label('Salary')
plt.title('Predicted salary for high school graduates')
plt.show()
fig_c, ax_c = plt.subplots(subplot_kw={"projection": "3d"})
ax_c.plot_surface(GPA, IQ, Salary_c)
ax_c.set_xlabel('GPA')
ax_c.set_ylabel('IQ')
ax_c.set_label('Salary')
plt.title('Predicted salary for college graduates')
plt.show()
# Load data
Carseats = load_data('Carseats')
# Fit model
y = Carseats['Sales']
vars = ['Price', 'Urban', 'US']
X = MS(vars).fit_transform(Carseats)
lin_model = sm.OLS(y, X)
lin_model.fit().summary()
Dep. Variable: | Sales | R-squared: | 0.239 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.234 |
Method: | Least Squares | F-statistic: | 41.52 |
Date: | Fri, 20 Sep 2024 | Prob (F-statistic): | 2.39e-23 |
Time: | 21:32:27 | Log-Likelihood: | -927.66 |
No. Observations: | 400 | AIC: | 1863. |
Df Residuals: | 396 | BIC: | 1879. |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
intercept | 13.0435 | 0.651 | 20.036 | 0.000 | 11.764 | 14.323 |
Price | -0.0545 | 0.005 | -10.389 | 0.000 | -0.065 | -0.044 |
Urban[Yes] | -0.0219 | 0.272 | -0.081 | 0.936 | -0.556 | 0.512 |
US[Yes] | 1.2006 | 0.259 | 4.635 | 0.000 | 0.691 | 1.710 |
Omnibus: | 0.676 | Durbin-Watson: | 1.912 |
---|---|---|---|
Prob(Omnibus): | 0.713 | Jarque-Bera (JB): | 0.758 |
Skew: | 0.093 | Prob(JB): | 0.684 |
Kurtosis: | 2.897 | Cond. No. | 628. |
vars2 = ['Price', 'US']
X2 = MS(vars2).fit_transform(Carseats)
lin_model2 = sm.OLS(y, X2)
lin_model2.fit().summary()
Dep. Variable: | Sales | R-squared: | 0.239 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.235 |
Method: | Least Squares | F-statistic: | 62.43 |
Date: | Fri, 20 Sep 2024 | Prob (F-statistic): | 2.66e-24 |
Time: | 21:32:27 | Log-Likelihood: | -927.66 |
No. Observations: | 400 | AIC: | 1861. |
Df Residuals: | 397 | BIC: | 1873. |
Df Model: | 2 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
intercept | 13.0308 | 0.631 | 20.652 | 0.000 | 11.790 | 14.271 |
Price | -0.0545 | 0.005 | -10.416 | 0.000 | -0.065 | -0.044 |
US[Yes] | 1.1996 | 0.258 | 4.641 | 0.000 | 0.692 | 1.708 |
Omnibus: | 0.666 | Durbin-Watson: | 1.912 |
---|---|---|---|
Prob(Omnibus): | 0.717 | Jarque-Bera (JB): | 0.749 |
Skew: | 0.092 | Prob(JB): | 0.688 |
Kurtosis: | 2.895 | Cond. No. | 607. |
# Load data
Boston = load_data("Boston")
# Extract dependent variable
y = Boston['crim']
# Blank plot
fig = plt.figure(1)
# Current subplot index
i = 1
# To store coefficients from simple regressions
simple_coeff = []
# Check significance of each simple model then output scatterplot
# if there's a statistically significant association
for col in Boston.columns.drop('crim'):
# Fit simple model, get coefficient and p-values
X = MS([col]).fit_transform(Boston)
simple_model = sm.OLS(y, X)
simple_model_fit = simple_model.fit()
simple_coeff.append(summarize(simple_model_fit)['coef'][1:])
pvalues = simple_model_fit.pvalues
# Check if variable is statistically significant
if pvalues[col] < 0.05:
# Add scatterplot as subplot
ax = fig.add_subplot(3, 4, i)
ax.scatter(Boston[col], y, s=0.5)
ax.set_xlabel(col)
# Increment index
i += 1
# Finish plot
fig.suptitle('Scatterplots for significant variables')
fig.supylabel('crim')
fig.tight_layout()
fig.show()
/var/folders/r4/w_3g9fm522b0jsvdg513pm440000gs/T/ipykernel_66768/2897818074.py:40: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown fig.show()
# Fit full model, get coefficients and p-values
X_full = MS(Boston.columns.drop('crim')).fit_transform(Boston)
full_model = sm.OLS(y, X_full)
full_model_fit = full_model.fit()
multi_coeff = summarize(full_model_fit)['coef'][1:]
summarize(full_model_fit)
coef | std err | t | P>|t| | |
---|---|---|---|---|
intercept | 13.7784 | 7.082 | 1.946 | 0.052 |
zn | 0.0457 | 0.019 | 2.433 | 0.015 |
indus | -0.0584 | 0.084 | -0.698 | 0.486 |
chas | -0.8254 | 1.183 | -0.697 | 0.486 |
nox | -9.9576 | 5.290 | -1.882 | 0.060 |
rm | 0.6289 | 0.607 | 1.036 | 0.301 |
age | -0.0008 | 0.018 | -0.047 | 0.962 |
dis | -1.0122 | 0.282 | -3.584 | 0.000 |
rad | 0.6125 | 0.088 | 6.997 | 0.000 |
tax | -0.0038 | 0.005 | -0.730 | 0.466 |
ptratio | -0.3041 | 0.186 | -1.632 | 0.103 |
lstat | 0.1388 | 0.076 | 1.833 | 0.067 |
medv | -0.2201 | 0.060 | -3.678 | 0.000 |
fig, ax = plt.subplots()
ax.scatter(simple_coeff, multi_coeff, s = 3)
ax.set_xlabel('Simple regression coefficient')
ax.set_ylabel('Multiple regression coefficient')
/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/matplotlib/cbook.py:1699: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead return math.isfinite(val)
Text(0, 0.5, 'Multiple regression coefficient')
# Fit each cubic model and print summary
for col in Boston.columns.drop('crim'):
print(col)
X = MS([poly(col, 3, raw=True)]).fit_transform(Boston)
model = sm.OLS(y, X)
results = model.fit()
print(summarize(results))
print("\n-----\n")
zn coef std err t P>|t| intercept 4.846100 0.433000 11.192 0.000 poly(zn, degree=3, raw=True)[0] -0.332200 0.110000 -3.025 0.003 poly(zn, degree=3, raw=True)[1] 0.006500 0.004000 1.679 0.094 poly(zn, degree=3, raw=True)[2] -0.000038 0.000031 -1.203 0.230 ----- indus coef std err t P>|t| intercept 3.6626 1.574 2.327 0.02 poly(indus, degree=3, raw=True)[0] -1.9652 0.482 -4.077 0.00 poly(indus, degree=3, raw=True)[1] 0.2519 0.039 6.407 0.00 poly(indus, degree=3, raw=True)[2] -0.0070 0.001 -7.292 0.00 ----- chas coef std err t P>|t| intercept 3.744400e+00 3.970000e-01 9.444 0.000 poly(chas, degree=3, raw=True)[0] -4.770000e+13 1.150000e+14 -0.414 0.679 poly(chas, degree=3, raw=True)[1] 2.385000e+13 5.770000e+13 0.414 0.679 poly(chas, degree=3, raw=True)[2] 2.385000e+13 5.770000e+13 0.414 0.679 ----- nox coef std err t P>|t| intercept 233.0866 33.643 6.928 0.0 poly(nox, degree=3, raw=True)[0] -1279.3713 170.397 -7.508 0.0 poly(nox, degree=3, raw=True)[1] 2248.5441 279.899 8.033 0.0 poly(nox, degree=3, raw=True)[2] -1245.7029 149.282 -8.345 0.0 ----- rm coef std err t P>|t| intercept 112.6246 64.517 1.746 0.081 poly(rm, degree=3, raw=True)[0] -39.1501 31.311 -1.250 0.212 poly(rm, degree=3, raw=True)[1] 4.5509 5.010 0.908 0.364 poly(rm, degree=3, raw=True)[2] -0.1745 0.264 -0.662 0.509 ----- age coef std err t P>|t| intercept -2.548800 2.769000 -0.920 0.358 poly(age, degree=3, raw=True)[0] 0.273700 0.186000 1.468 0.143 poly(age, degree=3, raw=True)[1] -0.007200 0.004000 -1.988 0.047 poly(age, degree=3, raw=True)[2] 0.000057 0.000021 2.724 0.007 ----- dis coef std err t P>|t| intercept 30.0476 2.446 12.285 0.0 poly(dis, degree=3, raw=True)[0] -15.5544 1.736 -8.960 0.0 poly(dis, degree=3, raw=True)[1] 2.4521 0.346 7.078 0.0 poly(dis, degree=3, raw=True)[2] -0.1186 0.020 -5.814 0.0 ----- rad coef std err t P>|t| intercept -0.6055 2.050 -0.295 0.768 poly(rad, degree=3, raw=True)[0] 0.5127 1.044 0.491 0.623 poly(rad, degree=3, raw=True)[1] -0.0752 0.149 -0.506 0.613 poly(rad, degree=3, raw=True)[2] 0.0032 0.005 0.703 0.482 ----- tax coef std err t P>|t| intercept 1.918360e+01 1.179600e+01 1.626 0.105 poly(tax, degree=3, raw=True)[0] -1.533000e-01 9.600000e-02 -1.602 0.110 poly(tax, degree=3, raw=True)[1] 4.000000e-04 0.000000e+00 1.488 0.137 poly(tax, degree=3, raw=True)[2] -2.204000e-07 1.890000e-07 -1.167 0.244 ----- ptratio coef std err t P>|t| intercept 477.1840 156.795 3.043 0.002 poly(ptratio, degree=3, raw=True)[0] -82.3605 27.644 -2.979 0.003 poly(ptratio, degree=3, raw=True)[1] 4.6353 1.608 2.882 0.004 poly(ptratio, degree=3, raw=True)[2] -0.0848 0.031 -2.743 0.006 ----- lstat coef std err t P>|t| intercept 1.2010 2.029 0.592 0.554 poly(lstat, degree=3, raw=True)[0] -0.4491 0.465 -0.966 0.335 poly(lstat, degree=3, raw=True)[1] 0.0558 0.030 1.852 0.065 poly(lstat, degree=3, raw=True)[2] -0.0009 0.001 -1.517 0.130 ----- medv coef std err t P>|t| intercept 53.1655 3.356 15.840 0.0 poly(medv, degree=3, raw=True)[0] -5.0948 0.434 -11.744 0.0 poly(medv, degree=3, raw=True)[1] 0.1555 0.017 9.046 0.0 poly(medv, degree=3, raw=True)[2] -0.0015 0.000 -7.312 0.0 -----