#!/usr/bin/env python # coding: utf-8 # # Setup # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.pyplot import subplots from ISLP import load_data from ISLP.models import (ModelSpec as MS, summarize, poly) import statsmodels.api as sm # # Q1 # ## (b) # In[3]: # Define grid of GPA, IQ values to calculate salary gpa = np.linspace(0, 4, 10) iq = np.linspace(70, 130, 10) GPA, IQ = np.meshgrid(gpa, iq) # Calculate salary on grid Salary_h = 50 + 20 * GPA + 0.07 * IQ + 0.01 * GPA * IQ Salary_c = 50 + 20 * GPA + 0.07 * IQ + 35 + 0.01 * GPA * IQ - 10 * GPA # In[4]: fig_h, ax_h = plt.subplots(subplot_kw={"projection": "3d"}) ax_h.plot_surface(GPA, IQ, Salary_h) ax_h.set_xlabel('GPA') ax_h.set_ylabel('IQ') ax_h.set_label('Salary') plt.title('Predicted salary for high school graduates') plt.show() # In[5]: fig_c, ax_c = plt.subplots(subplot_kw={"projection": "3d"}) ax_c.plot_surface(GPA, IQ, Salary_c) ax_c.set_xlabel('GPA') ax_c.set_ylabel('IQ') ax_c.set_label('Salary') plt.title('Predicted salary for college graduates') plt.show() # # Q4 # ## (a) # In[6]: # Load data Carseats = load_data('Carseats') # Fit model y = Carseats['Sales'] vars = ['Price', 'Urban', 'US'] X = MS(vars).fit_transform(Carseats) lin_model = sm.OLS(y, X) lin_model.fit().summary() # ## (e) # In[7]: vars2 = ['Price', 'US'] X2 = MS(vars2).fit_transform(Carseats) lin_model2 = sm.OLS(y, X2) lin_model2.fit().summary() # ## (g) # # Q5 # ## (a) # In[17]: # Load data Boston = load_data("Boston") # Extract dependent variable y = Boston['crim'] # Blank plot fig = plt.figure(1) # Current subplot index i = 1 # To store coefficients from simple regressions simple_coeff = [] # Check significance of each simple model then output scatterplot # if there's a statistically significant association for col in Boston.columns.drop('crim'): # Fit simple model, get coefficient and p-values X = MS([col]).fit_transform(Boston) simple_model = sm.OLS(y, X) simple_model_fit = simple_model.fit() simple_coeff.append(summarize(simple_model_fit)['coef'][1:]) pvalues = simple_model_fit.pvalues # Check if variable is statistically significant if pvalues[col] < 0.05: # Add scatterplot as subplot ax = fig.add_subplot(3, 4, i) ax.scatter(Boston[col], y, s=0.5) ax.set_xlabel(col) # Increment index i += 1 # Finish plot fig.suptitle('Scatterplots for significant variables') fig.supylabel('crim') fig.tight_layout() fig.show() # ## (b) # In[20]: # Fit full model, get coefficients and p-values X_full = MS(Boston.columns.drop('crim')).fit_transform(Boston) full_model = sm.OLS(y, X_full) full_model_fit = full_model.fit() multi_coeff = summarize(full_model_fit)['coef'][1:] summarize(full_model_fit) # In[26]: fig, ax = plt.subplots() ax.scatter(simple_coeff, multi_coeff, s = 3) ax.set_xlabel('Simple regression coefficient') ax.set_ylabel('Multiple regression coefficient') # ## (d) # In[30]: # Fit each cubic model and print summary for col in Boston.columns.drop('crim'): print(col) X = MS([poly(col, 3, raw=True)]).fit_transform(Boston) model = sm.OLS(y, X) results = model.fit() print(summarize(results)) print("\n-----\n") # In[ ]: