#!/usr/bin/env python
# coding: utf-8

# # Setup

# In[2]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots

from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
import statsmodels.api as sm


# # Q1

# ## (b)

# In[3]:


# Define grid of GPA, IQ values to calculate salary
gpa = np.linspace(0, 4, 10)
iq = np.linspace(70, 130, 10)
GPA, IQ = np.meshgrid(gpa, iq)

# Calculate salary on grid
Salary_h = 50 + 20 * GPA + 0.07 * IQ + 0.01 * GPA * IQ
Salary_c = 50 + 20 * GPA + 0.07 * IQ + 35 + 0.01 * GPA * IQ - 10 * GPA


# In[4]:


fig_h, ax_h = plt.subplots(subplot_kw={"projection": "3d"})
ax_h.plot_surface(GPA, IQ, Salary_h)
ax_h.set_xlabel('GPA')
ax_h.set_ylabel('IQ')
ax_h.set_label('Salary')
plt.title('Predicted salary for high school graduates')
plt.show()


# In[5]:


fig_c, ax_c = plt.subplots(subplot_kw={"projection": "3d"})
ax_c.plot_surface(GPA, IQ, Salary_c)
ax_c.set_xlabel('GPA')
ax_c.set_ylabel('IQ')
ax_c.set_label('Salary')
plt.title('Predicted salary for college graduates')
plt.show()


# # Q4

# ## (a)

# In[6]:


# Load data
Carseats = load_data('Carseats')

# Fit model
y = Carseats['Sales']
vars = ['Price', 'Urban', 'US']
X = MS(vars).fit_transform(Carseats)
lin_model = sm.OLS(y, X)
lin_model.fit().summary()


# ## (e)

# In[7]:


vars2 = ['Price', 'US']
X2 = MS(vars2).fit_transform(Carseats)
lin_model2 = sm.OLS(y, X2)
lin_model2.fit().summary()


# ## (g)

# # Q5

# ## (a)

# In[17]:


# Load data
Boston = load_data("Boston")

# Extract dependent variable
y = Boston['crim']

# Blank plot
fig = plt.figure(1)
# Current subplot index 
i = 1

# To store coefficients from simple regressions
simple_coeff = []

# Check significance of each simple model then output scatterplot
# if there's a statistically significant association
for col in Boston.columns.drop('crim'):
    # Fit simple model, get coefficient and p-values
    X = MS([col]).fit_transform(Boston)
    simple_model = sm.OLS(y, X)
    simple_model_fit = simple_model.fit()
    simple_coeff.append(summarize(simple_model_fit)['coef'][1:])
    
    pvalues = simple_model_fit.pvalues
    
    # Check if variable is statistically significant
    if pvalues[col] < 0.05:
        # Add scatterplot as subplot
        ax = fig.add_subplot(3, 4, i)
        ax.scatter(Boston[col], y, s=0.5)
        ax.set_xlabel(col)
        
        # Increment index
        i += 1

# Finish plot       
fig.suptitle('Scatterplots for significant variables')
fig.supylabel('crim')
fig.tight_layout()
fig.show()
        

# ## (b)

# In[20]:


# Fit full model, get coefficients and p-values
X_full = MS(Boston.columns.drop('crim')).fit_transform(Boston)
full_model = sm.OLS(y, X_full)
full_model_fit = full_model.fit()
multi_coeff = summarize(full_model_fit)['coef'][1:]
summarize(full_model_fit)


# In[26]:


fig, ax = plt.subplots()
ax.scatter(simple_coeff, multi_coeff, s = 3)
ax.set_xlabel('Simple regression coefficient')
ax.set_ylabel('Multiple regression coefficient')


# ## (d)

# In[30]:


# Fit each cubic model and print summary
for col in Boston.columns.drop('crim'):
    print(col)
   
    X = MS([poly(col, 3, raw=True)]).fit_transform(Boston)
    model = sm.OLS(y, X)
    results = model.fit()
    
    print(summarize(results))
    print("\n-----\n")


# In[ ]: