#!/usr/bin/env python
# coding: utf-8

# # Statistical Modeling with Python
# 
# `statsmodels` is better suited for traditional stats

# In[122]:


# the statsmodels.api uses numpy array notation
# statsmodels.formula.api use formula notation (similar to R's formula notation)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf


# ## A minimal OLS example

# Four pairs of points

# In[2]:


x = np.array([1,2,3,4])
y = np.array([2,6,4,8])


# In[3]:


plt.scatter(x,y, marker = '.')
plt.xlim(0,5)
plt.ylim(0,10)
plt.show()


# In[6]:


# make a dataframe of our data
d = pd.DataFrame({'x':x, 'y':y})
print(d)


# Seaborn lmplot

# In[7]:


sns.lmplot(x = 'x', y = 'y', data = d)


# ## Formula notation with statsmodels
# use statsmodels.formula.api (often imported as smf)

# In[8]:


# data is in a dataframe
model = smf.ols('y ~ x', data = d)


# In[10]:


# estimation of coefficients is not done until you call fit() on the model
results = model.fit()


# In[12]:


print(results.summary()) 


# Using the abline_plot function for plotting the results

# In[14]:


sm.graphics.abline_plot(model_results = results)
plt.scatter(d.x, d.y)

plt.xlim(0,5)
plt.ylim(0,10)

plt.show()


# Generating an anova table

# In[17]:


print(sm.stats.anova_lm(results))


# Making predictions

# In[16]:


results.predict({'x' : 2})


# ## numpy array notation
# similar to sklearn's notation

# In[18]:


print(x)


# In[19]:


X = sm.add_constant(x)  
# need to add a constant for the intercept term.
# because we are using the numpy notation, we use sm rather than smf


# In[20]:


print(X)


# $$y_i = \beta_0 + \beta_1 x_i + \epsilon_i$$
# 
# $$\mathbf{\hat{Y}} = \boldsymbol{\beta} \mathbf{X}$$
# 

# In[21]:


# OLS is capitalized in the numpy notation
model2 = sm.OLS(y, X)  


# In[22]:


results2 = model2.fit()


# In[23]:


print(results2.summary())


# ## OLS solution
# 
# $$(X^TX)^{-1}X^TY$$

# In[24]:


X


# In[28]:


np.linalg.inv(X.T @ X) @ (X.T @ y)


# ## Plot Interaction of Categorical Factors
# 
# https://www.statsmodels.org/dev/examples/notebooks/generated/categorical_interaction_plot.html
# 
# In this example, we will visualize the interaction between categorical factors. First, we will create some categorical data. Then, we will plot it using the interaction_plot function, which internally re-codes the x-factor categories to integers.

# In[18]:


# https://stackoverflow.com/questions/55663474/interaction-plot-from-statsmodels-formula-api-using-python

import pandas as pd
from statsmodels.formula.api import ols

Consumption = [51, 52, 53, 54, 56, 57, 55, 56, 58, 59, 62, 63]
Gender  = ["Male", "Male", "Male", "Male", "Male", "Male", "Female",
           "Female", "Female", "Female", "Female", "Female"]
Income = [80, 80, 90, 90, 100, 100, 80, 80, 90, 90, 100, 100]

df = pd.DataFrame( {"Consumption": Consumption, "Gender": Gender, "Income": Income})
print(df)


# In[20]:


Reg = ols(formula = "Consumption ~ Gender + Income + Gender*Income", data = df)
Fit = Reg.fit()
Fit.summary()


# In[21]:


import matplotlib.pyplot as plt
from statsmodels.graphics.factorplots import interaction_plot
plt.style.use('ggplot')

fig = interaction_plot(
   x        = Income,
   trace    = Gender,
   response = Fit.fittedvalues,
   colors   = ['red','blue'],
   markers  = ['D','^'])
plt.xlabel('Income')
plt.ylabel('Consumption')
plt.legend().set_title(None)
plt.show()


# In[ ]: