#!/usr/bin/env python
# coding: utf-8

# This notebook is part of my [Python data science curriculum](http://www.terran.us/articles/python_curriculum.html)

# In[1]:


import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

import statsmodels.graphics.regressionplots as smrp # This name is not standard

import altair as alt
alt.renderers.enable('notebook')


# In[2]:


import seaborn as sns
tips=sns.load_dataset('tips')

from plotnine.data import diamonds
diamonds_small=diamonds.sample(1000,random_state=0)
diamonds_small.to_csv('/tmp/diamonds_small.csv')


# # Linear Regression

# There are two interfaces to statsmodels, the statsmodels.api interface, which takes vectors and matrices, and the statsmodels.formula interface, which uses R-style formulas from the Patsy library.
# 
# http://www.statsmodels.org/stable/example_formulas.html  
# https://patsy.readthedocs.io/en/latest/formulas.html
# 
# Here's a port of the ISLR ch 3 labs to Python, using a mix of sklearn and statsmodels:
# 
# http://nbviewer.jupyter.org/github/JWarmenhoven/ISL-python/blob/master/Notebooks/Chapter%203.ipynb

# ## Single Feature

# In[3]:


m=smf.ols('I(tip/total_bill) ~ total_bill', tips).fit()
m.summary()


# It appears that the "warning" about how standard errors assume the errors are correctly specified is _always_ printed, as a general reminder, and does not result from any actual observation that your data violates the assumptions.

# ---
# 
# One thing we often want to look at is a scatter plot of actual vs predicted values:

# In[4]:


p = pd.DataFrame({'actual':tips.tip/tips.total_bill, 'predicted':m.predict()})


# In[5]:


alt.Chart(p.reset_index()).mark_point().encode(
    x='predicted',y='actual',tooltip='index'
).interactive()


# With a single feature, we can also look at the predicted values against the actual values as a function of the feature:

# In[6]:


tips2 = tips.assign(prediction=m.predict(tips))
c=alt.Chart(tips2.reset_index().assign(data='data').assign(fit='fit'))
c.mark_point().encode(
    x='total_bill',y='tip_pct:Q',color='data',tooltip='index'
).transform_calculate(tip_pct='datum.tip/datum.total_bill') + \
c.mark_line().encode(
    x='total_bill',y='prediction',color='fit'
)


# ## Aside: Data Description

# In[7]:


# Correlation:
tips2.corr()


# In[8]:


# Descriptive Summary Statistics
tips2.describe()


# ## Multiple Features
# 
# The tips data is not very satisfactory because there are *no* other features which have predictive
# value, so let's cut over to the diamonds data instead.

# In[9]:


dm = smf.ols('price ~ carat', data=diamonds_small).fit()
dm.summary()


# In[10]:


dm2 = smf.ols('price ~ carat + cut + color + clarity', diamonds_small).fit()
dm2.summary()


# ---
# 
# You can do a type-2 anova to look at the partial impact of each feature:

# In[11]:


sm.stats.anova_lm(dm2,type=2)


# You can also compare one model to another model:

# In[12]:


# This will produce some warnings, which I think are noise but I'm not certain
# see https://groups.google.com/forum/#!topic/pystatsmodels/-flY0cNnb3k

sm.stats.anova_lm(dm,dm2)


# ## Standard Plots

# In[13]:


import statsmodels.graphics.regressionplots as smrp

import matplotlib
matplotlib.rcParams['figure.figsize'] = (10,10)


# In[14]:


smrp.plot_regress_exog(dm2,-1);


# In[15]:


smrp.influence_plot(dm2);


# In[16]:


diamonds_small.loc[[26385,39274,16504,20297],]


# Seaborn also has a standard pairs plot which isn't bad, although note that it skips the categorical variables:

# In[17]:


sns.pairplot(diamonds_small)


# # Logistic Regression
# 
# http://www.statsmodels.org/stable/glm.htm  
# http://nbviewer.jupyter.org/github/JWarmenhoven/ISL-python/blob/master/Notebooks/Chapter%204.ipynb#4.3-Logistic-Regression

# In[18]:


from plotnine.data import mpg
mpg.head()


# Logistic regression is available in either the `logit` or `glm` methods.  The lowercase version uses the formula interface, and the uppercase version expects vectors and matrices.

# In[19]:


glm = smf.glm('year==2008 ~ displ + hwy', mpg, family=sm.families.Binomial()).fit()
glm.summary()


# A boolean cannot be used as the result with `logit`; it must be a 0/1 integer.  see https://stackoverflow.com/questions/30314188/statsmodels-broadcast-shapes-different

# In[20]:


mpg['y8']=(mpg.year==2008).astype('int8')
lm = smf.logit('y8 ~ displ + hwy', mpg).fit()
lm.summary()


# Of the two, I think I prefer the logit() summary output.  The glm() summary is missing the null value for deviance or likelihood, which is quite annoying.

# ## ROC curves

# In[21]:


# Apparently statsmodels has no ROC curve; use sklearn
from sklearn.metrics import roc_curve


# In[22]:


(fpr,tpr,thresh)=roc_curve(mpg.y8,lm.predict())
roc=pd.DataFrame({'fpr':fpr,'tpr':tpr,'thresh':thresh})


# In[23]:


alt.Chart(roc).mark_line().encode(
    x='fpr',y='tpr',order='thresh').interactive()


# # Scratch - Ignore

# In[24]:


from plotnine.data import diamonds
try:
    smf.ols('price ~ poly(carat,4) + cut + color + clarity', diamonds).fit().summary()
except Exception as ex:
    print(repr(ex))


# In[ ]: