#!/usr/bin/env python # coding: utf-8 # This notebook is part of my [Python data science curriculum](http://www.terran.us/articles/python_curriculum.html) # In[1]: import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf import statsmodels.graphics.regressionplots as smrp # This name is not standard import altair as alt alt.renderers.enable('notebook') # In[2]: import seaborn as sns tips=sns.load_dataset('tips') from plotnine.data import diamonds diamonds_small=diamonds.sample(1000,random_state=0) diamonds_small.to_csv('/tmp/diamonds_small.csv') # # Linear Regression # There are two interfaces to statsmodels, the statsmodels.api interface, which takes vectors and matrices, and the statsmodels.formula interface, which uses R-style formulas from the Patsy library. # # http://www.statsmodels.org/stable/example_formulas.html # https://patsy.readthedocs.io/en/latest/formulas.html # # Here's a port of the ISLR ch 3 labs to Python, using a mix of sklearn and statsmodels: # # http://nbviewer.jupyter.org/github/JWarmenhoven/ISL-python/blob/master/Notebooks/Chapter%203.ipynb # ## Single Feature # In[3]: m=smf.ols('I(tip/total_bill) ~ total_bill', tips).fit() m.summary() # It appears that the "warning" about how standard errors assume the errors are correctly specified is _always_ printed, as a general reminder, and does not result from any actual observation that your data violates the assumptions. # --- # # One thing we often want to look at is a scatter plot of actual vs predicted values: # In[4]: p = pd.DataFrame({'actual':tips.tip/tips.total_bill, 'predicted':m.predict()}) # In[5]: alt.Chart(p.reset_index()).mark_point().encode( x='predicted',y='actual',tooltip='index' ).interactive() # With a single feature, we can also look at the predicted values against the actual values as a function of the feature: # In[6]: tips2 = tips.assign(prediction=m.predict(tips)) c=alt.Chart(tips2.reset_index().assign(data='data').assign(fit='fit')) c.mark_point().encode( x='total_bill',y='tip_pct:Q',color='data',tooltip='index' ).transform_calculate(tip_pct='datum.tip/datum.total_bill') + \ c.mark_line().encode( x='total_bill',y='prediction',color='fit' ) # ## Aside: Data Description # In[7]: # Correlation: tips2.corr() # In[8]: # Descriptive Summary Statistics tips2.describe() # ## Multiple Features # # The tips data is not very satisfactory because there are *no* other features which have predictive # value, so let's cut over to the diamonds data instead. # In[9]: dm = smf.ols('price ~ carat', data=diamonds_small).fit() dm.summary() # In[10]: dm2 = smf.ols('price ~ carat + cut + color + clarity', diamonds_small).fit() dm2.summary() # --- # # You can do a type-2 anova to look at the partial impact of each feature: # In[11]: sm.stats.anova_lm(dm2,type=2) # You can also compare one model to another model: # In[12]: # This will produce some warnings, which I think are noise but I'm not certain # see https://groups.google.com/forum/#!topic/pystatsmodels/-flY0cNnb3k sm.stats.anova_lm(dm,dm2) # ## Standard Plots # In[13]: import statsmodels.graphics.regressionplots as smrp import matplotlib matplotlib.rcParams['figure.figsize'] = (10,10) # In[14]: smrp.plot_regress_exog(dm2,-1); # In[15]: smrp.influence_plot(dm2); # In[16]: diamonds_small.loc[[26385,39274,16504,20297],] # Seaborn also has a standard pairs plot which isn't bad, although note that it skips the categorical variables: # In[17]: sns.pairplot(diamonds_small) # # Logistic Regression # # http://www.statsmodels.org/stable/glm.htm # http://nbviewer.jupyter.org/github/JWarmenhoven/ISL-python/blob/master/Notebooks/Chapter%204.ipynb#4.3-Logistic-Regression # In[18]: from plotnine.data import mpg mpg.head() # Logistic regression is available in either the `logit` or `glm` methods. The lowercase version uses the formula interface, and the uppercase version expects vectors and matrices. # In[19]: glm = smf.glm('year==2008 ~ displ + hwy', mpg, family=sm.families.Binomial()).fit() glm.summary() # A boolean cannot be used as the result with `logit`; it must be a 0/1 integer. see https://stackoverflow.com/questions/30314188/statsmodels-broadcast-shapes-different # In[20]: mpg['y8']=(mpg.year==2008).astype('int8') lm = smf.logit('y8 ~ displ + hwy', mpg).fit() lm.summary() # Of the two, I think I prefer the logit() summary output. The glm() summary is missing the null value for deviance or likelihood, which is quite annoying. # ## ROC curves # In[21]: # Apparently statsmodels has no ROC curve; use sklearn from sklearn.metrics import roc_curve # In[22]: (fpr,tpr,thresh)=roc_curve(mpg.y8,lm.predict()) roc=pd.DataFrame({'fpr':fpr,'tpr':tpr,'thresh':thresh}) # In[23]: alt.Chart(roc).mark_line().encode( x='fpr',y='tpr',order='thresh').interactive() # # Scratch - Ignore # In[24]: from plotnine.data import diamonds try: smf.ols('price ~ poly(carat,4) + cut + color + clarity', diamonds).fit().summary() except Exception as ex: print(repr(ex)) # In[ ]: