#!/usr/bin/env python # coding: utf-8 # # Linear Modeling in Python # # This notebook, which was created and presented by Skipper Sebold at the Scipy2012, introduces the use of pandas and the formula framework in statsmodels in the context of linear modeling. # # **It is based heavily on Jonathan Taylor's [class notes that use R](http://www.stanford.edu/class/stats191/interactions.html)** # In[1]: import matplotlib.pyplot as plt import pandas import numpy as np from statsmodels.formula.api import ols from statsmodels.graphics.api import interaction_plot, abline_plot, qqplot from statsmodels.stats.api import anova_lm # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # ## Example 1: IT salary data # # In this example we will first establish a linear model, of salary as a function of experience, education, and management level. # We will test for any interactions between these factors. Significant interactions will be included in the model. # Finally, we will remove outliers, and plot the resulting fits. # ### Get the data Outcome: S, salaries for IT staff in a corporation Predictors: X, experience in years M, managment, 2 levels, 0=non-management, 1=management E, education, 3 levels, 1=Bachelor's, 2=Master's, 3=Ph.D # In[3]: url = 'http://stats191.stanford.edu/data/salary.table' salary_table = pandas.read_table(url) # needs pandas 0.7.3 #salary_table.to_csv('salary.table', index=False) # ### Inspect the data # In[4]: print(salary_table.head(10)) # In[5]: E = salary_table.E # Education M = salary_table.M # Management X = salary_table.X # Experience S = salary_table.S # Salary # Let's explore the data # In[6]: fig = plt.figure(figsize=(10,8)) ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary', xlim=(0, 27), ylim=(9600, 28800)) symbols = ['D', '^'] man_label = ["Non-Mgmt", "Mgmt"] educ_label = ["Bachelors", "Masters", "PhD"] colors = ['r', 'g', 'blue'] factor_groups = salary_table.groupby(['E','M']) for values, group in factor_groups: i,j = values label = "%s - %s" % (man_label[j], educ_label[i-1]) ax.scatter(group['X'], group['S'], marker=symbols[j], color=colors[i-1], s=350, label=label) ax.legend(scatterpoints=1, markerscale=.7, labelspacing=1); # ### Define and Fit a Linear Model # # Fit a linear model # # $$S_i = \beta_0 + \beta_1X_i + \beta_2E_{i2} + \beta_3E_{i3} + \beta_4M_i + \epsilon_i$$ # # where # # $$ E_{i2}=\cases{1,&if $E_i=2$;\cr 0,&otherwise. \cr}$$ # $$ E_{i3}=\cases{1,&if $E_i=3$;\cr 0,&otherwise. \cr}$$ # In the following, the model is defined using "patsy". # # - An intercept is automatically included. # - C(variable) includes the variable automatically as categories. # In[7]: formula = 'S ~ C(E) + C(M) + X' lm = ols(formula, salary_table).fit() print(lm.summary()) # *Dep. Variable* # the variable to be fitted; here, the "Salary" # # *Model* # OLS = ordinary least squares # # *Df Residuals* # The number of observations, minus the number of parameters fitted. # # *Df model* # "Degree of Freedom" of the model, i.e. the dimensionnality of the subspace spanned by the model. This entails that the intercept is not counted. # # *R-squared* # Coefficient of Determination = (S0-Sm)/S0 # # *Adj. R-squared* # The adjusted R2 coefficient, which takes into consideration the number of model paramters. # # *F-statistic*, and corresponding *Prob* # F-test on the regression model, if it is significantly different from the minimum model (i.e. a constant offset only) # # *Log-Likelihood* # Maximum log-likelihood value for the model. # # *AIC* # Aiken's Information Criterion, for the assessment of the model. # # *BIC* # Bayesian Information Criterion, for the assessment of the model. # # The values in the lowest box describe properties of the residuals ("Skew", "Kurtosisi"), as well as tests on the residuals. # ### Inspect the Design Matrix, and demonstrate the Model Predictions # # Look at the design matrix created for us. Every results instance has a reference to the model. # In[8]: lm.model.exog[:10] # Since we initially passed in a DataFrame, we have a transformed DataFrame available. # In[9]: print(lm.model.data.orig_exog.head(10)) # There is a reference to the original untouched data in # In[10]: print(lm.model.data.frame.head(10)) # If you use the formula interface, statsmodels remembers this transformation. Say you want to know the predicted salary for someone with 12 years experience and a Master's degree who is in a management position # In[12]: lm.predict(pd.DataFrame({'X' : [12], 'M' : [1], 'E' : [2]})) # ### Check the Residuals # # So far we've assumed that the effect of experience is the same for each level of education and professional role. # Perhaps this assumption isn't merited. We can formally test this using some interactions. # # We can start by seeing if our model assumptions are met. Let's look at a residuals plot, with the groups separated. # In[13]: resid = lm.resid # In[14]: fig = plt.figure(figsize=(12,8)) xticks = [] ax = fig.add_subplot(111, xlabel='Group (E, M)', ylabel='Residuals') for values, group in factor_groups: i,j = values xticks.append(str((i, j))) group_num = i*2 + j - 1 # for plotting purposes x = [group_num] * len(group) ax.scatter(x, resid[group.index], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') ax.set_xticks([1,2,3,4,5,6]) ax.set_xticklabels(xticks) ax.axis('tight'); # Obviously, the linear model alone does not capture all model features, as the residuals are not normally distributed within each group. To improve the model, we check if interactions between the model parameters can explain the group differences. # ### Add an Interaction # # #### Interaction Salary*Experience # # Add an interaction between salary and experience, allowing different intercepts for level of experience. # # $$S_i = \beta_0+\beta_1X_i+\beta_2E_{i2}+\beta_3E_{i3}+\beta_4M_i+\beta_5E_{i2}X_i+\beta_6E_{i3}X_i+\epsilon_i$$ # In[15]: interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit() print(interX_lm.summary()) # The "Factor" Experience has the "Treatments" (or elements) "1", "2", and "3". Since the "Treatment 1"([T.1]) is taken as the reference, it is not listed here explicitly. This is called "corner-point" approach. # Similarly, the "Factor" Management has the "Treatments" "0" and "1". With "0" taken as the reference, only the term C(M)[T.1] is listed in the model. # The interactions are described by the terms "C(E)[T.i]:X". # ##### Test the Interaction Management*Experience # # Test that $\beta_5 = \beta_6 = 0$. We can use anova_lm or we can use an F-test. # In[16]: print(anova_lm(lm, interX_lm)) # In[17]: print(interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0')) # In[18]: print(interX_lm.f_test([[0,0,0,0,0,1,-1],[0,0,0,0,0,0,1]])) # Both tests show that the models are not significantly different. In other words, there is no interaction effect between Management and Experience in the data. # # The contrasts are created here under the hood by patsy. # # Recall that F-tests are of the form $R\beta = q$ # In[19]: LC = interX_lm.model.data.orig_exog.design_info.linear_constraint('C(E)[T.2]:X = C(E)[T.3]:X = 0') print(LC.coefs) print(LC.constants) # #### Interact education with management # In[20]: interM_lm = ols('S ~ X + C(E)*C(M)', salary_table).fit() print(interM_lm.summary()) # In[21]: print(anova_lm(lm, interM_lm)) The interaction effect Management*Education is highly significant! # ##### Check the residuals of the extended model # In[22]: infl = interM_lm.get_influence() resid = infl.resid_studentized_internal # In[23]: fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='X', ylabel='standardized resids') for values, group in factor_groups: i,j = values idx = group.index ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') ax.axis('tight'); # There looks to be an outlier. # #### Find and Remove the Outlier # In[24]: outl = interM_lm.outlier_test('fdr_bh') outl.sort('unadj_p', inplace=True) print(outl) # In[25]: idx = salary_table.index.drop(32) # In[26]: print(idx) # ##### Rerun the original linear model without the outlier # In[27]: lm32 = ols('S ~ C(E) + X + C(M)', data=salary_table, subset=idx).fit() print(lm32.summary()) # #### Interaction Education*Experience # In[28]: interX_lm32 = ols('S ~ C(E) * X + C(M)', data=salary_table, subset=idx).fit() print(interX_lm32.summary()) # In[29]: table3 = anova_lm(lm32, interX_lm32) print(table3) # Again, this is not significant, and can be left away. # ### Final Result # Result from the Interaction Test: the significant model with Interaction Experience*Management, without the outlier #32 # In[30]: interM_lm32 = ols('S ~ X + C(E) * C(M)', data=salary_table, subset=idx).fit() print(anova_lm(lm32, interM_lm32)) # Re-plotting the residuals # In[31]: resid = interM_lm32.get_influence().summary_frame()['standard_resid'] fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='X[~[32]]', ylabel='standardized resids') for values, group in factor_groups: i,j = values idx = group.index ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') ax.axis('tight'); # A final plot of the fitted values # In[32]: lm_final = ols('S ~ X + C(E)*C(M)', data=salary_table.drop([32])).fit() mf = lm_final.model.data.orig_exog lstyle = ['-','--'] fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary') for values, group in factor_groups: i,j = values idx = group.index ax.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') # drop NA because there is no idx 32 in the final model ax.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(), ls=lstyle[j], color=colors[i-1]) ax.axis('tight'); # ##### Interaction Plot Salary | Experience # # From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management, M and education, E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot. # In[33]: U = S - X * interX_lm32.params['X'] U.name = 'Salary|X' fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) ax = interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'], markersize=10, ax=ax) # ## Example 2: Minority Employment Data - ABLine plotting TEST - Job Aptitude Test Score ETHN - 1 if minority, 0 otherwise JPERF - Job performance evaluation # In[34]: from urllib.request import urlopen url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_others/' inFile = 'minority.table' url = url_base + inFile minority_table = pandas.read_table(urlopen(url)) #minority_table = pandas.read_table(r'.\Data\data_others\minority.table') minority_table.to_csv('minority.table', sep="\t", index=False) # In[35]: factor_group = minority_table.groupby(['ETHN']) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') colors = ['purple', 'green'] markers = ['o', 'v'] for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1) # In[36]: min_lm = ols('JPERF ~ TEST', data=minority_table).fit() print(min_lm.summary()) # In[37]: fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left') fig = abline_plot(model_results = min_lm, ax=ax) # In[38]: min_lm2 = ols('JPERF ~ TEST + TEST:ETHN', data=minority_table).fit() print(min_lm2.summary()) # In[39]: fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) fig = abline_plot(intercept = min_lm2.params['Intercept'], slope = min_lm2.params['TEST'], ax=ax, color='purple') ax = fig.axes[0] fig = abline_plot(intercept = min_lm2.params['Intercept'], slope = min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'], ax=ax, color='green') ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left'); # In[40]: min_lm3 = ols('JPERF ~ TEST + ETHN', data=minority_table).fit() print(min_lm3.summary()) # In[41]: fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) fig = abline_plot(intercept = min_lm3.params['Intercept'], slope = min_lm3.params['TEST'], ax=ax, color='purple') ax = fig.axes[0] fig = abline_plot(intercept = min_lm3.params['Intercept'] + min_lm3.params['ETHN'], slope = min_lm3.params['TEST'], ax=ax, color='green') ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left'); # In[42]: min_lm4 = ols('JPERF ~ TEST * ETHN', data=minority_table).fit() print(min_lm4.summary()) # In[43]: fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, ylabel='JPERF', xlabel='TEST') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) fig = abline_plot(intercept = min_lm4.params['Intercept'], slope = min_lm4.params['TEST'], ax=ax, color='purple') ax = fig.axes[0] fig = abline_plot(intercept = min_lm4.params['Intercept'] + min_lm4.params['ETHN'], slope = min_lm4.params['TEST'] + min_lm4.params['TEST:ETHN'], ax=ax, color='green') ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left'); # Is there any effect of ETHN on slope or intercept? #
# Y ~ TEST vs. Y ~ TEST + ETHN + ETHN:TEST # In[44]: table5 = anova_lm(min_lm, min_lm4) print(table5) # Is there any effect of ETHN on intercept? #
# Y ~ TEST vs. Y ~ TEST + ETHN # In[45]: table6 = anova_lm(min_lm, min_lm3) print(table6) # Is there any effect of ETHN on slope? #
# Y ~ TEST vs. Y ~ TEST + ETHN:TEST # In[46]: table7 = anova_lm(min_lm, min_lm2) print(table7) # Is it just the slope or both? #
# Y ~ TEST + ETHN:TEST vs Y ~ TEST + ETHN + ETHN:TEST # In[47]: table8 = anova_lm(min_lm2, min_lm4) print(table8) # ## Two Way ANOVA - Kidney failure data Weight - (1,2,3) - Level of weight gan between treatments Duration - (1,2) - Level of duration of treatment Days - Time of stay in hospital # In[48]: try: kidney_table = pandas.read_table('kidney.table') except: url = 'http://stats191.stanford.edu/data/kidney.table' kidney_table = pandas.read_table(url, delimiter=" *") kidney_table.to_csv("kidney.table", sep="\t", index=False) # In[49]: # Explore the dataset, it's a balanced design print(kidney_table.groupby(['Weight', 'Duration']).size()) # In[50]: kt = kidney_table fig = plt.figure(figsize=(10,8)) ax = fig.add_subplot(111) fig = interaction_plot(kt['Weight'], kt['Duration'], np.log(kt['Days']+1), colors=['red', 'blue'], markers=['D','^'], ms=10, ax=ax) # $$Y_{ijk} = \mu + \alpha_i + \beta_j + \left(\alpha\beta\right)_{ij}+\epsilon_{ijk}$$ # # with # # $$\epsilon_{ijk}\sim N\left(0,\sigma^2\right)$$ # In[51]: help(anova_lm) # Things available in the calling namespace are available in the formula evaluation namespace # In[52]: kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)', data=kt).fit() # ANOVA Type-I Sum of Squares #

# SS(A) for factor A.
# SS(B|A) for factor B.
# SS(AB|B, A) for interaction AB.
# In[53]: print(anova_lm(kidney_lm)) # ANOVA Type-II Sum of Squares #

# SS(A|B) for factor A.
# SS(B|A) for factor B.
# In[54]: print(anova_lm(kidney_lm, typ=2)) ANOVA Type-III Sum of Squares

SS(A|B, AB) for factor A.
SS(B|A, AB) for factor B.
# In[55]: print(anova_lm(ols('np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Poly)', data=kt).fit(), typ=3)) # #### Excercise: Find the 'best' model for the kidney failure dataset