#!/usr/bin/env python # coding: utf-8 # # Multivariate Thinking # > A Summary of lecture "Exploratory Data Analysis in Python", via datacamp # # - toc: true # - badges: true # - comments: true # - author: Chanseok Kang # - categories: [Python, Datacamp] # - image: images/brfss-logreg.png # ## Limits of simple regression # # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from empiricaldist import Pmf, Cdf from scipy.stats import linregress import statsmodels.formula.api as smf # In[2]: brfss_original = pd.read_hdf('./dataset/brfss.hdf5', 'brfss') # ### Using StatsModels # In[3]: # Run regression with linregress subset = brfss_original.dropna(subset=['INCOME2', '_VEGESU1']) xs = subset['INCOME2'] ys = subset['_VEGESU1'] res = linregress(xs, ys) print(res) # Run regression with StatsModels results = smf.ols('_VEGESU1 ~ INCOME2', data=brfss_original).fit() print(results.params) # ## Multiple regression # In[4]: gss = pd.read_hdf('./dataset/gss.hdf5', 'gss') # ### Plot income and education # In[5]: # Group by educ grouped = gss.groupby('educ') # Compute mean income in each group mean_income_by_educ = grouped['realinc'].mean() # Plot mean income as a scatter plot plt.plot(mean_income_by_educ, 'o', alpha=0.5) # Label the axes plt.xlabel('Education (years)') plt.ylabel('Income (1986 $)') # ### Non-linear model of education # In[6]: gss['age2'] = gss['age'] ** 2 # In[7]: # Add a new column with educ squared gss['educ2'] = gss['educ'] ** 2 # Run a regression model with educ, educ2, age and age2 results = smf.ols('realinc ~ educ + educ2 + age + age2', data=gss).fit() # Print the estimated parameters print(results.params) # ## Visualizing regression results # ### Making predictions # In[8]: # Run a regression model with educ, educ2, age, and age2 results = smf.ols('realinc ~ educ + educ2 + age + age2', data=gss).fit() # Make the DataFrame df = pd.DataFrame() df['educ'] = np.linspace(0, 20) df['age'] = 30 df['educ2'] = df['educ'] ** 2 df['age2'] = df['age'] ** 2 # Generate and plot the predictions pred = results.predict(df) print(pred.head()) # ### Visualizing predictions # In[9]: # Plot mean income in each age group grouped = gss.groupby('educ') mean_income_by_educ = grouped['realinc'].mean() plt.plot(mean_income_by_educ, 'o', alpha=0.5) # Plot the predictions pred = results.predict(df) plt.plot(df['educ'], pred, label='Age 30') # Label axes plt.xlabel('Education (years)') plt.ylabel('Income (1986 $)') plt.legend() # ## Logistic regression # ### Predicting a binary variable # In[10]: # Recode grass gss['grass'].replace(2, 0, inplace=True) # Run logistic regression results = smf.logit('grass ~ age + age2 + educ + educ2 + C(sex)', data=gss).fit() print(results.params) # Make a DataFrame with a range of ages df = pd.DataFrame() df['age'] = np.linspace(18, 89) df['age2'] = df['age'] ** 2 # Set the education level to 12 df['educ'] = 12 df['educ2'] = df['educ'] ** 2 # Generate predictions for men and women df['sex'] = 1 pred1 = results.predict(df) df['sex'] = 2 pred2 = results.predict(df) grouped = gss.groupby('age') favor_by_age = grouped['grass'].mean() plt.plot(favor_by_age, 'o', alpha=0.5) plt.plot(df['age'], pred1, label='Male') plt.plot(df['age'], pred2, label='Female') plt.xlabel('Age') plt.ylabel('Probability of favoring legalization') plt.legend() plt.savefig('../images/brfss-logreg.png')