A Summary of lecture "Exploratory Data Analysis in Python", via datacamp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from empiricaldist import Pmf, Cdf
from scipy.stats import linregress
import statsmodels.formula.api as smf
brfss_original = pd.read_hdf('./dataset/brfss.hdf5', 'brfss')
# Run regression with linregress
subset = brfss_original.dropna(subset=['INCOME2', '_VEGESU1'])
xs = subset['INCOME2']
ys = subset['_VEGESU1']
res = linregress(xs, ys)
print(res)
# Run regression with StatsModels
results = smf.ols('_VEGESU1 ~ INCOME2', data=brfss_original).fit()
print(results.params)
LinregressResult(slope=0.06988048092105248, intercept=1.5287786243362973, rvalue=0.11967005884864361, pvalue=1.3785039162157718e-238, stderr=0.002110976356332355) Intercept 1.528779 INCOME2 0.069880 dtype: float64
gss = pd.read_hdf('./dataset/gss.hdf5', 'gss')
# Group by educ
grouped = gss.groupby('educ')
# Compute mean income in each group
mean_income_by_educ = grouped['realinc'].mean()
# Plot mean income as a scatter plot
plt.plot(mean_income_by_educ, 'o', alpha=0.5)
# Label the axes
plt.xlabel('Education (years)')
plt.ylabel('Income (1986 $)')
Text(0, 0.5, 'Income (1986 $)')
gss['age2'] = gss['age'] ** 2
# Add a new column with educ squared
gss['educ2'] = gss['educ'] ** 2
# Run a regression model with educ, educ2, age and age2
results = smf.ols('realinc ~ educ + educ2 + age + age2', data=gss).fit()
# Print the estimated parameters
print(results.params)
Intercept -23241.884034 educ -528.309369 educ2 159.966740 age 1696.717149 age2 -17.196984 dtype: float64
# Run a regression model with educ, educ2, age, and age2
results = smf.ols('realinc ~ educ + educ2 + age + age2', data=gss).fit()
# Make the DataFrame
df = pd.DataFrame()
df['educ'] = np.linspace(0, 20)
df['age'] = 30
df['educ2'] = df['educ'] ** 2
df['age2'] = df['age'] ** 2
# Generate and plot the predictions
pred = results.predict(df)
print(pred.head())
0 12182.344976 1 11993.358518 2 11857.672098 3 11775.285717 4 11746.199374 dtype: float64
# Plot mean income in each age group
grouped = gss.groupby('educ')
mean_income_by_educ = grouped['realinc'].mean()
plt.plot(mean_income_by_educ, 'o', alpha=0.5)
# Plot the predictions
pred = results.predict(df)
plt.plot(df['educ'], pred, label='Age 30')
# Label axes
plt.xlabel('Education (years)')
plt.ylabel('Income (1986 $)')
plt.legend()
<matplotlib.legend.Legend at 0x203b3405f08>
# Recode grass
gss['grass'].replace(2, 0, inplace=True)
# Run logistic regression
results = smf.logit('grass ~ age + age2 + educ + educ2 + C(sex)', data=gss).fit()
print(results.params)
# Make a DataFrame with a range of ages
df = pd.DataFrame()
df['age'] = np.linspace(18, 89)
df['age2'] = df['age'] ** 2
# Set the education level to 12
df['educ'] = 12
df['educ2'] = df['educ'] ** 2
# Generate predictions for men and women
df['sex'] = 1
pred1 = results.predict(df)
df['sex'] = 2
pred2 = results.predict(df)
grouped = gss.groupby('age')
favor_by_age = grouped['grass'].mean()
plt.plot(favor_by_age, 'o', alpha=0.5)
plt.plot(df['age'], pred1, label='Male')
plt.plot(df['age'], pred2, label='Female')
plt.xlabel('Age')
plt.ylabel('Probability of favoring legalization')
plt.legend()
plt.savefig('../images/brfss-logreg.png')
Optimization terminated successfully. Current function value: 0.588510 Iterations 6 Intercept -1.685223 C(sex)[T.2] -0.384611 age -0.034756 age2 0.000192 educ 0.221860 educ2 -0.004163 dtype: float64