#!/usr/bin/env python # coding: utf-8 # # Logistic Regression # # This is the first notebook for a tutorial on "Practical Bayesian Modeling with PyMC" # **Abstract** # # In this tutorial, we explore Bayesian regression using PyMC -- the primary library for Bayesian sampling in Python -- focusing on survey data and other datasets with categorical outcomes. Starting with logistic regression, we’ll build up to categorical and ordered logistic regression, showcasing how Bayesian approaches provide versatile tools for developing and evaluating complex models. Participants will leave with practical skills for implementing Bayesian regression models in PyMC, along with a deeper appreciation for the power of Bayesian inference in real-world data analysis. Participants should be familiar with Python, the SciPy ecosystem, and basic statistics, but no experience with Bayesian methods is required. # **Description** # # In this hands-on tutorial, we will dive into the world of Bayesian regression using PyMC, with a particular focus on datasets that feature categorical outcomes. PyMC provides versatile tools for iterative development of powerful models. This tutorial guides participants through the fundamentals of Bayesian regression, starting from logistic regression and extending to categorical and ordered logistic regression. It also introduces the PyMC package, its syntax and capabilities. # # Outline: # # * Logistic Regression with PyMC # # - Overview of Bayesian inference # - Modeling binary outcomes with logistic regression # - Introduction to PyMC and its capabilities # - Hands-on example: Happiness data in the General Social Survey # # * Categorical Regression # # - Extending logistic regression to multi-class outcomes # - Differences between Bayesian models and GLM # - Hands-on example: Political alignment and party affiliation (?) # # * Ordered Logistic Regression # - Introduction to ordinal outcomes and their challenges # - Implementing ordered logistic regression in PyMC # - Hands-on example: Political alignment and party affiliation (?) # # * Conclusion and Q&A # - Recap of key concepts # - Resources for further learning # - Open discussion and questions # # This tutorial is aimed at data scientists who are comfortable with Python, the SciPy ecosystem, and basic statistics but are new to Bayesian statistics and/or PyMC. By the end of the session, participants will have hands-on experience with Bayesian regression models and a solid understanding of how to apply these techniques to real-world data analysis. # # Notes: # The material for the tutorial is in this repository: # https://github.com/AllenDowney/SurveyDataPyMC/tree/main/notebooks # # The dataset from the General Social Survey (GSS) can be found here: # https://gssdataexplorer.norc.org/ # # # [The slides for the tutorial are here](COMING SOON). # [Click here to run this notebook on Colab](https://colab.research.google.com/github/AllenDowney/SurveyDataPyMC/blob/main/notebooks/01_logistic_regression.ipynb) # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: # Get utils.py from os.path import basename, exists def download(url): filename = basename(url) if not exists(filename): from urllib.request import urlretrieve local, _ = urlretrieve(url, filename) print("Downloaded " + local) download("https://github.com/AllenDowney/SurveyDataPyMC/raw/main/notebooks/utils.py") # In[3]: import matplotlib.pyplot as plt import numpy as np import pandas as pd import pymc as pm import arviz as az from utils import value_counts, decorate, load_idata_or_sample # In[4]: # Make the figures smaller to save some screen real estate plt.rcParams["figure.dpi"] = 75 plt.rcParams["figure.figsize"] = [6, 3.5] plt.rcParams["axes.titlelocation"] = "left" # ## Data # # The dataset we'll use is an extract from the General Social Survey. # The following cell downloads it. # In[5]: # This dataset is prepared in GssExtract/notebooks/02_make_extract-2022_4.ipynb # It has been resampled to correct for stratified sampling DATA_PATH = "https://github.com/AllenDowney/GssExtract/raw/main/data/interim/" filename = "gss_extract_2022_4.hdf" download(DATA_PATH + filename) # The file is in HDF format, which we can read using Pandas. # In[6]: gss = pd.read_hdf(filename, "gss") gss.shape # The dataset includes one row for each respondent and one column for each variable. # # To demonstrate logistic regression, we'll use responses to [this question](https://gssdataexplorer.norc.org/variables/441/vshow). # Generally speaking, would you say that most people can be trusted or that you can't be too careful in dealing with people? # # ``` # 1 Most people can be trusted # 2 Can't be too careful # 3 Depends # ``` # Here are the counts of the responses -- there are a large number of NaN values because not every respondent was asked the question. # In[7]: value_counts(gss["trust"]) # Although there are three possible responses, we'll treat this as a binary variable. # So we'll recode the responses so `1` means "most people can be trusted" and `0` means either of the other responses. # In[8]: gss["y"] = gss["trust"].replace({1: 1, 2: 0, 3: 0}) value_counts(gss["y"]) # Here's what the percentage of affirmative responses looks like over time. # In[9]: time_series = gss.groupby("year")["y"].mean() * 100 time_series.plot(style="o") decorate(ylabel="percent", title="Can people be trusted?") # Sadly, levels of trust have been declining in the US for decades. # ## Who is most trusting? # # Who do you think is most trusting, Democrats, Republicans, or independents? # To find out, we'll use another [GSS variable](https://gssdataexplorer.norc.org/variables/141/vshow), which contains responses to this question: # # > Generally speaking, do you usually think of yourself as a Republican, Democrat, Independent, or what? # # With these options: # # ``` # 0 Strong democrat # 1 Not very strong democrat # 2 Independent, close to democrat # 3 Independent (neither, no response) # 4 Independent, close to republican # 5 Not very strong republican # 6 Strong republican # 7 Other party # ``` # To simplify the analysis, we'll consider just three groups, Democrats (strong or not), Independent (leaning either way) and Republican (strong or not). # In[10]: party_map = { 0: 0, 1: 0, 2: 1, 3: 1, 4: 1, 5: 2, 6: 2, 7: np.nan, } gss["partyid3"] = gss["partyid"].replace(party_map) value_counts(gss["partyid3"]) # Well use this dictionary to map from codes to names. # In[11]: party_names = { 0: "Democrat", 1: "Independent", 2: "Republican", } # The following table shows how the percentages in each group have changed over time. # In[12]: table = ( gss.pivot_table(index="year", columns="partyid3", values="y", aggfunc="mean") * 100 ).rename(columns=party_names) table.iloc[:10] # Here's what the columns look like as time series. # In[13]: colors = ["C0", "C4", "C3"] table.plot(color=colors) decorate(ylabel="percent", title="Percent saying people can be trusted") # It looks like Republicans were the most trusting group, historically, but that might be changing. # ## Logistic Regression # # We'll use logistic regression to model changes in these responses over time and differences between groups. # # The fundamental idea of logistic regression is that each observation unit -- like a survey respondent -- has some latent propensity to choose one of two options, and we assume: # # * The latent propensities, `z[i]`, are a linear function of explanatory variables. # # * The probability a respondent chooses a particular option is `expit(z[i])`. # # Where `expit` is the function that maps from log-odds to probability (defined in `scipy.special`, also available from PyMC as `pm.math.sigmoid`). # # As a first example, we'll make a model with `year` as an explanatory variable, so we'll assume # # ``` # z = alpha + beta * year # ``` # # In a minute we'll see how to represent this model in PyMC, but first let's prepare the data. # We'll select the rows with valid data for the response variable and `year`. # In[14]: data = gss.dropna(subset=["y", "year"]) data.shape # And we'll extract the values as NumPy arrays. # In[15]: y = data["y"].to_numpy() year = data["year"].to_numpy() # We'll shift `year` so it's centered at its mean (and we'll see why later). # In[16]: year_shift = data["year"].mean() year = year - year_shift # Now here's the key idea of PyMC (and MCMC in general): if you can describe the data-generating process, the sampler can generate a sample from the posterior distribution of the parameters. # In[17]: with pm.Model() as logistic_model: # Priors for intercept and slope alpha = pm.Normal("alpha", 0, 1) beta = pm.Normal("beta", 0, 1) # Linear predictor (log-odds) z = alpha + beta * year # The inverse logit function is called sigma p = pm.math.sigmoid(z) # Bernoulli likelihood with logit link y_obs = pm.Bernoulli("y_obs", p=p, observed=y) # Objects created in the context manager are registered as elements of the model. # Here's a graphical representation of the model. # In[18]: pm.model_to_graphviz(logistic_model) # ## The posterior distribution # # The function that samples from the posterior distribution is called `sample`. # # ``` # with logistic_model: # idata = pm.sample(draws=500, tune=500) # ``` # # `draw` indicates the number of samples we want from each chain. # `tune` is the number of samples used to tune the chains before we start saving them. # For the workshop, we'll use the following function, which loads the results if they are already saved, or runs the sampler otherwise. # In[19]: filename = "logistic_model_idata.nc" idata = load_idata_or_sample(logistic_model, filename, draws=500, tune=500) # The result is an [ArViz `InferenceData` object](https://python.arviz.org/en/stable/api/inference_data.html), which contains several groups of data stored as [xarray `DataSet` objects](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html). # In[20]: idata # ArViz provides a variety of functions for processing and visualizing the results. # In this example, the primary thing we're interested in is the posterior distributions of the coefficients `alpha` and `beta`. # In[21]: az.plot_posterior(idata) decorate() # ### Comparing Prior with Posterior # # To see what we learned from the data, we can compare the prior and posterior distributions of the coefficents. # `sample_prior_predictive` runs the model forward to generate samples from the prior distributions (and the prior predictive, which we'll talk about later). # In[22]: with logistic_model: idata_prior = pm.sample_prior_predictive(draws=1000) # The result is another `InferenceData` object. # It will be convenient to put all of the samples in one object. # In[23]: idata.extend(idata_prior) # Now we can compare the prior and posterior distributions. # In[24]: az.plot_dist_comparison(idata, var_names=["alpha"]) decorate() # In[25]: az.plot_dist_comparison(idata, var_names=["beta"]) decorate() # Looking at the posterior distributions of `alpha` and `beta`, we can see that they fall comfortably within the prior distributions of these parameters, which means that the priors didn't have much effect on the results. # # As an experiment, try increasing or decreasing `sigma_prior` and see what effect it has on the posterior distributions. # ## Sampling diagnostics # # After sampling, we want to check that the process worked well — meaning: # # * All chains have effectively explored the posterior distribution, and # # * Successive draws within each chain are only weakly correlated (each draw should provide mostly new information). # # We can check these properties visually using `plot_trace`: # # * On the left, the distribution from each chain should look similar — this is evidence that the chains all converged to the same region of the posterior. # # * On the right, the sequence of draws within each chain should look like uncorrelated noise (sometimes called "hairy caterpillars"). It shouldn't look like a random walk and there shouldn't be flat spots, which would suggest the chain got stuck. # In[26]: az.plot_trace(idata, var_names=["alpha", "beta"]) decorate() # To check these properties numerically, we can look at two key summary statistics: ESS and r-hat. # # * r-hat quantifies the difference between chains. If all chains converged to the same posterior distribution, r-hat should be close to 1. If the chains are exploring different regions, r-hat will be larger than 1, indicating failure to converge. # # * Effective Sample Size (ESS) tells us how much independent information the chains actually contribute. If successive values within a chain are highly correlated, the chain isn’t exploring the posterior efficiently, and ESS will be smaller than the total number of draws. # In[27]: az.summary(idata) # ## Generating predictions # # [Note: I cut some content here that I think would work well in the slides] # # There are two ways to generate predictions: # # * We can use the results from PyMC to compute our own predictions. # # * We can use the PyMC model. # # With this example, we'll demonstrate the first way. # With the next example, we'll demonstrate the second. # # First, we'll extract the samples of the coefficients from `idata` and convert them to NumPy arrays. # In[28]: # Collect posterior draws of alpha and beta samples = az.extract(idata, var_names=["alpha", "beta"], num_samples=100) alphas = samples["alpha"].to_numpy() betas = samples["beta"].to_numpy() betas.shape # Here's the range of years we'll predict. # In[29]: year_range = np.arange(1970, 2031) year_centered = year_range - year_shift # To generate predictions, we pretty much reimplement the model in NumPy. # In[30]: from scipy.special import expit for alpha, beta in zip(alphas, betas): zs = alpha + beta * year_centered probs = expit(zs) * 100 plt.plot(year_range, probs, color="C0", alpha=0.02) time_series.plot(style="o", label="observed") decorate(ylabel="percent", title="Percent saying people can be trusted") # It looks like the model fits the data well, and makes a plausible projection for the future. # ## Centering # # You might remember that we mean-centered the predictor, `years`. # Now here's why: because we centered `years`, the sampled slopes and intercepts are uncorrelated. # In[31]: pm.plot_pair(idata, var_names=["alpha", "beta"]) decorate() # As an experiment, try: # # * Set `year_shift=1970` and run the model again. You might get some warnings from the sampler, and `plot_pair` will show that the alphas and betas are correlated. The not-quite-centered predictor stretches the shape of the joint posterior distribution and makes it harder to sample efficiently. # # * Set `year_shift=0` and run the model again. With the predictor completely uncentered, the joint posterior distribution is so stretched that the sampler diverges frequently -- and basically fails. # # Without centering, the intercept represents the log-odds of the outcome when `year=0`, which is way outside the range of the data. # This forces the intercept and slope to compensate for each other — if the slope increases, the intercept must decrease to hit the same observed points. # # As a general rule, it's a good idea to center continuous predictors in Bayesian regression models. # ## Categorical predictors # # Now let's add political party as a predictor. # In[32]: value_counts(gss["partyid3"]) # To prepare the data, we'll select observations where all the variables in the model are valid. # In[33]: data = gss.dropna(subset=["y", "year", "partyid3"]) data.shape # And again we'll center the years and convert all data to NumPy arrays. # In[34]: y = data["y"].to_numpy() year = data["year"].to_numpy() year_shift = data["year"].mean() year = year - year_shift partyid3 = data["partyid3"].astype(int).to_numpy() # This version of the model add a separate intercept for each group, and demonstrates two additional features: # # * Data: making the input data part of the model so we can modify it to make out-of-sample predictions # # * Deterministic: saving the result of intermediate calculations as part of the `InferenceData` # # # In[35]: with pm.Model() as logistic_model2: # add the predictors to the model as mutable Data year_pt = pm.Data("year", year) party3_pt = pm.Data("party3", partyid3) # Party-specific intercepts (one for each group) alpha = pm.Normal("alpha", 0, 1, shape=3) # Shared slope for year (assuming effect of year is the same across parties) beta = pm.Normal("beta", 0, 1) # Linear predictor (log-odds) z = alpha[party3_pt] + beta * year_pt # Compute and save the probabilities p = pm.Deterministic("p", pm.math.sigmoid(z)) # Bernoulli likelihood y_obs = pm.Bernoulli("y_obs", p=p, observed=y) # Here's the graph representation of the model. # In[36]: pm.model_to_graphviz(logistic_model2) # In[37]: filename = 'logistic_model2_idata.nc' idata2 = load_idata_or_sample(logistic_model2, filename, draws=500, tune=500) # Everything marked as `Deterministic` gets saved in the `idata`. # In[38]: idata2 # So when we plot the posteriors, we'll use `var_names` to show only the coefficients. # In[39]: az.plot_posterior(idata2, var_names=["alpha", "beta"]) decorate() # Here are the trace plots. # In[40]: az.plot_trace(idata2, compact=False, var_names=["alpha", "beta"]) decorate() # And the diagnostic statistics. # In[41]: pm.summary(idata2, var_names=["alpha", "beta"]) # Now to generate predictions, we can use the PyMC model directly -- we don't have to reimplement it by hand. # # We'll use `compute_deterministics` to compute the values of `p`, conditioned on the posterior distributions of `alpha` and `beta`. # In[42]: probabilities = {} with logistic_model2: for key, party_name in party_names.items(): # Assign covariates to compute the posterior distributions of pm.set_data( { "party3": np.tile(key, len(year_centered)), "year": year_centered, } ) # Compute the posterior predictive idata = pm.compute_deterministics(idata2['posterior']) probabilities[party_name] = az.extract(idata, num_samples=100)["p"] # Here's what the results look like. # In[43]: # Plot the original data table.plot(color=colors) # Plot posterior draws for each party affiliation for key, party_name in party_names.items(): ps = probabilities[party_name].to_numpy() * 100 plt.plot(year_range, ps, color=colors[key], alpha=0.02) decorate(ylabel="percent", title="Percent saying people can be trusted") # The posterior distributions of `p` overlap for the Democrat and Independent groups, but the distribution for Republicans is notably different. # # Based on these results, we can say with some confidence that Republicans are more likely to say people can be trusted -- at least under the assumption that the changes over time are well modeled by the logistic regression model. # In[ ]: