#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') # --- # title: Maximum Likelihood Estimation of Custom Models in Python with StatsModels # tags: Statistics, Python # --- # Maximum likelihood estimation is a common method for fitting statistical models. In Python, it is quite possible to fit maximum likelihood models using just [`scipy.optimize`](http://docs.scipy.org/doc/scipy/reference/optimize.html). Over time, however, I have come to prefer the convenience provided by [`statsmodels`'](http://statsmodels.sourceforge.net/) [`GenericLikelihoodModel`](http://statsmodels.sourceforge.net/devel/dev/generated/statsmodels.base.model.GenericLikelihoodModel.html). In this post, I will show how easy it is to subclass `GenericLikelihoodModel` and take advantage of much of `statsmodels`' well-developed machinery for maximum likelihood estimation of custom models. # #### Zero-inflated Poisson models # The model we use for this demonstration is a [zero-inflated Poisson model](http://en.wikipedia.org/wiki/Zero-inflated_model#Zero-inflated_Poisson). This is a model for count data that generalizes the Poisson model by allowing for an overabundance of zero observations. # # The model has two parameters, $\pi$, the proportion of excess zero observations, and $\lambda$, the mean of the Poisson distribution. We assume that observations from this model are generated as follows. First, a weighted coin with probability $\pi$ of landing on heads is flipped. If the result is heads, the observation is zero. If the result is tails, the observation is generated from a Poisson distribution with mean $\lambda$. Note that there are two ways for an observation to be zero under this model: # # 1. the coin is heads, and # 2. the coin is tails, and the sample from the Poisson distribution is zero. # # If $X$ has a zero-inflated Poisson distribution with parameters $\pi$ and $\lambda$, its probability mass function is given by # # $$\begin{align*} # P(X = 0) # & = \pi + (1 - \pi)\ e^{-\lambda} \\ # P(X = x) # & = (1 - \pi)\ e^{-\lambda}\ \frac{\lambda^x}{x!} \textrm{ for } x > 0. # \end{align*}$$ # # In this post, we will use the parameter values $\pi = 0.3$ and $\lambda = 2$. The probability mass function of the zero-inflated Poisson distribution is shown below, next to a normal Poisson distribution, for comparison. # In[2]: from __future__ import division from matplotlib import pyplot as plt import numpy as np from scipy import stats import seaborn as sns from statsmodels.base.model import GenericLikelihoodModel # In[3]: np.random.seed(123456789) # In[4]: pi = 0.3 lambda_ = 2. # In[5]: def zip_pmf(x, pi=pi, lambda_=lambda_): if pi < 0 or pi > 1 or lambda_ <= 0: return np.zeros_like(x) else: return (x == 0) * pi + (1 - pi) * stats.poisson.pmf(x, lambda_) # In[6]: fig, ax = plt.subplots(figsize=(8, 6)) xs = np.arange(0, 10); palette = sns.color_palette() ax.bar(2.5 * xs, stats.poisson.pmf(xs, lambda_), width=0.9, color=palette[0], label='Poisson'); ax.bar(2.5 * xs + 1, zip_pmf(xs), width=0.9, color=palette[1], label='Zero-inflated Poisson'); ax.set_xticks(2.5 * xs + 1); ax.set_xticklabels(xs); ax.set_xlabel('$x$'); ax.set_ylabel('$P(X = x)$'); ax.legend(); # #### Maximum likelihood estimation # First we generate 1,000 observations from the zero-inflated model. # In[7]: N = 1000 # In[8]: inflated_zero = stats.bernoulli.rvs(pi, size=N) x = (1 - inflated_zero) * stats.poisson.rvs(lambda_, size=N) # In[9]: fig, ax = plt.subplots(figsize=(8, 6)) ax.hist(x, width=0.8, bins=np.arange(x.max() + 1), normed=True); ax.set_xticks(np.arange(x.max() + 1) + 0.4); ax.set_xticklabels(np.arange(x.max() + 1)); ax.set_xlabel('$x$'); ax.set_ylabel('Proportion of samples'); # We are now ready to estimate $\pi$ and $\lambda$ by maximum likelihood. To do so, we define a class that inherits from `statsmodels`' `GenericLikelihoodModel` as follows. # In[10]: class ZeroInflatedPoisson(GenericLikelihoodModel): def __init__(self, endog, exog=None, **kwds): if exog is None: exog = np.zeros_like(endog) super(ZeroInflatedPoisson, self).__init__(endog, exog, **kwds) def nloglikeobs(self, params): pi = params[0] lambda_ = params[1] return -np.log(zip_pmf(self.endog, pi=pi, lambda_=lambda_)) def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): if start_params is None: lambda_start = self.endog.mean() excess_zeros = (self.endog == 0).mean() - stats.poisson.pmf(0, lambda_start) start_params = np.array([excess_zeros, lambda_start]) return super(ZeroInflatedPoisson, self).fit(start_params=start_params, maxiter=maxiter, maxfun=maxfun, **kwds) # The key component of this class is the method `nloglikeobs`, which returns the negative log likelihood of each observed value in `endog`. Secondarily, we must also supply reasonable initial guesses of the parameters in `fit`. Obtaining the maximum likelihood estimate is now simple. # In[11]: model = ZeroInflatedPoisson(x) results = model.fit() # We see that we have estimated the parameters fairly well. # In[12]: pi_mle, lambda_mle = results.params pi_mle, lambda_mle # There are many advantages to buying into the `statsmodels` ecosystem and subclassing `GenericLikelihoodModel`. The already-written `statsmodels` code handles storing the observations and the interaction with `scipy.optimize` for us. (It is possible to control the use of `scipy.optimize` through keyword arguments to `fit`.) # # We also gain access to many of `statsmodels`' built in model analysis tools. For example, we can use bootstrap resampling to estimate the variation in our parameter estimates. # In[13]: boot_mean, boot_std, boot_samples = results.bootstrap(nrep=500, store=True) boot_pis = boot_samples[:, 0] boot_lambdas = boot_samples[:, 1] # In[14]: fig, (pi_ax, lambda_ax) = plt.subplots(ncols=2, figsize=(16, 6)) sns.boxplot(boot_pis, ax=pi_ax, names=['$\pi$'], color=palette[0]); sns.boxplot(boot_lambdas, ax=lambda_ax, names=['$\lambda$'], color=palette[1]); fig.suptitle('Boostrap distribution of maximum likelihood estimates'); # The next time you are fitting a model using maximum likelihood, try integrating with `statsmodels` to take advantage of the significant amount of work that has gone into its ecosystem. # In[ ]: