#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import scipy as sp import scipy.stats import matplotlib.pyplot as plt import math as mt import scipy.special import seaborn as sns plt.style.use("fivethirtyeight") from statsmodels.graphics.tsaplots import plot_acf from matplotlib import cm import pandas as pd # Although we have demonstrated the convenience of PyMC3, which encapsulated the MCMC in the relatively isolated process, we still need to investigate the basic structure of MCMC. # # In this chapter, we will demonstrate the inner mechanism of MCMC with the help of previous analytical examples. There will be no PyMC3 used in this Chapter. # # Markov Chain Monte Carlo # The **Markov Chain Monte Carlo** (**MCMC**) is a class of algorithm to simulate a distribution that has no closed-form expression. # To illustrate the mechanism of MCMC, we resort to the example of Gamma-Poisson conjugate. # # Though it has a closed-form expression of posterior, we can still simulate the posterior for demonstrative purpose. # # To use MCMC, commonly the Bayes' Theorem is modified without affecting the final result. # $$ # P(\lambda \mid y) \propto P(y \mid \lambda) P(\lambda) # $$ # where $\propto$ means proportional to, the integration in the denominator can be safely omitted since it is a constant. # # Here we recap the example of hurricanes in the last chapter. The prior elicitation uses # # $$ # E(\lambda) = \frac{\alpha}{\beta}\\ # \text{Var}(\lambda) = \frac{\alpha}{\beta^2} # $$ # In[2]: x = np.linspace(0, 10, 100) params = [10, 2] gamma_pdf = sp.stats.gamma.pdf(x, a=params[0], scale=1 / params[1]) fig, ax = plt.subplots(figsize=(7, 7)) ax.plot( x, gamma_pdf, lw=3, label=r"$\alpha = %.1f, \beta = %.1f$" % (params[0], params[1]) ) ax.set_title("Prior") mean = params[0] / params[1] mode = (params[0] - 1) / params[1] ax.axvline(mean, color="tomato", ls="--", label="mean: {}".format(mean)) ax.axvline(mode, color="red", ls="--", label="mode: {}".format(mode)) ax.legend() plt.show() # 1. Because posterior will also be Gamma distribution, we start from proposing a value drawn from posterior # $$ # \lambda = 8 # $$ # This is an arbitrary value, which is called the **initial value**. # 2. Calculate the likelihood of observing $k=3$ hurricanes given $\lambda=8$. # $$ # \mathcal{L}(3 ; 8)=\frac{\lambda^{k} e^{-\lambda}}{k !}=\frac{8^{3} e^{-8}}{3 !}=0.1075 # $$ # In[3]: def pois_lh(k, lamda): lh = lamda**k * np.exp(-lamda) / mt.factorial(k) return lh # In[4]: lamda_init = 8 k = 3 pois_lh(k=k, lamda=lamda_init) # 3. Calculate prior # $$ # g(\lambda ; \alpha, \beta)=\frac{\beta^{\alpha} \lambda^{\alpha-1} e^{-\beta \lambda}}{\Gamma(\alpha)} # $$ # In[5]: def gamma_prior(alpha, beta, lamda): prior = ( beta**alpha * lamda ** (alpha - 1) * np.exp(-beta * lamda) ) / sp.special.gamma(alpha) return prior # In[6]: lamda_current = lamda_init alpha = 10 beta = 2 gamma_prior(alpha=alpha, beta=beta, lamda=lamda_current) # 4. Calculate the posterior with the first guess $\lambda=8$ and we denote it as $\lambda_{current}$ # In[7]: k = 3 posterior_current = pois_lh(k=k, lamda=lamda_current) * gamma_prior( alpha=10, beta=2, lamda=lamda_current ) posterior_current # 5. Draw a second value $\lambda_{proposed}$ from a **proposal distribution** with $\mu=\lambda_{current}$ and $\sigma = .5$. The $\sigma$ here is called **tuning parameter**, which will be clearer in following demonstrations. # In[8]: tuning_param = 0.5 lamda_prop = sp.stats.norm(loc=lamda_current, scale=tuning_param).rvs() lamda_prop # 6. Calculate posterior based on the $\lambda_{proposed}$. # In[9]: posterior_prop = gamma_prior(alpha, beta, lamda=lamda_prop) * pois_lh( k, lamda=lamda_prop ) posterior_prop # 7. Now we have two posteriors. To proceed, we need to make some rules to throw one away. Here we introduce the **Metropolis Algorithm**. The probability threshold for accepting $\lambda_{proposed}$ is # $$ # P_{\text {accept }}=\min \left(\frac{P\left(\lambda_{\text {proposed }} \mid \text { data }\right)}{P\left(\lambda_{\text {current }} \mid \text { data }\right)}, 1\right) # $$ # In[10]: print(posterior_current) print(posterior_prop) # In[11]: prob_threshold = np.min([posterior_prop / posterior_current, 1]) prob_threshold # It means the probability of accepting $\lambda_{proposed}$ is $1$. What if the smaller value is $\frac{\text{posterior proposed}}{\text{posterior current}}$, let's say $\text{prob_threshold}=.768$. The algorithm requires a draw from a uniform distribution, if the draw is smaller than $.768$, go for $\lambda_{proposed}$ if larger then stay with $\lambda_{current}$. # 8. The demonstrative algorithm will be # In[12]: if sp.stats.uniform.rvs() > 0.768: print("stay with current lambda") else: print("accept next lambda") # 9. If we accept $\lambda_{proposed}$, redenote it as $\lambda_{current}$, then repeat from step $2$ for thousands of times. # ## Combine All Steps # We will join all the steps in a loop for thousands of times (the number of repetition depends on your time constraint and your computer's capacity). # In[13]: def gamma_poisson_mcmc( lamda_init=2, k=3, alpha=10, beta=2, tuning_param=1, chain_size=10000 ): np.random.seed(123) lamda_current = lamda_init lamda_mcmc = [] pass_rate = [] post_ratio_list = [] for i in range(chain_size): lh_current = pois_lh(k=k, lamda=lamda_current) prior_current = gamma_prior(alpha=alpha, beta=beta, lamda=lamda_current) posterior_current = lh_current * prior_current lamda_proposal = sp.stats.norm(loc=lamda_current, scale=tuning_param).rvs() prior_next = gamma_prior(alpha=alpha, beta=beta, lamda=lamda_proposal) lh_next = pois_lh(k, lamda=lamda_proposal) posterior_proposal = lh_next * prior_next post_ratio = posterior_proposal / posterior_current prob_next = np.min([post_ratio, 1]) unif_draw = sp.stats.uniform.rvs() post_ratio_list.append(post_ratio) if unif_draw < prob_next: lamda_current = lamda_proposal lamda_mcmc.append(lamda_current) pass_rate.append("Y") else: lamda_mcmc.append(lamda_current) pass_rate.append("N") return lamda_mcmc, pass_rate # The proposal distribution must be symmetrical otherwise the Markov chain won't reach an equilibrium distribution. Also the tuning parameter should be set at a value which maintains $30\%\sim50\%$ acceptance. # In[14]: lamda_mcmc, pass_rate = gamma_poisson_mcmc(chain_size=10000) # In[15]: yes = ["Pass", "Not Pass"] counts = [pass_rate.count("Y"), pass_rate.count("N")] x = np.linspace(0, 10, 100) params_prior = [10, 2] gamma_pdf_prior = sp.stats.gamma.pdf(x, a=params_prior[0], scale=1 / params_prior[1]) # We assume $1$ year, the data records in total $3$ hurricanes. Obtain the analytical posterior, therefore we can compare the simulation and the analytical distribution. # \begin{align} # \alpha_{\text {posterior }}&=\alpha_{0}+\sum_{i=1}^{n} x_{i} = 10+3=13\\ # \beta_{\text {posterior }}&=\beta_{0}+n = 2+1=3 # \end{align} # Prepare the analytical Gamma distribution # In[16]: params_post = [13, 3] gamma_pdf_post = sp.stats.gamma.pdf(x, a=params_post[0], scale=1 / params_post[1]) # Because initial sampling might not converge to the equilibrium, so the first $1/10$ values of the Markov chain can be safely dropped. This $1/10$ period is termed as **burn-in** period. Also in order to minimize the _autocorrelation_ issue, we can perform **pruning** process to drop every other (or even five) observation(s). # # That is why we use ```lamda_mcmc[1000::2]``` in codes below. # In[17]: fig, ax = plt.subplots(figsize=(12, 12), nrows=3, ncols=1) ax[0].hist(lamda_mcmc[1000::2], bins=100, density=True) ax[0].set_title(r"Posterior Frequency Distribution of $\lambda$") ax[0].plot(x, gamma_pdf_prior, label="Prior") ax[0].plot(x, gamma_pdf_post, label="Posterior") ax[0].legend() ax[1].plot(np.arange(len(lamda_mcmc)), lamda_mcmc, lw=1) ax[1].set_title("Trace") ax[2].barh(yes, counts, color=["green", "blue"], alpha=0.7) plt.show() # # Diagnostics of MCMC # The demonstration above has been fine-tuned deliberately to circumvent potential errors, which are common in MCMC algorithm designing. We will demonstrate how it happens, what probable remedies we might possess. # ## Invalid Proposal # Following the Gamma-Poisson example, if we have a proposal distribution with $\mu=1$, the random draw from this proposal distribution might be a negative number, however the posterior is a Gamma distribution which only resides in positive domain. # # The remedy of this type of invalid proposal is straightforward, multiply $-1$ onto all negative draws. # In[18]: x_gamma = np.linspace(-3, 12, 100) x_norm = np.linspace(-3, 6, 100) params_gamma = [10, 2] gamma_pdf = sp.stats.gamma.pdf(x, a=params_gamma[0], scale=1 / params_gamma[1]) mu = 1 sigma = 1 normal_pdf = sp.stats.norm.pdf(x, loc=mu, scale=sigma) fig, ax = plt.subplots(figsize=(14, 7)) ax.plot( x, gamma_pdf, lw=3, label=r"Prior $\alpha = %.1f, \beta = %.1f$" % (params[0], params[1]), color="#FF6B1A", ) ax.plot( x, normal_pdf, lw=3, label=r"Proposal $\mu=%.1f , \sigma= %.1f$" % (mu, sigma), color="#662400", ) ax.text(4, 0.27, "Gamma Prior", color="#FF6B1A") ax.text(1.7, 0.37, "Normal Proposal", color="#662400") ax.text(0.2, -0.04, r"$\lambda_{current}=1$", color="tomato") x_fill = np.linspace(-3, 0, 30) y_fill = sp.stats.norm.pdf(x_fill, loc=mu, scale=sigma) ax.fill_between(x_fill, y_fill, color="#B33F00") ax.axvline(mu, color="red", ls="--", label=r"$\mu=${}".format(mu), alpha=0.4) ax.legend() plt.show() # Two lines of codes will solve this issue # ``` # if lamda_proposal < 0: # lamda_proposal *= -1 # ``` # In[19]: def gamma_poisson_mcmc_1( lamda_init=2, k=3, alpha=10, beta=2, tuning_param=1, chain_size=10000 ): np.random.seed(123) lamda_current = lamda_init lamda_mcmc = [] pass_rate = [] post_ratio_list = [] for i in range(chain_size): lh_current = pois_lh(k=k, lamda=lamda_current) prior_current = gamma_prior(alpha=alpha, beta=beta, lamda=lamda_current) posterior_current = lh_current * prior_current lamda_proposal = sp.stats.norm(loc=lamda_current, scale=tuning_param).rvs() if lamda_proposal < 0: lamda_proposal *= -1 prior_next = gamma_prior(alpha=alpha, beta=beta, lamda=lamda_proposal) lh_next = pois_lh(k, lamda=lamda_proposal) posterior_proposal = lh_next * prior_next post_ratio = posterior_proposal / posterior_current prob_next = np.min([post_ratio, 1]) unif_draw = sp.stats.uniform.rvs() post_ratio_list.append(post_ratio) if unif_draw < prob_next: lamda_current = lamda_proposal lamda_mcmc.append(lamda_current) pass_rate.append("Y") else: lamda_mcmc.append(lamda_current) pass_rate.append("N") return lamda_mcmc, pass_rate # This time we can set chain size much larger. # In[20]: lamda_mcmc, pass_rate = gamma_poisson_mcmc_1(chain_size=100000, tuning_param=1) # As you can see the frequency distribution is also much smoother. # In[21]: y_rate = pass_rate.count("Y") / len(pass_rate) n_rate = pass_rate.count("N") / len(pass_rate) yes = ["Pass", "Not Pass"] counts = [pass_rate.count("Y"), pass_rate.count("N")] fig, ax = plt.subplots(figsize=(12, 12), nrows=3, ncols=1) ax[0].hist(lamda_mcmc[int(len(lamda_mcmc) / 10) :: 2], bins=100, density=True) ax[0].set_title(r"Posterior Frequency Distribution of $\lambda$") ax[0].plot(x, gamma_pdf_prior, label="Prior") ax[0].plot(x, gamma_pdf_post, label="Posterior") ax[0].legend() ax[1].plot(np.arange(len(lamda_mcmc)), lamda_mcmc, lw=1) ax[1].set_title("Trace") ax[2].barh(yes, counts, color=["green", "blue"], alpha=0.7) ax[2].text( counts[1] * 0.4, "Not Pass", r"${}\%$".format(np.round(n_rate * 100, 2)), color="tomato", size=28, ) ax[2].text( counts[0] * 0.4, "Pass", r"${}\%$".format(np.round(y_rate * 100, 2)), color="tomato", size=28, ) plt.show() # ## Numerical Overflow # If prior and likelihood are extremely close to $0$, the product would be even closer to $0$. This would cause storage error in computer due to the binary system. # # The remedy is to use the log version of Bayes' Theorem, i.e. # $$ # \ln{P(\lambda \mid y)} \propto \ln{P(y \mid \lambda)}+ \ln{P(\lambda)} # $$ # Also the acceptance rule can be converted into log version # $$ # \ln{ \left(\frac{P\left(\lambda_{proposed } \mid y \right)}{P\left(\lambda_{current} \mid y \right)}\right)} # =\ln{P\left(\lambda_{proposed } \mid y \right)} - \ln{P\left(\lambda_{current } \mid y \right)} # $$ # In[22]: def gamma_poisson_mcmc_2( lamda_init=2, k=3, alpha=10, beta=2, tuning_param=1, chain_size=10000 ): np.random.seed(123) lamda_current = lamda_init lamda_mcmc = [] pass_rate = [] post_ratio_list = [] for i in range(chain_size): log_lh_current = np.log(pois_lh(k=k, lamda=lamda_current)) log_prior_current = np.log( gamma_prior(alpha=alpha, beta=beta, lamda=lamda_current) ) log_posterior_current = log_lh_current + log_prior_current lamda_proposal = sp.stats.norm(loc=lamda_current, scale=tuning_param).rvs() if lamda_proposal < 0: lamda_proposal *= -1 log_prior_next = np.log( gamma_prior(alpha=alpha, beta=beta, lamda=lamda_proposal) ) log_lh_next = np.log(pois_lh(k, lamda=lamda_proposal)) log_posterior_proposal = log_lh_next + log_prior_next log_post_ratio = log_posterior_proposal - log_posterior_current post_ratio = np.exp(log_post_ratio) prob_next = np.min([post_ratio, 1]) unif_draw = sp.stats.uniform.rvs() post_ratio_list.append(post_ratio) if unif_draw < prob_next: lamda_current = lamda_proposal lamda_mcmc.append(lamda_current) pass_rate.append("Y") else: lamda_mcmc.append(lamda_current) pass_rate.append("N") return lamda_mcmc, pass_rate # With the use of log posterior and acceptance rule, the numerical overflow is unlikely to happen anymore, which means we can set a much longer Markov chain and also a larger tuning parameter. # In[23]: lamda_mcmc, pass_rate = gamma_poisson_mcmc_2(chain_size=100000, tuning_param=3) # In[24]: y_rate = pass_rate.count("Y") / len(pass_rate) n_rate = pass_rate.count("N") / len(pass_rate) yes = ["Pass", "Not Pass"] counts = [pass_rate.count("Y"), pass_rate.count("N")] fig, ax = plt.subplots(figsize=(12, 12), nrows=3, ncols=1) ax[0].hist(lamda_mcmc[int(len(lamda_mcmc) / 10) :: 2], bins=100, density=True) ax[0].set_title(r"Posterior Frequency Distribution of $\lambda$") ax[0].plot(x, gamma_pdf_prior, label="Prior") ax[0].plot(x, gamma_pdf_post, label="Posterior") ax[0].legend() ax[1].plot(np.arange(len(lamda_mcmc)), lamda_mcmc, lw=1) ax[1].set_title("Trace") ax[2].barh(yes, counts, color=["green", "blue"], alpha=0.7) ax[2].text( counts[1] * 0.4, "Not Pass", r"${}\%$".format(np.round(n_rate * 100, 2)), color="tomato", size=28, ) ax[2].text( counts[0] * 0.4, "Pass", r"${}\%$".format(np.round(y_rate * 100, 2)), color="tomato", size=28, ) plt.show() # Larger tuning parameter yields a lower pass ($30\%\sim50\%$) rate that is exactly what we are seeking for. # ## Pruning # In[25]: lamda_mcmc = np.array(lamda_mcmc) n = 4 fig, ax = plt.subplots(ncols=1, nrows=n, figsize=(12, 10)) for i in range(1, n + 1): g = plot_acf( lamda_mcmc[::i], ax=ax[i - 1], title="", label=r"$lag={}$".format(i), lags=30 ) fig.suptitle("Markov chain Autocorrelation") plt.show()