#!/usr/bin/env python # coding: utf-8 # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fonnesbeck/Bios8366/blob/master/notebooks/Section3_2-Bootstrapping.ipynb) # # # Bootstrapping # # Parametric inference can be non-robust: # # - inaccurate if parametric assumptions are violated # - if we rely on asymptotic results, we may not achieve an acceptable level of accuracy # # Parmetric inference can be difficult: # - derivation of sampling distribution may not be possible # # An alternative is to estimate the sampling distribution of a statistic **empirically** without making assumptions about the form of the population. # # The bootstrap is a simulation tool for assessing accuracy. It is related to cross-validation, which we will encounter later in the course when we talk about tuning machine learning models. # # This approach is most commonly used as a non-parametric method for calculating standard errors and confidence intervals. # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set_context('notebook') np.random.seed(42) DATA_URL = 'https://raw.githubusercontent.com/fonnesbeck/Bios8366/master/data/' # ## The Jackknife # # A simple precursor to bootstrapping is the jackknife (Quenouille 1949), which facilitates the estimation of the bias and variance of an estimator. Recall that the bias of an estimator $\widehat{\theta}$ of $\theta$ is: # # $$Bias(\widehat{\theta}) = E(\widehat{\theta}) - \theta$$ # # Consider calculating an estimate using $n-1$ values from the dataset $\widehat{\theta}_{(-i)}$ (that is, the $i^{th}$ value is removed). If this is repeated for each $i=1,\ldots,n$ observation, we can average them to obtain: # # $$\bar{\theta}_{(n)} = n^{-1} \sum_i \widehat{\theta}_{(-i)}$$ # # The **jackknife bias estimate** is: # # $$b_{jack} = (n-1)(\bar{\theta}_{(n)} - \widehat{\theta})$$ # # It can be shown that $b_{jk}$ estimates the bias up to order $O(n^{-2})$. # # Thus, a bias-corrected estimator is: # # $$\widehat{\theta}_{jack} = \widehat{\theta} - b_{jack}$$ # In[ ]: def jackknife_bias(data, func, **kwargs): theta_hat = func(data, **kwargs) n = data.shape[0] idx = np.arange(n) theta_jack = sum(func(data[idx!=i], **kwargs) for i in range(n))/n return (n-1) * (theta_jack - theta_hat) # In[ ]: x = np.random.normal(0, 2, 100) # In[ ]: np.std(x) # In[ ]: np.std(x) - jackknife_bias(x, np.std) # Another way of expressing $\widehat{\theta}_{jack}$ is: # # $$\widehat{\theta}_{jack} = (n^{-1}) \sum_i \widetilde{\theta}_i$$ # # where $\widetilde{\theta}_i$ are known as **pseudovalues** and are defined as: # # $$\widetilde{\theta}_i = n \widehat{\theta} - (n-1) \widehat{\theta}_{(-i)}$$ # In[ ]: def jackknife(data, func, **kwargs): theta_hat = func(data, **kwargs) n = data.shape[0] idx = np.arange(n) return sum(n*theta_hat - (n-1)*func(data[idx!=i], **kwargs) for i in range(n))/n # In[ ]: jackknife(x, np.std) # Correspondingly, the **jackknife variance estimate** of the estimator $\widehat{\theta}$ is: # # $$v_{jack} = \frac{\widetilde{s}^2}{n}$$ # # where $\widetilde{s}^2$ is the sample variance of the pseudo-values: # # $$\widetilde{s}^2 = \frac{\sum_i (\widetilde{\theta}_i - n^{-1}\sum_j \widetilde{\theta}_j)^2}{n-1}$$ # # Under certain regularity conditions, $v_{jack}$ is a consistent estimator of $V(\widehat{\theta})$. It can be inconsistent for some statistics, such as the median, that are not smooth. # ### Exercise # # Write a function that implements a jackknife variance estimator. # In[ ]: # write your answer here # ## The Bootstrap # # The bootstrap is a resampling method discovered by [Brad Efron](http://www.jstor.org/discover/10.2307/2958830?uid=3739568&uid=2&uid=4&uid=3739256&sid=21102342537691) that allows one to approximate the true sampling distribution of a dataset, and thereby obtain estimates of the mean and variance of some function of the distribution. # # In general, consider a statistic $T_n = g(X_1, \ldots, X_n)$, which has variance $V_F(T_n)$. Note this implies that the variance is a function of the underlying $F$, which is unknown. # # One approach is to estimate $V_F(T_n)$ with some $V_{F_n}(T_n)$, which is a plug-in estimator of the variance. But, $V_{F_n}(T_n)$ may be difficult to compute, so we can attempt to approximate it with a **simulation estimate** $V_{boot}$. # # Here is the algorithm: # # 1. Draw $X_{1}^*, \ldots, X_{n}^* \sim \widehat{F}_n$ # 2. For each r in R iterations: # Calculate statistic $T^*_{rn} = g(X^*_{r1}, \ldots, X^*_{rn})$ # 3. Calculate variance: $V_{boot} = \frac{1}{R} \sum_r \left(T^*_{rn} - \bar{T}^*_{.n} \right)^2$ # # Rather than finding a way of drawing $n$ points at random from $\widehat{F}_n$, we can instead draw a sample of size $n$ **with replacement from the original data**. So, step 1 can be modified to: # # > 1. Draw $X_{1}^*, \ldots, X_{n}^*$ with replacement from $X_{1}, \ldots, X_{n}$ # # # These are called **bootrstrap samples**: # # $$S_1^* = \{X_{11}^*, X_{12}^*, \ldots, X_{1n}^*\}$$ # # We regard S as an "estimate" of population P # # > population : sample :: sample : bootstrap sample # To generate samples with replacement in Python, there are a handful of approaches. We can use NumPy to generate random integers (`np,random.randint`), and use these to index DataFrame rows with `iloc`. # In[ ]: try: prostate_data = pd.read_table('../data/prostate.data.txt', index_col=0) except FileNotFoundError: prostate_data = pd.read_table(DATA_URL + 'prostate.data.txt', index_col=0) prostate_data.head() # In[ ]: random_ind = np.random.randint(0, prostate_data.shape[0], 5) prostate_data.iloc[random_ind] # pandas' `sample` method makes this even easier, and allows for custom sampling probabilities when non-uniform sampling is desired. # In[ ]: prostate_data.sample(5, replace=True) # ### Exercise # # Generate 20 samples with replacement from the prostate dataset, weighting samples inversely by age. # In[ ]: # Write your answer here # ### Bootstrap Estimates # # From our bootstrapped samples, we can extract *estimates* of the expectation and its variance: # # $$\bar{T}^* = \hat{E}(T^*) = \frac{\sum_r T_r^*}{R}$$ # # $$\hat{\text{Var}}(T^*) = \frac{\sum_r (T_r^* - \bar{T}^*)^2}{R-1}$$ # # # Under appropriate regularity conditions, $\frac{s^2}{\hat{\text{Var}}(T^*)} \rightarrow 1$ as $n \rightarrow \infty$. # ## Bootstrap Confidence Intervals # # There are a handful of ways for constructing confidence intervals from bootstrap samples, varying in ease of calculation and accuracy. # ### Bootstrap Normal Interval # # Perhaps the simplest bootstrap interval is the normal interval: # # $$T_n ± z_{α/2} \widehat{SE}_{boot}$$ # # where $\widehat{SE}_{boot}$ is an estimate of the standard error using the bootstrap sample. Of course, this interval is not accurate unless the distribution of $T_n$ is close to Gaussian. # # # We can first define a **pivotal interval**. Let $\theta = T(F)$ and $\widehat{\theta}_n = T(\widehat{F}_n)$, and further define $P_n = \widehat{\theta}_n - \theta$. # # Let $H(p)$ denote the CDF of the pivot: # # $$ # H(p) = Pr_F(P_n \le p) # $$ # # Now define the interval $C_n = (a, b)$ where: # # $$\begin{aligned} # a &= \widehat{\theta}_n - H^{-1}(1-\alpha/2) \\ # b &= \widehat{\theta}_n - H^{-1}(\alpha/2). # \end{aligned}$$ # # It can be shown that $C_n$ is an exact $1 - \alpha$ confidence interval for $\theta$. But clearly, $a$ and $b$ depend on the unknown $H$. # # A pivot confidence interval uses a bootstrap estimate of H as an approximation: # # $$\widehat{H}_P = \frac{1}{R} \sum_r I\left[(\widehat{\theta}_{nr} - \widehat{\theta}_{n.}) < p\right]$$ # # From this, an approximate $1-\alpha$ confidence interval is $C_n = (\widehat{a}, \widehat{b})$: # # $$\begin{aligned} # \widehat{a} &= \widehat{\theta}_n - \widehat{H}^{-1}(1-\alpha/2) = 2\widehat{\theta}_n - \theta^*_{1 - \alpha/2} \\ # \widehat{b} &= \widehat{\theta}_n - \widehat{H}^{-1}(\alpha/2) = 2\widehat{\theta}_n - \theta^*_{\alpha/2}. # \end{aligned}$$ # # # Let's calculate the pivot interval for the standard deviation of the log-weights in the prostate dataset. # In[ ]: weights = prostate_data.lweight.copy() n = weights.shape[0] weights.std(ddof=0) # In[ ]: R = 1000 samples = np.array([weights.sample(n, replace=True) for r in range(R)]) samples.shape # In[ ]: theta_hat = weights.std(ddof=0) # In[ ]: estimates = samples.std(axis=1) # In[ ]: 2*theta_hat - np.percentile(estimates, [97.5, 2.5]) # An alternative approach for calculating a pivotal interval is to define: # # $$Z_n = \frac{T_n - \theta}{\widehat{SE}_{boot}}$$ # # which can be approximated with: # # $$Z^*_{nr} = \frac{T^*_{nr} - T_n}{\widehat{SE}^*_{boot}}$$ # # where $\widehat{SE}^*_{boot}$ is the standard error of $T^*_{nr}$. # # Here, the quantiles of $Z^*_{n1}, \ldots, Z^*_{nR}$ should approximate the true quantiles of the distribution of $Z_n$. We can then calculate the following interval: # # $$C_n = (T_n - z^*_{1-\alpha/2}\widehat{SE}_{boot}, T_n - z^*_{\alpha/2}\widehat{SE}_{boot})$$ # # where $z^*_{a}$ is the $a$ sample quantile of $Z^*_{n1}, \ldots, Z^*_{nR}$. # # This is a **studentized pivotal interval**. # # This interval is more computationally-intensive because $\widehat{SE}^*_{boot}$ has to be calculated for each bootstrap sample, but it has better accuracy than the non-studentized interval. # ### Exercise # # Write a function to calculate studentized pivotal intervals for arbitrary functions. # In[ ]: # Write your answer here # ### Bootstrap Percentile Intervals # # An even simpler interval involves using the empirical quantiles of the bootstrapped statistics. Consider a monotone transformation $U = m(T)$ such that $U \sim N(m(\theta), c^2)$. Importantly, we don't need to know what $m$ is, only that it exists. If we let $U^*_r = m(T^*_r)$, then $U^*_{(R\alpha/2)} = m(T^*_{(R\alpha/2)})$ because the monotone transformation preserves the quantiles. # # Since $U \sim N(m(\theta), c^2)$, the $\alpha/2$ quantile of $U$ is $m(\theta) - z_{\alpha/2} c$. From this, it can be shown that: # # $$Pr(T^*_{R\alpha/2} \le \theta \le T^*_{R(1 - \alpha/2)}) = Pr\left( -z_{\alpha/2} \le \frac{U - m(T)}{c} \le z_{\alpha/2} \right) = 1 - \alpha$$ # # This employs the *ordered* bootstrap replicates: # # $$T_{(1)}^*, T_{(2)}^*, \ldots, T_{(R)}^*$$ # # Simply extract the $100(\alpha/2)$ and $100(1-\alpha/2)$ percentiles: # # $$T_{[(R+1)\alpha/2]}^* \lt \theta \lt T_{[(R+1)(1-\alpha/2)]}^*$$ # Extract subset of varibles, and take bootstrap samples. # In[ ]: data_subset = prostate_data[['lcavol', 'lweight', 'lbph', 'lcp', 'lpsa']] bootstrap_data = (data_subset.sample(data_subset.shape[0], replace=True) for _ in range(1000)) # We will user scikit-learn's `LinearRegression` model to fit a regression model to each bootstrap sample, and store the resulting coefficients. # In[ ]: from sklearn.linear_model import LinearRegression regmod = LinearRegression() # Empty array to store estimates coefs = np.empty((1000, 4)) for i,X in enumerate(bootstrap_data): y = X.pop('lpsa') regmod.fit(X, y) coefs[i] = regmod.coef_ # In[ ]: coefs.mean(0) # Sort coefficients, and extract percentiles: # In[ ]: coefs.sort(axis=0) boot_se = coefs[[25, 975], :].T # In[ ]: # Plot means coef_means = coefs.mean(0) plt.plot(coef_means, 'ro') # Plot bootstrap intervals for i in range(len(coef_means)): plt.errorbar(x=[i,i], y=boot_se[i], color='red') plt.xlim(-0.5, 3.5) plt.xticks(range(len(coef_means)), ['lcavol', 'lweight', 'lbph', 'lcp']) plt.axhline(0, color='k', linestyle='--'); # Note, however, that the pivot confidence interval is generally more accurate than the percentile interval. # # Also, its important to remember that the bootstrap is an **asymptotic method**. So, the coverage of the confidence interval is expected to be $1 - \alpha + r_n$ where $r_n \propto 1/\sqrt{n}$. # ## Parametric Bootstrap # # The bootstrap approaches we have outlined above are all non-parametric, though parametric inference is possible. If we have $X_1, \ldots, X_n \sim p(x|\theta)$, where $\widehat{\theta}$ is the MLE, then let $\phi = g(\theta)$ and $\widehat{\phi} = g(\widehat{\theta})$. # # An approach for estimating the standard error of $\widehat{\phi}$ is to compute the standard deviation of the bootstrapped $\hat{\phi^*}_1, \ldots, \hat{\phi^*}_R$. To do this, we can draw bootstrap samples from the **parametric** distribution $p(x|\widehat{\theta})$: # # $$\widehat{X}_1^*, \ldots, \widehat{X}_n^* \sim p(x|\widehat{\theta})$$ # --- # ## References # # Wasserman, L. (2006). All of Nonparametric Statistics. Springer Science & Business Media.