import numpy as np
import pandas as pd
import math
from numpy.random import default_rng
rng = default_rng(42)
from scipy.stats import norm
import matplotlib as mpl
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from tqdm.notebook import tqdm
mpl.rcParams['font.size'] = 18
Consider the data from Example 8.6. Find the plug-in estimate of the correlation coefficient. Estimate the standard error using the bootstrap. Find a 95 percent confidence interval using the Normal, pivotal, and percentile methods.
Solution:
Y = np.array([576, 635, 558, 578, 666, 580, 555, 661,
651, 605, 653, 575, 545, 572, 594]) # LSAT
Z = np.array([3.39, 3.30, 2.81, 3.03, 3.44, 3.07, 3.00, 3.43,
3.36, 3.13, 3.12, 2.74, 2.76, 2.88, 3.96]) # GPA
n_1 = Y.size
n_2 = Z.size
def corr(Y, Z):
Y_bar = np.mean(Y)
Z_bar = np.mean(Z)
return np.sum((Y - Y_bar) * (Z - Z_bar)) / np.sqrt(np.sum(np.power((Y - Y_bar), 2)) * np.sum(np.power((Z - Z_bar), 2)))
theta_hat = corr(Y, Z)
print(f"Estimate of correlation coefficient: {theta_hat:.3f}")
B = 10000
corr_boot = np.empty(B)
for i in range(B):
indices = rng.choice(list(range(n_1)), size=n_1, replace=True)
Y_B = Y[indices]
Z_B = Z[indices]
corr_boot[i] = corr(Y_B, Z_B)
se = math.sqrt(np.var(corr_boot))
print(f"Bootstrap estimate of standard error: {se:.3f}")
Estimate of correlation coefficient: 0.546 Bootstrap estimate of standard error: 0.196
alpha = 0.05
z = norm.ppf(1-alpha/2)
print("95% confidence intervals:")
print(f"Normal method: ({corr(Y,Z) - z * se:.3f}, {corr(Y,Z) + z * se:.3f})")
a_hat = 2 * theta_hat - np.quantile(corr_boot, 1 - alpha / 2)
b_hat = 2 * theta_hat - np.quantile(corr_boot, alpha / 2)
print(f"Pivot method: ({a_hat:.3f}, {b_hat:.3f})")
lb = np.quantile(corr_boot, alpha / 2)
ub = np.quantile(corr_boot, 1 - alpha / 2)
print(f"Percentile method: ({lb:.3f}, {ub:.3f})")
95% confidence intervals: Normal method: (0.162, 0.930) Pivot method: (0.153, 0.894) Percentile method: (0.198, 0.939)
(Computer Experiment.) Conduct a simulation to compare the various bootstrap confidence interval methods. Let $n = 50$ and let $T(F) = \int (x - \mu)^3 \, dF(x)/\sigma^3$ be the skewness. Draw $Y_1, \dots, Y_n \sim N(0,1)$ and let $X_i = e^{Y_i}$ $i=1, \dots, n$. Construct the three types of bootstrap 95 percent confidence intervals for $T(F)$ from the data $X_1, \dots, X_n$. Repeat this whole thing many times and estimate the true coverage of the three intervals.
Solution:
Note that $X_i$ has a (standard) log-normal distribution, which has skewness $(e + 2)\sqrt{e-1} \approx 6.185$
n = 50
def skew(x):
mu_bar = np.mean(x)
sigma = np.std(x, ddof=1)
return np.sum((x - mu_bar) ** 3) / (n * sigma ** 3)
Y = rng.normal(loc=0, scale=1, size=n)
X = np.exp(Y)
theta_hat = skew(X)
print(f'Estimated skewness: {theta_hat:.3f}')
Estimated skewness: 3.365
def get_bootstrap_se(X):
B = 10000
skew_boot = np.empty(B)
for i in range(B):
X_boot = rng.choice(X, size=n, replace=True)
skew_boot[i] = skew(X_boot)
return math.sqrt(np.var(skew_boot)), skew_boot
se, skew_boot = get_bootstrap_se(X)
print(f'Bootstrap standard error: {se:.3f}')
alpha = 0.05
z = norm.ppf(1-alpha/2)
print("95% confidence intervals:")
print(f"Normal method: ({theta_hat - z * se:.3f}, {theta_hat + z * se:.3f})")
a_hat = 2 * theta_hat - np.quantile(skew_boot, 1 - alpha / 2)
b_hat = 2 * theta_hat - np.quantile(skew_boot, alpha / 2)
print(f"Pivot method: ({a_hat:.3f}, {b_hat:.3f})")
lb = np.quantile(skew_boot, alpha / 2)
ub = np.quantile(skew_boot, 1 - alpha / 2)
print(f"Percentile method: ({lb:.3f}, {ub:.3f})")
Bootstrap standard error: 0.817 95% confidence intervals: Normal method: (1.764, 4.966) Pivot method: (2.168, 5.542) Percentile method: (1.188, 4.562)
Repeating many times:
m = 100 # num trials
normal_coverage = 0
pivot_coverage = 0
percentile_coverage = 0
alpha = 0.05
z = norm.ppf(1-alpha/2)
theta = (math.e + 2) * math.sqrt(math.e - 1)
for j in range(m):
Y = rng.normal(loc=0, scale=1, size=n)
X = np.exp(Y)
theta_hat = skew(X)
se, skew_boot = get_bootstrap_se(X)
if (theta_hat - z * se <= theta) and (theta <= theta_hat + z * se):
normal_coverage += 1
a_hat = 2 * theta_hat - np.quantile(skew_boot, 1 - alpha / 2)
b_hat = 2 * theta_hat - np.quantile(skew_boot, alpha / 2)
if (a_hat <= theta) and (theta <= b_hat):
pivot_coverage += 1
lb = np.quantile(skew_boot, alpha / 2)
ub = np.quantile(skew_boot, 1 - alpha / 2)
if (lb <= theta) and (theta <= ub):
percentile_coverage += 1
print(f'Normal Coverage: {100 * normal_coverage / m}%')
print(f'Pivot Coverage: {100 * pivot_coverage / m}%')
print(f'Percentile Coverage: {100 * percentile_coverage / m}%')
Normal Coverage: 11.0% Pivot Coverage: 14.0% Percentile Coverage: 1.0%
Let
$$X_1, \dots, X_n \sim t_3$$where $n = 25$. Let $\theta = T(F) = (q_{.75} - q_{.25})/1.34$ where $q_p$ denotes the $p^{\text{th}}$ quantile. Do a simulation to compare the coverage and length of the following confidence intervals for $\theta$: (i) Normal interval with standard error from the bootstrap, (ii) bootstrap percentile interval, and (iii) pivotal bootstrap interval.
Solution:
from scipy.stats import t
theta = (t.ppf(.75, df=3) - t.ppf(0.25, df=3)) / 1.34
print(f"Theta: {theta:.3f}")
n = 25
X = rng.standard_t(df=3, size=n)
def T(x):
q_b = np.quantile(x, 0.75)
q_a = np.quantile(x, 0.25)
return (q_b - q_a) / 1.34
theta_hat = T(X)
print(f"Theta_hat: {theta_hat:.3f}")
print("Getting bootstrap standard error...")
def get_bootstrap(X):
n = X.size
B = 1000
T_boot = np.empty(B)
for i in range(B):
X_boot = rng.choice(X, size=n, replace=True)
T_boot[i] = T(X_boot)
return T_boot
T_boot = get_bootstrap(X)
theta_hat = T(X)
se = math.sqrt(np.var(T_boot))
print(f"Bootstrap standard error: {se:.3f}")
def get_confidence_intervals(theta_hat, T_boot, show=False):
se = math.sqrt(np.var(T_boot))
alpha = 0.05
z = norm.ppf(1 - alpha / 2)
normal_ci = (theta_hat - z * se, theta_hat + z * se)
a_hat = 2 * theta_hat - np.quantile(T_boot, 1 - alpha / 2)
b_hat = 2 * theta_hat - np.quantile(T_boot, alpha / 2)
pivot_ci = (a_hat, b_hat)
lb = np.quantile(T_boot, alpha / 2)
ub = np.quantile(T_boot, 1 - alpha / 2)
percentile_ci = (lb, ub)
if show:
print("95% confidence intervals:")
print(f"Normal method: ({normal_ci[0]:.3f}, {normal_ci[1]:.3f})")
print(f"Pivot method: ({pivot_ci[0]:.3f}, {pivot_ci[1]:.3f})")
print(f"Percentile method: ({percentile_ci[0]:.3f}, {percentile_ci[1]:.3f})")
return normal_ci, pivot_ci, percentile_ci
normal_ci, pivot_ci, percentile_ci = get_confidence_intervals(theta_hat, T_boot, show=True)
Theta: 1.142 Theta_hat: 1.378 Getting bootstrap standard error... Bootstrap standard error: 0.330 95% confidence intervals: Normal method: (0.731, 2.025) Pivot method: (0.822, 2.027) Percentile method: (0.729, 1.935)
Investigating the average length over 100 iterations, as well as the coverage of the true value:
results = {'Normal LB':[], 'Normal UB':[], 'Pivot LB':[], 'Pivot UB':[], 'Percentile LB':[], 'Percentile UB':[]}
for m in range(100):
X = rng.standard_t(df=3, size=n)
theta_hat = T(X)
T_boot = get_bootstrap(X)
normal_ci, pivot_ci, quantile_ci = get_confidence_intervals(theta_hat, T_boot)
results['Normal LB'] += [normal_ci[0]]
results['Normal UB'] += [normal_ci[1]]
results['Pivot LB'] += [pivot_ci[0]]
results['Pivot UB'] += [pivot_ci[1]]
results['Percentile LB'] += [quantile_ci[0]]
results['Percentile UB'] += [quantile_ci[1]]
df = pd.DataFrame.from_records(results)
for method in ['Normal', 'Pivot', 'Percentile']:
df[f'{method} Length'] = df[f'{method} UB'] - df[f'{method} LB']
df[f'{method} Cover'] = (theta <= df[f'{method} UB']) & (df[f'{method} LB'] <= theta)
summary_data = []
for method in ['Normal', 'Pivot', 'Percentile']:
summary_data.append([method, df[f'{method} Length'].mean(), df[f'{method} Cover'].mean()])
summary = pd.DataFrame(summary_data, columns=['Method','Average Length','Coverage'])
summary
Method | Average Length | Coverage | |
---|---|---|---|
0 | Normal | 1.302371 | 0.97 |
1 | Pivot | 1.287781 | 0.84 |
2 | Percentile | 1.287781 | 0.97 |
Let $X_1, \dots, X_n$ be distinct observations (no ties). Show that there are
$${2n - 1 \choose n}$$distinct bootstrap samples.
Hint: Imagine putting $n$ balls into $n$ buckets.
Solution:
It is indeed a straightforward observation that the number of distinct bootstrap samples is the same as the number of ways to put $n$ balls into $n$ buckets, where balls are indistinguishable. This is, in turn, the same as the number of ways to insert $n-1$ bars in a row of $n$ stars, where bars can be placed in between two stars or on the ends (bars can also be right next to eachother). This is, in turn, the number of ways to select $n$ of a group of $2n - 1$ entities to be stars, and the rest to be bars, i.e., ${2n - 1 \choose n}$.
Let $X_1, \dots, X_n$ be distinct observations (no ties). Let $X_1^*, \dots, X_n^*$ denote a bootstrap sample and let $\bar{X}_n^* = n^{-1}\sum_{i=1}^n X_i^*$. Find $E(\bar{X}_n^* \mid X_1, \dots, X_n)$, $V(\bar{X}_n^* \mid X_1, \dots, X_n)$, $E(\bar{X}_n^*)$ and $V(\bar{X}_n^*)$.
Solution:
Given $X_1, \dots, X_n$, for all $j = 1, \dots, n$, we may regard $X_j^*$ as a discrete random variable assuming the value $X_i$ with probability $\frac{1}{n}$ for all $i = 1, \dots n$.
Observe that $E(X_i^* \mid X_1, \dots, X_n) = \frac{1}{n}\sum_{i=1}^n X_i$ for all $i =1,\dots, n$. Thus,
\begin{align*} E(\bar{X}_n^* \mid X_1, \dots X_n) &= \frac{1}{n} \sum_{i=1}^n E(X_i^* \mid X_1, \dots, X_n) \\ &= \frac{1}{n} \sum_{i=1}^n \frac{1}{n}\sum_{j=1}^n X_j = \frac{1}{n} n \bar{X} = \bar{X} \end{align*}then,
\begin{align*} E(\bar{X}_n^*) &= E\left[E(\bar{X}_n^* \mid X_1, \dots X_n) \right] \tag{Iterated Expectation} \\ &= E\left[\bar{X} \right] \\ &= \frac{1}{n} \sum_{i=1}^n E[X_i] \\ &= \mu \end{align*}Now, observe that $V(X_i^* \mid X_1, \dots, X_n) = \frac{1}{n} \sum_{i=1}^n(X_i - \bar{X})^2 = \left(\frac{n-1}{n}\right) S_n^2$. Thus,
\begin{align*} V(\bar{X}_n^* \mid X_1 \dots, X_n) &= \frac{1}{n^2} \sum_{i=1}^n V(X_i^* \mid X_1, \dots, X_n) \\ &= \frac{1}{n^2} n \left(\frac{n-1}{n}\right) S_n^2 \\ &= \left(\frac{n-1}{n^2}\right) S_n^2 \\ \end{align*}So, by Eve's Law,
\begin{align*} V(\bar{X}_n^*) &= E[V(\bar{X}_n^* \mid X_1 \dots, X_n)] + V[E(\bar{X}_n^* \mid X_1, \dots X_n)] \\ &= \left(\frac{n-1}{n^2}\right) E[S_n^2] + V[\bar{X}] \\ &= \left(\frac{n-1}{n^2}\right) \sigma^2 + \frac{\sigma^2}{n} = \left(\frac{2n - 1}{n^2}\right) \sigma^2 \end{align*}(Computer Experiment.) Let $X_1, \dots, X_n \sim N(\mu, 1)$. Let $\theta = e^{\mu}$ and let $\hat{\theta} = e ^ {\bar{X}}$. Create a data set (using $\mu = 5$) consisting of $n=100$ observations.
(a) Use the bootstrap to get the $\text{se}$ and 95 percent confidence interval for $\theta$.
(b) Plot a histogram of the bootstrap replications. This is an estimate of the distribution of $\hat{\theta}$. Compare this to the true sampling distribution of $\hat{\theta}$.
mu = 5
n = 100
theta = math.e ** mu
X = rng.normal(loc=mu, scale=1, size=n)
X_bar = np.mean(X)
math.e ** X_bar
def T(x):
x_bar = np.mean(x)
return math.e ** x_bar
B = 100000
T_boot = np.empty(B)
for i in range(B):
X_boot = rng.choice(X, size=n, replace=True)
T_boot[i] = T(X_boot)
theta_hat = T(X)
se = math.sqrt(np.var(T_boot))
alpha = 0.05
z = norm.ppf(1 - alpha / 2)
print(f"True Value: {theta:.3f}")
print(f"Estimate: {theta_hat:.3f}")
print(f"Normal 95% CI: ({theta_hat - z * se:.3f}, {theta_hat + z * se:.3f})")
True Value: 148.413 Estimate: 167.502 Normal 95% CI: (134.503, 200.500)
Note that
$$\bar{X} = \frac{1}{n} \sum_{i=1}^n X_i \sim N \left( \mu, \frac{1}{n} \right)$$Letting $F_{\bar{X}}(\cdot)$ be the CDF of the normal distribution with mean $\mu$ and variance $\frac{1}{n}$, we have
$$P(\hat{\theta} \le x) = P(e^{\bar{X}} \le x) = P(\bar{X} \le \log{x}) = F_{\bar{X}}(\log{x})$$so the CDF of $\hat{\theta}$ is $F_{\hat{\theta}}(x) = F_{\bar{X}}(\log{x})$.
def theta_cdf(x):
return norm.cdf(math.log(x), loc=5, scale = np.sqrt(1 / n))
bins = np.linspace(50, 200, 150)
theta_cdf_bins = list(map(theta_cdf, bins))
theta_cdf_bins_delta = np.empty(len(bins))
theta_cdf_bins_delta[0] = 0
theta_cdf_bins_delta[1:] = np.diff(theta_cdf_bins)
fig = go.Figure(data=go.Histogram(x=T_boot,
opacity=0.5,
histnorm='probability density',
name='Bootstrap Replications'))
fig.add_trace(go.Scatter(x=bins, y=theta_cdf_bins_delta, name='True Sampling Distribution'))
fig.update_layout(
legend=dict(
yanchor="top",
y=0.99,
xanchor="right",
x=0.99
),
)
fig.show()
Let $X_1, \dots, X_n \sim \text{Uniform}(0,\theta)$. Let $\hat{\theta} = X_{\text{max}} = \max \{X_1, \dots, X_n \}$. Generate a data set of size 50 with $\theta = 1$.
(a) Find the distribution of $\hat{\theta}$. Compare the true distribution of $\hat{\theta}$ to the histograms from the bootstrap.
(b) This is a case where the bootstrap does very poorly. In fact, we can prove that this is the case. Show that $P(\hat{\theta} = \theta) = 0$ and yet $P(\hat{\theta}^* = \hat{\theta}) \approx .632$. Hint: show that $P(\hat{\theta}^* = \hat{\theta}) = 1 - (1 - \frac{1}{n})^n$ then take the limit as $n$ gets large.
Note: my copy of the book writes "Show that $P(\hat{\theta} = \hat{\theta}) = 0$", but I don't believe I can manage this.
Solution:
If $x < 0$, $P(\hat{\theta} \le x) = 0$, and if $x \ge \theta$, $P(\hat{\theta} \le \theta) = 1$. If $0 \le x < \theta$, we have
\begin{align*} P(\hat{\theta} \le x) &= P(\text{max}\{X_1, \dots, X_n\} \le x) \\ &= P(\bigcap_{i=1}^n (X_i \le x)) \\ &= P(X_1 \le x)^n = \left(\frac{x}{\theta} \right)^n \end{align*}So the CDF of $\hat{\theta}$ is
\begin{equation*} F_{\hat{\theta}}(x) = \begin{cases} 0 & x < 0 \\ \left(\frac{x}{\theta} \right)^n & 0 \le x < \theta \\ 1 & \theta \le x \end{cases} \end{equation*}which has PDF $\frac{n}{\theta^n} x^{n-1} \mathbb{1}_{(0,\theta)}(x)$
n = 50
theta = 1
X = rng.uniform(low=0, high=theta, size=n)
theta_hat = max(X)
B = 1000
T_boot = np.empty(B)
for i in range(B):
X_boot = rng.choice(X, size=n, replace=True)
T_boot[i] = max(X_boot)
xx = np.linspace(0, theta, 100)
def theta_hat_density(x):
return (n / (theta ** n)) * (x ** (n - 1))
theta_hat_density = list(map(theta_hat_density, xx))
fig = go.Figure(data=go.Histogram(x=T_boot,
opacity=0.5,
histnorm='probability density',
name='Bootstrap Replications'))
fig.add_trace(go.Scatter(x=xx, y=theta_hat_density, name='True Sampling Distribution'))
fig.show()
(b) The probability of $X_i$ assuming any given value is 0, as it is a continuous random variable. In particular, the probability of $X_i$ equalling $\theta$ is 0, and, thus, the same is true for $\max\{X_1, \dots, X_n\} = \hat{\theta}$.
Meanwhile, we observe
\begin{align*} P(\hat{\theta}^* = \hat{\theta}) &= 1 - P(\hat{\theta}^* \ne \hat{\theta}) \\ &= 1 - P(\bigcap_{i=1}^n X_i^* \ne \max \{X_1, \dots, X_n\}) \\ &= 1 - P(X_1^* \ne X_{\max})^n \tag{$X_{\max} = \max \{X_1, \dots, X_n\}$} \\ &= 1 - (1 - P(X_1^* = X_{\max}))^n \\ &= 1 - (1 - \frac{1}{n})^n \rightarrow 1 - e^{-1} \approx 0.632 \end{align*}meaning it is likely the bootstrap estimate equals the sample estimate, which has no chance of being correct.
Let $T_n = \bar{X}_n^2$, $\mu = E(X_1)$, $\alpha_k = \int |x - \mu|^k \, dF(x)$ and $\hat{\alpha} = n^{-1} \sum_{i=1}^n |X_i - \bar{X}_n|^k$. Show that
$$v_{\text{boot}} = \frac{4 \bar{X}_n^2 \hat{\alpha}_2}{n} + \frac{4 \bar{X}_n \hat{\alpha}_3}{n^2} + \frac{\hat{\alpha}_4}{n^3}$$Solution: