In [1]:
import numpy as np
from scipy.stats import beta as Beta, binom, norm, rv_continuous
from scipy.special import gamma
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
np.random.seed(42)

1¶

Verify 11.7:

Let $X_1, \dots, X_n \sim N(\theta, \sigma^2)$. For simplicity, let us assume that $\sigma$ is known. Let us take as a prior $\theta \sim N(a, b^2)$. Show the posterior for $\theta$ is

$$\theta \mid X^n \sim N(\bar{\theta}, \tau^2)$$

where

$$\bar{\theta} = w\bar{X} + (1-w)a,$$$$w = \frac{\frac{1}{\text{se}^2}}{\frac{1}{\text{se}^2} + \frac{1}{b^2}}, \frac{1}{\tau^2} = \frac{1}{\text{se}^2} + \frac{1}{b^2},$$

and $\text{se} = \sigma / \sqrt{n}$ is the standard error of the MLE $\bar{X}$.

Solution:

Note that it suffices to show the posterior pdf is proportional to $\exp \left(-\frac{1}{2} \left(\frac{\theta - \bar{\theta}}{\tau}\right) ^ 2 \right)$. Ignoring constant factors will be a common theme in this chapter. Letting $\mathcal{L}(\theta \mid X^n)$ and $f(\theta)$ denote the likelihood and prior, respectively, we have

\begin{align*} f(\theta \mid X^n) &\propto \mathcal{L}(\theta \mid X^n) f(\theta) \\ &\propto \exp \left( -\frac{1}{2\sigma^2} \sum_i (X_i - \theta)^2 \right) \cdot \exp \left( -\frac{1}{b^2}(\theta - a)^2 \right) \\ &= \exp \left( -\frac{1}{2} \left[ \frac{\sum_i (X_i - \theta)^2}{\sigma^2} + \frac{(\theta - a)^2}{b^2} \right] \right) \end{align*}

Now note that it suffices to show the contents of the square brackets are equal to $\left(\frac{\theta - \bar{\theta}}{\tau}\right) ^ 2 + C$ for some $C$ that doesn't depend on $\theta$. Note that $\sum_i (X_i - \theta)^2 = n(\bar{X} - \theta)^2 + nS^2$, where $S^2 = n^{-1}\sum_i (X_i - \bar{X})^2$ (shown by adding $(-\bar{X} + \bar{X})$ and expanding the square). Also note that $\frac{1}{\text{se}^2} = \frac{w}{\tau^2}$, and $\frac{1}{b^2} = \frac{1-w}{\tau^2}$.

\begin{align*} \frac{\sum_i (X_i - \theta)^2}{\sigma^2} + \frac{(\theta - a)^2}{b^2} &= \frac{n(\theta - \bar{X})^2 + nS^2}{\sigma^2} + \frac{(\theta - a)^2}{b^2} \\ &= \frac{(\theta - \bar{X})^2}{\text{se}^2} + \frac{(\theta - a)^2}{b^2} + \frac{S^2}{\text{se}^2} \\ &= \frac{\theta^2 - 2\bar{X}\theta + \bar{X}^2}{\text{se}^2} + \frac{\theta^2 - 2a\theta + a^2}{b^2} + C' \tag{$C' = \frac{S^2}{\text{se}^2}$} \\ &= \left(\frac{1}{\text{se}^2} + \frac{1}{b^2} \right)\theta^2 - 2\left(\frac{\bar{X}}{\text{se}^2} + \frac{a}{b^2}\right)\theta + \left(\frac{\bar{X}^2}{\text{se}^2} + \frac{a^2}{b^2} \right) + C'\\ &= \frac{1}{\tau^2} \theta^2 - 2 \left(\frac{w}{\tau^2}\bar{X} + \frac{(1-w)}{\tau^2} a\right)\theta + \left(\frac{w}{\tau^2}\bar{X}^2 + \frac{1-w}{\tau^2} a^2 \right) +C' \\ &= \frac{1}{\tau^2} \left[\theta^2 - 2 \hat{\theta}\theta + w\bar{X}^2 + (1-w)a^2 + 2w(1-w)\bar{X}a - 2w(1-w)\bar{X}a\right] +C' \\ &= \frac{1}{\tau^2} \left[\theta^2 - 2 \hat{\theta}\theta + (w\bar{X} + (1-w)a)^2 - 2w(1-w)\bar{X}a\right] +C'\\ &= \frac{(\theta - \hat{\theta})^2}{\tau^2} - \frac{2w(1-w)\bar{X}a}{\tau^2} + C' \\ &= \frac{(\theta - \hat{\theta})^2}{\tau^2} + C \tag{$C = - \frac{2w(1-w)\bar{X}a}{\tau^2} + C'$} \end{align*}

2¶

Let $X_1, \dots, X_n \sim \text{Normal}(\mu, 1)$.

(a) Simulate a data set (using $\mu = 5$) consisting of $n=100$ observations.

In [2]:
mu = 5
n = 100
X = norm.rvs(loc=mu, scale=1, size=n)

(b) Take $f(\mu) = 1$ and find the posterior density. Plot the density.

The posterior density is given by: \begin{align*} f(\mu \mid X^n) &\propto \mathcal{L}_n(\mu \mid X^n) f(\mu) \\ &= (2 \pi)^{-n/2} \exp \left( (-1/2) \sum_i (X_i - \mu)^2 \right) \\ &= (2 \pi)^{-n/2} \exp \left( (-n/2)[(\bar{X} - \mu)^2 + S^2 ]\right) \\ &\propto \exp \left[-\frac12 \left(\frac{\bar{X} - \mu}{1 / \sqrt{n}} \right)^2 \right] \\ &\propto \sqrt{\frac{n}{2\pi}} \exp \left[-\frac12 \left(\frac{\bar{X} - \mu}{1 / \sqrt{n}} \right)^2 \right] \tag{Normalized} \end{align*}

Thus, $\mu \mid X^n \sim N(\bar{X},1/ n)$.

In [3]:
X_bar = np.mean(X)
sigma = np.sqrt(1/n)
XX = np.arange(4, 6, .01)
plt.plot(XX, norm.pdf(XX, loc=X_bar, scale=sigma))
plt.title("Posterior Density for $\mu$")
plt.grid()
plt.show()

(c) Simulate 1000 draws from the posterior. Plot a histogram of the simulated values and compare the histogram to the answers in (b).

In [4]:
k = 1000
mu_post = norm.rvs(loc=X_bar, scale=sigma, size=k)
plt.plot(XX, norm.pdf(XX, loc=X_bar, scale=np.sqrt(1/n)), label='Posterior Density')
plt.hist(mu_post, bins=30, histtype='step', density=True, label='(Normalized) Histogram of Draws')
plt.grid()
plt.legend(loc='upper right')
plt.show()

(d) Let $\theta = e^\mu$. Find the posterior density for $\theta$ analytically and by simulation.

The posterior cdf is given by: \begin{align*} H(\theta | X^n) &= P(e^\mu \le \theta | X^n) \\ &= \int_{\{\mu : e^\mu \le \theta\}} f(\mu \mid X^n) \, d\mu \\ &= \int_{-\infty}^{\log(\theta)} f(\mu \mid X^n) \, d\mu \\ &= \Phi \left( \frac{\log(\theta) - \bar{X}}{\sqrt{1/n}}\right) \end{align*} Meaning the posterior pdf is:

\begin{align*} H'(\theta | X^n) = \phi \left( \frac{\log(\theta) - \bar{X}}{\sqrt{1/n}}\right)\frac{1}{\theta \sqrt{1/n}} \end{align*}
In [5]:
TT = np.arange(100, 200, .1)
theta_post = norm.pdf((np.log(TT) - X_bar) / np.sqrt(1/n)) * (1 / (TT * np.sqrt(1/n)))
plt.plot(TT, theta_post)
theta_post = np.exp(mu_post)
plt.hist(theta_post, bins=30, histtype='step', density=True, label='(Normalized) Histogram of Draws')
plt.grid()
plt.legend()
plt.show()

(e) Find a 95% posterior interval for $\mu$.

As $\mu | X^n \sim N(\bar{X}, 1/n)$, a 95% CI is given by $(\bar{X} - n^{-1/2}z_{0.025},\bar{X} + n^{-1/2}z_{0.025})$.

In [6]:
alpha = 0.05
z = norm.ppf(1 - alpha/2)
print(f"{100 *(1-alpha):.0f}% C.I. for mu: ({X_bar - (1/np.sqrt(n)) * z:.3f}, {X_bar + (1/np.sqrt(n)) * z:.3f})")
95% C.I. for mu: (4.700, 5.092)

(f) Find a 95% posterior interval for $\theta$.

We can transform the interval for $\mu$, yielding $(\exp(\bar{X} - n^{-1/2}z_{0.025}),\exp(\bar{X} + n^{-1/2}z_{0.025}))$:

In [7]:
print(f"{100 *(1-alpha):.0f}% C.I. for theta: " \
      f"({np.exp(X_bar - (1/np.sqrt(n)) * z):.3f}," \
      f" {np.exp(X_bar + (1/np.sqrt(n)) * z):.3f})")
95% C.I. for theta: (109.964, 162.739)

or we can get it from our simulation:

In [8]:
print(f"{100 *(1-alpha):.0f}% C.I. for theta: " \
      f"({np.quantile(theta_post, 0.025):.3f}," \
      f" {np.quantile(theta_post, 0.975):.3f})")
95% C.I. for theta: (111.968, 164.394)

3¶

Let $X_1, \dots, X_n \sim \text{Uniform}(0, \theta)$. Let $f(\theta) \propto 1/\theta$. Find the posterior density.

Solution:

Let $X_{\max} = \max\{X_1, \dots, X_n\}$. Recalling $$\mathcal{L}_n(\theta \mid X^n) = \begin{cases} 0 & \theta \le X_{\max} \\ \frac{1}{\theta^n} & X_{\max} < \theta, \end{cases} $$

we have

\begin{align*} f(\theta \mid X^n) &\propto \mathcal{L}_n(\theta \mid X^n) f(\theta) \\ &= \begin{cases} 0 & \theta \le X_{\max}\\ \frac{1}{\theta^{n+1}} & X_{\max} < \theta \end{cases} \\ &= \mathbb{1}_{X_{\max} < \theta} \cdot \theta^{-(n+1)}. \end{align*}

Normalizing, we get

$$f(\theta \mid X^n) = \mathbb{1}_{X_{\max} < \theta} \cdot \frac{\theta^{-(n+1)}}{n (X_{\max})^n} $$

4¶

Suppose that 50 people are given a placebo and 50 are given a new treatment. 30 placebo patients show improvement while 40 treated patients show improvement. Let $\tau = p_2 − p_1$ where $p_2$ is the probability of improving under treatment and $p_1$ is the probability of improving under placebo.

(a) Find the MLE of $\tau$. Find the standard error and 90 percent confidence interval using the delta method.

Let $\mathbf{p} = [p_1, p_2]^T$, and let $g(\mathbf{p}) = p_2 - p_1$. Note that the likelihood of $\mathbf{p}$ can be expressed as the product of two independent likelihoods:

\begin{align*} \mathcal{L}_n(p_1,p_2) &= {50 \choose 30}p_1^{30}(1-p_1)^{20} \cdot {50 \choose 40}p_2^{40}(1-p_2)^{10} \\ &= \mathcal{L}_n(p_1) \cdot \mathcal{L}_n(p_2) \end{align*}

Hence $\max \mathcal{L}_n(p_1,p_2) = \max \mathcal{L}_n(p_1) \cdot \max \mathcal{L}_n(p_2)$.

Maximizing the individual likelihoods:

$\mathcal{l}(p_1) = 30\log(p_1) + 20\log(1-p_1)$

$\mathcal{l}'(p_1) = \frac{30}{p_1} - \frac{20}{1-p_1} \Rightarrow 30(1-\hat{p}_1) = 20\hat{p}_1 \Rightarrow \hat{p}_1 = \frac{3}{5}$

Similarly, $\hat{p}_2 = \frac{4}{5}$, and so $\hat{\mathbf{p}} = [\hat{p}_1, \hat{p}_2]^T$. By the invariance property of the MLE, $\hat{\tau} = g(\hat{\mathbf{p}}) = \hat{p}_2 - \hat{p}_1 = \frac{1}{5}$.

We now seek the standard error. We have the gradient of $g$:

$\nabla g = \begin{bmatrix} -1 \\ 1 \end{bmatrix}$

the Fisher Information matrix (the negated expected value of the Hessian of the log likelihood):

$I_n = \begin{bmatrix} \frac{50}{p_1(1-p_1)} & 0 \\ 0 & \frac{50}{p_2(1-p_2)} \end{bmatrix}$

and its inverse:

$J_n = I_n^{-1} = \begin{bmatrix} \frac{p_1(1-p_1)}{50} & 0 \\ 0 & \frac{p_2(1-p_2)}{50} \end{bmatrix}.$

By Theorem 9.28, we have

$\hat{\text{se}}(\hat{\tau}) = \sqrt{(\hat{\nabla}g)^T \hat{J}_n \hat{\nabla}g} = \sqrt{\frac{\hat{p}_1(1-\hat{p}_1)}{50} + \frac{\hat{p}_2(1-\hat{p}_2)}{50}} = \frac{\sqrt{5}}{25} \approx 0.0894$

where $\hat{\nabla}g = \nabla g(\hat{\mathbf{p}})$ ($=\nabla g$, in our case), and $\hat{J}_n = J_n(\hat{\mathbf{p}})$.

Thus, a 90 percent confidence interval is given by:

$(\frac{1}{5} - z_{.05}\frac{\sqrt{5}}{25}, \frac{1}{5} + z_{0.05}\frac{\sqrt{5}}{25}) \approx (0.0529, 0.3471)$

(b) Find the standard error and 90 percent confidence interval using the parametric bootstrap.

In [9]:
B = 100000
p_1_hat = 3 / 5
p_2_hat = 4 / 5
tau_hat = p_2_hat - p_1_hat
tau_boot = np.empty(B)
for i in range(B):
    p_1_boot = binom.rvs(n=50, p=p_1_hat) / 50
    p_2_boot = binom.rvs(n=50, p=p_2_hat) / 50
    tau_boot[i] = p_2_boot - p_1_boot
se_boot = np.std(tau_boot)
z = norm.ppf(1-0.05)
print(f"Standard error: {se_boot:.3f}")
print(f"90% C.I.: ({tau_hat - z * se_boot:.3f},{tau_hat + z * se_boot:.3f})")
Standard error: 0.089
90% C.I.: (0.053,0.347)

(c) Use the prior $f(p_1, p_2) = 1$. Use simulation to find the posterior mean and posterior 90 percent interval for $\tau$.

We have the posterior:

\begin{align*} p(\mathbf{p} \mid X) &\propto \mathcal{L}(\mathbf{p})f(\mathbf{p}) \\ &= p_1^{30}(1-p_1)^{20}p_2^{40}(1-p_2)^{10} \\ &\propto \frac{p_1^{30}(1-p_1)^{20}p_2^{40}(1-p_2)^{10}}{\int \int p_1^{30}(1-p_1)^{20}p_2^{40}(1-p_2)^{10} \,dp_1 \, dp_2} \\ &\propto p_1^{(30 + 1) - 1} (1-p_1)^{(20 + 1)-1}p_2^{(40 + 1) - 1} (1-p_2)^{(10 + 1)-1} \end{align*}

Thus we can sample $p_1$ from $\text{Beta}(31, 21)$ and $p_2$ from $\text{Beta}(41, 11)$.

In [10]:
k = 100000
p_1_samples = Beta.rvs(a=31, b=21, size=k)
p_2_samples = Beta.rvs(a=41, b=11, size=k)
tau_samples = p_2_samples - p_1_samples
In [11]:
print(f"Simulated posterior mean: {np.mean(tau_samples):.3f}")
print(f"90% C.I.: " \
      f"({np.quantile(tau_samples, 0.05):.3f}," \
      f" {np.quantile(tau_samples, 0.95):.3f})")
Simulated posterior mean: 0.193
90% C.I.: (0.047, 0.336)

(d) Let $$\psi = \log \left( \left(\frac{p_1}{1-p_1}\right) \div \left(\frac{p_2}{1-p_2}\right) \right)$$

be the log-odds ratio. Note that $\psi = 0$ if $p_1 = p_2$. Find the MLE of $\psi$. Use the delta method to find a 90 percent confidence interval for $\psi$.

By the equivariance of the MLE, the MLE of $\psi$ is simply

$$\hat{\psi} = \log \left( \left(\frac{\hat{p}_1}{1-\hat{p}_1}\right) \div \left(\frac{\hat{p}_2}{1-\hat{p}_2}\right) \right) \approx -0.981$$

We now compute the standard error using the delta method: $$\hat{\text{se}}(\hat{\tau}) = \sqrt{(\hat{\nabla}g)^T \hat{J}_n \hat{\nabla}g}$$ where $\hat{J}_n$ is as in part (a) and now $$g(p_1, p_2) = \log \left( \left(\frac{p_1}{1-p_1}\right) \div \left(\frac{p_2}{1-p_2}\right) \right),$$ meaning $$\nabla g = \begin{bmatrix} \frac{1}{p_1(1-p_1)} \\ \frac{1}{p_2(1-p_2)}\end{bmatrix}.$$

Thus, \begin{align*} \hat{\text{se}}(\hat{\tau}) &= \sqrt{\frac{1}{50\hat{p}_1(1-\hat{p}_1)} + \frac{1}{50\hat{p}_2(1-\hat{p}_2)}} \\ &= \frac{\sqrt{30}}{12} \approx 0.456 \end{align*}

In [12]:
psi_hat = np.log((p_1_hat / (1 - p_1_hat)) / (p_2_hat / (1 - p_2_hat)))
z = norm.ppf(1-0.05)
print(f"90% C.I.: ({psi_hat - z * np.sqrt(30)/12:.3f}, {psi_hat + z * np.sqrt(30)/12:.3f})")
90% C.I.: (-1.732, -0.230)

(e) Use simulation to find the posterior mean and posterior 90 percent interval for $\psi$.

In [13]:
psi_samples = np.log((p_1_samples / (1 - p_1_samples)) / (p_2_samples / (1 - p_2_samples)))
print(f"Posterior mean: {np.mean(psi_samples):.3f}")
print(f"90% C.I.: ({np.quantile(psi_samples, 0.05):.3f}, "
      f"{np.quantile(psi_samples, 0.95):.3f})")
Posterior mean: -0.954
90% C.I.: (-1.705, -0.226)

5¶

Consider the $\text{Bernoulli}(p)$ observations $$ 0\,1\,0\,1\,0\,0\,0\,0\,0\,0 $$ Plot the posterior for $p$ using these priors: $\text{Beta}(1/2,1/2)$, $\text{Beta}(1,1)$, $\text{Beta}(10,10)$, $\text{Beta}(100,100)$.

Solution:

Recall a random variable has a $\text{Beta}$ distribution with parameters $\alpha$ and $\beta$ if its density is $$ f(p; \alpha, \beta) = \frac{\Gamma(\alpha + \beta)}{\Gamma(\alpha)\Gamma(\beta)} p^{\alpha - 1}(1-p)^{\beta - 1}$$

Since

$\mathcal{L}_n(p \mid X) = \prod p^X_i (1-p)^{1-X_i} = p^2(1-p)^8$,

\begin{align*} f(p \mid X) &\propto \mathcal{L}_n(p \mid X) f(p) \\ &\propto p^{\alpha + 1}(1-p)^{\beta + 7} \\ &= p^{(\alpha + 2) - 1}(1-p)^{(\beta + 8)-1} \\ &\propto \frac{\Gamma(\alpha + \beta + 10)}{\Gamma(\alpha + 2)\Gamma(\beta + 8)} p^{(\alpha + 2) - 1}(1-p)^{(\beta + 8)-1}. \\ \end{align*}

That is, $p \mid X \sim \text{Beta}(\alpha + 2, \beta + 8)$.

In [14]:
p_hat = 2 / 10
alphas = [.5, 1, 10, 100]
betas = [.5, 1, 10, 100]
X = np.arange(0, 1, 0.01)
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
for alpha, beta in zip(alphas, betas):
    ax1.plot(X, Beta.pdf(X, alpha, beta))
    ax2.plot(X, Beta.pdf(X, alpha + 2, beta + 8), label=f"a={alpha}, b={beta}")
ax1.set_title("Prior Distributions")
ax2.set_title("Posterior Distributions")
ax2.vlines(p_hat, 0, 12, linestyle='--', color='black', label='MLE')
ax1.grid(True)
ax2.grid(True)
ax2.legend()
plt.show()

6¶

Let $X_1, \dots, X_n \sim \text{Poisson}(\lambda)$.

(a) Let $\lambda \sim \text{Gamma}(\alpha, \beta)$ be the prior. Show that the posterior is also a Gamma. Find the posterior mean.

$\mathcal{L}_n(\lambda) = \prod_i e^{-\lambda}\frac{\lambda^{X_i}}{X_i!} = \frac{1}{\prod_i X_i!} e^{-n\lambda}\lambda^{(\sum_i X_i)}$

\begin{align*} f(\lambda \mid X) &\propto \mathcal{L}_n(\lambda)f(\lambda) \\ &=\frac{1}{\prod_i X_i!} e^{-n\lambda}\lambda^{(\sum_i X_i)} \frac{1}{\beta^{\alpha} \Gamma(\alpha)} \lambda^{\alpha - 1} e^{-\lambda / \beta} \\ &= \frac{1}{\beta^{\alpha} \Gamma(\alpha) \prod_i X_i!} \lambda^{(\alpha + \sum_i X_i) - 1} e^{-\lambda (n + \frac{1}{\beta})} \\ &\propto \lambda^{\alpha' - 1} e^{-\lambda/\beta'} \end{align*}

where $\alpha' = \alpha + \sum_i X_i$ and $\beta' = \frac{\beta}{n\beta + 1}$. Thus, $\lambda \mid X \sim \text{Gamma}(\alpha + \sum_i X_i, \frac{\beta}{n\beta + 1})$. The posterior mean is thus the mean of this distribution:

$$E(\lambda \mid X) = \frac{\beta(\alpha + \sum_i X_i)}{n\beta + 1}$$

(b) Find the Jeffrey's prior. Find the posterior.

Computing the Jeffrey's prior: \begin{align*} f(x;\lambda) &= e^{-\lambda} \frac{\lambda^x}{x!} \\ \Rightarrow \log f(x;\lambda) &= -\lambda + x\log(\lambda) - \log(x!) \\ \Rightarrow s(x, \lambda) = \frac{\partial}{\partial \lambda} \log f(x;\lambda) &= -1 + \frac{x}{\lambda} \\ \Rightarrow \frac{\partial s(x, \lambda)}{\partial \lambda} &= -\frac{x}{\lambda^2} \\ \Rightarrow I(\lambda) = -\mathbb{E}_{\lambda}\left(\frac{\partial s(x, \lambda)}{\partial \lambda}\right) &= \frac{1}{\lambda} \\ \Rightarrow f(\lambda) \propto \sqrt{I(\lambda)} &= \sqrt{1 / \lambda} \\ \end{align*}

Thus, the Jeffrey's prior is $\sqrt{1 / \lambda}$, an improper prior.

Computing the posterior:

\begin{align*} f(\lambda \mid X) &\propto \mathcal{L}_n(\lambda)f(\lambda) \\ &=\frac{1}{\prod_i X_i!} e^{-n\lambda}\lambda^{(\sum_i X_i)} \lambda^{-1/2} \\ &=\frac{1}{\prod_i X_i!} e^{-n\lambda}\lambda^{(\sum_i X_i) - 1/2} \\ &\propto \lambda^{\alpha' - 1} e^{-\lambda/\beta'} \end{align*}

where $\alpha' = (\sum_i X_i) + 1/2$ and $\beta' = 1/n$. Hence $\lambda \mid X \sim \text{Beta}(\sum_i X_i + 1/2, 1/n)$.

7¶

In Example 11.9, verify 11.11.

See Example 11.9 for definitions.

We seek to show the Horwitz-Thompson estimator:

$$\hat{\psi} = \frac{1}{n} \sum_{i=1}^n \frac{R_iY_i}{\xi_{X_i}}$$

is unbiased with variance not exceeding $\frac{1}{n\delta^2}$.

Solution: Noting that $\mathbb{E}\left[ \frac{R_iY_i}{\xi_{X_i}} \right] = \mathbb{E}\left[ \frac{R_jY_j}{\xi_{X_j}} \right]$ for all $i,j \in \{1,\dots, n\}$, we may establish that $\hat{\psi}$ is unbiased: \begin{align*} \mathbb{E}(\hat{\psi}) &= \frac{1}{n} \sum_{j=1}^n \mathbb{E}\left[ \frac{R_jY_j}{\xi_{X_j}} \right] \\ &= \mathbb{E}\left[ \frac{R_iY_i}{\xi_{X_i}} \right] \\ &= \frac{1}{\xi_{X_i}}\mathbb{E}\left[R_iY_i \right] \\ &= \frac{1}{\xi_{X_i}} \xi_{X_i} \theta_{X_i} \\ &= \theta_{X_i} = P(Y_i = 1) = \psi. \end{align*}

Similarly noting that $\mathbb{V}\left[ \frac{R_iY_i}{\xi_{X_i}} \right] = \mathbb{V}\left[ \frac{R_jY_j}{\xi_{X_j}} \right]$ for all $i,j \in \{1,\dots, n\}$, we may bound the variance:

\begin{align*} \mathbb{V}(\hat{\psi}) &= \mathbb{V} \left[\frac{1}{n} \sum_{i=1}^n \frac{R_iY_i}{\xi_{X_i}} \right] \\ &= \frac{1}{n^2} \mathbb{V} \left[\sum_{i=1}^n \frac{R_iY_i}{\xi_{X_i}} \right] \\ &= \frac{1}{n} \mathbb{V} \left[ \frac{R_iY_i}{\xi_{X_i}} \right] \tag{Independence + above observation}\\ &= \frac{1}{n} \left[\mathbb{E}\left[ \left(\frac{R_iY_i}{\xi_{X_i}} \right)^2\right] - \psi^2\right]. \\ &\le \frac{1}{n} \mathbb{E}\left[ \left(\frac{R_iY_i}{\xi_{X_i}} \right)^2\right] \tag{$\psi \ge 0$} \\ &= \frac{1}{n\xi_{X_i}^2} \mathbb{E} [R_i^2Y_i^2] \tag{Linearity of Expectation}\\ &\le \frac{1}{n\delta^2} \mathbb{E} [R_i^2Y_i^2] \tag{$\xi_{X_i} \ge \delta$}\\ &\le \frac{1}{n\delta^2} \tag{$R_iY_i \le 1$} \end{align*}

8¶

Let $X \sim N(\mu, 1)$. Consider testing

$$H_0 : \mu = 0 \text{ versus } H_1 : \mu \ne 0$$

Take $P(H_0) = P(H_1) = \frac12$. Let the prior for $\mu$ under $H_1$ be $\mu \sim N(0, b^2)$. Find an expression for $P(H_0 \mid X = x)$. Compare $P(H_0 \mid X = x)$ to the $p$-value of the Wald test. Do the comparison numerically for a variety of values of $x$ and $b$. Now repeat the problem using a sample of size $n$. You will see that the posterior probability of $H_0$ can be large even when the $p$-value is small, especially when $n$ is large. This disagreement between Bayesian and frequentist testing is called the Jeffreys-Lindley paradox.

Solution:

The MLE is $\bar{X}$, with standard error $\sqrt{\sigma/n} = n^{-1/2}$. So the Wald statistic is $W = \sqrt{n}\bar{X}$, with $p$-value $2 \Phi(-|W|)$.

We have

\begin{align*} P(H_0 \mid X^n) &= \frac{f(X^n \mid H_0) P(H_0)}{f(X^n \mid H_0) P(H_0) + f(X^n \mid H_1) P(H_1)} \\ &= \frac{f(X^n \mid H_0)}{f(X^n \mid H_0) + f(X^n \mid H_1)} \\ &= \frac{f(X^n \mid \mu = 0)}{f(X^n \mid \mu = 0) + \int f(X^n \mid \mu) f(\mu) \, d\mu} \\ &= \frac{1}{1 + \gamma} \end{align*}

where

$$\gamma = \frac{\int f(X^n \mid \mu) f(\mu)}{f(X^n \mid \mu = 0)}.$$

The integrand is:

\begin{align*} f(X^n \mid \mu) f(\mu) &= (2 \pi)^{-n/2} \exp \left\{ -\frac12 \sum_i (X_i - \mu)^2 \right\} (2 \pi b^2)^{-1/2} \exp \left\{ -\frac12 (\mu / b)^2 \right\} \\ &= \frac{1}{b} (2 \pi)^{-(n+1)/2} \exp \left\{ -\frac12 \left[\sum_i (X_i - \mu)^2 + (\mu / b)^2 \right] \right\}. \end{align*}

Letting

$$\sigma^2 = \frac{b^2}{1 + nb^2},$$

we can expand and complete the square:

\begin{align*} \sum_i (X_i - \mu)^2 + (\mu / b)^2 &= \frac{1}{b^2} \mu^2 + \sum_i [X_i^2 - 2X_i\mu + \mu^2] \\ &= \left(\frac{1}{b^2} + n\right)\mu^2 - 2n\bar{X}\mu + \sum_i X_i^2 \\ &= \frac{1}{\sigma^2}\mu^2 - 2n\bar{X}\mu + \sum_i X_i^2 \\ &= \frac{1}{\sigma^2}\left(\mu^2 - 2n\sigma^2\bar{X}\mu\right) + \sum_i X_i^2 \\ &= \frac{1}{\sigma^2}\left(\mu^2 - n\sigma^2\bar{X}\right)^2 + \sum_i X_i^2 - (n\sigma\bar{X})^2. \\ \end{align*}

Thus, we can express the integrand as the product of a constant and a density:

\begin{align*} f(X^n \mid \mu) f(\mu) &= \frac{1}{b}(2 \pi)^{-(n+1)/2} \exp \left\{ -\frac{\left(\mu^2 - n\sigma^2\bar{X}\right)^2}{2\sigma^2} \right\} \exp \left\{ -\frac12 \left[ \sum_i X_i^2 - (n\sigma\bar{X})^2 \right] \right\} \\ &= \frac{\sigma}{b}(2 \pi)^{-n/2}\exp \left\{ -\frac12 \left[\sum_i X_i^2 - (n\sigma\bar{X})^2 \right] \right\} \cdot (2 \pi \sigma^2)^{-1/2} \exp \left\{ -\frac{\left(\mu^2 - n\sigma^2\bar{X}\right)^2}{2\sigma^2} \right\}. \end{align*}

Meaning:

$$\int f(X^n \mid \mu) f(\mu) = \frac{\sigma}{b}(2 \pi)^{-n/2}\exp \left\{ -\frac12 \left[\sum_i X_i^2 - (n\sigma\bar{X})^2 \right] \right\}$$

Meanwhile,

\begin{align*} f(X^n \mid \mu = 0) &= (2\pi)^{-n/2} \exp\left\{-\frac{1}{2} \sum_i X_i^2 \right\} \end{align*}

Thus,

$$ \frac{\int f(X^n \mid \mu) f(\mu)}{f(X^n \mid \mu = 0)} = \frac{\sigma}{b} \exp \left\{ (n\sigma\bar{X})^2 / 2 \right\},$$

and

\begin{align*} P(H_0 \mid X^n) &= \frac{1}{1 + \frac{\sigma}{b} \exp \left\{(n\sigma\bar{X})^2 / 2 \right\}} \end{align*}
In [15]:
for n in [10, 100, 1000]:
    X = norm.rvs(loc=0.1, scale=1, size=n)
    X_bar = X.mean()
    p_value = 2 * norm.cdf(-np.sqrt(n) * np.abs(X_bar))
    
    print(f"n = {n}, p-value: {p_value:.3f}")
    b = np.logspace(-4, 5, num=50, base=10)
    
    sigma_2 = b ** 2 / (1 + n * (b ** 2))
    z = (np.sqrt(sigma_2) / b) * np.exp(( np.sqrt(sigma_2) * n * X_bar) ** 2 / 2)
    posterior = 1 / (1 + z)

    plt.plot(b, posterior, label=f"n={n}")
    plt.xscale('log')
    plt.xlabel('b')
    plt.ylabel("Posterior Probability of $H_0$")
plt.legend()
plt.grid()
plt.show()
n = 10, p-value: 0.008
n = 100, p-value: 0.193
n = 1000, p-value: 0.000