import math
from matplotlib import pyplot as plt
import numpy as np
import plotly.graph_objects as go
from scipy.stats import norm
np.random.seed(42)
Prove Theorem 14.1:
Let $a$ be a vector of length $k$ and let $X$ be a random vector of the same length with mean $\mu$ and variance $\Sigma$. Then $\mathbb{E}(a^TX) = a^T \mu$ and $\mathbb{V}(a^T X) = a^T \Sigma a$. If $A$ is a matrix with $k$ columns, then $\mathbb{E}(AX) = A\mu$ and $\mathbb{V}(AX) = A\Sigma A^T$.
Solution: By linearity of expectation for scalar random variables: \begin{align*} \mathbb{E}(a^TX) &= \begin{bmatrix} \mathbb{E}(a_1X_1) \\ \vdots \\ \mathbb{E}(a_kX_k) \end{bmatrix} = \begin{bmatrix} a_1\mathbb{E}(X_1) \\ \vdots \\ a_k\mathbb{E}(X_k) \end{bmatrix} = a^T \mu. \end{align*}
Meanwhile, since $\text{Cov}(aX, bY) = ab\text{Cov}(X, Y)$ for constant $a, b$, \begin{align*} \mathbb{V}(a^TX) &= \begin{bmatrix} \mathbb{V}(a_1 X_1) & \text{Cov}(a_1 X_1, a_2 X_2) & \dots & \text{Cov}(a_1 X_1, a_k X_k) \\ \text{Cov}(a_2 X_2, a_1 X_1) & \mathbb{V}(a_2 X_2) & \dots & \text{Cov}(a_2 X_2, a_k X_k) \\ \vdots & \vdots & \vdots & \vdots \\ \text{Cov}(a_k X_k, a_1 X_1) & \text{Cov}(a_k X_k, a_2 X_2) & \dots & \mathbb{V}(a_k X_k) \\ \end{bmatrix} \\ &= \begin{bmatrix} a_1^2 \mathbb{V}(X_1) & a_1 a_2 \text{Cov}(X_1, X_2) & \dots & a_1 a_k\text{Cov}(X_1, X_k) \\ a_2 a_1 \text{Cov}(X_2, X_1) & a_2^2 \mathbb{V}(X_2) & \dots & a_2 a_k \text{Cov}(X_2, X_k) \\ \vdots & \vdots & \vdots & \vdots \\ a_k a_1 \text{Cov}(X_k, X_1) & a_k a_2\text{Cov}(X_k, X_2) & \dots & a_k^2 \mathbb{V}(X_k) \\ \end{bmatrix} \\ &= \begin{bmatrix} a_1 \dots a_k \end{bmatrix} \begin{bmatrix} \mathbb{V}(X_1) & \text{Cov}(X_1, X_2) & \dots & \text{Cov}(X_1, X_k) \\ \text{Cov}(X_2, X_1) & \mathbb{V}(X_2) & \dots & \text{Cov}(X_2, X_k) \\ \vdots & \vdots & \vdots & \vdots \\ \text{Cov}(X_k, X_1) & \text{Cov}(X_k, X_2) & \dots & a_k^2 \mathbb{V}(X_k) \end{bmatrix} \begin{bmatrix} a_1 \\ \vdots \\ a_k \end{bmatrix} &= a^T \Sigma a. \end{align*}
Letting $a_{l.}$ denote the vector of length $k$ corresponding to the $l^{\text{th}}$ row of $A$,
\begin{align*} \mathbb{E}(AX) &= \mathbb{E}\left( \begin{bmatrix} \sum_{i=1}^k a_{1i}X_i \\ \vdots \\ \sum_{i=1}^k a_{ji}X_i \end{bmatrix} \right) \\ &= \begin{bmatrix} \mathbb{E}(a_{1.}^T X) \\ \vdots \\ \mathbb{E}(a_{j.}^T X) \\ \end{bmatrix} \\ &= \begin{bmatrix} a_{1.}^T \mu \\ \vdots \\ a_{j.}^T \mu \\ \end{bmatrix} = A\mu. \end{align*}Note that we can similarly show that $\mathbb{E}(XA^T) = \mu A^T$.
Finally, using the definition $\mathbb{V}(X) = \mathbb{E}((X - \mu)(X - \mu)^T)$ and the previous facts,
\begin{align*} \mathbb{V}(AX) &= \mathbb{E}\left[(AX - \mathbb{E}(AX))(AX - \mathbb{E}(AX))^T\right] \\ &= \mathbb{E}\left[(AX - A\mu)(AX - A\mu)^T \right] \\ &= \mathbb{E}\left[A(X - \mu)(A(X - \mu))^T \right] \\ &= \mathbb{E}\left[A(X - \mu)(X - \mu)^T A^T\right] \\ &= A\mathbb{E}\left[(X - \mu)(X - \mu)^T\right] A^T = A \Sigma A^T.\\ \end{align*}Find the Fisher information matrix for the MLE of a Multinomial.
Solution: Let $X \sim \text{Multinomial}(n, p)$, where $p$ is vector of length $k$ consisting of nonnegative entries that sum to 1. Theorem 14.5 establishes that the MLE of $p$ is $X / n$.
The pdf of $X$ is given by:
$$\frac{n!}{\prod_{i=1}^n X_i!} \prod_{i=1}^n p_i^{X_i},$$whence the log-likelihood of any estimate of $p$ (up to a constant) is
$$\mathcal{l}(p) = \sum_{j=1}^k X_j \log p_j.$$The gradient of the above with respect to $p$ is:
\begin{bmatrix} X_1 / p_1 \\ \vdots \\ X_k / p_k \end{bmatrix},
whence we get the terms of the Hessian:
$$H_{ij} = \begin{cases} -X_i / p_i^2 & i = j \\ 0 & i \ne j. \end{cases} $$Negating the expectation of the Hessian yields the information:
$$I_n = n \begin{bmatrix} \frac{1}{p_1} & 0 & \dots & 0 \\ 0 & \frac{1}{p_2} & \dots & 0 \\ \vdots & \vdots & \vdots & \vdots \\ 0 & \vdots & \vdots & \frac{1}{p_k} \\ \end{bmatrix}.$$(Computer Experiment) Write a function to generate nsim
observations from a Multinomial($n, p$) distribution.
nsim = int(1e5)
n = int(1e2)
p = np.random.random(5)
p /= sum(p)
def get_obs(nsim, n, p):
"""
Performs the following n times:
- Generate nsim random variables from a uniform distribution.
- Returns the resulting histogram, where the bins are defined
by the cumulative sum of p.
"""
data = np.random.random((nsim, n))
bins = np.append([0], p.cumsum())
obs = np.apply_along_axis(lambda x: np.histogram(x, bins)[0], 1, data)
return obs
result = get_obs(nsim, n, p)
print(p)
result
[0.13319703 0.33810082 0.26031769 0.21289984 0.05548463]
array([[18, 32, 26, 19, 5], [12, 35, 26, 24, 3], [14, 26, 34, 21, 5], ..., [14, 33, 28, 18, 7], [18, 31, 14, 32, 5], [ 7, 35, 27, 24, 7]])
(Computer Experiment.) Write a function to generate nsim
observations from a Multivariate normal with given mean $\mu$ and covariance matrix $\Sigma$.
Solution: We shall implement a generator rather than use a library. Using the following steps:
Using the above process, generate a $\text{nsim} \times k$ matrix of standard normally distributed samples, $Z$, where $k$ is the number of variables in the desired distribution.
Get the desired result via the transform $X = \mu + ZL^T$, where $L$ is the Cholesky factor of $\Sigma$, i.e., $L L^T = \Sigma$.
k = 2
mu = np.array([1, 1])
sigma = np.array([[1, 3 / 5],
[3 / 5, 2]])
nsim = int(1e5)
def get_samples(mu, sigma, nsim):
k = mu.size
u_1 = np.random.uniform(size=((nsim // 2) + 1, k))
u_2 = np.random.uniform(size=((nsim // 2) + 1, k))
z_1 = np.sqrt(-2 * np.log(u_1)) * np.cos(2 * np.pi * u_2)
z_2 = np.sqrt(-2 * np.log(u_1)) * np.sin(2 * np.pi * u_2)
z = np.concatenate((z_1, z_2), axis=0)[:nsim]
L = np.linalg.cholesky(sigma) # cholesky factor of sigma
return mu + z @ L.T
z = get_samples(mu, sigma, nsim)
Plotting a 2D histogram:
import plotly.graph_objects as go
fig = go.Figure(
go.Histogram2d(
x=z[:, 0],
y=z[:, 1]
)
)
fig.update_layout(
width=500,
height=500,
paper_bgcolor="LightSteelBlue",
xaxis_range=[max(-5, min(z[:,0])),
min(max(z[:,0]), 5)],
yaxis_range=[max(-5, min(z[:,1])),
min(max(z[:,1]), 5)]
)
fig.show()
(Computer Experiment.) Generate 100 random vectors from a $N(\mu, \Sigma)$ distribution where
$$ \mu = \begin{pmatrix} 3 \\ 8 \end{pmatrix}, \,\, \Sigma = \begin{pmatrix} 1 & 1 \\ 1 & 2 \\ \end{pmatrix}. $$Plot the simulation as a scatterplot. Estimate the mean and covariance matrix $\Sigma$. Find the correlation $\rho$ between $X_1$ and $X_2$. Compare this with the sample correlations from your simulation. Find a 95 percent confidence interval for $\rho$. Use two methods: the bootstrap and Fisher's method. Compare.
mu = np.array([3, 8])
sigma = np.array([
[1, 1],
[1, 2],
])
n = 100
X = get_samples(mu, sigma, nsim=n)
plt.grid()
plt.scatter(x=X[:, 0], y=X[:, 1])
plt.show()
mu_hat = np.mean(X, axis=0)
sigma_hat = (X - mu_hat).T @ (X - mu_hat) / (n - 1)
sigma_hat_numpy = np.cov(X, rowvar=False)
assert np.allclose(sigma_hat, sigma_hat_numpy)
print(f"mu_hat:")
print(f"{mu_hat}")
print(f"sigma_hat: ")
print(f"{sigma_hat}")
mu_hat: [3.04491859 8.00680898] sigma_hat: [[0.9948494 1.14602857] [1.14602857 2.32019661]]
The true correlation can be computed:
\begin{align*} \rho &= \frac{\mathbb{E}(X_1 - \mu_1)(X_2 - \mu_2)}{\sigma_1 \sigma_2} \\ &= \frac{\text{Cov}(X_1, X_2)}{\sigma_1 \sigma_2} \\ &= \frac{1}{\sqrt{1}{\sqrt{2}}} \\ &\approx 0.7071 \end{align*}Computing the sample correlation via the formula
$$\hat{\rho} = \frac{\sum_{i=1}^n (X_{1i} - \bar{X}_1)(X_{2i} - \bar{X}_2)} {\sqrt{\sum_{i}(X_{1i} - \bar{X})^2} \sqrt{\sum_i(X_{2i} - \bar{X}_2)^2}}.$$Note the equation in the book (which uses the sample standard deviations in the denominator) is off by a factor of $\frac{1}{n-1}$.
s_1 = np.sqrt(np.power(np.linalg.norm(X[:, 0] - mu_hat[0]), 2) / (n - 1))
s_2 = np.sqrt(np.power(np.linalg.norm(X[:, 1] - mu_hat[1]), 2) / (n - 1))
rho_hat = np.sum((X[:, 0] - mu_hat[0]) * (X[:, 1] - mu_hat[1])) / ((n-1) * s_1 * s_2)
rho_hat_numpy = np.corrcoef(X[:,0], X[:, 1])[0,1]
assert np.isclose(rho_hat, rho_hat_numpy)
print(f"Sample correlation: {rho_hat:.3f}")
Sample correlation: 0.754
Computing a 95% confidence interval using the bootstrap (using B = 1000):
def get_rho_hat(X):
mu_hat = np.mean(X, axis=0)
s_1 = np.sqrt(np.power(np.linalg.norm(X[:, 0] - mu_hat[0]), 2) / (n - 1))
s_2 = np.sqrt(np.power(np.linalg.norm(X[:, 1] - mu_hat[1]), 2) / (n - 1))
sample_cov = np.sum((X[:, 0] - mu_hat[0]) * (X[:, 1] - mu_hat[1]))
return sample_cov / ((n - 1) * s_1 * s_2)
def get_bootstrap_ci(X):
rho_hat = get_rho_hat(X)
B = 1000
T = np.empty(B)
for i in range(B):
X_star = X[np.random.choice(X.shape[0], size=n, replace=True)]
T[i] = get_rho_hat(X_star)
se_boot = np.sqrt(np.var(T))
lb = rho_hat - z * se_boot
ub = rho_hat + z * se_boot
return (lb, ub)
z = norm.ppf(1 - 0.05 / 2)
ci_boot = get_bootstrap_ci(X)
print(f"Normal 95% CI: ({ci_boot[0]:.3f}, {ci_boot[1]:.3f})")
Normal 95% CI: (0.670, 0.839)
Now we use Fisher's Method, which has the following steps:
def get_fishers_method_ci(X):
rho_hat = get_rho_hat(X)
theta_hat = (1 / 2) * (math.log(1 + rho_hat) - math.log(1 - rho_hat))
se_theta_hat = 1 / math.sqrt(n - 3)
a = theta_hat - z * se_theta_hat
b = theta_hat + z * se_theta_hat
lb = (math.exp(2 * a) - 1) / (math.exp(2 * a) + 1)
ub = (math.exp(2 * b) - 1) / (math.exp(2 * b) + 1)
return (lb, ub)
ci_fisher = get_fishers_method_ci(X)
print(f"Fisher's Method 95% CI: ({ci_fisher[0]:.3f}, {ci_fisher[1]:.3f})")
Fisher's Method 95% CI: (0.655, 0.828)
In this instance, both methods yielded a confidence interval containing the true value $\sqrt{2} / 2$. In this instance, the confidence interval from Fisher's method is slightly smaller.
boot_ci_width= ci_boot[1] - ci_boot[0]
fisher_ci_width = ci_fisher[1] - ci_fisher[0]
print(f"Bootstrap C.I. width: {boot_ci_width:.3f}")
print(f"Fisher's Method C.I. width: {fisher_ci_width:.3f}")
Bootstrap C.I. width: 0.169 Fisher's Method C.I. width: 0.173
(Computer Experiment.) Repeat the previous exercise 1000 times. Compare the coverage of the two confidence intervals for $\rho$.
rho = math.sqrt(2) / 2
K = 1000
coverage = {"Bootstrap": np.zeros(K),
"Fisher": np.zeros(K)}
width = {"Bootstrap": np.zeros(K),
"Fisher": np.zeros(K)}
for k in range(K):
X = get_samples(mu=mu, sigma=sigma, nsim=100)
ci_boot = get_bootstrap_ci(X)
ci_fisher = get_fishers_method_ci(X)
if (ci_boot[0] <= rho) and (rho <= ci_boot[1]):
coverage["Bootstrap"][k] = 1
if (ci_fisher[0] <= rho) and (rho <= ci_fisher[1]):
coverage["Fisher"][k] = 1
width["Bootstrap"][k] = ci_boot[1] - ci_boot[0]
width["Fisher"][k] = ci_fisher[1] - ci_fisher[0]
Confidence intervals generated by Fisher's method tend to contain the true value more often, and tend to be wider, than those generated by the bootstrap.
print(f"Bootstrap Coverage: {100 * coverage['Bootstrap'].mean():.1f}%")
print(f"Fisher's Method Coverage: {100 * coverage['Fisher'].mean():.1f}%")
print(f"Bootstrap C.I. Avg. Width: {width['Bootstrap'].mean():.4f}%")
print(f"Fisher's Method C.I. Avg. Width: {width['Fisher'].mean():.4f}%")
Bootstrap Coverage: 95.4% Fisher's Method Coverage: 96.0% Bootstrap C.I. Avg. Width: 0.1967% Fisher's Method C.I. Avg. Width: 0.2001%