import matplotlib.pyplot as plt
import numpy as np
from numpy.random import default_rng
import pandas as pd
from scipy.integrate import quad
from scipy.stats import gamma, norm, uniform
from scipy.optimize import minimize
np.set_printoptions(formatter={"float": "{: 0.3f}".format})
np.random.seed(0)
Let
$$I = \int_1^2 \frac{e^{-x^2 / 2}}{\sqrt{2 \pi}} dx .$$(a) Estimate $I$ using the basic Monte Carlo method. Use $N = 100,000$. Also, find the estimated standard error.
Solution:
Letting $f(x) = e^{-x^2 / 2} / \sqrt{2 \pi}$ and $h(x) = \mathbb{1}_{x \in [1, 2]}$, we have
$$I = \int h(x) f(x)\, dx,$$and we can estimate $I$ via
$$\hat{I} = \frac{1}{N} \sum_{i=1}^N Y_i = \frac{1}{N} \sum_{i=1}^N h(X_i),$$where $X_1, \dots X_n$ are drawn from a standard normal distribution. Note that $\hat{I}$ is the sample mean of $Y$. An estimated standard error of the sample mean is given by
$$\hat{\textsf{se}} = \frac{s}{\sqrt{N}}$$where
$$s^2 = \frac{\sum_{i=1}^N (Y_i - \hat{I})^2}{N - 1}$$is the sample variance.
def h(x):
return np.where(a <= x, np.where(x <= b, 1, 0), 0)
N = 100_000
a = 1
b = 2
X = norm.rvs(loc=0, scale=1, size=N)
Y = h(X)
I_hat = np.mean(Y)
s2 = (np.linalg.norm(Y - I_hat) ** 2) / (N - 1) # sample variance
se = np.sqrt(s2) / np.sqrt(N) # estimated standard error
print(f"I_hat : {I_hat:.5f}, estimated standard error: {se:.3e}")
print(f"Approximate 95% C.I.: ({I_hat - 2 * se:.3e}, {I_hat + 2 * se:.3e})")
I_hat : 0.13457, estimated standard error: 1.079e-03 Approximate 95% C.I.: (1.324e-01, 1.367e-01)
(b) Find an (analytical) expression for the standard error of your estimate in (a). Compare to the estimated standard error.
Solution:
The true standard error (what $\hat{\textsf{se}}$ estimates) is the standard deviation of the sample mean of $Y$, namely, $\hat{I}$. Since $Y_i = \mathbb{1}_{X_i \in [1,2]}$ are independent, it is given by
$$\textsf{se} = \frac{\sigma_{Y}}{\sqrt{N}},$$where $\sigma_Y$ is the standard deviation of $Y$. Observe that $\mathbb{E}[Y_i] = I = \Phi(2) - \Phi(1)$, and that since $Y_i \in [0, 1]$, $Y_i^2 = Y_i$. Therefore,
$$ \begin{align*} \sigma_Y^2 = \mathbb{V}[Y_i] &= \mathbb{E}[Y_i^2] - \mathbb{E}[Y_i]^2 = I(1 - I), \\ \end{align*} $$and thus
$$\textsf{se} = \sqrt{\frac{I(1 - I)}{N}}.$$I = norm.cdf(2) - norm.cdf(1)
se_true = np.sqrt(I * (1 - I) / (N - 1))
print(f"True standard error: {se_true:.3e}")
print(f"Estimated standard error: {se:.3e}")
print(f"Difference: {se_true - se:.3e}")
True standard error: 1.084e-03 Estimated standard error: 1.079e-03 Difference: 4.503e-06
(c) Estimate $I$ using importance sampling. Take $g$ to be $N(1.5, v^2)$ with $v = .1$, $v = 1$ and $v = 10$. Compute the (true) standard errors in each case. Also, plot a histogram of the values you are averaging to see if there are any extreme values.
def f(x):
return norm.pdf(x, loc=0, scale=1)
def g(x):
return norm.pdf(x, loc=1.5, scale=v**2)
fig, ax = plt.subplots(figsize=(14, 4), nrows=1, ncols=3)
estimates = {}
for i, v in enumerate([0.1, 1, 10]):
X = norm.rvs(loc=1.5, scale=v**2, size=N)
Y = (h(X) * f(X)) / g(X)
estimates[v] = np.mean(Y)
ax[i].hist(Y, bins=100)
ax[i].set_title(f"$v$ = {v}")
print(estimates)
fig.suptitle("Histograms for $Y_i$", fontsize=14)
plt.show()
{0.1: 0.013070056030923984, 1: 0.13611405986928374, 10: 0.131887993546821}
The true standard error can once again be computed via
$$\textsf{se} = \frac{\sigma_{Y}}{\sqrt{N}}.$$However, now
$$Y = \frac{h(X)f(X)}{g(X)}, X \sim g = N(1.5, v^2)$$To recover the formula for an arbitrary mean, let $m = \mathbb{E}[X]$ (in the problem, $m = 1.5$). The variance of $Y$ is given:
$$ \begin{align*} \sigma_{Y}^2 &= \mathbb{E}[Y^2] - \mathbb{E}[Y]^2 \\ &= \int \frac{h^2(x)f^2(x)}{g^2(x)} g(x) \, dx - \left( \int h(x) f(x) \, dx\right)^2 \\ &= \int_1^2 \frac{f^2(x)}{g(x)} \, dx - \left( \int_1^2 f(x) \, dx\right)^2 \\ &= \int_1^2 \frac{e^{-x^2}}{2 \pi} \frac{\sqrt{2 \pi v^2}}{e^{-(x - m)^2 / 2v^2}} \, dx - I^2 \\ &= \frac{v}{\sqrt{2 \pi}} \int_1^2 \exp\left\{-x^2 + \frac{1}{2} \left(\frac{x - m}{v}\right)^2\right\} \, dx - I^2 .\\ \end{align*} $$It is possible to simplify the integral term. However, we can compute it numerically, yielding the standard errors:
m = 1.5
def f(x):
return (v / np.sqrt(2 * np.pi)) * np.exp(
-(x**2) + ((x - m) ** 2) / (2 * (v**2))
)
vs = [0.1, 1, 10]
ses = {}
for v in vs:
true_value, _ = quad(f, 1, 2)
sigma_y_2 = true_value - I**2
sigma_y_2 = sigma_y_2
ses[v] = sigma_y_2 / np.sqrt(N)
pd.DataFrame.from_dict({"Standard Error": ses}, orient="index").T
Standard Error | |
---|---|
0.1 | 0.263522 |
1.0 | 0.000121 |
10.0 | 0.001649 |
We can search for the minimizer:
vv = np.logspace(-1, 1)
sigma_y_2s = {}
ses = {}
for i, v in enumerate(vv):
def f(x):
return (v / np.sqrt(2 * np.pi)) * np.exp(
-(x**2) + ((x - m) ** 2) / (2 * (v**2))
)
true_value, _ = quad(f, 1, 2)
sigma_y_2 = true_value - I**2
sigma_y_2s[v] = sigma_y_2
ses[v] = sigma_y_2 / np.sqrt(N)
plt.plot(ses.keys(), ses.values())
plt.xscale("log")
plt.yscale("log")
plt.xlabel("$v$")
plt.ylabel("Standard Error")
plt.grid()
plt.show()
optimal_v = list(ses.keys())[np.array(list(ses.values())).argmin()]
print(f"Optimal v: {optimal_v:.3e}, Standard Error: {ses[optimal_v]:.3e}")
Optimal v: 3.393e-01, Standard Error: 3.548e-05
(d) Find the optimal importance sampling function $g^*$. What is the standard error using $g^*$?
Solution:
By Theorem 24.5, the optimal choice of $g$ is
$$g^*(x) = \frac{|h(x)|f(x)}{\int |h(s)| f(s) \, ds}$$In our case, $f(x) = e^{-x^2 / 2} / \sqrt{2 \pi}$ and $h(x) = \mathbb{1}_{x \in [1, 2]}$, so
$$ \begin{align*} g^*(x) &= \begin{cases} \frac{e^{-x^2 / 2}}{\int_1^2 e^{-s^2 / 2} \, ds} = \frac{\phi(x)}{\Phi(2) - \Phi(1)} & 1 \le x \le 2 \\ 0 & \text{otherwise.} \end{cases} \end{align*} $$In this case, the variance vanishes:
$$ \begin{align*} \sigma_{Y}^2 &= \int_1^2 \frac{f^2(x)}{g^*(x)} \, dx - \left( \int_1^2 f(x) \, dx\right)^2 \\ &= \int_1^2 \frac{\phi^2(x) \left( \Phi(2) - \Phi(1)\right)}{\phi(x)} - (\Phi(2) - \Phi(1))^2 \\ &= \left( \Phi(2) - \Phi(1)\right) \int_1^2 \phi(x) \, dx - (\Phi(2) - \Phi(1))^2 = 0 \end{align*} $$Here is a way to use importance sampling to estimate a marginal density. Let $f_{X,Y}(x, y)$ be a bivariate density and let $(X_1, X_2), \dots, (X_N, Y_N) \sim f_{X, Y}$.
(a) Let $w(x)$ be an arbitrary probability density function. Let
$$\hat{f}_X(x) = \frac{1}{N} \sum_{i=1}^N \frac{f_{X,Y}(x, Y_i) w(X_i)}{f_{X,Y}(X_i, Y_i)}$$Show that, for each $x$,
$$\hat{f}_X(x) \xrightarrow{P} f_X(x).$$Find an expression for the variance of this estimator.
Solution:
If $(X_i, Y_i)$ are i.i.d. according to a density $f_{X,Y}(x,y)$, the law of large numbers states that for any function $g$:
$$\frac{1}{N} \sum_{i=1}^N g(X_i, Y_i) \xrightarrow{P} \int g(x,y) f_{X,Y}(x,y) \, dx dy$$Thus,
$$ \begin{align*} \hat{f}_X(x) &= \frac{1}{N} \sum_{i=1}^N \frac{f_{X,Y}(x, Y_i) w(X_i)}{f_{X,Y}(X_i, Y_i)} \\ &\xrightarrow{P} \int \int \left[\frac{f_{X,Y}(x, y) w(x')}{f_{X,Y}(x', y)} \right] f_{X,Y}(x', y) \, dx' dy\\ & \int f_{X,Y}(x, y) \left[ \int w(x') \, dx' \right] dy \\ &= f_{X}(x). \end{align*} $$We now compute the variance of the summand via $\mathbb{V}(u) = \mathbb{E}(u^2) - \mathbb{E}(u)^2$:
We have
$$ \begin{align*} \mathbb{E}\left[\hat{f}_X(x) \right] &= \frac{1}{N} \sum_{i=1}^N \int \int \frac{f_{X, Y}(x, y) w(x')}{f_{X,Y}(x', y)} f_{X,Y}(x', y) dy dx' \\ &= \frac{1}{N} \sum_{i=1}^N f_{X}(x) \int w(x') dx' = f_X(x).\\ \end{align*} $$and
$$ \begin{align*} \mathbb{E}\left[\hat{f}_X^2(x) \right] &= \frac{1}{N} \sum_{i=1}^N \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}^2(x', y)} f_{X,Y}(x', y) dy dx' \\ &= \frac{1}{N} \sum_{i=1}^N \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}(x', y)} dy dx' \\ &= \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}(x', y)} dy dx' \end{align*} $$So the variance is:
$$\mathbb{V}\left[ \hat{f}_X(x) \right] = \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}(x', y)} dy dx' - f_X^2(x).$$(b)
Let $Y \sim N(0, 1)$ and $X \mid Y = y \sim N(y, 1 + y^2)$. Use the method in (a) to estimate $f_X(x)$.
Solution:
The joint pdf is given:
$$ \begin{align*} f_{X,Y}(x,y) &= f_{X \mid Y}(x, y) f_{Y}(y) \\ &= \frac{1}{\sqrt{2 \pi (1 + y^2)}} \exp \left\{- \frac{1}{2} \frac{(x - y) ^2}{(1 + y^2)} \right\} \frac{1}{\sqrt{2 \pi}} \exp \left\{ -y^2 / 2 \right\} \\ &= \frac{1}{2 \pi \sqrt{1 + y^2}} \exp \left\{-\frac{1}{2} \left[ \frac{(x - y) ^2}{(1 + y^2)} + y^2\right] \right\} \end{align*} $$We can generate samples of $Y$, and from those, $X$. We shall set $w$ to be a normal distribution with location 5 and scale 10.
N = 100_000
Y = norm.rvs(loc=0, scale=1, size=N)
X = norm.rvs(loc=Y, scale=np.sqrt(1 + Y**2))
def f(x, y):
coef = 1 / (2 * np.pi * np.sqrt(1 + y**2))
exp = -(1 / 2) * ((x - y) ** 2 / (1 + y**2) + y**2)
return coef * np.exp(exp)
def w(x):
return norm.pdf(x, loc=5, scale=10)
xx = np.arange(-10, 10, step=0.05)
f_X = np.zeros_like(xx)
for i, x in enumerate(xx):
f_X[i] = np.nanmean(f(x, Y) * w(X) / f(X, Y))
plt.figure(figsize=(8, 4))
plt.plot(xx, f_X)
plt.xlabel("x")
plt.ylabel("Estimate of f_X")
plt.grid()
plt.show()
Here is a method called accept-reject sampling for drawing observations from a distribution.
(a) Suppose that $f$ is some probability density function. Let $g$ be any other density and suppose that $f(x) \le Mg(x)$ for all $x$, where $M$ is a known constant. Consider the following algorithm:
(step 1): Draw $X \sim g$ and $U \sim \text{Unif}(0, 1)$;
(step 2): If $U \le f(X) / (M g(X))$ set $Y = X$, otherwise go back to step 1. (Keep repeating until you finally get an observation.)
Show that the distribution of $Y$ is $f$.
Solution:
For any $y$,
$$ \begin{align*} \mathbb{P}(Y = y) &= \mathbb{P}\left(X = y, U \le \frac{f(X)}{Mg(X)}\right) \\ &= g(y) \frac{f(y)}{M g(y)} \\ &= \frac{f(y)}{M}, \end{align*} $$and thus $f_Y \propto f$. But since $f_Y$ is a PDF, $f_Y = f$.
(b) Let $f$ be a standard normal density and let $g(x) = 1 / (1 + x^2)$ be the Cauchy density. Apply the method in (a) to draw 1,000 observations from the normal distribution. Draw a histogram of the sample to verify that the sample appears to be normal.
from scipy.stats import cauchy, uniform
def f(x):
return norm.pdf(x)
def g(x):
return 1 / (1 + x**2)
np.random.seed(0)
M = 1
N = 1000
counter = 0
Ys = []
while counter < N:
X = cauchy.rvs()
U = uniform.rvs()
if U <= f(X) / (M * g(X)):
Ys.append(X)
counter += 1
plt.hist(Ys, histtype="step", bins=20, label="samples", density=True)
xx = np.linspace(-3, 3)
plt.plot(xx, norm.pdf(xx), label="N(0,1)")
plt.legend()
plt.show()
A random variable $Z$ has a inverse Gaussian distribution if it has density
$$f(z) \propto z^{-3/2} \exp \left\{-\theta_1 z - \frac{\theta_2}{2} + 2 \sqrt{\theta_1 \theta_2} + \log \left(\sqrt{2 \theta_2} \right) \right\}, \quad z > 0$$where $\theta_1 > 0$ and $\theta_2 > 0$ are parameters. It can be shown that
$$\mathbb{E}(Z) = \sqrt{\frac{\theta_2}{\theta_1}} \text{ and } \mathbb{E}\left(\frac{1}{Z}\right) = \sqrt{\frac{\theta_1}{\theta_2}} + \frac{1}{2 \theta_2}.$$(a) Let $\theta_1 = 1.5$ and $\theta_2 = 2$. Draw a sample of size 1,000 using the independence-Metropolis-Hastings method. Use a Gamma distribution as the proposal density. To assess the accuracy, compare the mean of $Z$ and $1 / Z$ from the sample to the theoretical means. Try different Gamma distributions to see if you can get an accurate sample.
Solution:
The independence-Metropolis-Hastings method takes the following steps:
Let $\mu = \mathbb{E}[Z]$ and $\nu = \mathbb{E}\left( \frac{1}{Z}\right)$.
from numpy.random import default_rng
from scipy.integrate import quad
def f(z):
exponent = (
-theta_1 * z
- theta_2 / z
+ 2 * np.sqrt(theta_1 * theta_2)
+ np.log(np.sqrt(2 * theta_2))
)
return (z ** (-3 / 2)) * np.exp(exponent)
def g(z, a):
return gamma.pdf(z, a)
theta_1 = 1.5
theta_2 = 2
mu = np.sqrt(theta_2 / theta_1)
nu = np.sqrt(theta_1 / theta_2) + 1 / (2 * theta_2)
# get normalization constant numerically
C, _ = quad(f, a=-1e-6, b=1e3)
def independence_MCMC(func, n, a):
x = gamma.rvs(a=a, random_state=rng) # initialize x
cursor = 0
X = np.zeros(n)
while cursor < n:
y = gamma.rvs(a=a, random_state=rng) # get proposal
r = min(1, func(y) / func(x) * g(x, a=a) / g(y, a=a))
if uniform.rvs(random_state=rng) < r:
x = y
X[cursor] = x
cursor += 1
return X
def plot_results(values):
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
ax[0].plot(values, mu_hats, label="Estimated")
ax[0].hlines(mu, min(values), max(a_values), color="orange", label="True")
ax[0].set_xlabel("a")
ax[0].legend()
ax[0].set_title("Mean")
ax[1].plot(values, nu_hats, label="Estimated")
ax[1].hlines(
nu,
min(values),
max(values),
color="orange",
label="True",
)
ax[1].set_xlabel("a")
ax[1].legend()
ax[1].set_title("Mean of Inverse")
plt.show()
n = 1000
a_values = np.linspace(0.3, 2, 21)
samples = np.zeros((len(a_values), n))
mu_hats = np.zeros_like(a_values)
nu_hats = np.zeros_like(a_values)
print(f"Actual mu: {mu:.3f} \t Actual nu: {nu:.3f}")
print("a\t\tmu_hat\t\tnu_hat\tmu_hat_error\tnu_hat_error\ttotal_abs_error")
for i, a in enumerate(a_values):
rng = default_rng(seed=0)
samples[i] = independence_MCMC(f, n, a)
mu_hats[i] = np.mean(samples[i])
nu_hats[i] = np.mean(np.power(np.array(samples[i]), -1))
print(
f"{a:.3e}\t{mu_hats[i]:.3f}\t\t{nu_hats[i]:.3f}\t{abs(mu_hats[i] - mu):.3f}\t\t{abs(nu_hats[i] - nu):.3f}"
f"\t\t{np.abs((mu_hats[i] - mu)) + np.abs((nu_hats[i] - nu)):.3f}"
)
plot_results(a_values)
Actual mu: 1.155 Actual nu: 1.116 a mu_hat nu_hat mu_hat_error nu_hat_error total_abs_error 3.000e-01 1.085 1.261 0.069 0.145 0.215 3.850e-01 1.085 1.216 0.070 0.100 0.170 4.700e-01 1.084 1.204 0.070 0.088 0.158 5.550e-01 1.092 1.182 0.063 0.066 0.129 6.400e-01 1.119 1.191 0.036 0.075 0.111 7.250e-01 1.113 1.195 0.041 0.079 0.120 8.100e-01 1.119 1.200 0.036 0.084 0.120 8.950e-01 1.124 1.191 0.031 0.075 0.106 9.800e-01 1.155 1.158 0.000 0.042 0.043 1.065e+00 1.185 1.138 0.030 0.022 0.053 1.150e+00 1.205 1.136 0.050 0.020 0.070 1.235e+00 1.201 1.146 0.046 0.030 0.076 1.320e+00 1.210 1.133 0.055 0.017 0.072 1.405e+00 1.205 1.146 0.051 0.030 0.080 1.490e+00 1.187 1.134 0.033 0.018 0.051 1.575e+00 1.212 1.120 0.057 0.004 0.061 1.660e+00 1.220 1.114 0.065 0.002 0.068 1.745e+00 1.232 1.092 0.077 0.024 0.101 1.830e+00 1.244 1.084 0.089 0.032 0.121 1.915e+00 1.256 1.062 0.101 0.054 0.155 2.000e+00 1.261 1.067 0.106 0.049 0.155
The minimizer (for this run) of the total absolute error is $a=.98$. Plotting the resulting histogram of samples against $f$ (with the normalization constant computed numerically), we observe good agreement:
rng = default_rng(seed=0)
Z_ind = independence_MCMC(f, n, a=0.98)
mu_hat_ind = np.mean(Z_ind)
nu_hat_ind = np.mean(1 / Z_ind)
plt.hist(Z_ind, bins=50, density=True, label="Samples")
xx = np.linspace(1e-3, 5, 100)
plt.plot(xx, f(xx) / C, label="True density")
plt.legend()
plt.show()
(b) Draw a sample of size 1,000 using the random-walk-Metropolis-Hastings method. Since $z > 0$ we cannot just use a Normal density. One strategy is this. Let $W = \log Z$. Find the density of $W$. Use this random-walk-Metropolis-Hastings method to get a sample $W_1, \dots, W_N$ and let $Z_i = e ^ {W_i}$. Assess the accuracy of the simulation in part (a).
Solution:
We first compute the distribution of $W = \log Z$. Let $h(x) = \log (x)$. Since $h$ is one-to-one, we may use the formula:
$$f_W(x) = f(h^{-1}(x)) \left| \frac{d}{dx} h^{-1}(x) \right|,$$yielding
$$f_W(x) = \exp(x) \cdot f(\exp(x)).$$The random-walk-Metropolis-Hasting method to sample from a distribution $f$ (with unknown normalization constant) is the following:
We shall perform random-walk-MCMC on $f_W$, yielding a sample of $W$, from which we can get a sample of $Z$.
def f_W(z):
return np.exp(z) * f(np.exp(z))
def random_walk_MCMC(func, n, b):
x = 0 # initialize x
cursor = 0
W = np.zeros(n)
while cursor < n:
eps = norm.rvs(scale=b, random_state=rng) # get proposal
y = x + eps
r = min(1, func(y) / func(x))
if uniform.rvs(random_state=rng) < r:
x = y
W[cursor] = x
cursor += 1
return W
b_values = np.logspace(-3, 1, 20)
samples = np.zeros((len(b_values), n))
mu_hats = np.zeros_like(b_values)
nu_hats = np.zeros_like(b_values)
print(f"Actual mu: {mu:.3f} \t Actual nu: {nu:.3f}")
print("b\t\tmu_hat\t\tnu_hat\tmu_hat_error\tnu_hat_error\ttotal_abs_error")
for i, b in enumerate(b_values):
rng = default_rng(seed=0)
samples[i] = np.exp(random_walk_MCMC(f_W, n, b))
mu_hats[i] = np.mean(samples[i])
nu_hats[i] = np.mean(np.power(np.array(samples[i]), -1))
print(
f"{b:.3e}\t{mu_hats[i]:.3f}\t\t{nu_hats[i]:.3f}\t{abs(mu_hats[i] - mu):.3f}\t\t{abs(nu_hats[i] - nu):.3f}"
f"\t\t{np.abs((mu_hats[i] - mu)) + np.abs((nu_hats[i] - nu)):.3f}"
)
Actual mu: 1.155 Actual nu: 1.116 b mu_hat nu_hat mu_hat_error nu_hat_error total_abs_error 1.000e-03 0.988 1.013 0.167 0.103 0.270 1.624e-03 0.980 1.021 0.175 0.095 0.270 2.637e-03 0.968 1.034 0.187 0.082 0.269 4.281e-03 0.949 1.056 0.206 0.060 0.266 6.952e-03 0.922 1.091 0.233 0.025 0.259 1.129e-02 0.884 1.146 0.270 0.030 0.300 1.833e-02 0.830 1.257 0.325 0.141 0.466 2.976e-02 0.964 1.175 0.191 0.059 0.249 4.833e-02 1.109 1.087 0.046 0.029 0.074 7.848e-02 1.217 1.042 0.062 0.074 0.137 1.274e-01 1.178 1.069 0.023 0.047 0.071 2.069e-01 1.136 1.094 0.019 0.022 0.041 3.360e-01 1.148 1.140 0.007 0.024 0.030 5.456e-01 1.147 1.116 0.008 0.000 0.008 8.859e-01 1.177 1.105 0.023 0.011 0.034 1.438e+00 1.174 1.130 0.020 0.014 0.034 2.336e+00 1.180 1.168 0.025 0.052 0.078 3.793e+00 1.157 1.175 0.002 0.059 0.060 6.158e+00 1.188 1.140 0.033 0.024 0.057 1.000e+01 1.247 1.144 0.093 0.028 0.120
Using the minimizing value of $b$:
rng = default_rng(seed=0)
Z_rw = np.exp(random_walk_MCMC(f_W, n, b=5.456e-01))
mu_hat_rw = np.mean(Z_rw)
nu_hat_rw = np.mean(1 / Z_rw)
plt.hist(Z_rw, bins=50, density=True, label="Samples")
xx = np.linspace(1e-3, 5, 100)
plt.plot(xx, f(xx) / C, label="True density")
plt.legend()
plt.show()
results = {
"mu": {
"actual": mu,
"Independence MCMC Estimate": mu_hat_ind,
"Random-Walk MCMC Estimate": mu_hat_rw,
},
"nu": {
"actual": nu,
"Independence MCMC Estimate": nu_hat_ind,
"Random-Walk MCMC Estimate": nu_hat_rw,
},
}
pd.DataFrame.from_dict(results, orient="index")
actual | Independence MCMC Estimate | Random-Walk MCMC Estimate | |
---|---|---|---|
mu | 1.154701 | 1.154551 | 1.147062 |
nu | 1.116025 | 1.158405 | 1.116451 |
Using the best settings for each, Random-Walk MCMC appears to outperform Independence MCMC for this task.
Get the heart disease data from the book web site. Consider a Bayesian analysis of the logistic regression model
$$ \mathbb{P}(Y = 1 \mid X = x) = \frac{e ^ {\beta_0 + \sum_{j=1}^k \beta_j x_j}}{1 + e ^ {\beta_0 + \sum_{j=1}^k \beta_j x_j}}. $$Use the flat prior $f(\beta_0, \dots, \beta_k) \propto 1$. Use the Gibbs-Metropolis algorithm to draw a sample of size 10,000 from the posterior $f(\beta, \beta_1 \mid \text{data})$. Plot histograms for the posterior for the $\beta_j$'s. Get the posterior mean and a 95 percent posterior interval for each $\beta_j$.
Solution:
Letting $\beta = (\beta_0, \dots, \beta_k)^T$, we may express the probability of a positive observation in vector notation:
$$\mathbb{P}(Y = 1 \mid X = X_i) = \frac{e ^ {\beta_0 + \sum_{j=1}^k \beta_j X_{ij}}}{1 + e ^ {\beta_0 + \sum_{j=1}^k \beta_j X_{ij}}} = \frac{e ^ {\beta^T X_i}}{1 + e ^{\beta^T X_i}}.$$Therefore, the likelihood of the logistic regression model can be expressed:
$$ \begin{align*} \mathcal{L}(\beta) &= \prod_{i=1}^n \mathbb{P}(Y = Y_i \mid X = X_i) \\ &= \prod_{i=1 : Y_i = 1} \frac{e ^ {\beta^T X_i}}{1 + e ^{\beta^T X_i}} \prod_{i=1 : Y_i = 0} \frac{1}{1 + e ^{\beta^T X_i}}\\ &= \prod_{i=1}^n \frac{e ^ {\beta^T X_i Y_i}}{1 + e ^{\beta^T X_i}}. \\ \end{align*} $$Since we are assuming a flat prior, the posterior is proportional to the likelihood:
$$ \begin{align*} f(\beta \mid \text{data}) &= \frac{f(\beta) \mathcal{L}(\beta)}{\int f(u) \mathcal{L}(u)} \\ &\propto \mathcal{L}(\beta). \end{align*} $$Since we know how to evaluate a function proportional to $f(\beta \mid \text{data})$, we can use the Gibbs-Metropolis algorithm to draw $n$ samples from it. Let the matrix $B = \{B_i\}_{i=1}^n$ be the set of samples, where the row $B_i = \{B_{ij}\}_{j=0}^k$ is the $i^{\text{th}}$ sample from the posterior distribution of $\beta$.
To generate samples, we use the following procedure:
By using a symmetric proposal distribution, the expression for $r$ reduces to:
$$r = \min \left\{\frac{f(B_{\text{proposal}})}{f(B_{\text{current}})} , 1\right\}.$$We shall take $q_j(y \mid x)$ to be a $N(x, b^2_j)$, and tune $b^2_j$ so that proposals are accepted ~50% of the time (this is the rule of thumb suggested by the book).
Note: care must be taken to avoid overflow / underflow when computing:
$$ \begin{align*} \mathcal{L}(\beta) &= \prod_{i=1}^n \frac{e ^ {\beta^T X_i Y_i}}{1 + e ^{\beta^T X_i}}. \\ \end{align*} $$This can be done by first computing the log-likelihood:
$$ \begin{align*} \mathcal{l}(\beta) &= \sum_{i=1}^n \beta^T X_i Y_i - \log \left(1 + \exp\{ \beta^T X_i\}\right) \\ &\approx \beta^T X_i Y_i - \left[ \max\{0, \beta^T X_i\} + \log \left(1 + \exp\{ - \text{abs}(\beta^T X_i) \}\right) \right], \\ \end{align*} $$where the absolute value function is applied element-wise, and then exponentiating the result.
data = pd.read_csv(
"data/coris.dat",
skiprows=3,
names="sbp tobacco ldl adiposity famhist typea obesity alcohol age chd".split(),
)
target = "chd"
features = data.columns.drop(target)
X, Y = data[features].values, data[target].values
# add a constant to design matrix
features = features.insert(0, "constant")
X = np.insert(X, 0, values=np.ones_like(X.shape[0]), axis=1)
n = int(1e4) # desired sample size
from scipy.special import expit
def log_likelihood(beta):
logits = X @ beta
y_pred = expit(logits)
y_true = Y
epsilon = 1e-15 # small value to avoid taking the logarithm of zero
y_pred = np.clip(
y_pred, epsilon, 1 - epsilon
) # clip values to avoid extreme probabilities
likelihood = np.sum(
y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
)
return likelihood
res = minimize(fun=lambda beta: -log_likelihood(beta), x0=np.zeros(10))
my_beta_star = res.x
import statsmodels.api as sm
model = sm.Logit(Y, X)
result = model.fit()
stats_models_beta_star = result.params
assert np.isclose(my_beta_star, stats_models_beta_star, atol=1e-4).all()
Optimization terminated successfully. Current function value: 0.510974 Iterations 6
def gibbs_metropolis(n, sigmas):
k = len(features)
acceptance_rates = np.zeros(k)
B = np.zeros((n, k))
for i in range(n):
if i > 0:
B[i] = B[i - 1]
for j in range(k):
B_current = B[i].copy()
B_ij_current = B[i, j]
Z = norm.rvs(loc=B_ij_current, scale=sigmas[j])
B_proposal = B_current.copy()
B_proposal[j] = Z
r = min(
np.exp(log_likelihood(B_proposal))
/ np.exp(log_likelihood(B_current)),
1,
)
if uniform.rvs() < r:
B[i, j] = Z
acceptance_rates[j] += 1
return B, acceptance_rates / n
# sigmas selected heuristically so that acceptance rates are ~50%.
sigmas = np.zeros(10)
sigmas[0] = 0.23
sigmas[1] = 0.0015
sigmas[2] = 0.04
sigmas[3] = 0.05
sigmas[4] = 0.01
sigmas[5] = 0.3
sigmas[6] = 0.005
sigmas[7] = 0.01
sigmas[8] = 0.005
sigmas[9] = 0.005
samples, acceptance_rates = gibbs_metropolis(n, sigmas)
pd.DataFrame.from_dict(
dict(zip(features, acceptance_rates)),
columns=["Acceptance Rates"],
orient="index",
)
Acceptance Rates | |
---|---|
constant | 0.5036 |
sbp | 0.5256 |
tobacco | 0.4716 |
ldl | 0.4446 |
adiposity | 0.4400 |
famhist | 0.5273 |
typea | 0.4536 |
obesity | 0.4500 |
alcohol | 0.6196 |
age | 0.4850 |
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(20, 8))
for index, feature_name in enumerate(features):
ax[index // 5][index % 5].hist(samples[:, index], histtype="step", bins=30)
ax[index // 5][index % 5].set_title(feature_name)
plt.show()
posterior_mean, lb, ub = (
samples.mean(axis=0),
np.quantile(samples, 0.025, axis=0),
np.quantile(samples, 0.975, axis=0),
)
posterior = pd.DataFrame(
{"mean": posterior_mean, "95% LB": lb, "95% UB": ub},
index=features,
)
plt.figure(figsize=(10, 5))
plt.bar(posterior.index, posterior["mean"], label="Posterior Mean")
plt.vlines(
posterior.index,
ymin=posterior["95% LB"],
ymax=posterior["95% UB"],
color="black",
label="95% Posterior Interval",
)
plt.grid()
plt.legend()
plt.show()
(b) Compare your analysis to a frequentist approach using maximum likelihood.
Solution: The coefficients that maximize the likelihood are similar to the posterior means.
res = minimize(fun=lambda beta: -log_likelihood(beta), x0=np.zeros(10))
frequentist_beta = res.x
plt.figure(figsize=(10, 5))
plt.bar(posterior.index, frequentist_beta)
plt.title("Maximum Likelihood Estimates")
plt.grid()
plt.show()
bayesian_beta = posterior_mean
k = len(features)
for j in range(0, k):
plt.plot(
[frequentist_beta[j], bayesian_beta[j]],
[0, 1],
color="black",
marker="o",
)
plt.yticks([0, 1], ["Frequentist", "Bayesian"])
plt.grid()
plt.show()