In [1]:
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import default_rng
import pandas as pd
from scipy.integrate import quad
from scipy.stats import gamma, norm, uniform
from scipy.optimize import minimize

np.set_printoptions(formatter={"float": "{: 0.3f}".format})
np.random.seed(0)

1¶

Let

$$I = \int_1^2 \frac{e^{-x^2 / 2}}{\sqrt{2 \pi}} dx .$$

(a) Estimate $I$ using the basic Monte Carlo method. Use $N = 100,000$. Also, find the estimated standard error.

Solution:

Letting $f(x) = e^{-x^2 / 2} / \sqrt{2 \pi}$ and $h(x) = \mathbb{1}_{x \in [1, 2]}$, we have

$$I = \int h(x) f(x)\, dx,$$

and we can estimate $I$ via

$$\hat{I} = \frac{1}{N} \sum_{i=1}^N Y_i = \frac{1}{N} \sum_{i=1}^N h(X_i),$$

where $X_1, \dots X_n$ are drawn from a standard normal distribution. Note that $\hat{I}$ is the sample mean of $Y$. An estimated standard error of the sample mean is given by

$$\hat{\textsf{se}} = \frac{s}{\sqrt{N}}$$

where

$$s^2 = \frac{\sum_{i=1}^N (Y_i - \hat{I})^2}{N - 1}$$

is the sample variance.

In [2]:
def h(x):
    return np.where(a <= x, np.where(x <= b, 1, 0), 0)


N = 100_000
a = 1
b = 2
X = norm.rvs(loc=0, scale=1, size=N)
Y = h(X)
I_hat = np.mean(Y)
s2 = (np.linalg.norm(Y - I_hat) ** 2) / (N - 1)  # sample variance
se = np.sqrt(s2) / np.sqrt(N)  # estimated standard error
print(f"I_hat : {I_hat:.5f}, estimated standard error: {se:.3e}")
print(f"Approximate 95% C.I.: ({I_hat - 2 * se:.3e}, {I_hat + 2 * se:.3e})")
I_hat : 0.13457, estimated standard error: 1.079e-03
Approximate 95% C.I.: (1.324e-01, 1.367e-01)

(b) Find an (analytical) expression for the standard error of your estimate in (a). Compare to the estimated standard error.

Solution:

The true standard error (what $\hat{\textsf{se}}$ estimates) is the standard deviation of the sample mean of $Y$, namely, $\hat{I}$. Since $Y_i = \mathbb{1}_{X_i \in [1,2]}$ are independent, it is given by

$$\textsf{se} = \frac{\sigma_{Y}}{\sqrt{N}},$$

where $\sigma_Y$ is the standard deviation of $Y$. Observe that $\mathbb{E}[Y_i] = I = \Phi(2) - \Phi(1)$, and that since $Y_i \in [0, 1]$, $Y_i^2 = Y_i$. Therefore,

$$ \begin{align*} \sigma_Y^2 = \mathbb{V}[Y_i] &= \mathbb{E}[Y_i^2] - \mathbb{E}[Y_i]^2 = I(1 - I), \\ \end{align*} $$

and thus

$$\textsf{se} = \sqrt{\frac{I(1 - I)}{N}}.$$
In [3]:
I = norm.cdf(2) - norm.cdf(1)
se_true = np.sqrt(I * (1 - I) / (N - 1))
print(f"True standard error: {se_true:.3e}")
print(f"Estimated standard error: {se:.3e}")
print(f"Difference: {se_true - se:.3e}")
True standard error: 1.084e-03
Estimated standard error: 1.079e-03
Difference: 4.503e-06

(c) Estimate $I$ using importance sampling. Take $g$ to be $N(1.5, v^2)$ with $v = .1$, $v = 1$ and $v = 10$. Compute the (true) standard errors in each case. Also, plot a histogram of the values you are averaging to see if there are any extreme values.

In [4]:
def f(x):
    return norm.pdf(x, loc=0, scale=1)


def g(x):
    return norm.pdf(x, loc=1.5, scale=v**2)


fig, ax = plt.subplots(figsize=(14, 4), nrows=1, ncols=3)

estimates = {}
for i, v in enumerate([0.1, 1, 10]):
    X = norm.rvs(loc=1.5, scale=v**2, size=N)
    Y = (h(X) * f(X)) / g(X)
    estimates[v] = np.mean(Y)
    ax[i].hist(Y, bins=100)
    ax[i].set_title(f"$v$ = {v}")
print(estimates)
fig.suptitle("Histograms for $Y_i$", fontsize=14)
plt.show()
{0.1: 0.013070056030923984, 1: 0.13611405986928374, 10: 0.131887993546821}

The true standard error can once again be computed via

$$\textsf{se} = \frac{\sigma_{Y}}{\sqrt{N}}.$$

However, now

$$Y = \frac{h(X)f(X)}{g(X)}, X \sim g = N(1.5, v^2)$$

To recover the formula for an arbitrary mean, let $m = \mathbb{E}[X]$ (in the problem, $m = 1.5$). The variance of $Y$ is given:

$$ \begin{align*} \sigma_{Y}^2 &= \mathbb{E}[Y^2] - \mathbb{E}[Y]^2 \\ &= \int \frac{h^2(x)f^2(x)}{g^2(x)} g(x) \, dx - \left( \int h(x) f(x) \, dx\right)^2 \\ &= \int_1^2 \frac{f^2(x)}{g(x)} \, dx - \left( \int_1^2 f(x) \, dx\right)^2 \\ &= \int_1^2 \frac{e^{-x^2}}{2 \pi} \frac{\sqrt{2 \pi v^2}}{e^{-(x - m)^2 / 2v^2}} \, dx - I^2 \\ &= \frac{v}{\sqrt{2 \pi}} \int_1^2 \exp\left\{-x^2 + \frac{1}{2} \left(\frac{x - m}{v}\right)^2\right\} \, dx - I^2 .\\ \end{align*} $$

It is possible to simplify the integral term. However, we can compute it numerically, yielding the standard errors:

In [5]:
m = 1.5


def f(x):
    return (v / np.sqrt(2 * np.pi)) * np.exp(
        -(x**2) + ((x - m) ** 2) / (2 * (v**2))
    )


vs = [0.1, 1, 10]
ses = {}
for v in vs:
    true_value, _ = quad(f, 1, 2)
    sigma_y_2 = true_value - I**2
    sigma_y_2 = sigma_y_2
    ses[v] = sigma_y_2 / np.sqrt(N)
pd.DataFrame.from_dict({"Standard Error": ses}, orient="index").T
Out[5]:
Standard Error
0.1 0.263522
1.0 0.000121
10.0 0.001649

We can search for the minimizer:

In [6]:
vv = np.logspace(-1, 1)
sigma_y_2s = {}
ses = {}
for i, v in enumerate(vv):

    def f(x):
        return (v / np.sqrt(2 * np.pi)) * np.exp(
            -(x**2) + ((x - m) ** 2) / (2 * (v**2))
        )

    true_value, _ = quad(f, 1, 2)
    sigma_y_2 = true_value - I**2
    sigma_y_2s[v] = sigma_y_2
    ses[v] = sigma_y_2 / np.sqrt(N)

plt.plot(ses.keys(), ses.values())
plt.xscale("log")
plt.yscale("log")
plt.xlabel("$v$")
plt.ylabel("Standard Error")
plt.grid()
plt.show()
In [7]:
optimal_v = list(ses.keys())[np.array(list(ses.values())).argmin()]
print(f"Optimal v: {optimal_v:.3e}, Standard Error: {ses[optimal_v]:.3e}")
Optimal v: 3.393e-01, Standard Error: 3.548e-05

(d) Find the optimal importance sampling function $g^*$. What is the standard error using $g^*$?

Solution:

By Theorem 24.5, the optimal choice of $g$ is

$$g^*(x) = \frac{|h(x)|f(x)}{\int |h(s)| f(s) \, ds}$$

In our case, $f(x) = e^{-x^2 / 2} / \sqrt{2 \pi}$ and $h(x) = \mathbb{1}_{x \in [1, 2]}$, so

$$ \begin{align*} g^*(x) &= \begin{cases} \frac{e^{-x^2 / 2}}{\int_1^2 e^{-s^2 / 2} \, ds} = \frac{\phi(x)}{\Phi(2) - \Phi(1)} & 1 \le x \le 2 \\ 0 & \text{otherwise.} \end{cases} \end{align*} $$

In this case, the variance vanishes:

$$ \begin{align*} \sigma_{Y}^2 &= \int_1^2 \frac{f^2(x)}{g^*(x)} \, dx - \left( \int_1^2 f(x) \, dx\right)^2 \\ &= \int_1^2 \frac{\phi^2(x) \left( \Phi(2) - \Phi(1)\right)}{\phi(x)} - (\Phi(2) - \Phi(1))^2 \\ &= \left( \Phi(2) - \Phi(1)\right) \int_1^2 \phi(x) \, dx - (\Phi(2) - \Phi(1))^2 = 0 \end{align*} $$

2¶

Here is a way to use importance sampling to estimate a marginal density. Let $f_{X,Y}(x, y)$ be a bivariate density and let $(X_1, X_2), \dots, (X_N, Y_N) \sim f_{X, Y}$.

(a) Let $w(x)$ be an arbitrary probability density function. Let

$$\hat{f}_X(x) = \frac{1}{N} \sum_{i=1}^N \frac{f_{X,Y}(x, Y_i) w(X_i)}{f_{X,Y}(X_i, Y_i)}$$

Show that, for each $x$,

$$\hat{f}_X(x) \xrightarrow{P} f_X(x).$$

Find an expression for the variance of this estimator.

Solution:

If $(X_i, Y_i)$ are i.i.d. according to a density $f_{X,Y}(x,y)$, the law of large numbers states that for any function $g$:

$$\frac{1}{N} \sum_{i=1}^N g(X_i, Y_i) \xrightarrow{P} \int g(x,y) f_{X,Y}(x,y) \, dx dy$$

Thus,

$$ \begin{align*} \hat{f}_X(x) &= \frac{1}{N} \sum_{i=1}^N \frac{f_{X,Y}(x, Y_i) w(X_i)}{f_{X,Y}(X_i, Y_i)} \\ &\xrightarrow{P} \int \int \left[\frac{f_{X,Y}(x, y) w(x')}{f_{X,Y}(x', y)} \right] f_{X,Y}(x', y) \, dx' dy\\ & \int f_{X,Y}(x, y) \left[ \int w(x') \, dx' \right] dy \\ &= f_{X}(x). \end{align*} $$

We now compute the variance of the summand via $\mathbb{V}(u) = \mathbb{E}(u^2) - \mathbb{E}(u)^2$:

We have

$$ \begin{align*} \mathbb{E}\left[\hat{f}_X(x) \right] &= \frac{1}{N} \sum_{i=1}^N \int \int \frac{f_{X, Y}(x, y) w(x')}{f_{X,Y}(x', y)} f_{X,Y}(x', y) dy dx' \\ &= \frac{1}{N} \sum_{i=1}^N f_{X}(x) \int w(x') dx' = f_X(x).\\ \end{align*} $$

and

$$ \begin{align*} \mathbb{E}\left[\hat{f}_X^2(x) \right] &= \frac{1}{N} \sum_{i=1}^N \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}^2(x', y)} f_{X,Y}(x', y) dy dx' \\ &= \frac{1}{N} \sum_{i=1}^N \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}(x', y)} dy dx' \\ &= \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}(x', y)} dy dx' \end{align*} $$

So the variance is:

$$\mathbb{V}\left[ \hat{f}_X(x) \right] = \int \int \frac{f_{X, Y}^2(x, y) w^2(x')}{f_{X,Y}(x', y)} dy dx' - f_X^2(x).$$

(b)

Let $Y \sim N(0, 1)$ and $X \mid Y = y \sim N(y, 1 + y^2)$. Use the method in (a) to estimate $f_X(x)$.

Solution:

The joint pdf is given:

$$ \begin{align*} f_{X,Y}(x,y) &= f_{X \mid Y}(x, y) f_{Y}(y) \\ &= \frac{1}{\sqrt{2 \pi (1 + y^2)}} \exp \left\{- \frac{1}{2} \frac{(x - y) ^2}{(1 + y^2)} \right\} \frac{1}{\sqrt{2 \pi}} \exp \left\{ -y^2 / 2 \right\} \\ &= \frac{1}{2 \pi \sqrt{1 + y^2}} \exp \left\{-\frac{1}{2} \left[ \frac{(x - y) ^2}{(1 + y^2)} + y^2\right] \right\} \end{align*} $$

We can generate samples of $Y$, and from those, $X$. We shall set $w$ to be a normal distribution with location 5 and scale 10.

In [8]:
N = 100_000

Y = norm.rvs(loc=0, scale=1, size=N)
X = norm.rvs(loc=Y, scale=np.sqrt(1 + Y**2))


def f(x, y):
    coef = 1 / (2 * np.pi * np.sqrt(1 + y**2))
    exp = -(1 / 2) * ((x - y) ** 2 / (1 + y**2) + y**2)
    return coef * np.exp(exp)


def w(x):
    return norm.pdf(x, loc=5, scale=10)


xx = np.arange(-10, 10, step=0.05)
f_X = np.zeros_like(xx)
for i, x in enumerate(xx):
    f_X[i] = np.nanmean(f(x, Y) * w(X) / f(X, Y))
plt.figure(figsize=(8, 4))
plt.plot(xx, f_X)
plt.xlabel("x")
plt.ylabel("Estimate of f_X")
plt.grid()
plt.show()

3¶

Here is a method called accept-reject sampling for drawing observations from a distribution.

(a) Suppose that $f$ is some probability density function. Let $g$ be any other density and suppose that $f(x) \le Mg(x)$ for all $x$, where $M$ is a known constant. Consider the following algorithm:

(step 1): Draw $X \sim g$ and $U \sim \text{Unif}(0, 1)$;

(step 2): If $U \le f(X) / (M g(X))$ set $Y = X$, otherwise go back to step 1. (Keep repeating until you finally get an observation.)

Show that the distribution of $Y$ is $f$.

Solution:

For any $y$,

$$ \begin{align*} \mathbb{P}(Y = y) &= \mathbb{P}\left(X = y, U \le \frac{f(X)}{Mg(X)}\right) \\ &= g(y) \frac{f(y)}{M g(y)} \\ &= \frac{f(y)}{M}, \end{align*} $$

and thus $f_Y \propto f$. But since $f_Y$ is a PDF, $f_Y = f$.

(b) Let $f$ be a standard normal density and let $g(x) = 1 / (1 + x^2)$ be the Cauchy density. Apply the method in (a) to draw 1,000 observations from the normal distribution. Draw a histogram of the sample to verify that the sample appears to be normal.

In [9]:
from scipy.stats import cauchy, uniform


def f(x):
    return norm.pdf(x)


def g(x):
    return 1 / (1 + x**2)


np.random.seed(0)
M = 1
N = 1000
counter = 0
Ys = []
while counter < N:
    X = cauchy.rvs()
    U = uniform.rvs()
    if U <= f(X) / (M * g(X)):
        Ys.append(X)
    counter += 1
plt.hist(Ys, histtype="step", bins=20, label="samples", density=True)
xx = np.linspace(-3, 3)
plt.plot(xx, norm.pdf(xx), label="N(0,1)")
plt.legend()
plt.show()

4¶

A random variable $Z$ has a inverse Gaussian distribution if it has density

$$f(z) \propto z^{-3/2} \exp \left\{-\theta_1 z - \frac{\theta_2}{2} + 2 \sqrt{\theta_1 \theta_2} + \log \left(\sqrt{2 \theta_2} \right) \right\}, \quad z > 0$$

where $\theta_1 > 0$ and $\theta_2 > 0$ are parameters. It can be shown that

$$\mathbb{E}(Z) = \sqrt{\frac{\theta_2}{\theta_1}} \text{ and } \mathbb{E}\left(\frac{1}{Z}\right) = \sqrt{\frac{\theta_1}{\theta_2}} + \frac{1}{2 \theta_2}.$$

(a) Let $\theta_1 = 1.5$ and $\theta_2 = 2$. Draw a sample of size 1,000 using the independence-Metropolis-Hastings method. Use a Gamma distribution as the proposal density. To assess the accuracy, compare the mean of $Z$ and $1 / Z$ from the sample to the theoretical means. Try different Gamma distributions to see if you can get an accurate sample.

Solution:

The independence-Metropolis-Hastings method takes the following steps:

  • Draw a proposal from a fixed distribution $g$. $g$ is an approximation to $f$.
  • Accept with probability
$$r(x, y) = \min \left\{1, \frac{f(y)}{f(x)} \frac{g(x)}{g(y)}\right\}.$$

Let $\mu = \mathbb{E}[Z]$ and $\nu = \mathbb{E}\left( \frac{1}{Z}\right)$.

In [10]:
from numpy.random import default_rng
from scipy.integrate import quad


def f(z):
    exponent = (
        -theta_1 * z
        - theta_2 / z
        + 2 * np.sqrt(theta_1 * theta_2)
        + np.log(np.sqrt(2 * theta_2))
    )
    return (z ** (-3 / 2)) * np.exp(exponent)


def g(z, a):
    return gamma.pdf(z, a)


theta_1 = 1.5
theta_2 = 2
mu = np.sqrt(theta_2 / theta_1)
nu = np.sqrt(theta_1 / theta_2) + 1 / (2 * theta_2)
# get normalization constant numerically
C, _ = quad(f, a=-1e-6, b=1e3)
In [11]:
def independence_MCMC(func, n, a):
    x = gamma.rvs(a=a, random_state=rng)  # initialize x
    cursor = 0
    X = np.zeros(n)
    while cursor < n:
        y = gamma.rvs(a=a, random_state=rng)  # get proposal
        r = min(1, func(y) / func(x) * g(x, a=a) / g(y, a=a))
        if uniform.rvs(random_state=rng) < r:
            x = y
            X[cursor] = x
            cursor += 1
    return X


def plot_results(values):
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
    ax[0].plot(values, mu_hats, label="Estimated")
    ax[0].hlines(mu, min(values), max(a_values), color="orange", label="True")
    ax[0].set_xlabel("a")
    ax[0].legend()
    ax[0].set_title("Mean")

    ax[1].plot(values, nu_hats, label="Estimated")
    ax[1].hlines(
        nu,
        min(values),
        max(values),
        color="orange",
        label="True",
    )
    ax[1].set_xlabel("a")
    ax[1].legend()
    ax[1].set_title("Mean of Inverse")
    plt.show()


n = 1000
a_values = np.linspace(0.3, 2, 21)
samples = np.zeros((len(a_values), n))
mu_hats = np.zeros_like(a_values)
nu_hats = np.zeros_like(a_values)
print(f"Actual mu: {mu:.3f} \t Actual nu: {nu:.3f}")
print("a\t\tmu_hat\t\tnu_hat\tmu_hat_error\tnu_hat_error\ttotal_abs_error")
for i, a in enumerate(a_values):
    rng = default_rng(seed=0)
    samples[i] = independence_MCMC(f, n, a)

    mu_hats[i] = np.mean(samples[i])
    nu_hats[i] = np.mean(np.power(np.array(samples[i]), -1))
    print(
        f"{a:.3e}\t{mu_hats[i]:.3f}\t\t{nu_hats[i]:.3f}\t{abs(mu_hats[i] - mu):.3f}\t\t{abs(nu_hats[i] - nu):.3f}"
        f"\t\t{np.abs((mu_hats[i] - mu)) + np.abs((nu_hats[i] - nu)):.3f}"
    )
plot_results(a_values)
Actual mu: 1.155 	 Actual nu: 1.116
a		mu_hat		nu_hat	mu_hat_error	nu_hat_error	total_abs_error
3.000e-01	1.085		1.261	0.069		0.145		0.215
3.850e-01	1.085		1.216	0.070		0.100		0.170
4.700e-01	1.084		1.204	0.070		0.088		0.158
5.550e-01	1.092		1.182	0.063		0.066		0.129
6.400e-01	1.119		1.191	0.036		0.075		0.111
7.250e-01	1.113		1.195	0.041		0.079		0.120
8.100e-01	1.119		1.200	0.036		0.084		0.120
8.950e-01	1.124		1.191	0.031		0.075		0.106
9.800e-01	1.155		1.158	0.000		0.042		0.043
1.065e+00	1.185		1.138	0.030		0.022		0.053
1.150e+00	1.205		1.136	0.050		0.020		0.070
1.235e+00	1.201		1.146	0.046		0.030		0.076
1.320e+00	1.210		1.133	0.055		0.017		0.072
1.405e+00	1.205		1.146	0.051		0.030		0.080
1.490e+00	1.187		1.134	0.033		0.018		0.051
1.575e+00	1.212		1.120	0.057		0.004		0.061
1.660e+00	1.220		1.114	0.065		0.002		0.068
1.745e+00	1.232		1.092	0.077		0.024		0.101
1.830e+00	1.244		1.084	0.089		0.032		0.121
1.915e+00	1.256		1.062	0.101		0.054		0.155
2.000e+00	1.261		1.067	0.106		0.049		0.155

The minimizer (for this run) of the total absolute error is $a=.98$. Plotting the resulting histogram of samples against $f$ (with the normalization constant computed numerically), we observe good agreement:

In [12]:
rng = default_rng(seed=0)
Z_ind = independence_MCMC(f, n, a=0.98)
mu_hat_ind = np.mean(Z_ind)
nu_hat_ind = np.mean(1 / Z_ind)
plt.hist(Z_ind, bins=50, density=True, label="Samples")
xx = np.linspace(1e-3, 5, 100)
plt.plot(xx, f(xx) / C, label="True density")
plt.legend()
plt.show()

(b) Draw a sample of size 1,000 using the random-walk-Metropolis-Hastings method. Since $z > 0$ we cannot just use a Normal density. One strategy is this. Let $W = \log Z$. Find the density of $W$. Use this random-walk-Metropolis-Hastings method to get a sample $W_1, \dots, W_N$ and let $Z_i = e ^ {W_i}$. Assess the accuracy of the simulation in part (a).

Solution:

We first compute the distribution of $W = \log Z$. Let $h(x) = \log (x)$. Since $h$ is one-to-one, we may use the formula:

$$f_W(x) = f(h^{-1}(x)) \left| \frac{d}{dx} h^{-1}(x) \right|,$$

yielding

$$f_W(x) = \exp(x) \cdot f(\exp(x)).$$

The random-walk-Metropolis-Hasting method to sample from a distribution $f$ (with unknown normalization constant) is the following:

  1. Initialize an empty collection of samples. Select $x_0$ arbitrarily. Set $x = x_0$.
  2. Sample $\epsilon \sim N(0, b^2)$, and let $y = x + \epsilon$.
  3. Let $r = \min \{1, f(y) / f(x) \}$.
  4. With probability $r$, set $x = y$, and add $x$ to the collection of samples.
  5. Repeat steps (2) to (4) until a desired number of samples is collected.

We shall perform random-walk-MCMC on $f_W$, yielding a sample of $W$, from which we can get a sample of $Z$.

In [13]:
def f_W(z):
    return np.exp(z) * f(np.exp(z))


def random_walk_MCMC(func, n, b):
    x = 0  # initialize x
    cursor = 0
    W = np.zeros(n)
    while cursor < n:
        eps = norm.rvs(scale=b, random_state=rng)  # get proposal
        y = x + eps
        r = min(1, func(y) / func(x))
        if uniform.rvs(random_state=rng) < r:
            x = y
            W[cursor] = x
            cursor += 1
    return W


b_values = np.logspace(-3, 1, 20)
samples = np.zeros((len(b_values), n))
mu_hats = np.zeros_like(b_values)
nu_hats = np.zeros_like(b_values)
print(f"Actual mu: {mu:.3f} \t Actual nu: {nu:.3f}")

print("b\t\tmu_hat\t\tnu_hat\tmu_hat_error\tnu_hat_error\ttotal_abs_error")
for i, b in enumerate(b_values):
    rng = default_rng(seed=0)
    samples[i] = np.exp(random_walk_MCMC(f_W, n, b))
    mu_hats[i] = np.mean(samples[i])
    nu_hats[i] = np.mean(np.power(np.array(samples[i]), -1))
    print(
        f"{b:.3e}\t{mu_hats[i]:.3f}\t\t{nu_hats[i]:.3f}\t{abs(mu_hats[i] - mu):.3f}\t\t{abs(nu_hats[i] - nu):.3f}"
        f"\t\t{np.abs((mu_hats[i] - mu)) + np.abs((nu_hats[i] - nu)):.3f}"
    )
Actual mu: 1.155 	 Actual nu: 1.116
b		mu_hat		nu_hat	mu_hat_error	nu_hat_error	total_abs_error
1.000e-03	0.988		1.013	0.167		0.103		0.270
1.624e-03	0.980		1.021	0.175		0.095		0.270
2.637e-03	0.968		1.034	0.187		0.082		0.269
4.281e-03	0.949		1.056	0.206		0.060		0.266
6.952e-03	0.922		1.091	0.233		0.025		0.259
1.129e-02	0.884		1.146	0.270		0.030		0.300
1.833e-02	0.830		1.257	0.325		0.141		0.466
2.976e-02	0.964		1.175	0.191		0.059		0.249
4.833e-02	1.109		1.087	0.046		0.029		0.074
7.848e-02	1.217		1.042	0.062		0.074		0.137
1.274e-01	1.178		1.069	0.023		0.047		0.071
2.069e-01	1.136		1.094	0.019		0.022		0.041
3.360e-01	1.148		1.140	0.007		0.024		0.030
5.456e-01	1.147		1.116	0.008		0.000		0.008
8.859e-01	1.177		1.105	0.023		0.011		0.034
1.438e+00	1.174		1.130	0.020		0.014		0.034
2.336e+00	1.180		1.168	0.025		0.052		0.078
3.793e+00	1.157		1.175	0.002		0.059		0.060
6.158e+00	1.188		1.140	0.033		0.024		0.057
1.000e+01	1.247		1.144	0.093		0.028		0.120

Using the minimizing value of $b$:

In [14]:
rng = default_rng(seed=0)
Z_rw = np.exp(random_walk_MCMC(f_W, n, b=5.456e-01))
mu_hat_rw = np.mean(Z_rw)
nu_hat_rw = np.mean(1 / Z_rw)

plt.hist(Z_rw, bins=50, density=True, label="Samples")
xx = np.linspace(1e-3, 5, 100)
plt.plot(xx, f(xx) / C, label="True density")
plt.legend()
plt.show()
In [15]:
results = {
    "mu": {
        "actual": mu,
        "Independence MCMC Estimate": mu_hat_ind,
        "Random-Walk MCMC Estimate": mu_hat_rw,
    },
    "nu": {
        "actual": nu,
        "Independence MCMC Estimate": nu_hat_ind,
        "Random-Walk MCMC Estimate": nu_hat_rw,
    },
}
pd.DataFrame.from_dict(results, orient="index")
Out[15]:
actual Independence MCMC Estimate Random-Walk MCMC Estimate
mu 1.154701 1.154551 1.147062
nu 1.116025 1.158405 1.116451

Using the best settings for each, Random-Walk MCMC appears to outperform Independence MCMC for this task.

5¶

Get the heart disease data from the book web site. Consider a Bayesian analysis of the logistic regression model

$$ \mathbb{P}(Y = 1 \mid X = x) = \frac{e ^ {\beta_0 + \sum_{j=1}^k \beta_j x_j}}{1 + e ^ {\beta_0 + \sum_{j=1}^k \beta_j x_j}}. $$

Use the flat prior $f(\beta_0, \dots, \beta_k) \propto 1$. Use the Gibbs-Metropolis algorithm to draw a sample of size 10,000 from the posterior $f(\beta, \beta_1 \mid \text{data})$. Plot histograms for the posterior for the $\beta_j$'s. Get the posterior mean and a 95 percent posterior interval for each $\beta_j$.

Solution:

Letting $\beta = (\beta_0, \dots, \beta_k)^T$, we may express the probability of a positive observation in vector notation:

$$\mathbb{P}(Y = 1 \mid X = X_i) = \frac{e ^ {\beta_0 + \sum_{j=1}^k \beta_j X_{ij}}}{1 + e ^ {\beta_0 + \sum_{j=1}^k \beta_j X_{ij}}} = \frac{e ^ {\beta^T X_i}}{1 + e ^{\beta^T X_i}}.$$

Therefore, the likelihood of the logistic regression model can be expressed:

$$ \begin{align*} \mathcal{L}(\beta) &= \prod_{i=1}^n \mathbb{P}(Y = Y_i \mid X = X_i) \\ &= \prod_{i=1 : Y_i = 1} \frac{e ^ {\beta^T X_i}}{1 + e ^{\beta^T X_i}} \prod_{i=1 : Y_i = 0} \frac{1}{1 + e ^{\beta^T X_i}}\\ &= \prod_{i=1}^n \frac{e ^ {\beta^T X_i Y_i}}{1 + e ^{\beta^T X_i}}. \\ \end{align*} $$

Since we are assuming a flat prior, the posterior is proportional to the likelihood:

$$ \begin{align*} f(\beta \mid \text{data}) &= \frac{f(\beta) \mathcal{L}(\beta)}{\int f(u) \mathcal{L}(u)} \\ &\propto \mathcal{L}(\beta). \end{align*} $$

Since we know how to evaluate a function proportional to $f(\beta \mid \text{data})$, we can use the Gibbs-Metropolis algorithm to draw $n$ samples from it. Let the matrix $B = \{B_i\}_{i=1}^n$ be the set of samples, where the row $B_i = \{B_{ij}\}_{j=0}^k$ is the $i^{\text{th}}$ sample from the posterior distribution of $\beta$.

To generate samples, we use the following procedure:

  • Choose an arbitrary initial sample $B_0$ (we shall use $B_0 = \mathbf{0}$).
  • For each $i \in \{1, \dots, n\}$, we construct the sample $B_i$:
    • For each $j \in \{0, \dots, k\}$, we set $B_{ij}$ using the following procedure:
      • Draw a proposal value $Z$ from the proposal distribution $q_j(z \mid B_{ij})$.
      • Letting: $$B_{\text{current}} = (B_{i,0}, \dots, B_{i,(j-1)}, B_{ij}, B_{(i-1),(j+1)}, \dots, B_{(i-1), k})$$ $$B_{\text{proposal}} = (B_{i,0}, \dots, B_{i,(j-1)}, Z, B_{(i-1),(j+1)}, \dots, B_{(i-1), k})$$ evaluate $$r = \min \left\{\frac{f(B_{\text{proposal}})}{f(B_{\text{current}})} \frac{q_j(Z \mid B_{ij})}{q_j(B_{ij} \mid Z)}, 1\right\},$$ and set $$ B_{i,j} = \begin{cases} Z & \text{with probability $r$} \\ B_{(i-1), j} & \text{with probability $1 - r$}. \end{cases} $$

By using a symmetric proposal distribution, the expression for $r$ reduces to:

$$r = \min \left\{\frac{f(B_{\text{proposal}})}{f(B_{\text{current}})} , 1\right\}.$$

We shall take $q_j(y \mid x)$ to be a $N(x, b^2_j)$, and tune $b^2_j$ so that proposals are accepted ~50% of the time (this is the rule of thumb suggested by the book).

Note: care must be taken to avoid overflow / underflow when computing:

$$ \begin{align*} \mathcal{L}(\beta) &= \prod_{i=1}^n \frac{e ^ {\beta^T X_i Y_i}}{1 + e ^{\beta^T X_i}}. \\ \end{align*} $$

This can be done by first computing the log-likelihood:

$$ \begin{align*} \mathcal{l}(\beta) &= \sum_{i=1}^n \beta^T X_i Y_i - \log \left(1 + \exp\{ \beta^T X_i\}\right) \\ &\approx \beta^T X_i Y_i - \left[ \max\{0, \beta^T X_i\} + \log \left(1 + \exp\{ - \text{abs}(\beta^T X_i) \}\right) \right], \\ \end{align*} $$

where the absolute value function is applied element-wise, and then exponentiating the result.

In [16]:
data = pd.read_csv(
    "data/coris.dat",
    skiprows=3,
    names="sbp tobacco ldl adiposity famhist typea obesity alcohol age chd".split(),
)
target = "chd"
features = data.columns.drop(target)
X, Y = data[features].values, data[target].values

# add a constant to design matrix
features = features.insert(0, "constant")
X = np.insert(X, 0, values=np.ones_like(X.shape[0]), axis=1)

n = int(1e4)  # desired sample size
In [17]:
from scipy.special import expit


def log_likelihood(beta):
    logits = X @ beta
    y_pred = expit(logits)
    y_true = Y
    epsilon = 1e-15  # small value to avoid taking the logarithm of zero

    y_pred = np.clip(
        y_pred, epsilon, 1 - epsilon
    )  # clip values to avoid extreme probabilities

    likelihood = np.sum(
        y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
    )

    return likelihood


res = minimize(fun=lambda beta: -log_likelihood(beta), x0=np.zeros(10))
my_beta_star = res.x
import statsmodels.api as sm

model = sm.Logit(Y, X)
result = model.fit()
stats_models_beta_star = result.params
assert np.isclose(my_beta_star, stats_models_beta_star, atol=1e-4).all()
Optimization terminated successfully.
         Current function value: 0.510974
         Iterations 6
In [18]:
def gibbs_metropolis(n, sigmas):
    k = len(features)
    acceptance_rates = np.zeros(k)
    B = np.zeros((n, k))
    for i in range(n):
        if i > 0:
            B[i] = B[i - 1]
        for j in range(k):
            B_current = B[i].copy()
            B_ij_current = B[i, j]
            Z = norm.rvs(loc=B_ij_current, scale=sigmas[j])
            B_proposal = B_current.copy()
            B_proposal[j] = Z

            r = min(
                np.exp(log_likelihood(B_proposal))
                / np.exp(log_likelihood(B_current)),
                1,
            )
            if uniform.rvs() < r:
                B[i, j] = Z
                acceptance_rates[j] += 1
    return B, acceptance_rates / n


# sigmas selected heuristically so that acceptance rates are ~50%.
sigmas = np.zeros(10)
sigmas[0] = 0.23
sigmas[1] = 0.0015
sigmas[2] = 0.04
sigmas[3] = 0.05
sigmas[4] = 0.01
sigmas[5] = 0.3
sigmas[6] = 0.005
sigmas[7] = 0.01
sigmas[8] = 0.005
sigmas[9] = 0.005

samples, acceptance_rates = gibbs_metropolis(n, sigmas)
In [19]:
pd.DataFrame.from_dict(
    dict(zip(features, acceptance_rates)),
    columns=["Acceptance Rates"],
    orient="index",
)
Out[19]:
Acceptance Rates
constant 0.5036
sbp 0.5256
tobacco 0.4716
ldl 0.4446
adiposity 0.4400
famhist 0.5273
typea 0.4536
obesity 0.4500
alcohol 0.6196
age 0.4850
In [20]:
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(20, 8))
for index, feature_name in enumerate(features):
    ax[index // 5][index % 5].hist(samples[:, index], histtype="step", bins=30)
    ax[index // 5][index % 5].set_title(feature_name)
plt.show()
In [21]:
posterior_mean, lb, ub = (
    samples.mean(axis=0),
    np.quantile(samples, 0.025, axis=0),
    np.quantile(samples, 0.975, axis=0),
)
posterior = pd.DataFrame(
    {"mean": posterior_mean, "95% LB": lb, "95% UB": ub},
    index=features,
)
plt.figure(figsize=(10, 5))
plt.bar(posterior.index, posterior["mean"], label="Posterior Mean")
plt.vlines(
    posterior.index,
    ymin=posterior["95% LB"],
    ymax=posterior["95% UB"],
    color="black",
    label="95% Posterior Interval",
)
plt.grid()
plt.legend()
plt.show()

(b) Compare your analysis to a frequentist approach using maximum likelihood.

Solution: The coefficients that maximize the likelihood are similar to the posterior means.

In [22]:
res = minimize(fun=lambda beta: -log_likelihood(beta), x0=np.zeros(10))
frequentist_beta = res.x

plt.figure(figsize=(10, 5))
plt.bar(posterior.index, frequentist_beta)
plt.title("Maximum Likelihood Estimates")
plt.grid()
plt.show()
In [23]:
bayesian_beta = posterior_mean
k = len(features)
for j in range(0, k):
    plt.plot(
        [frequentist_beta[j], bayesian_beta[j]],
        [0, 1],
        color="black",
        marker="o",
    )
plt.yticks([0, 1], ["Frequentist", "Bayesian"])
plt.grid()
plt.show()