In [1]:
import math

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import norm, chi2, beta as Beta

plt.style.use("default.mplstyle")

1¶

Prove Theorem 21.5:

The risk of $\hat{f}$ is

$$R(J) = \sum_{j=1}^J \frac{\sigma_j^2}{n} + \sum_{j=J+1}^{\infty} \beta_j^2.$$

Solution:

Recall the squared bias-variance decomposition of the mean integrated squared error (MISE), aka risk:

\begin{align*} R(J) &= \int \mathbb{V}(\hat{f}(x)) + \left(b(x)\right)^2 \, dx. \end{align*}

After computing the bias:

\begin{align*} b(x) &= f(x) - \mathbb{E}[\hat{f}(x)] \\ &= \sum_{j=1}^{\infty}\beta_j \phi_j(x) - \sum_{j=1}^J\mathbb{E}[\hat{\beta}_j]\phi_j(x) \\ &= \sum_{j=J+1}^{\infty}\beta_j \phi_j(x) \tag{$\hat{\beta}_j$ is unbiased}\\ \end{align*}

then the variance:

\begin{align*} \mathbb{V}[\hat{f}(x)] &= \mathbb{V}\left[ \sum_{j=1}^J \hat{\beta}_j \phi_j(x)\right] \\ &= \sum_{j=1}^J \phi_j^2(x) \mathbb{V}[\hat{\beta}_j] + \sum_{j=1}^J \sum_{k=1}^J \text{Cov}[\hat{\beta}_j, \hat{\beta}_k] \phi_j(x) \phi_k(x) \\ &= \sum_{j=1}^J \phi_j^2(x) \frac{\sigma_j^2}{n} + \sum_{j=1}^J \sum_{k=1}^J \text{Cov}[\hat{\beta}_j, \hat{\beta}_k] \phi_j(x) \phi_k(x) \\ \end{align*}

we may express the risk

\begin{align*} R(J) &= \int \left[\sum_{j=1}^J \phi_j^2(x) \frac{\sigma_j^2}{n} + \sum_{j=1}^J \sum_{k=1}^J \text{Cov}[\hat{\beta}_j, \hat{\beta}_k] \phi_j(x) \phi_k(x) + \sum_{j=J+1}^{\infty}\beta_j^2 \phi_j^2(x) + \sum_{j=J+1}^{\infty} \sum_{k=J+1, k\ne j}^{\infty} \beta_j \beta_{k} \phi_j(x) \phi_k(x) \right] \, dx \\ &= \sum_{j=1}^J \frac{\sigma_j^2}{n} \int \phi_j^2(x) \, dx + \sum_{j=1}^J \sum_{k=1}^J \text{Cov}[\hat{\beta}_j, \hat{\beta}_k] \int \phi_j(x) \phi_k(x) + \sum_{j=J+1}^{\infty} \beta_j^2 \int \phi_j^2(x) + \sum_{j=J+1}^{\infty} \sum_{k=J+1, k\ne j}^{\infty} \beta_j \beta_{k} \int \phi_j(x) \phi_k(x) \, dx \\ &= \sum_{j=1}^J \frac{\sigma_j^2}{n} + \sum_{j=J+1}^{\infty} \beta_j^2. \tag{orthonormality of basis functions} \end{align*}

2¶

Prove Theorem 21.9:

The risk $R(J)$ of the estimator $\hat{r}_n(x) = \sum_{j=1}^J \hat{\beta}_j \phi_j(x)$ is

$$R(J) = \frac{J\sigma^2}{n} + \sum_{j=J+1}^{\infty} \beta_j^2$$

Solution: This follows the same lines as the previous question. From the bias-variance decomposition:

\begin{align*} R(J) &= \int \mathbb{V}[\hat{r}(x)] + b(x)^2 \, dx \end{align*}\begin{align*} b(x) &= r(x) - \mathbb{E}[\hat{r}(x)] \\ &= \sum_{j=1}^{\infty} \beta_j \phi_j(x) - \sum_{j=1}^J \mathbb{E}[\hat{\beta}_j]\phi_j(x) \\ &= \sum_{j=J+1}^{\infty} \beta_j \phi_j(x) \tag{$\hat{\beta}_j$ is unbiased} \end{align*}\begin{align*} \mathbb{V}[\hat{r}(x)] &= \mathbb{V}\left[\sum_{j=1}^J \hat{\beta}_j \phi_j(x) \right] \\ &= \sum_{j=1}^J \mathbb{V}[\hat{\beta_j}]\phi_j^2(x) - \sum_{j=1}^J \sum_{k=1}^J \phi_j(x) \phi_k(x) \text{Cov}[\hat{\beta}_j, \hat{\beta}_k] \end{align*}

Thus,

\begin{align*} R(J) &= \int \left[ \sum_{j=1}^J \mathbb{V}[\hat{\beta_j}]\phi_j^2(x) - \sum_{j=1}^J \sum_{k=1}^J \phi_j(x) \phi_k(x) \text{Cov}[\hat{\beta}_j, \hat{\beta}_k] + \sum_{j=J+1}^{\infty} \beta_j^2 \phi_j^2(x) \right] \, dx \\ &= \sum_{j=1}^J \mathbb{V}[\hat{\beta_j}]\int \phi_j^2(x) \, dx - \sum_{j=1}^J \sum_{k=1}^J \text{Cov}[\hat{\beta}_j, \hat{\beta}_k] \int \phi_j(x) \phi_k(x) \, dx + \sum_{j=J+1}^{\infty} \beta_j^2 \int \phi_j^2(x) \, dx \\ &= \frac{J\sigma^2}{n} + \sum_{j=J+1}^{\infty} \beta_j^2 \end{align*}

3¶

Let

$$\psi_1 = \left(\frac{1}{\sqrt{3}}, \frac{1}{\sqrt{3}}, \frac{1}{\sqrt{3}} \right), \psi_2 = \left(\frac{1}{\sqrt{2}}, -\frac{1}{\sqrt{2}}, 0 \right), \psi_1 = \left(\frac{1}{\sqrt{6}}, \frac{1}{\sqrt{6}}, -\frac{2}{\sqrt{6}} \right).$$

Show that these vectors have norm 1 and are orthogonal.

Solution:

Computing norms:

\begin{align*} ||\psi_1|| &= \left( \left(\frac{1}{\sqrt{3}} \right)^2 + \left(\frac{1}{\sqrt{3}} \right)^2 + \left(\frac{1}{\sqrt{3}} \right)^2 \right)^{1/2} = \left( \frac13 + \frac13 + \frac13 \right)^{1/2} = 1 \\ ||\psi_2|| &= \left( \left(\frac{1}{\sqrt{2}} \right)^2 + \left(-\frac{1}{\sqrt{2}} \right)^2 + 0^2 \right)^{1/2} = \left( \frac12 + \frac12 + 0\right)^{1/2} = 1 \\ ||\psi_3|| &= \left( \left(\frac{1}{\sqrt{6}} \right)^2 + \left(\frac{1}{\sqrt{6}} \right)^2 + \left(-\frac{2}{\sqrt{6}} \right)^2 \right)^{1/2} = \left( \frac16 + \frac16 + \frac23 \right)^{1/2} = 1 \\ \end{align*}

Taking inner products (noting, by commutativity of the inner product, we need only show three such products):

\begin{align*} \langle \psi_1, \psi_2 \rangle = \frac{1}{\sqrt{3}}\frac{1}{\sqrt{2}} + \frac{1}{\sqrt{3}}\left(-\frac{1}{\sqrt{2}}\right) + \frac{1}{\sqrt{3}}(0) = 0 \\ \langle \psi_1, \psi_3 \rangle = \frac{1}{\sqrt{3}}\frac{1}{\sqrt{6}} + \frac{1}{\sqrt{3}}\frac{1}{\sqrt{6}} + \frac{1}{\sqrt{3}}\left(-\frac{2}{\sqrt{6}} \right) = 0 \\ \langle \psi_2, \psi_3 \rangle = \frac{1}{\sqrt{2}}\frac{1}{\sqrt{6}} + \left(-\frac{1}{\sqrt{2}}\right)\frac{1}{\sqrt{6}} + 0\left(-\frac{2}{\sqrt{6}} \right) = 0 \\ \end{align*}

4¶

Prove Parseval's relation equation (21.6):

$$||f||^2 \equiv \int f^2(x) \, dx = \sum_{j=1}^{\infty} \beta_j^2 \equiv ||\beta||^2$$

Solution:

\begin{align*} \int f^2(x) \, dx &= \int \sum_{j=1}^{\infty} \beta_j^2 \phi_j^2(x) - \sum_{j=1}^{\infty}\sum_{k=1}^{\infty} \beta_j \beta_k \phi_j(x) \phi_k(x) \, dx \\ &= \sum_{j=1}^{\infty} \beta_j^2 \int \phi_j(x) \, dx - \sum_{j=1}^{\infty} \sum_{k=1}^{\infty} \beta_j \beta_k \int \phi_j(x) \phi_k(x) \, dx \tag{$f \in L_2$, plus Fubini's theorem} \\ &= \sum_{j=1}^{\infty} \beta_j^2 \tag{Orthonormality of basis functions} \end{align*}

5¶

Plot the first five Legendre polynomials. Verify, numerically, that they are orthonormal.

Solution:

The first few are given in the text:

  • $P_0(x) = 1$
  • $P_1(x) = x$
  • $P_2(x) = \frac{1}{2} \left(3x^2 - 1\right)$
  • $P_3(x) = \frac{1}{2} \left(5x^3 - 3x\right)$

We compute a fourth using the recursion relation (with $j=3$):

\begin{align*} P_4(x) &= \frac{(2 \cdot 3 + 1)xP_3(x) - 3P_2(x)}{4} \\ &= \frac{7x \left[\frac{1}{2}\left(5x^3 - 3x\right)\right] - 3\left[\frac{1}{2}\left(3x^2 - 1\right)\right]}{4} \\ &=\frac{1}{8}\left(35x^4 - 30x^2 + 3\right) \end{align*}

Plotting the functions (after normalization):

In [2]:
def get_polynomial(x, coefficients):
    yy = np.zeros(len(x))
    for order, value in enumerate(coefficients):
        yy += value * np.power(x, order)
    return yy

legendre_coefficients = {"P_0": [1],
                         "P_1": [0, 1],
                         "P_2": [-1/2, 0, 3/2],
                         "P_3": [0, -3/2, 0, 5/2],
                         "P_4": [3/8, 0, -30/8, 0, 35/8]}

fig = plt.figure(figsize=(10, 6))
x = np.linspace(-1, 1, 100)
for j, (key, coefs) in enumerate(legendre_coefficients.items()):
    normalized_coefs = np.array(coefs) * math.sqrt((2 * j + 1) / 2) # normalize
    y = get_polynomial(x, normalized_coefs) 
    plt.plot(x, y, label=f"${key}(x)$")
plt.legend()
plt.show()

Verifying orthonormality:

In [3]:
N = int(1e6)
x = np.linspace(-1, 1, N + 1)
for j_a, (key_a, value_a) in enumerate(legendre_coefficients.items()):
    value_a = np.array(value_a) * math.sqrt((2 * j_a + 1) / 2) # normalization
    for j_b, (key_b, value_b) in enumerate(legendre_coefficients.items()):
        if key_b > key_a:
            value_b = np.array(value_b) * math.sqrt((2 * j_b + 1) / 2) # normalization
            y_a = get_polynomial(x, value_a)
            y_b = get_polynomial(x, value_b)
            print(f"<{key_a},{key_b}> =  {(2 / (N)) * np.sum(y_a * y_b):.3f}")
<P_0,P_1> =  -0.000
<P_0,P_2> =  0.000
<P_0,P_3> =  -0.000
<P_0,P_4> =  0.000
<P_1,P_2> =  -0.000
<P_1,P_3> =  0.000
<P_1,P_4> =  -0.000
<P_2,P_3> =  -0.000
<P_2,P_4> =  0.000
<P_3,P_4> =  -0.000

6¶

Expand the following functions in the cosine basis on [0, 1]. For (a) and (b), find the coefficients $\beta_j$ analytically. For $(c)$ and $(d)$, find the coefficients $\beta_j$ numerically, i.e.

$$\beta_j = \int_0^1 f(x) \phi_j(x) \approx \frac{1}{N} \sum_{r=1}^N f \left( \frac{r}{N} \right) \phi_j \left(\frac{r}{N}\right)$$

for som large integer $N$. Then plot the partial sum $\sum_{j=1}^n \beta_j \phi_j(x)$ for increasing values of $n$.

(a) $f(x) = \sqrt{2} \cos(3\pi x)$
(b) $f(x) = \sin(\pi x)$
(c) $f(x) = \sum_{j=1}^{11} h_j K(x-t_j)$ where $K(t) = (1 + \text{sign}(t))/2$, $\text{sign}(x) = -1$ if $x < 0$, $\text{sign}(x) = 0$ if $x = 0$, and $\text{sign}(x) = 1$ if $x > 0$, $$(t_j) = (.1, .13, .15, .23, .25, .40, .44, .65, .76, .78, .81),$$ $$(h_j) = (4, −5, 3, −4, 5, −4.2, 2.1, 4.3, −3.1, 2.1, −4.2).$$ (d) $f(x) = \sqrt{x(1-x)}\sin\left(\frac{2.1 \pi}{(x + 0.05)}\right)$.

Solution:

(a) Observe $f(x) = \phi_3(x)$. Thus, the coefficients are

$$ \beta_j = \begin{cases} 1 & j = 3 \\ 0 & \text{otherwise.} \\ \end{cases} $$

(b)

\begin{align*} \beta_j &= \int_a^b f(x) \phi_j(x) \, dx \\ &= \sqrt{2} \int_0^1 \sin(\pi x) \cos(j \pi x) \, dx \\ \end{align*}

Integrating by parts: \begin{align*} I = \int_0^1 \cos(j \pi x) \sin(\pi x) \, dx &= \left.\left[-\frac{\cos(j \pi x) \cos(\pi x)}{\pi}\right]\right\rvert_0^1 + j \int_0^1 \cos(\pi x) \sin(j \pi x) \, dx \\ &= \frac{\cos(j \pi) + 1}{\pi} - j(I_2), \end{align*}

where $I_2 = \int_0^1 \cos(\pi x) \sin(j \pi x) \, dx$. Applying integration by parts to $I_2$, we get:

\begin{align*} I_2 = \int_0^1 \cos(\pi x) \sin(j \pi x) \, dx &= \left.\left[\frac{\sin(j \pi x) \sin(\pi x)}{\pi} \right]\right\rvert_0^1 - j \int_0^1 \cos(j \pi x) \sin(\pi x) \, dx. \\ &= 0 + jI \end{align*}

Therefore,

$$\beta_j = \sqrt{2} \int_0^1 \cos(j \pi x) \sin(\pi x) \, dx = \frac{\sqrt{2}(\cos(j \pi) + 1)}{\pi(1-j^2)}.$$
In [4]:
def f_a(x):
    return np.sqrt(2) * np.cos(3 * np.pi * x)

def f_b(x):
    return np.sin(np.pi * x)
    
t = np.array([.1, .13, .15, .23, .25, .40, .44, .65, .76, .78, .81])
h = np.array([4, -5, 3, -4, 5, -4.2, 2.1, 4.3, -3.1, 2.1, -4.2])

def f_c(x):
    return np.sum((np.sign(x - np.repeat(np.reshape(t, [-1, 1]), len(x), axis=1)) / 2).T * h, axis=1)

def f_d(x):
    return np.sqrt(x * (1 - x)) * np.sin(2.1 * np.pi / (x + 0.05))

def beta_function_a(j):
    return 1 if j == 3 else 0

def beta_function_b(j):
    if j == 0:
        return 2 / np.pi
    elif j == 1:
        return 0
    else:
        return (math.sqrt(2) * (np.cos(np.pi * j) + 1)) / (np.pi * (1 - j ** 2))
In [5]:
def cosine_phi(x, j):
    """cosine basis function on [0,1]"""
    if j == 0:
        return np.ones(len(x))
    else:
        return math.sqrt(2) * np.cos(j * np.pi * x)
In [6]:
class orthogonal_function_estimator():
    def __init__(self, f, phi, beta_function=None):
        self.f = f
        self.phi = phi
        N = 10000 # size of mesh
        n = 1028 # number of coefficients to compute

        self.beta = {}
        if beta_function is None:
            inputs = np.linspace(0, 1, N)
            for j in range(n):
                self.beta[j] = (1 / N) * np.sum(f(inputs) * self.phi(inputs, j))
        else:
            for j in range(n):
                self.beta[j] = beta_function(j)


    def plot_f(self):
        xx = np.linspace(0, 1, N)
        plt.plot(xx, self.f(xx), label=f"$f(x)$")

    def plot_phi(self, k):
        xx = np.linspace(0, 1, N)
        plt.plot(xx, phi(xx, k), label=f"$\phi_k(x)$")

    def partial_sum(self, x, J):
        y = np.zeros(len(x))
        for j in range(0, J):
            y = y + (self.beta[j] * self.phi(x, j))
        return y
    
    def plot(self):
        _, ax = plt.subplots(nrows=1, ncols=4, figsize=(20,4))
        xx = np.linspace(0, 1, N)
        ax[0].plot(xx, self.f(xx))
        ax[0].set_title("$f(x)$")
        y_min = min(np.array(self.f(xx))) * 1.5
        y_max = max(np.array(self.f(xx))) * 1.5
        ax[0].set_ylim(y_min, y_max)
        for index, J in enumerate([16, 64, 512]):
            ax[index + 1].plot(xx, self.partial_sum(xx, J))
            ax[index + 1].set_title("$\hat{f}$" + f"$[{J}](x)$")
            ax[index + 1].set_ylim(y_min, y_max)
        plt.show()
In [7]:
ofe = orthogonal_function_estimator(f_a, cosine_phi, beta_function_a)
ofe.plot()
In [8]:
ofe = orthogonal_function_estimator(f_b, cosine_phi, beta_function_b)
ofe.plot()
In [9]:
ofe = orthogonal_function_estimator(f_c, cosine_phi)
ofe.plot()
In [10]:
ofe = orthogonal_function_estimator(f_d, cosine_phi)
ofe.plot()

7¶

Consider the glass fragments data from the book's website. Let $Y$ be refractive index and let $X$ be aluminum content (the fourth variable).

(a) Do a nonparametric regression to fit the model $Y = f(x) + \epsilon$ using the cosine basis method. The data are not on a regular grid. Ignore this when estimating the function. (But do sort the data first according to $x$.) Provide a function estimate, an estimate of the risk, and a confidence band.

(b) Use the wavelet method to estimate $f$.

In [11]:
data = pd.read_csv("data/glass.dat", sep="\s+", usecols=["RI", "Al"])
Y = data["RI"].values
X = data["Al"].values
sorted_indices = X.argsort()
X = X[sorted_indices]
Y = Y[sorted_indices]
X = (X - min(X)) / (max(X) - min(X))
In [12]:
class orthogonal_function_regressor():
    
    def __init__(self, x, y, phi):
        self.x = x
        self.y = y
        self.phi = phi
        self.n = 1000
        self.beta = {}
        for j in range(self.n):
            self.beta[j] = np.mean(y * self.phi(x, j))

        self.k = int(self.n / 4)
        self.sigma_2_hat = (self.n / self.k) * np.sum(np.power(np.array([self.beta[j] for j in range(self.n-self.k+1, self.n)]), 2))

    def risk_estimate(self, J):
        R = (J * self.sigma_2_hat / self.n)
        betas = np.array([self.beta[j] for j in range(J+1, self.n)])
        terms =  np.power(betas, 2) - self.sigma_2_hat / self.n
        positive_terms = np.where(terms > 0, terms, 0)
        R += np.sum(positive_terms)
        return R

    def partial_sum(self, x, J, confidence_band=False):
        
        y = np.zeros(len(x))
        for j in range(0, J):
            y = y + (self.beta[j] * self.phi(x, j))

        if confidence_band:
            a = np.sqrt(np.sum([np.power(ofr.phi(x, j), 2) for j in range(5)], axis=0))
            c = (a * np.sqrt(self.sigma_2_hat) * chi2.ppf(0.05, df=J)) / np.sqrt(self.n)
            l = y - c
            u = y + c
            return y, l, u
        else:
            return y
        
ofr = orthogonal_function_regressor(X, Y, cosine_phi)

Js = np.arange(20)
plt.plot(Js, [ofr.risk_estimate(j) for j in Js], linewidth=1)
plt.xticks(Js)
plt.xlabel("$J$")
plt.ylabel("Estimated Risk")
plt.show()

xx = np.linspace(min(X), max(X), 1000)
plt.scatter(X, Y, alpha=0.25)
y, l, u = ofr.partial_sum(xx, 6, confidence_band=True)
plt.plot(xx, y, label="Orthogonal Function Regressor (J=6)")
plt.plot(xx, l, color='orange')
plt.plot(xx, u, color='orange', label="95% Confidence Band")
plt.xlabel("Aluminum Content")
plt.ylabel("Refractive Index")
plt.legend()
plt.show()

print(f"Estimated Risk: {ofr.risk_estimate(6):.3f}")
Estimated Risk: 25.075

Using the wavelet method:

In [13]:
def haar_phi(x):
    return (np.where(((0 <= x) & (x < 1)), 1, 0)).astype(np.float64)

def haar_psi(x):
    return np.where((x >= 0) & (x < 1), np.where((x < 1/2), -1, 1), 0)

def haar_psi_jk(x, j, k):
    return (2 ** (j / 2)) * haar_psi(((2 ** j) * x) - k)

n = len(X)
alpha_hat = (haar_phi(X) @ Y) / n
J = int(np.ceil(np.log2(n)))
D = {}
for j in range(J):
    for k in range(0, int(2 ** (J))):
        D[j,k] = (haar_psi_jk(X, j, k) @ Y) / n

# universal thresholding
D_values = np.array([D[J - 1, k] for k in range(0, int(2 ** (J)))])
sigma_hat = np.median(np.abs(D_values)) / 0.6745
for j in range(J):
    for k in range(0, int(2 ** (J ))):
        if np.abs(D[j,k]) <= sigma_hat * np.sqrt(2 * np.log(n) / n):
            D[j,k] = 0

x = np.linspace(0, 1, 1000)
partial_sum = alpha_hat * haar_phi(x)
for j in range(J):
    for k in range(0, int((2 ** (J)))):
        partial_sum += D[j,k] * haar_psi_jk(x, j, k)

plt.scatter(X,Y, alpha=0.5)
plt.plot(x, partial_sum)
plt.show()

8¶

Show that the Haar wavelets are orthonormal.

Solution:

The Haar scaling function is defined:

$$ \phi(x) = \begin{cases} 1 & \text{if} \,\, 0 \le x < 1 \\ 0 & \text{otherwise,} \\ \end{cases} $$

the mother Haar wavelet is defined:

$$ \psi(x) = \begin{cases} -1 & \text{if} \,\, 0 \le x \le \frac12 \\ 1 & \text{if} \,\, \frac12 < x \le 1, \end{cases} $$

and finally, for any integers $j$ and $k$, we define the daughter wavelet:

$$ \psi_{jk}(x) = 2 ^ {j / 2} \psi(2^j x - k).$$

We first establish normality:

\begin{align*} ||\phi(x)||^2 &= \int \phi^2(x) \, dx = \int_0^1 \, dx = 1 \end{align*}\begin{align*} ||\psi(x)||^2 &= \int_{0}^{1/2} (-1)^2 + \int_{1/2}^1 1^2 \, dx = 1 \end{align*}\begin{align*} ||\psi_{jk}(x)||^2 &= \int_{2^{-j}k}^{(k + 1/2)2^{-j}} (-2^{-j/2})^2 \, dx + \int_{(k + 1/2)2^{-j}}^{(k + 1)2^{-j}} (2^{-j/2})^2 \, dx = 1. \end{align*}

Orthogonality can be established by the following observations. If $(j_1, k_1) \ne (j_2, k_2)$, it must be the case that either the supports $\psi_{j_1k_1}$ and $\psi_{j_2k_2}$ are disjoint, or that the support of one is a subset of the other. In the first case, the pointwise product of the functions is zero, and hence so is the inner product. Suppose the second case holds, and WLOG, suppose $\text{supp}(\psi_{j_1k_1}) \subseteq \text{supp}(\psi_{j_2k_2})$. It must be the case that $\psi_{j_2k_2}$ is constant on $\text{supp}(\psi_{j_1k_1})$ (assuming a value of -1 or 1, depending on if the support of $\psi_{j_1k_1}$ is on the left or right side). In either case, the pointwise product yields a multiple of $\psi_{j_1k_1}$, which has integral 0.

9¶

Consider again the doppler signal:

$$f(x) = \sqrt{x(1 - x)} \sin \left( \frac{2.1\pi}{x + 0.05}\right).$$

Let $n = 1,024$, $\sigma = 0.1$, and let $(x_1, \dots, x_n) = (1 / n, \dots, 1)$. Generate data

$$Y_i = f(x_i) + \sigma * \epsilon_i$$

where $\epsilon_i \sim N(0, 1)$.

(a) Fit the curve using the cosine basis method. Plot the function estimate and confidence band for $J = 10, 20, \dots, 100$.

(b) Use Haar wavelets to fit the curve.

In [14]:
N = 1_024
sigma = 0.1
x = np.arange(1 / N, 1 + 1/N, 1/N)
eps = norm.rvs(size=N)
Y = f_d(x) + sigma * eps
In [15]:
# fitting using cosine basis method

def cosine_phi(x, j):
    if j == 0:
        return np.ones(len(x))
    else:
        return math.sqrt(2) * np.cos(j * np.pi * x)
    
ofr = orthogonal_function_regressor(x, Y, cosine_phi)
Js = np.arange(10, 110, 10)
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(22,8))
ax = ax.flatten()
for index, J in enumerate(Js):
    xx = np.linspace(0, 1, 1000)
    y_est, lb, ub = ofr.partial_sum(xx, J=J, confidence_band=True)
    ax[index].plot(xx, y_est, color='blue')
    ax[index].plot(xx, lb, color='orange')
    ax[index].plot(xx, ub, color='orange')
    ax[index].set_ylim(-1, 1)
    ax[index].set_title(f"J={J}")
    ax[index].grid(False)
    ax[index].set_yticks([])
In [16]:
# computing the coefficients using the DWT for Haar Wavelets
n = len(Y)
J = int(np.ceil(np.log2(n)))
D = {x: None for x in range(J)}
temp = Y / np.sqrt(n)
for j in range(J - 1, -1, -1):
    m = 2 ** j
    I = np.array(list(range(1, m + 1))).astype(int)
    D[j] = ((temp[2 * I - 1] - temp[np.array(2 * I) - 2]) / np.sqrt(2))
    temp = (temp[2 * I - 1] + temp[np.array(2 * I) - 2]) / np.sqrt(2)

# applying universal thresholding
D_values = np.array([D[J-1][k] for k in range(0, int(2 ** (J-1)))])
sigma_hat = np.sqrt(n) * np.median(np.abs(D_values)) / 0.6745
for j in range(J):
    for k in range(0, int(((2 ** (j))))):
        if np.abs(D[j][k]) <= sigma_hat * np.sqrt(2 * np.log(n) / n):
            D[j][k] = 0

# computing the approximation
xx = np.linspace(0, 1, 1_000)
alpha_hat = (haar_phi(x) @ Y) / n
partial_sum = alpha_hat * haar_phi(xx)
for j in range(J):
    for k in range(0, int(2 ** j)):
        partial_sum = partial_sum + D[j][k] * haar_psi_jk(xx, j, k)

plt.scatter(x, Y, alpha=0.03)
plt.plot(xx, partial_sum)
Out[16]:
[<matplotlib.lines.Line2D at 0x7f0bc7e433d0>]

10¶

(Haar density Estimation.) Let $X_1, \dots, X_n \sim f$ for some density $f$ on $[0, 1]$. Let's consider constructing a wavelet histogram. Let $\phi$ and $\psi$ be the Haar father and mother wavelet. Write

$$f(x) \approx \phi(x) + \sum_{j=0}^{J-1}\sum_{k=0}^{2^j - 1} \beta_{j,k} \psi_{j,k}(x)$$

where $J \approx \log_2(n)$. Let

$$\hat{\beta}_{jk} = \frac{1}{n} \sum_{i=1}^n \psi_{j,k}(X_i).$$

(a) Show that $\hat{\beta}_{j,k}$ is an unbiased estimate of $\beta_{j,k}$.

(b) Define the Haar histogram

$$\hat{f}(x) = \phi(x) + \sum_{j=1}^B \sum_{k=0}^{2^j - 1} \hat{\beta}_{j,k} \psi_{j,k}(x)$$

for $0 \le B \le J-1$.

(c) Find an approximate expression for the MSE as a function of $B$.

(d) Generate $n=1,000$ observations from a $\text{Beta}(15,4)$ density. Estimate the density using the Haar histogram. Use leave-one-out cross validation to choose $B$.

Solution:

Establishing unbiasedness. Note: we must assume $f \in L^2[0,1]$. This is the case if, e.g., $f$ has finite variance, or $f$ is continuous on $[0,1]$.

\begin{align*} \mathbb{E}[\hat{\beta}_{jk}] &= \frac{1}{n} \sum_{i=1}^n \mathbb{E}[\psi_{jk}(X_i)] \\ &= \mathbb{E}[\psi_{jk}(X_1)] \\ &= \int_0^1 \psi_{jk}(x) f(x) \, dx = \beta_{jk} \tag{Requires $f \in L^2[0,1]$} \end{align*}
\begin{align*} \mathbb{V}[\hat{\beta}_{jk}] &= \frac{1}{n^2} \sum_{i=1}^n \mathbb{V}[\psi_{jk}(X_i)] \\ &= \frac{1}{n} \mathbb{V}[\psi_{jk}(X_1)] \\ &= \frac{1}{n} \int_0^1 (\psi_{jk}(x) - \beta_{jk})^2 f(x) \, dx \\ &= \frac{\sigma^2_{jk}}{n} \end{align*}

Finding an expression for the Mean Squared Error:

\begin{align*} \mathbb{E}[(\hat{f}(x) - f(x))^2] &= \mathbb{V}[\hat{f}(x) - f(x)] + \mathbb{E}[\hat{f}(x) - f(x)]^2 \\ &= \mathbb{V}[\hat{f}(x)] + \mathbb{E}[\hat{f}(x) - f(x)]^2 \\ \end{align*}\begin{align*} \mathbb{E}[\hat{f}(x) - f(x)] &= \left[\phi(x) + \sum_{j=0}^{J-1}\sum_{k=0}^{2^j - 1} \beta_{j,k} \psi_{j,k}(x) \right] - \left[\phi(x) + \sum_{j=1}^B \sum_{k=0}^{2^j - 1} \mathbb{E}[\hat{\beta}_{j,k}] \psi_{j,k}(x)\right] \\ &= \sum_{j=B+1}^{J-1}\sum_{k=0}^{2^j - 1} \beta_{j,k} \psi_{j,k}(x) \end{align*}\begin{align*} \mathbb{V}[\hat{f}(x)] &= \mathbb{V} \left[\phi(x) + \sum_{j=0}^{B}\sum_{k=0}^{2^j - 1} \hat{\beta}_{j,k} \psi_{j,k}(x) \right] \\ &= \sum_{j=0}^{B}\sum_{k=0}^{2^j - 1} \frac{\sigma^2_{jk}}{n}\psi_{j,k}^2(x) + \sum_{j=0}^{B}\sum_{k=0}^{2^j - 1}\sum_{j'=0, j' \ne j}^{B}\sum_{k'=0, k'\ne k}^{2^{j'} - 1} \text{Cov}[\hat{\beta}_{j,k}, \hat{\beta}_{j', k'}] \psi_{jk}(x) \psi_{j'k'}(x) \\ \end{align*}

When integrating the squared bias plus the variance, the cross-terms cancel due to orthonormality, and we get:

\begin{align*} \text{MISE} = \sum_{j=B+1}^{J-1}\sum_{k=0}^{2^j - 1} \beta^2_{j,k} + \sum_{j=0}^{B}\sum_{k=0}^{2^j - 1} \frac{\sigma^2_{jk}}{n} \end{align*}
In [17]:
def haar_histogram(X, B):

    def compute_beta(x):
        beta = {}
        for j in range(B + 1):
            for k in range(2 ** j):
                beta[j,k] = np.sum(haar_psi_jk(x, j, k)) / n
        return beta
    
    beta = compute_beta(X)

    def f_hat(z):
        f_hat = haar_phi(z).astype(np.float64)
        for j in range(B + 1):
            for k in range(2 ** j):
                f_hat += beta[j,k] * haar_psi_jk(z, j, k)
        return f_hat
        
    return f_hat

n = 1_000
J = np.round(np.log2(n)).astype(int)
X = Beta.rvs(a=15, b=4, size=n)
f_hat = haar_histogram(X, 4)

step = 1e-4
xx = np.arange(0, 1 + step, step)
plt.plot(xx, f_hat(xx))
plt.show()
In [18]:
def risk_of_haar_histogram(X, B):
    f_hat = haar_histogram(X, B)
    xx = np.arange(0, 1 + step, step)
    second_moment = np.sum((f_hat(xx) ** 2) * step)
    return second_moment - (2 / n) * np.sum([haar_histogram(np.delete(X, i), B)(X[i]) for i in range(n)])

risk = {}
for B in range(J+1):
    risk[B] = risk_of_haar_histogram(X, B)
In [19]:
plt.plot(risk.keys(), risk.values())
plt.xlabel("$B$")
plt.ylabel("$\hat{J}$", rotation=0)
plt.title("Risk of Haar Histogram by $B$")

print(f"Risk minimizer: B={np.argmin(list(risk.values()))}")
Risk minimizer: B=4
In [20]:
plt.figure(figsize=(20, 8))
for index, B in enumerate(range(J)):
    f_hat = haar_histogram(X, B)
    ax = plt.subplot(2, 5, index + 1)
    ax.plot(xx, f_hat(xx))
    ax.grid(False)
    ax.set_title(f"$B = {B}$, $J = {risk[B]:.3f}$")
plt.show()

11¶

In this question, we will explore the motivation for equation (21.37). Let $X_1, \dots, X_n \sim N(0, \sigma^2)$. Let

$$\hat{\sigma} = \sqrt{n} \times \frac{\text{median}(|X_1|, \dots, |X_n|)}{0.6745}.$$

(a) Show that $\mathbb{E}[\hat{\sigma}] = \sigma$.

(b) Simulate $n = 100$ observations from a $N(0,1)$ distribution. Compute $\hat{\sigma}$ as well as the usual estimate of $\sigma$. Repeat 1,000 times and compare the MSE.

(c) Repeat (b) but add some outliers to the data. To do this, simulate each observation from a $N(0, 1)$ with probability 0.95 and simulate each observation from a $N(0, 10)$ with probability 0.05.

Solution:

Note: the equation is incorrect; we do not scale by $\sqrt{n}$. The estimate should be

$$\hat{\sigma} = \frac{\text{median}(|X_1|, \dots, |X_n|)}{0.6745}.$$

Fact: the expected value of the sample median is the population median. That is, if $Y_1, \dots, Y_n \sim f$, $\mathbb{E}[\text{median}(Y_1, \dots, Y_n)] = F_{Y_1}^{-1}(1/2)$. The proof is nontrivial. Therefore,

$$\mathbb{E}[\hat{\sigma}] = \frac{F_{|X|}^{-1}(1/2)}{0.6745}$$

where $X \sim N(0, \sigma^2)$. The result follows from observing $|X|$ follows a half-normal distribution with parameter $\sigma$, which has median

$$F_{|X|}^{-1}(1/2) = \sigma \sqrt{2} \text{erf}^{-1}(1/2) \approx 0.6745 \sigma.$$

The new estimate for $\sigma$ improves on the MSE of the usual estimate:

$$\hat{\sigma}_{\text{old}} = \frac{1}{n-1}\sum_{i=1}^n (X_i - \bar{X})^2.$$
In [21]:
n = 100
k = 1000
X = norm.rvs(size=(n, k))

sigma_hat_old = np.sum(np.power(X - np.mean(X, axis=0), 2), axis=0) / (n - 1)
sigma_hat_new = np.median(np.abs(X), axis=0) / 0.6745

mse_old = np.mean(np.power(sigma_hat_old - 1, 2))
mse_new = np.mean(np.power(sigma_hat_new - 1, 2))

print(f"MSE of old estimate: {mse_old:.4f}")
print(f"MSE of new estimate: {mse_new:.4f}")
MSE of old estimate: 0.0195
MSE of new estimate: 0.0132

The second experiment demonstrates the new estimate is much more robust to outliers:

In [22]:
rand_ints = np.random.randint(0, 100, size=(n, k))
mask_5_percent = rand_ints < 5
normal_1 = norm.rvs(0, 1, size=(n, k))
normal_10 = norm.rvs(0, 10, size=(n, k))
X = np.where(mask_5_percent, normal_10, normal_1)

sigma_hat_old = np.sum(np.power(X - np.mean(X, axis=0), 2), axis=0) / (n - 1)
sigma_hat_new = np.median(np.abs(X), axis=0) / 0.6745

mse_old = np.mean(np.power(sigma_hat_old - 1, 2))
mse_new = np.mean(np.power(sigma_hat_new - 1, 2))

print(f"MSE of old estimate: {mse_old:.4f}")
print(f"MSE of new estimate: {mse_new:.4f}")
MSE of old estimate: 41.2768
MSE of new estimate: 0.0183

12¶

Repeat question 6 using the Haar basis.

Solution:

(a)

The coefficient of a basis function is computed by taking the inner product of the basis function and the function of interest:

\begin{align*} \alpha = \int_0^1 f(x) \phi(x) \, dx. \end{align*}\begin{align*} \beta_{jk} = \int_0^1 f(x) \psi_{jk}(x) \, dx. \end{align*}\begin{align*} \alpha &= \sqrt{2} \int_0^1 \cos(3 \pi x) \, dx = \frac{\sqrt{2}}{3 \pi} \sin(3 \pi) = 0 \end{align*}\begin{align*} \beta_{jk} &= \sqrt{2} \cdot 2^{j/2} \left[ -\int_{2^{-j}k}^{2 ^ {-j} (k + 1/2)} \cos(3 \pi x) \, dx + \int_{2^{-j}(k {-j}+ 1/2)}^{2 ^ {-j} (k + 1)} \cos(3 \pi x) \, dx \right]\\ &= \frac{2^{(j + 1)/2}}{3 \pi}\left[\sin(3 \pi (k + 1) 2^{-j}) - 2\sin(3 \pi (k + 1/2) 2^{-j}) + \sin(3 \pi k 2^{-j})) \right] \end{align*}
In [23]:
def beta_function_a(j, k):
    first_term = np.sin(3 * np.pi * (k + 1) * 2 ** (-j))
    second_term = - 2 * np.sin(3 * np.pi * (k + 0.5) * 2 ** (-j))
    third_term = np.sin(3 * np.pi * (k) * 2 ** (-j))
    return (2 ** ((j + 1) / 2)/ (3 * np.pi)) * (first_term + second_term + third_term)
    
def wavelet_regressor(f, father_wavelet, daughter_wavelet, alpha=None, beta_function=None):
    beta = {}
    J = 6
    z = np.arange(0, 1, 1e-4)
    Y = f(z)
    if beta_function is None:
        alpha = (Y @ father_wavelet(z)) / len(Y)

    for j in range(J + 1):
        for k in range(2 ** j):
            if beta_function is None:
                beta[j,k] = (Y @ daughter_wavelet(z, j, k)) / len(Y)
            else:
                beta[j, k] = beta_function(j, k)
    
    def f_hat(x):
        result = alpha * father_wavelet(x)
        for j in range(J + 1):
            for k in range(2 ** j):
                result += beta[j, k] * daughter_wavelet(x, j, k)
        return result
    return f_hat

f_hat = wavelet_regressor(f_a, haar_phi, haar_psi_jk, alpha=0, beta_function=beta_function_a)
xx = np.arange(0, 1, 1e-4)
plt.plot(xx, f_hat(xx))
Out[23]:
[<matplotlib.lines.Line2D at 0x7f0bc7e5ad60>]

(b)

\begin{align*} \alpha = \int_0^1 \sin(\pi x) \, dx = \left.-\frac{\cos(\pi x)}{\pi}\right\rvert_0^1 = \frac{2}{\pi} \end{align*}\begin{align*} \beta_{jk} &= 2^{j/2} \left[-\int_{2^{-j}k}^{2 ^ {-j} (k + 1/2)} \sin(\pi x) \, dx + \int_{2^{-j}(k {-j}+ 1/2)}^{2 ^ {-j} (k + 1)} \sin(\pi x) \, dx \right]\\ &=2^{j /2} \left[ \left. \frac{\cos(\pi x)}{\pi} \right\rvert_{2^{-j}k}^{2 ^ {-j} (k + 1/2)} - \left. \frac{\cos(\pi x)}{\pi} \right\rvert_{2^{-j}(k + 1/2)}^{2 ^ {-j} (k + 1)} \right] \\ &= \frac{2^{j/2}}{\pi}\left[ 2\cos \left(\frac{(k + 1/2) \pi}{ 2 ^ {j}}\right) - \cos \left(\frac{k \pi}{ 2 ^ {j}}\right) - \cos \left(\frac{(k + 1) \pi}{ 2 ^ {j}}\right)\right] \end{align*}
In [24]:
def beta_function_b(j,k):
    return ((2 ** (j / 2)) / np.pi) \
        * (2 * np.cos(((k + (1/2)) * np.pi)/(2 ** j)) \
        - np.cos(k * np.pi / (2 ** j)) - np.cos(((k + 1) * np.pi)/(2 ** j)))

f_hat = wavelet_regressor(f_b, haar_phi, haar_psi_jk, alpha = (2 / np.pi), beta_function=beta_function_b)

xx = np.arange(0, 1, 1e-4)
plt.plot(xx, f_hat(xx))
Out[24]:
[<matplotlib.lines.Line2D at 0x7f0bcc24ab50>]
In [25]:
f_hat = wavelet_regressor(f_c, haar_phi, haar_psi_jk)
xx = np.arange(0, 1, 1e-4)
plt.plot(xx, f_hat(xx))
Out[25]:
[<matplotlib.lines.Line2D at 0x7f0bcdf23610>]
In [26]:
f_hat = wavelet_regressor(f_d, haar_phi, haar_psi_jk)
xx = np.arange(0, 1, 1e-4)
plt.plot(xx, f_hat(xx))
Out[26]:
[<matplotlib.lines.Line2D at 0x7f0bd05ff970>]