from functools import partial, lru_cache
from itertools import product
import math
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import norm
plt.style.use("default.mplstyle")
plt.rcParams["lines.linewidth"] = 1
Let $X_1, \dots X_n \sim f$ and let $\hat{f}_n$ be the kernel density estimator using the boxcar kernel:
\begin{align*} K(x) = \begin{cases} 1 & -\frac12 < x < \frac12 \\ 0 & \text{otherwise.} \\ \end{cases} \\ \end{align*}(a) Show that
$$\mathbb{E}(\hat{f}(x)) = \frac1h \int_{x - (h/2)}^{x + (h/2)}f(y) \, dy$$and
$$\mathbb{V}(\hat{f}(x)) = \frac{1}{nh^2} \left[ \int_{x - (h/2)}^{x + (h/2)}f(y) \, dy - \left( \int_{x - (h/2)}^{x + (h/2)}f(y) \, dy\right)^2\right]$$Solution:
\begin{align*} \hat{f}_n(x) &= \frac1n \sum_{i=1}^n \frac{1}{h} K \left( \frac{x - X_i}{h} \right) \\ &= \frac1n \sum_{i=1}^n \frac{1}{h} \mathbb{1}_{|(x-X_i)/h| \le 1/2} \\ &= \frac{1}{nh} C_x, \end{align*}where $C_x$ is the number of $i$ such that $|x - X_i| \le h/2$. Observe:
\begin{align*} \mathbb{E}(C_x) &= \sum_{i=1}^n \mathbb{P}(|x - X_i| \le h/2) \\ &= \sum_{i=1}^n \int_{x - h/2}^{x + x/h} f(y)\, dy \\ &= n \int_{x - h/2}^{x + x/h} f(y)\, dy \end{align*}Therefore
$$\mathbb{E}\left[\hat{f}_n(x)\right] = \frac{1}{h} \int_{x - h/2}^{x + h/2} f(y) \, dy.$$For the variance, observe that $C_x \sim \text{Binom}(n, p)$, where $p = \int_{x - h/2}^{x + x/h} f(y)\, dy$. Then,
\begin{align*} \mathbb{V} \left( \hat{f}_n(x) \right) &= \frac{1}{n^2h^2} \mathbb{V}(C_x) \\ &= \frac{1}{nh^2} (p(1-p)) \\ &= \frac{1}{nh^2} \left[ \int_{x - (h/2)}^{x + (h/2)}f(y) \, dy - \left( \int_{x - (h/2)}^{x + (h/2)}f(y) \, dy\right)^2\right]. \end{align*}(b) Show that if $h \to 0$ and $nh \to \infty$ as $n \to \infty$, then $\hat{f}_n(x) \to f(x)$ in probability.
Solution:
Observe that the bias tends to 0:
\begin{align*} \lim_{n\to\infty} \mathbb{E}[\hat{f}_n(x)] &= \lim_{n\to\infty} \frac{1}{h} \int_{x-(h/2)}^{x+(h/2)} f(y) \, dy \\ &= \lim_{h\to 0} \frac{1}{h} \int_{x-(h/2)}^{x+(h/2)} f(y) \, dy \\ &= \lim_{h\to 0} \frac{F(x + h/2) - F(x - h/2)}{h} \\ &= f(x). \tag{limit definition of derivative and continuity of $F$} \end{align*}Thus, by the bias-variance decomposition: \begin{align*} 0 \le \lim_{n\to\infty} \mathbb{E}\left[ (\hat{f}_n(x) - f(x))^2 \right] &= 0 + \lim_{n\to\infty} \mathbb{V}\left[\hat{f}_n(x)\right] \\ &= \lim_{n\to\infty} \frac{1}{nh} \left[ \frac{1}{h}\int_{x - (h/2)}^{x + (h/2)}f(y) \, dy - \frac{1}{h}\left( \int_{x - (h/2)}^{x + (h/2)}f(y) \, dy\right)^2\right] \\ &\le \lim_{n\to\infty} \frac{1}{nh} \left[ \frac{1}{h}\int_{x - (h/2)}^{x + (h/2)}f(y) \, dy \right] = 0, \\ \end{align*}
where the last limit holds because \begin{align*} \lim_{n\to\infty}\frac{1}{h}\int_{x - (h/2)}^{x + (h/2)}f(y) \, dy = f(x) < \infty. \end{align*}
Thus $\hat{f}_n(x) \to f(x)$ in quadratic mean, and therefore also in probability.
Get the data on fragments of glass collected in forensic work from the book website. Estimate the density of the first variable (refractive index) using a histogram and use a kernel density estimator. Use cross-validation to choose the amount of smoothing. Experiment with different binwidths and bandwidths. Comment on the similarities and differences. Construct 95 percent confidence bands for your estimators.
Solution:
Getting the data:
! curl -s --insecure https://www.stat.cmu.edu/~larry/all-of-statistics/=data/glass.dat --output data/glass.dat
glass_data = pd.read_csv("data/glass.dat", sep="\s+", usecols=["RI"])
refractive_indices = np.squeeze(glass_data.values)
Implementing a histogram estimator, including functions to compute estimated risk:
class histogram_estimator:
"""
Class for defining a histogram estimator, a nonparametric curve estimator.
Attributes:
X (list): List of values.
m (int): Number of histogram bins.
bounds (tuple): Lower and upper bounds (optional).
Methods:
cv_risk_estimate() -> Float:
Computes the estimated risk using cross validation (up to a constant).
simple_cv_risk_estimate() -> Float:
Computes the estimated risk using cross validation (up to a constant) using simpler formula.
"""
def __init__(self, X, m, bounds = None):
self.X = X
self.m = m
if bounds is None:
bounds = (X.min(), X.max())
self.bounds = bounds
self.n = len(self.X)
self.counts, self.bin_edges = np.histogram(self.X,
bins=self.m,
range = self.bounds)
self.h = self.bin_edges[1] - self.bin_edges[0]
self.p_hat = self.counts / self.n
def __call__(self, x):
bin_index = min(math.floor((x - self.bin_edges[0]) // self.h) - 1, self.m - 1)
density = self.counts[bin_index] / (self.h * self.n)
return density
def count_at(self, x):
bin_index = min(math.floor((x - self.bin_edges[0]) // self.h), self.m - 1)
return self.counts[bin_index]
def plot(self, confidence_band=False, alpha=0.05, **kwargs):
if confidence_band:
xx = self.bin_edges[:-1]
lower_bounds = [self.confidence_interval(x, alpha)[0] for x in xx]
upper_bounds = [self.confidence_interval(x, alpha)[1] for x in xx]
plt.step(xx, lower_bounds, "-", color='orange')
plt.step(xx, upper_bounds, "-", color='orange')
# plt.fill_between(xx, lower_bounds, upper_bounds, step="pre", alpha=0.1, color='', label="95% Confidence Envelope")
plt.step(self.bin_edges[:-1] - self.h, [self(x) for x in self.bin_edges[0:-1]], label=f"hist (m={self.m})", **kwargs)
def cv_risk_estimate(self):
second_moment = np.sum((self.p_hat / self.h) ** 2 * self.h)
sum = 0
for index, value in enumerate(self.X):
X_1 = np.delete(self.X, index)
f = histogram_estimator(X_1, self.m, self.bounds)
sum += f(value)
return second_moment - (2 / self.n) * sum
def simple_cv_risk_estimate(self):
first_term = 2 / ((self.n - 1) * self.h)
# note that the equation in the text has a typo
second_term = ((self.n + 1) / (self.h * (self.n - 1))) * np.sum(self.p_hat ** 2)
return first_term - second_term
def confidence_interval(self, x, alpha):
z = norm.ppf( 1 - (alpha / (2 * self.m)))
c = (z / 2) * np.sqrt(self.m / self.n)
l = (max(np.sqrt(self(x)) - c, 0)) ** 2
u = (np.sqrt(self(x)) + c) ** 2
return l, u
Computing the risk for various numbers of bins ($m$) from 1 to 1000. The minimizer is $m=106$, but the curve has is quite rough, and the risk is similar for values from ~50 to ~200.
mm = np.arange(1, 1001, 1)
risk = np.empty(shape=mm.size)
for index, m in enumerate(mm):
est = histogram_estimator(refractive_indices, m)
risk[index] = est.cv_risk_estimate()
m_star = mm[np.argmin(risk)]
plt.plot(mm, risk)
plt.scatter(x=m_star, y=risk.min(), color='red', label=f"$m^*$ = {m_star}")
plt.xlabel("Number of bins (m)")
plt.ylabel("Estimated Risk (up to a constant)")
plt.legend()
plt.show()
Plotting the optimal histogram along with the 95% confidence envelope. Since there are only 214 observations, and a range of values larger than 20, the envelope is large. This is unsurprising in light of Figure 20.4 in the text. In that plot of astronomy data, a similar number of bins is employed, but there are around five times more observations. Even so, there is substantial uncertainty.
optimal_hist = histogram_estimator(refractive_indices, m_star)
optimal_hist.plot(confidence_band=True, color='red', alpha=0.05)
plt.legend()
plt.show()
def epanechnikov_kernel(x):
cond = np.abs(x) < math.sqrt(5)
return np.where(cond, (3 / 4) * (1 - (x ** 2) / 5) / math.sqrt(5), 0)
def gaussian_kernel(x):
return norm.pdf(x)
class kde:
def __init__(self, X, kernel, h):
self.X = X
self.K = kernel
self.n = len(X)
self.h = h
def __call__(self, x):
return (1 / (self.n * self.h) * np.sum(self.K((x - self.X) / self.h)))
def plot(self, confidence_interval=False, **kwargs):
X_min, X_max = (self.X.min(), self.X.max())
xx = np.linspace(X_min, X_max, 1000)
plt.plot(xx, [self(x) for x in xx],
label=f"K = {self.K.__name__.split('_')[0]}, h={self.h:.3f}",
**kwargs)
if confidence_interval:
bounds = [self.confidence_interval(x) for x in xx]
lower_bounds = [bound[0] for bound in bounds]
upper_bounds = [bound[1] for bound in bounds]
plt.plot(xx, lower_bounds, color='orange')
plt.plot(xx, upper_bounds, color='orange')
def est_risk(self):
if self.K.__name__ == "gaussian_kernel":
K_2 = partial(norm.pdf, scale=np.sqrt(2))
K_star = lambda x: K_2(x) - 2 * self.K(x)
K_star_inputs = [(self.X[i] - self.X[j]) / self.h for i, j in product(range(self.n), range(self.n))]
sum = np.sum(K_star(K_star_inputs))
return (1 / (self.h * (self.n ** 2))) * sum + (2 / (self.n * self.h)) * self.K(0)
else:
raise ValueError(f"Kernel must be gaussian.")
def confidence_interval(self, x, alpha = 0.05):
if self.K.__name__ == "gaussian_kernel":
a = self.X.min()
b = self.X.max()
omega = 3 * self.h
m = (b - a) / omega
q = norm.ppf((1 + (1 - alpha) ** (1 / m)) / 2)
Y = (1 / self.h) * self.K((x - self.X) / self.h)
Y_bar = np.mean(Y)
s_2 = (1 / (self.n - 1)) * np.sum(np.power(Y - Y_bar, 2))
se = np.sqrt(s_2) / math.sqrt(self.n)
lower_bound = self(x) - q * se
upper_bound = self(x) + q * se
return lower_bound, upper_bound
else:
raise ValueError(f"Kernel must be gaussian.")
hist_est = histogram_estimator(refractive_indices, m=m_star)
hist_est.plot(color='black')
est = kde(refractive_indices, epanechnikov_kernel, h=1)
est.plot(color='blue')
est = kde(refractive_indices, gaussian_kernel, h=1)
est.plot(color='red')
plt.legend()
plt.xlabel("Refractive Index")
plt.show()
hh = np.logspace(-2, 1, 100)
est_risks = np.empty(shape=len(hh))
for index, h in enumerate(hh):
est = kde(refractive_indices, gaussian_kernel, h=h)
est_risks[index] = est.est_risk()
plt.plot(hh, est_risks)
plt.xscale("log")
plt.xlabel("Bandwidth ($h$)")
plt.ylabel("Estimated Risk (up to a constant)")
h_star = hh[np.argmin(est_risks)]
plt.scatter(h_star, est_risks.min(), s=20, color='red', label=f"$h^*$ = {h_star:.5f}")
plt.legend()
plt.show()
hist_est = histogram_estimator(refractive_indices, m=m_star)
hist_est.plot(color='blue')
est = kde(refractive_indices, gaussian_kernel, h=h_star)
est.plot(color='red')
plt.legend()
plt.xlabel("Refractive Index")
plt.show()
est = kde(refractive_indices, gaussian_kernel, h=h_star)
est.plot(color='red', confidence_interval=True)
plt.xlabel("Refractive Index")
Text(0.5, 0, 'Refractive Index')
Consider the data from question 2. Let $Y$ be refractive index and let $x$ be aluminum content (the fourth variable). Perform a nonparametric regression to fit the model $Y = f(x) + \epsilon$. Use cross-validation to estimate the bandwidth. Construct 95 confidence bands for your estimate.
data = pd.read_csv("data/glass.dat", sep="\s+")
refractive_indices = data["RI"].values
aluminum_content = data["Al"].values
class NWRegressor:
def __init__(self, x, Y, h):
self.x = x
self.Y = Y
self.K = gaussian_kernel
self.h = h
self.n = len(self.x)
def __call__(self, z):
xx = np.repeat(self.x.reshape(-1,1), len(z), axis=1)
K_values = self.K((z - xx) / self.h)
W = K_values / np.sum(K_values, axis=0)
return W.T @ self.Y
def est_risk(self):
xx = np.repeat(self.x.reshape(-1, 1), len(self.x), axis=1)
K_values = self.K((xx - self.x) / h)
K_values_sum = K_values.sum(axis=1)
W = K_values / K_values_sum
R = W.T @ self.Y
terms = ((self.Y - R) / (1 - self.K(0) / K_values_sum)) ** 2
if np.isnan(terms).any():
return np.inf
return np.sum(terms)
def plot(self, confidence_bands=False, **kwargs):
a = min(self.x)
b = max(self.x)
t = np.linspace(a, b, 1000)
plt.plot(t, self(t), **kwargs)
if confidence_bands:
omega = 3 * self.h # effective width of kernel
m = (b - a) / omega
alpha = 0.05
q = norm.ppf((1 - (1 - alpha) ** (1 / m)) / 2)
Y_sorted_by_x = np.array([z[1] for z in sorted(zip(self.x, self.Y), key=lambda x: x[0])])
sigma_2_hat = (1 / (2 * (self.n - 1))) * np.sum(np.power(np.diff(Y_sorted_by_x), 2))
sigma_hat = np.sqrt(sigma_2_hat)
xx = np.repeat(self.x.reshape(-1, 1), len(t), axis=1)
K_values = self.K((t - xx) / self.h)
K_values_sum = K_values.sum(axis=0)
W = K_values / K_values_sum
se = sigma_hat * np.sqrt(np.sum(np.power(W, 2), axis=0))
plt.plot(t, self(t) + q * se, color='orange')
plt.plot(t, self(t) - q * se, color='orange')
h_vals = np.logspace(-4, 0, 100)
r_hats = []
for h in h_vals:
r_hat = NWRegressor(aluminum_content, refractive_indices, h=h)
r_hats.append(r_hat.est_risk())
h_star = h_vals[np.argmin(r_hats)]
fig = plt.figure()
plt.scatter(h_star, r_hats[np.argmin(r_hats)], s=50, color='red', label=f"$h^* =${h_star:.3f}")
plt.plot(h_vals, r_hats, linewidth=2)
plt.xlabel("h")
plt.ylabel("Estimated Risk (up to a constant)")
plt.legend()
plt.show()
/tmp/ipykernel_87541/291599215.py:21: RuntimeWarning: invalid value encountered in divide terms = ((self.Y - R) / (1 - self.K(0) / K_values_sum)) ** 2 /tmp/ipykernel_87541/291599215.py:21: RuntimeWarning: divide by zero encountered in divide terms = ((self.Y - R) / (1 - self.K(0) / K_values_sum)) ** 2
r_hat = NWRegressor(aluminum_content, refractive_indices, h=h_star)
fig = plt.figure()
plt.scatter(aluminum_content, refractive_indices, alpha=0.5, label="data")
r_hat.plot(confidence_bands=True, color='red', linewidth=2, label=f"Optimal Nadaraya-Watson Estimator (h={r_hat.h:.3f})")
plt.xlabel("Aluminum Content")
plt.ylabel("Refractive Index")
plt.legend()
plt.show()
Prove Lemma 20.1:
The risk can be written as
$$R(f, \hat{f}) = \mathbb{E}\left(L(g, \hat{g})\right) = \int b^2(x) \, dx + \int v(x) \, dx$$where
$$b(x) = \mathbb{E}(\hat{g}_n(x)) - g(x)$$is the bias of $\hat{g}_n(x)$ at a fixed $x$ and
$$v(x) = \mathbb{V}(\hat{g}_n(x)) = \mathbb{E}\left[ \left(\hat{g}_n(x) - \mathbb{E}[\hat{g}_n(x)]\right)^2 \right]$$is the variance of $\hat{g}_n(x)$ at a fixed $x$.
Solution:
\begin{align*} R(g, \hat{g}) &= \mathbb{E} \left[\int \left( g(x) - \hat{g}(x) \right)^2 \, dx \right] \\ &= \int \mathbb{E}\left[ \left( g(x) - \hat{g}(x) \right)^2 \right] \, dx \\ &= \int \left(\mathbb{E}\left[g(x) - \hat{g}(x) \right]\right)^2 + \mathbb{V}[g(x) - \hat{g}(x)] \, dx \tag{$E(X^2) = V(X) + E(X)^2$} \\ &= \int \left(g(x) - \mathbb{E}[\hat{g}(x)] \right)^2 + \mathbb{V}[\hat{g}(x)] \, dx \tag{$\mathbb{V}(-X + c) = \mathbb{V}(X)$} \\ &= \int b^2(x) \, dx + \int v(x) \, dx \end{align*}Prove Theorem 20.3:
Consider fixed $x$ and fixed $m$, and let $B_j$ be the bin containing $x$. Then,
$$\mathbb{E}(\hat{f}_n(x)) = \frac{p_j}{h} \text{ and } \mathbb{V}(\hat{f}_n(x)) = \frac{p_j(1-p_j)}{nh^2}$$Solution:
Note that $\nu_j$, the count of observations in $B_j$, is distributed as $\text{Binom}(n, p_j)$, and thus, $\mathbb{E}[\nu_j] = np_j$, and $\mathbb{V}[\nu_j] = np_j(1-p_j)$. Therefore,
\begin{align*} \mathbb{E}[\hat{f}_n(x)] &= \mathbb{E}[\hat{p_j}/h] \\ &= \frac{\mathbb{E}[\nu_j]}{nh} \\ &= p_j / h \end{align*}and
\begin{align*} \mathbb{V}[\hat{f}_n(x)] &= \mathbb{V}[\hat{p_j}/h] \\ &= \frac{\mathbb{V}[\nu_j]}{(nh)^2} \\ &= \frac{p_j(1-p_j)}{nh^2}. \end{align*}Prove Theorem 20.7:
The following identity holds:
$$\hat{J}(h) = \frac{2}{(n-1)h} - \frac{n+1}{(n - 1)h} \sum_{j=1}^m \hat{p}_j^2.$$Solution: Let $\nu_{j(i)} = \nu_j$ when $X_i \in B_j$. Observe that $\sum_{i=1}^n \nu_{j(i)} = \sum_{j=1}^m \nu_j^2$. \begin{align*} \hat{J}(h) &= \int \left(\hat{f}_n(x)\right)^2 \, dx - \frac{2}{n} \sum_{i=1}^n \hat{f}_{(-i)}(X_i) \\ &= \sum_{j=1}^m \int_{B_j} (\hat{p}_j / h)^2 \, dx - \frac{2}{n} \sum_{i=1}^n \frac{\nu_{j(i)} - 1}{(n-1)h} \\ &= \sum_{j=1}^m \frac{\nu_j^2}{n^2 h} + \frac{2}{(n-1)h} - \frac{2}{n(n-1)h}\sum_{i=1}^n \nu_{j(i)} \\ &= \frac{2}{(n-1)h} + \left[\frac{1}{h} - \frac{2n^2}{n(n-1)h} \right]\sum_{j=1}^m \frac{\nu_{j}^2}{n^2} \\ &= \frac{2}{(n-1)h} - \frac{n+1}{(n-1)h}\sum_{j=1}^m \hat{p}_j. \end{align*}
Prove Theorem 20.15:
For any $h > 0$,
$$\mathbb{E}\left[\hat{J}(h)\right] = \mathbb{E}[J(h)].$$Also,
$$\hat{J}(h) \approx \frac{1}{hn^2}\sum_i \sum_j K^* \left( \frac{X_i - X_j}{h}\right) + \frac{2}{nh}K(0)$$where $K^*(x) = K^{(2)}(x) - 2K(x)$ and $K^{(2)}(z) = \int K(z-y)K(y)\,dy$. In particular, if $K$ is a $N(0,1)$ Gaussian kernel then $K^{(2)}(z)$ is the $N(0,2)$ density.
Solution:
First, we demonstrate $\mathbb{E}\left[\hat{J}(h)\right] = \mathbb{E}[J(h)]$. Recalling:
$$J(h) = \int \hat{f}_n^2(x) \, dx - 2 \int \hat{f}_n(x) f(x) \, dx$$and
$$\hat{J}(h) = \int \hat{f}^2(x) \, dx - \frac{2}{n} \sum_{i=1}^n \hat{f}_{-i}(X_i),$$we may observe that it suffices to show
$$\mathbb{E}\left[\int \hat{f}_n(x) f(x) \, dx \right] = \mathbb{E}\left[ \frac{1}{n} \sum_{i=1}^n \hat{f}_{-i}(X_i) \right].$$Observing:
$$\mathbb{E}\left[\hat{f}_{-i}(X_i)\right] = \int \frac{1}{h} K\left( \frac{X_i - u}{h}\right) f(u)\, du,$$we have
$$ \begin{align*} \frac{1}{n}\mathbb{E}\left[\sum_{i=1}^n\hat{f}_{-i}(X_i)\right] &= \int \frac{1}{n}\sum_{i=1}^n K\left( \frac{X_i - u}{h}\right) f(u)\, du \\ &= \int \hat{f}_n(u) f(u) \, du. \end{align*} $$We now derive the formula. Assume the kernel is even.
The formula follows from summing (1) and (2). Suppose $K$ is a $N(0,1)$ Gaussian kernel. Then,
\begin{align*} K^{(2)}(z) &= \int K(z-y)K(y) \, dy \\ &= \frac{1}{2\pi} \int e^{-(z-y)^2 / 2} e^{-y^2 / 2} \, dy \\ &= \frac{1}{2\pi} \int e^{-z^2 / 2 + zy - y^2 + (z^2 / 4 - z^2 / 4)} \, dy \tag{completing the square} \\ &= \frac{1}{2\pi} e^{-z^2 / 4} \int e^{-(y - z/2)^2} \, dy \\ &= \frac{1}{2\pi} e^{-z^2 / 4} (\sqrt{\pi}) \\ &= \frac{1}{\sqrt{2\pi\cdot 2}} e^{-\frac{1}{2}(z^2/2)}, \end{align*}so $K^{(2)}(z)$ is the $N(0,2)$ Gaussian kernel.
Consider regression data $(x_1, Y_1), \dots (x_n, Y_n)$. Suppose that $0 \le x_i \le 1$ for all $i$. Define bins $B_j$ as in equation (20.7). For $x \in B_j$ define
$$\hat{r}_n(x) = \bar{Y}_j$$where $\bar{Y}_j$ is the mean of the $Y_i$ corresponding to those $x_i$'s in $B_j$. Find the approximate risk of this estimator. From this expression for the risk, find the optimal bandwidth. At what rate does the risk go to zero?
Solution:
The following will be similar to the derivation of Theorem 20.4, which presents the risk as a function of $f$, $n$, and $h$ for a histogram estimator.
We have the bias-variance tradeoff:
\begin{align*} R(r, \hat{r}_n) &= \int \mathbb{V}\left[\hat{r}_n(x)\right] + \left(b(x)\right)^2 \, dx, \end{align*}where $b(x) = \mathbb{E}[\hat{r}_n(x)] - r(x)$. We first compute the bias. Let $x \in B_j$. Observe $\mathbb{E}[\hat{r}(x)] = \frac{1}{h} \int_{B_j} r(u) \, du$, and that for $u \in B_j$
$$r(u) \approx r(x) + (u - x)r'(x).$$Therefore,
\begin{align*} \mathbb{E}[\hat{r}(x)] &\approx \frac{1}{h} \int_{B_j} r(x) + (u - x)r'(x) \, du \\ &= r(x) + r'(x)\left(h\left(j - \frac{1}{2}\right) - x \right) \end{align*}Thus,
$$b(x) \approx r'(x)\left(h\left(j - \frac{1}{2}\right) - x \right)$$If $\tilde{x}_j$ is the center of the bin,
$$\int_{B_j} b^2(x) \, dx \approx (r'(\tilde{x}_j))^2 \frac{h^3}{12},$$meaning
\begin{align*} \int_0^1 b^2(x)\, dx &= \sum_{j=1}^m \int_{B_j} b^2(x)\,dx \\ &\approx \frac{h^2}{12} \sum_{j=1}^m h(r'(\tilde{x}_j))^2 \\ &\approx \frac{h^2}{12} \int_0^1 (r'(x))^2 \, dx. \end{align*}Assume that observations are evenly distributed in $[0,1]$. Observe that $\mathbb{V}[\hat{r}(x)] = \mathbb{V}[\bar{Y}_j] = \frac{1}{nh}\mathbb{V}[Y_j] = \frac{\sigma^2}{nh}$, and therefore
$$\int_0^1 \mathbb{V}[\hat{r}(x)] = \frac{\sigma^2}{nh}.$$Altogether,
$$R(r, \hat{r}_n) = \frac{h^2}{12} \int_0^1 (r'(x))^2 \, dx + \frac{\sigma^2}{nh}, \tag{1}$$which is minimized by
$$h^* = \left( \frac{6\sigma^2}{n \int_0^1 (r'(x))^2 \, dx} \right)^{1/3}.$$Plugging $h^*$ into (1), we observe the risk goes to zero with $n^{-2/3}$.
Show that with suitable smoothness assumptions on $r(x)$, $\hat{\sigma}^2$ in equation (20.36) is a consistent estimator of $\sigma^2$.
Solution:
Recall
$$\hat{\sigma}^2 = \frac{1}{2(n-1)}\sum_{i=1}^{n-1}(Y_{i+1} - Y_i)^2.$$Since $X_n \xrightarrow{qm} X \Rightarrow X_n \xrightarrow{p} X$, it suffices to show convergence in quadratic mean. Thus, by Chapter 5, #2, it suffices to show $\mathbb{V}[\hat{\sigma}^2] \to 0$ and $\mathbb{E}[\hat{\sigma}^2] \to \sigma^2$.
First showing $\mathbb{E}[\hat{\sigma}^2] \to \sigma^2$. Observe:
\begin{align*} Y_{i+1} - Y_i &= \left[r(x_{i+1}) + \epsilon_{i+1}\right] - \left[r(x_i) + \epsilon_i \right] \\ &= \left[r(x_{i+1}) - r(x_i)\right] + \left[\epsilon_{i+1} - \epsilon_i \right] \\ &\Rightarrow \mathbb{V}[Y_{i+1} - Y_i] = \mathbb{V}[\epsilon_{i+1} - \epsilon_i] \tag{$x_j$ is fixed for all $j$} = 2\sigma^2\\ \end{align*}Suppose $r$ is Lipschitz continuous with constant $L$. That is, $r(x_{i+1}) - r(x_i) \le L(x_{i+1} - x_i)$. Observe:
$$0 \le [r(x_{i+1}) - r(x_i)]^2 \le L^2(x_{i+1} - x_i)^2$$Therefore, \begin{align*} \mathbb{E}[(Y_{i+1} - Y_i)^2] &= \mathbb{V}[Y_{i+1} - Y_i] + \left(\mathbb{E}[Y_{i+1} - Y_i]\right)^2 \\ &= 2\sigma^2 +[r(x_{i+1}) - r(x_i)]^2 \in [2\sigma^2, 2\sigma^2 + L^2(x_{i+1} - x_i)^2] \end{align*}
and thus
\begin{align*} \mathbb{E}[\hat{\sigma}^2] &= \frac{1}{2(n-1)}\sum_{i=1}^{n-1}\mathbb{E}\left[(Y_{i+1} - Y_i)^2\right] \\ &\in [\sigma^2, \sigma^2 + \frac{L^2}{2(n-1)}\sum_{i=1}^{n-1}(x_{i+1} - x_i)^2]. \end{align*}Assume the domain of the $x$ values is a finite interval of width $M$. The term $\sum_{i=1}^{n-1}(x_{i+1} - x_i)^2$ assumes the greatest value when the $x_j$ terms are evenly distributed (this can be shown using, e.g., the method of Lagrange multipliers). Furthermore, in that case, $\sum_{i=1}^{n-1}(x_{i+1} - x_i)^2 = \sum_{i=1}^{n-1}(M/(n-1))^2 = M^2 / (n-1)$. Therefore,
$$0 \le \frac{L^2}{2(n-1)}\sum_{i=1}^{n-1}(x_{i+1} - x_i)^2 \le \frac{(LM)^2}{2(n-1)^2} \to 0$$and thus
$$\mathbb{E}[\hat{\sigma}^2] \to \sigma^2.$$We now show $\mathbb{V}[\hat{\sigma}^2] \to 0$. Observe
\begin{align*} \mathbb{V}[(Y_{i+1} - Y_i)^2] &= \mathbb{E}[(Y_{i+1} - Y_i)^4] - \mathbb{E}[(Y_{i+1} - Y_i)^2]^2 \\ \end{align*}Denoting $r_i = r(x_i)$, we can expand the first integrand:
\begin{align*} (Y_{i+1} - Y_i)^4 &= (r_{i+1} + \epsilon_{i+1} - r_i - \epsilon_i)^4 \\ &= (r_{i+1} - r_i)^4 + 4(r_{i+1} - r_i)^3(\epsilon_{i+1} - \epsilon_i) + 6(r_{i+1} - r_i)^2(\epsilon_{i+1} - \epsilon_i)^2 + 4(r_{i+1} - r_i)(\epsilon_{i+1} - \epsilon_i)^3 + (\epsilon_{i+1} - \epsilon_i)^4 \end{align*}Note that
Hence,
\begin{align*} \mathbb{E}[(Y_{i+1} - Y_i)^4] = (r_{i+1} - r_i)^4 + 12\sigma^2(r_{i+1} - r_i)^2 + 12\sigma^4. \end{align*}Recalling the Lipschitz continuity of $r$, we have
So, we may bound the variance of $Y_{i+1} - Y_i$:
\begin{align*} 8\sigma^2 \le \mathbb{V}[(Y_{i+1} - Y_i)^2] \le 8\sigma^2 + 14\sigma^2L^2(x_{i+1} - x_i)^2 + 2L^4(x_{i+1} - x_i)^4 \end{align*}and thus
\begin{align*} 0 \le \mathbb{V}[\hat{\sigma}^2] &= \frac{1}{4(n-1)^2}\sum_{i=1}^{n-1}\mathbb{V}[(Y_{i+1} - Y_i)^2] \\ &\le \frac{8(n-1)\sigma^2}{4(n-1)^2} + \frac{14L^2}{4(n-1)^2}\sum_{i=1}^{n-1}(x_{i+1} - x_i)^2 + \frac{2L^4}{4(n-1)^2}\sum_{i=1}^{n-1}(x_{i+1} - x_i)^4 \\ &\le \frac{2\sigma^2}{(n-1)} + \frac{7(LM)^2}{2(n-1)^3} + \frac{(LM)^4}{2(n-1)^5} \to 0.\\ \end{align*}In summary, we have shown that in the case of a finite domain and Lipschitz continuity of the true regression function, $\hat{\sigma}^2$ is a consistent estimator.
Prove Theorem 20.22:
$\hat{J}$ can be written as
$$\hat{J}(h) = \sum_{i=1}^n (Y_i - \hat{r}(x_i))^2 \frac{1}{\left(1 - \frac{K(0)}{\sum_{j=1}^n K\left(\frac{x_i - x_j}{h}\right)} \right)^2}$$Solution:
Recall:
\begin{align*} \hat{J}(h) = \sum_{i=1}^n (Y_i - \hat{r}_{-i}(x_i))^2 \end{align*}We can express the evaluation at $x_i$ of the estimator trained without observation $i$ in the following way:
\begin{align*} \hat{r}_{-i}(x_i) &= \sum_{j=1, j\ne i}^n \frac{K((x_i - x_j)/h)}{\sum_{k=1, k\ne i}^n K((x_i - x_k)/h)}Y_j \\ &= \sum_{j=1, j\ne i}^n \frac{K((x_i - x_j)/h)}{\sum_{k=1}^n K((x_i - x_k)/h) - K(0)}Y_j \\ &= \frac{1}{\sum_{k=1}^n K((x_i - x_k)/h) - K(0)} \left[ \sum_{j=1}^n K((x_i - x_j)/h)Y_j - K(0)Y_i \right] \\ &= \frac{1}{1 - \frac{K(0)}{\sum_{k=1}^n K((x_i - x_k)/h)}}\left[\sum_{j=1}^n \frac{K((x_i - x_j)/h)Y_j}{\sum_{k=1}^n K((x_i - x_k)/h)} - \frac{K(0)Y_i}{\sum_{k=1}^n K((x_i - x_k)/h)} \right] \\ &= \frac{1}{1 - \alpha}\left[\hat{r}(x_i) - \alpha Y_i \right] \tag{$\alpha = \frac{K(0)}{\sum_{k=1}^n K((x_i - x_k)/h)}$} \\\ \end{align*}Therefore,
\begin{align*} Y_i - \hat{r}_{-i}(x_i) &= Y_i - \frac{1}{1-\alpha}[\hat{r}(x_i) - \alpha Y_i] \\ &= \frac{(1-\alpha)Y_i - \hat{r}(x_i) + \alpha Y_i}{1-\alpha} \\ &= (Y_i - \hat{r}(x_i))\frac{1}{1-\alpha}, \end{align*}and, finally:
\begin{align*} \sum_{i=1}^n (Y_i - \hat{r}_{-i}(x_i))^2 &= \sum_{i=1}^n (Y_i - \hat{r}(x_i))^2\frac{1}{(1-\alpha)^2} \\ &= \sum_{i=1}^n (Y_i - \hat{r}(x_i))^2 \frac{1}{\left(1 - \frac{K(0)}{\sum_{j=1}^n K\left(\frac{x_i - x_j}{h}\right)} \right)^2}. \end{align*}