from functools import partial, lru_cache
from itertools import product
import math

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import norm

plt.style.use("default.mplstyle")
plt.rcParams["lines.linewidth"] = 1


! curl -s --insecure https://www.stat.cmu.edu/~larry/all-of-statistics/=data/glass.dat --output data/glass.dat
glass_data = pd.read_csv("data/glass.dat", sep="\s+", usecols=["RI"])
refractive_indices = np.squeeze(glass_data.values)


class histogram_estimator:
    """
    Class for defining a histogram estimator, a nonparametric curve estimator.

    Attributes:
        X (list): List of values.
        m (int): Number of histogram bins.
        bounds (tuple): Lower and upper bounds (optional).

    Methods:
        cv_risk_estimate() -> Float:
            Computes the estimated risk using cross validation (up to a constant).

        simple_cv_risk_estimate() -> Float:
            Computes the estimated risk using cross validation (up to a constant) using simpler formula.
    """
    def __init__(self, X, m, bounds = None):
        self.X = X
        self.m = m
        if bounds is None:
            bounds = (X.min(), X.max())
        self.bounds = bounds
        self.n = len(self.X)
        self.counts, self.bin_edges = np.histogram(self.X, 
                                                   bins=self.m,
                                                   range = self.bounds)
        self.h = self.bin_edges[1] - self.bin_edges[0]
        self.p_hat = self.counts / self.n

    def __call__(self, x):
        bin_index = min(math.floor((x - self.bin_edges[0]) // self.h) - 1, self.m - 1)
        density = self.counts[bin_index] / (self.h * self.n)
        return density
    
    def count_at(self, x):
        bin_index = min(math.floor((x - self.bin_edges[0]) // self.h), self.m - 1)
        return self.counts[bin_index]
    
    def plot(self, confidence_band=False, alpha=0.05, **kwargs):
        if confidence_band:
            xx = self.bin_edges[:-1]
            lower_bounds = [self.confidence_interval(x, alpha)[0] for x in xx]
            upper_bounds = [self.confidence_interval(x, alpha)[1] for x in xx]
            plt.step(xx, lower_bounds, "-", color='orange')
            plt.step(xx, upper_bounds, "-", color='orange')
            # plt.fill_between(xx, lower_bounds, upper_bounds, step="pre", alpha=0.1, color='', label="95% Confidence Envelope")
        plt.step(self.bin_edges[:-1] - self.h, [self(x) for x in self.bin_edges[0:-1]], label=f"hist (m={self.m})", **kwargs)
    
    def cv_risk_estimate(self):
        second_moment = np.sum((self.p_hat / self.h) ** 2 * self.h)  
        sum = 0
        for index, value in enumerate(self.X):
            X_1 = np.delete(self.X, index)
            f = histogram_estimator(X_1, self.m, self.bounds)
            sum += f(value)
        return second_moment - (2 / self.n) * sum

    def simple_cv_risk_estimate(self):
        first_term = 2 / ((self.n - 1) * self.h)
        # note that the equation in the text has a typo
        second_term = ((self.n + 1) / (self.h * (self.n - 1))) * np.sum(self.p_hat ** 2)
        return first_term - second_term
    
    def confidence_interval(self, x, alpha):
        z = norm.ppf( 1 - (alpha / (2 * self.m)))
        c = (z / 2) * np.sqrt(self.m / self.n)
        l = (max(np.sqrt(self(x)) - c, 0)) ** 2
        u = (np.sqrt(self(x)) + c) ** 2
        return l, u


mm = np.arange(1, 1001, 1)
risk = np.empty(shape=mm.size)

for index, m in enumerate(mm):
    est = histogram_estimator(refractive_indices, m)
    risk[index] = est.cv_risk_estimate()
m_star = mm[np.argmin(risk)]

plt.plot(mm, risk)
plt.scatter(x=m_star, y=risk.min(), color='red', label=f"$m^*$ = {m_star}")
plt.xlabel("Number of bins (m)")
plt.ylabel("Estimated Risk (up to a constant)")
plt.legend()
plt.show()


optimal_hist = histogram_estimator(refractive_indices, m_star)
optimal_hist.plot(confidence_band=True, color='red', alpha=0.05)
plt.legend()
plt.show()


def epanechnikov_kernel(x):
    cond = np.abs(x) < math.sqrt(5)
    return np.where(cond, (3 / 4) * (1 - (x ** 2) / 5) / math.sqrt(5), 0)

def gaussian_kernel(x):
    return norm.pdf(x)

class kde:
    def __init__(self, X, kernel, h):
        self.X = X
        self.K = kernel
        self.n = len(X)
        self.h = h

    def __call__(self, x):
        return (1 / (self.n * self.h) * np.sum(self.K((x - self.X) / self.h)))
    
    def plot(self, confidence_interval=False, **kwargs):
        X_min, X_max = (self.X.min(), self.X.max())
        xx = np.linspace(X_min, X_max, 1000)
        plt.plot(xx, [self(x) for x in xx], 
                 label=f"K = {self.K.__name__.split('_')[0]}, h={self.h:.3f}", 
                 **kwargs)
        if confidence_interval:
            bounds = [self.confidence_interval(x) for x in xx]
            lower_bounds = [bound[0] for bound in bounds]
            upper_bounds = [bound[1] for bound in bounds]
            plt.plot(xx, lower_bounds, color='orange')
            plt.plot(xx, upper_bounds, color='orange')
    
    def est_risk(self):
        if self.K.__name__ == "gaussian_kernel":
            K_2 = partial(norm.pdf, scale=np.sqrt(2))
            K_star = lambda x: K_2(x) - 2 * self.K(x)
            K_star_inputs = [(self.X[i] - self.X[j]) / self.h for i, j in product(range(self.n), range(self.n))]
            sum = np.sum(K_star(K_star_inputs))
            return (1 / (self.h * (self.n ** 2))) * sum + (2 / (self.n * self.h)) * self.K(0)
        else:
            raise ValueError(f"Kernel must be gaussian.")
        
    def confidence_interval(self, x, alpha = 0.05):
        if self.K.__name__ == "gaussian_kernel":
            a = self.X.min()
            b = self.X.max()
            omega = 3 * self.h
            m = (b - a) / omega
            q = norm.ppf((1 + (1 - alpha) ** (1 / m)) / 2)

            Y =  (1 / self.h) * self.K((x - self.X) / self.h)
            Y_bar = np.mean(Y)
            s_2 = (1 / (self.n - 1)) * np.sum(np.power(Y - Y_bar, 2))
            se = np.sqrt(s_2) / math.sqrt(self.n)
            lower_bound = self(x) - q * se
            upper_bound = self(x) + q * se
            return lower_bound, upper_bound
        else:
            raise ValueError(f"Kernel must be gaussian.")


hist_est = histogram_estimator(refractive_indices, m=m_star)
hist_est.plot(color='black')
est = kde(refractive_indices, epanechnikov_kernel, h=1)
est.plot(color='blue')
est = kde(refractive_indices, gaussian_kernel, h=1)
est.plot(color='red')
plt.legend()
plt.xlabel("Refractive Index")
plt.show()


hh = np.logspace(-2, 1, 100)
est_risks = np.empty(shape=len(hh))
for index, h in enumerate(hh):
    est = kde(refractive_indices, gaussian_kernel, h=h)
    est_risks[index] = est.est_risk()
plt.plot(hh, est_risks)
plt.xscale("log")
plt.xlabel("Bandwidth ($h$)")
plt.ylabel("Estimated Risk (up to a constant)")
h_star = hh[np.argmin(est_risks)]
plt.scatter(h_star, est_risks.min(), s=20, color='red', label=f"$h^*$ = {h_star:.5f}")
plt.legend()
plt.show()


hist_est = histogram_estimator(refractive_indices, m=m_star)
hist_est.plot(color='blue')
est = kde(refractive_indices, gaussian_kernel, h=h_star)
est.plot(color='red')
plt.legend()
plt.xlabel("Refractive Index")
plt.show()


est = kde(refractive_indices, gaussian_kernel, h=h_star)
est.plot(color='red', confidence_interval=True)
plt.xlabel("Refractive Index")

Text(0.5, 0, 'Refractive Index')


data = pd.read_csv("data/glass.dat", sep="\s+")
refractive_indices = data["RI"].values
aluminum_content = data["Al"].values


class NWRegressor:
    def __init__(self, x, Y, h):
        self.x = x
        self.Y = Y
        self.K = gaussian_kernel
        self.h = h
        self.n = len(self.x)

    def __call__(self, z):
        xx = np.repeat(self.x.reshape(-1,1), len(z), axis=1)
        K_values = self.K((z - xx) / self.h)
        W = K_values / np.sum(K_values, axis=0)
        return W.T @ self.Y
    
    def est_risk(self):
        xx = np.repeat(self.x.reshape(-1, 1), len(self.x), axis=1)
        K_values = self.K((xx - self.x) / h)
        K_values_sum = K_values.sum(axis=1)
        W = K_values / K_values_sum
        R = W.T @ self.Y
        terms = ((self.Y - R) / (1 - self.K(0) / K_values_sum)) ** 2
        if np.isnan(terms).any():
            return np.inf
        return np.sum(terms)
    
    def plot(self, confidence_bands=False, **kwargs):
        a = min(self.x)
        b = max(self.x)
        t = np.linspace(a, b, 1000)
        plt.plot(t, self(t), **kwargs)
        if confidence_bands:
            omega = 3 * self.h  # effective width of kernel
            m = (b - a) / omega
            alpha = 0.05
            q = norm.ppf((1 - (1 - alpha) ** (1 / m)) / 2)
            Y_sorted_by_x = np.array([z[1] for z in sorted(zip(self.x, self.Y), key=lambda x: x[0])])
            sigma_2_hat = (1 / (2 * (self.n - 1))) * np.sum(np.power(np.diff(Y_sorted_by_x), 2))
            sigma_hat = np.sqrt(sigma_2_hat)

            xx = np.repeat(self.x.reshape(-1, 1), len(t), axis=1)
            K_values = self.K((t - xx) / self.h)
            K_values_sum = K_values.sum(axis=0)
            W = K_values / K_values_sum
            se = sigma_hat * np.sqrt(np.sum(np.power(W, 2), axis=0))

            plt.plot(t, self(t) + q * se, color='orange')
            plt.plot(t, self(t) - q * se, color='orange')


h_vals = np.logspace(-4, 0, 100)
r_hats = []
for h in h_vals:
    r_hat = NWRegressor(aluminum_content, refractive_indices, h=h)
    r_hats.append(r_hat.est_risk())
h_star = h_vals[np.argmin(r_hats)]
fig = plt.figure()
plt.scatter(h_star, r_hats[np.argmin(r_hats)], s=50, color='red', label=f"$h^* =${h_star:.3f}")
plt.plot(h_vals, r_hats, linewidth=2)
plt.xlabel("h")
plt.ylabel("Estimated Risk (up to a constant)")
plt.legend()
plt.show()

/tmp/ipykernel_87541/291599215.py:21: RuntimeWarning: invalid value encountered in divide
  terms = ((self.Y - R) / (1 - self.K(0) / K_values_sum)) ** 2
/tmp/ipykernel_87541/291599215.py:21: RuntimeWarning: divide by zero encountered in divide
  terms = ((self.Y - R) / (1 - self.K(0) / K_values_sum)) ** 2


r_hat = NWRegressor(aluminum_content, refractive_indices, h=h_star)
fig = plt.figure()
plt.scatter(aluminum_content, refractive_indices, alpha=0.5, label="data")
r_hat.plot(confidence_bands=True, color='red', linewidth=2, label=f"Optimal Nadaraya-Watson Estimator (h={r_hat.h:.3f})")
plt.xlabel("Aluminum Content")
plt.ylabel("Refractive Index")
plt.legend()
plt.show()

1¶

2¶

3¶

4¶

5¶

6¶

7¶

8¶

9¶

10¶