import math
from functools import lru_cache

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import chi2, norm
from scipy.optimize import fsolve

plt.style.use("default.mplstyle")


data = {"Black Victim": {"Death Sentence": 14, "No Death Sentence": 641}, 
        "White Victim": {"Death Sentence": 62, "No Death Sentence": 594}}
data = pd.DataFrame(data).transpose()
X = data.values


T = 0
for i in [0, 1]:
    for j in [0, 1]:
        T += X[i,j] * math.log((X[i, j] * sum(sum(X))) / (sum(X[i,:]) * sum(X[:, j])))
T *= 2

z = chi2.ppf(1 - 0.05, df=1)
print(f"95% quantile for chi-squared distribution with df=1: {z:3f}")
print(f"Likelihood ratio test statistic: {T:.3f}")
print(f"p-value: {1 - chi2.cdf(T, df=1):.12f}")

95% quantile for chi-squared distribution with df=1: 3.841459
Likelihood ratio test statistic: 34.534
p-value: 0.000000004190


n = sum(sum(X))
U = 0

for i in [0, 1]:
    for j in [0, 1]:
        E = sum(X[i, :]) * sum(X[:, j]) / n
        U += ((X[i,j] - E) ** 2) / E

print(f"Pearson's chi^2 test statistic: {U:.3f}")
print(f"p-value: {1 - chi2.cdf(U, df=1):.12f}")

Pearson's chi^2 test statistic: 32.104
p-value: 0.000000014616


psi_hat = (X[0, 0] * X[1, 1]) / (X[0, 1] * X[1, 0])
gamma_hat = math.log(psi_hat)
print(f"psi_hat: {psi_hat:.3f}")
print(f"gamma_hat: {gamma_hat:.3f}")

# estimated standard error of gamma_hat

se_gamma_hat = math.sqrt((1 / X[0, 0]) + (1 / X[0, 1]) + (1 / X[1, 0]) + (1 / X[1, 1]))
print(f"se_gamma_hat: {se_gamma_hat:.3f}")
W = gamma_hat / se_gamma_hat
p_value = 1- norm.cdf(np.abs(W))
print(f"p-value: {p_value:.9f}")

z = norm.ppf(1 - 0.05 / 2)

print(f"95% C.I. for gamma: ({gamma_hat - z * se_gamma_hat:.3f}, {gamma_hat + z * se_gamma_hat:.3f})")
print(f"95% C.I. for psi: ({np.exp(gamma_hat - z * se_gamma_hat):.3f}, {np.exp(gamma_hat + z * se_gamma_hat):.3f})")

psi_hat: 0.209
gamma_hat: -1.564
se_gamma_hat: 0.301
p-value: 0.000000105
95% C.I. for gamma: (-2.155, -0.974)
95% C.I. for psi: (0.116, 0.378)


!curl -s --insecure https://www.stat.cmu.edu/~larry/all-of-statistics/=data/montana.dat --output data/montana.dat
data = pd.read_csv(
    "data/montana.dat",
    skiprows=33,
    sep="\s+",
    names="AGE SEX INC POL AREA FIN STAT".split(),
    usecols=["AGE", "FIN"],
).query("AGE != '*' & FIN != '*'")
data.head()


X = data.groupby(["AGE", "FIN"]).size().values.reshape((3,3))
I, J = X.shape

# computing the likelihood ratio test statistic

T = 0
for i in range(I):
    for j in range(J):
        ratio = (X[i, j] * np.sum(X)) / (np.sum(X[i, :]) * np.sum(X[:, j]))
        T += X[i, j] * math.log(ratio)
T *= 2
print(f"T: {T:.3f}")
nu = (I - 1) * (J - 1)
print(f"p-value: {1 - chi2.cdf(T, df=nu):.6f}")

T: 22.064
p-value: 0.000195


# Computing Pearson's chi^2 test statistic

U = 0
for i in range(I):
    for j in range(J):
        E = np.sum(X[i, :]) * np.sum(X[:, j]) / n
        U += ((X[i,j] - E) ** 2) / E

print(f"U: {U:.3f}")
nu = (3 - 1) * (3 - 1)
print(f"p-value: {1 - chi2.cdf(U, df=nu):.6f}")

U: 1060.653
p-value: 0.000000


!curl -s --insecure https://www.stat.cmu.edu/~larry/all-of-statistics/=data/temp.dat --output data/temp.dat
data = pd.read_csv("data/temp.dat", 
                   skiprows=31,
                   sep="\t")
data.head()


# computing the sample correlation using the nonparametric plug-in estimator
Y = data["JanTemp"].values
Z = data["Lat"].values 
X = np.stack([Y, Z], axis=1)
Y_hat = np.mean(Y)
Z_hat = np.mean(Z)
n = len(Y)
s_Y = np.sqrt(np.power(np.linalg.norm(Y - Y_hat), 2) / (n - 1))
s_Z = np.sqrt(np.power(np.linalg.norm(Z - Z_hat), 2) / (n - 1))
rho_hat = np.sum((Y - Y_hat) * (Z - Z_hat)) / ((n-1) * s_Y * s_Z)
print(f"Sample correlation: {rho_hat:.3f}")

Sample correlation: -0.848


def get_rho_hat(X):
    mu_hat = np.mean(X, axis=0)
    s_1 = np.sqrt(np.power(np.linalg.norm(X[:, 0] - mu_hat[0]), 2) / (n - 1))
    s_2 = np.sqrt(np.power(np.linalg.norm(X[:, 1] - mu_hat[1]), 2) / (n - 1))
    sample_cov = np.sum((X[:, 0] - mu_hat[0]) * (X[:, 1] - mu_hat[1]))
    return sample_cov / ((n - 1) * s_1 * s_2)

def get_bootstrap_ci(X):
    rho_hat = get_rho_hat(X)
    B = 1000
    T = np.empty(B)
    for i in range(B):
        X_star = X[np.random.choice(X.shape[0], size=n, replace=True)]
        T[i] = get_rho_hat(X_star)
    se_boot = np.sqrt(np.var(T))
    lb = rho_hat - z * se_boot
    ub = rho_hat + z * se_boot
    return se_boot, (lb, ub)

z = norm.ppf(1 - 0.05 / 2)
se_boot, ci_boot = get_bootstrap_ci(X)
print(f"Bootstrap 95% CI: ({ci_boot[0]:.3f}, {ci_boot[1]:.3f})")

def get_fishers_method_ci(X):
    rho_hat = get_rho_hat(X)
    theta_hat = (1 / 2) * (math.log(1 + rho_hat) - math.log(1 - rho_hat))
    se_theta_hat = 1 / math.sqrt(n - 3)
    a = theta_hat - z * se_theta_hat
    b = theta_hat + z * se_theta_hat
    lb = (math.exp(2 * a) - 1) / (math.exp(2 * a) + 1)
    ub = (math.exp(2 * b) - 1) / (math.exp(2 * b) + 1)
    return (lb, ub)

ci_fisher = get_fishers_method_ci(X)
print(f"Fisher's Method 95% CI: ({ci_fisher[0]:.3f}, {ci_fisher[1]:.3f})")

Bootstrap 95% CI: (-0.965, -0.732)
Fisher's Method 95% CI: (-0.908, -0.753)


# Wald test using the standard error from the bootstrap
W = rho_hat / se_boot
print("p-value: ", norm.cdf(-2 * abs(W)))

p-value:  2.2416470099624988e-179


!curl -s --insecure https://www.stat.cmu.edu/~larry/all-of-statistics/=data/calcium.dat --output data/calcium.dat
data = pd.read_csv("data/calcium.dat",
                   skiprows=32,
                   sep="\t",
                   usecols=["Treatment", "Decrease"])
data


Y = data['Treatment'].map({'Placebo': 1, 'Calcium': 2}).values
Z = data['Decrease'].values

n_1 = sum(Y == 1)
n_2 = sum(Y == 2)

def F_1(z):
    return (1 / n_1) * np.sum((Z < z) * (Y == 1))

def F_2(z):
    return (1 / n_2) * np.sum((Z < z) * (Y == 2))

zs = np.arange(-10, 20, .1)
plt.plot(zs, [F_1(z) for z in zs], label='F_1(z)')
plt.plot(zs, [F_2(z) for z in zs], label='F_2(z)')
plt.legend()
plt.title("CDFs")
plt.show()


D = max([abs(F_1(z) - F_2(z)) for z in zs])
test_statistic = math.sqrt((n_1 * n_2) / (n_1 + n_2)) * D
print(f"D: {D:.3f}")
print(f"Test statistic: {test_statistic:.3f}")


# approximating H(t) to a specified tolerance
def H(t, xtol=1e-8):
    j_max = int(np.ceil(np.sqrt(- np.log(xtol / 2) / (2 * t * t) )))
    jj = np.arange(1, j_max+1)
    return 1 + 2 * sum((-1) ** (jj) * np.exp(-2 * (jj ** 2) * t * t))

# computing H^{-1}(x)
@lru_cache(maxsize=None)
def H_inv(x):
    return fsolve(lambda t: H(t) - x, x0=1)[0]

D: 0.409
Test statistic: 0.936


T = np.logspace(-3, 0.25, 100)
plt.plot(T, list(map(H, T)))
plt.xlabel("t")
plt.ylabel("H(t)")

Text(0, 0.5, 'H(t)')


X = np.linspace(0.01, 0.99, 99)
plt.plot(X, list(map(H_inv, X)))
plt.xlabel("$1-a$")
plt.ylabel("$H^{-1}(1-a)$")
plt.hlines(y=test_statistic, xmin=min(X), xmax=max(X), color='orange', label="Test statistic")
plt.title("$H^{-1}(1-a)$")
plt.legend()
plt.show()

	City	JanTemp	Lat	Long
0	Mobile, AL	44	31.2	88.5
1	Montgomery, AL	38	32.9	86.8
2	Phoenix, AZ	35	33.6	112.5
3	Little Rock, AR	31	35.4	92.8
4	Los Angeles, CA	47	34.3	118.7

1¶

2¶

3¶

4¶

5¶

6¶

7¶

	Treatment	Decrease
0	Calcium	7
1	Calcium	-4
2	Calcium	18
3	Calcium	17
4	Calcium	-3
5	Calcium	-5
6	Calcium	1
7	Calcium	10
8	Calcium	11
9	Calcium	-2
10	Placebo	-1
11	Placebo	12
12	Placebo	-1
13	Placebo	-3
14	Placebo	3
15	Placebo	-5
16	Placebo	5
17	Placebo	2
18	Placebo	-11
19	Placebo	-1
20	Placebo	-3

	Death Sentence	No Death Sentence
Black Victim	14	641
White Victim	62	594

	AGE	FIN
0	3	2
1	2	3
2	1	2
3	3	1
4	3	2