import numpy as np
import pandas as pd
import math
from numpy.random import default_rng
rng = default_rng(42)
from scipy.stats import norm
import matplotlib as mpl
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from tqdm import tqdm

mpl.rcParams['font.size'] = 18


n = 100 # number of observations
m = 1 # number of trials

X = np.sort(rng.normal(size=n))

alpha = 0.05
eps = math.sqrt((1 / (2 * n)) * math.log(2 / alpha))

fig = go.Figure()
fig.add_trace(go.Scatter(x=X, y=norm.cdf(X), name='True CDF', marker = {'color': 'black'}))
fig.add_trace(go.Scatter(x=X, y=np.arange(0, 1, 1 / n), name='Empirical Distribution Function'))
fig.add_trace(go.Scatter(x=X, y=np.minimum(np.arange(0, 1, 1 / n) + eps,1), name=f'{100*(1-alpha)}% UB'))
fig.add_trace(go.Scatter(x=X, y=np.maximum(np.arange(0, 1, 1 / n) - eps, 0), name=f'{100*(1-alpha)}% LB'))
fig.update_layout(
    title=f"Empirical CDF with Confidence Bands",
    xaxis_title=r"x")
fig.show()


contained_count = 0
m = 1000
for j in range(m):
    X = rng.normal(size=n)
    U = lambda x : np.minimum(np.sum(X < x) / n + eps, 1)
    L = lambda x : np.maximum(np.sum(X < x) / n - eps, 0)
    above = all(np.array((list(map(U, X)))) >= np.array(list(map(norm.cdf, X))))
    below = all(np.array((list(map(L, X)))) <= np.array(list(map(norm.cdf, X))))
    contained_count += (above and below)
print(f'The confidence band contains the true distribution in {100 * contained_count / m}% of the iterations')

The confidence band contains the true distribution in 96.6% of the iterations


n = 100 # number of observations
m = 1 # number of trials

X = np.sort(rng.standard_cauchy(size=n))

alpha = 0.05
eps = math.sqrt((1 / (2 * n)) * math.log(2 / alpha))

fig = go.Figure()
fig.add_trace(go.Scatter(x=X, y=norm.cdf(X), name='True CDF', marker = {'color': 'black'}))
fig.add_trace(go.Scatter(x=X, y=np.arange(0, 1, 1 / n), name='Empirical Distribution Function'))
fig.add_trace(go.Scatter(x=X, y=np.minimum(np.arange(0, 1, 1 / n) + eps,1), name=f'{100*(1-alpha)}% UB'))
fig.add_trace(go.Scatter(x=X, y=np.maximum(np.arange(0, 1, 1 / n) - eps, 0), name=f'{100*(1-alpha)}% LB'))
fig.update_layout(
    title=f"Empirical CDF with Confidence Bands",
    xaxis_title=r"x")
fig.show()


contained_count = 0
m = 1000
for j in range(m):
    X = rng.standard_cauchy(size=n)
    U = lambda x : np.minimum(np.sum(X < x) / n + eps, 1)
    L = lambda x : np.maximum(np.sum(X < x) / n - eps, 0)
    above = all(np.array((list(map(U, X)))) >= np.array(list(map(norm.cdf, X))))
    below = all(np.array((list(map(L, X)))) <= np.array(list(map(norm.cdf, X))))
    contained_count += (above and below)
print(f'The confidence band contains the true distribution in {100 * contained_count / m}% of the iterations')

The confidence band contains the true distribution in 21.8% of the iterations


df = pd.read_csv('data/fijiquakes.txt', delim_whitespace=True)
magnitudes = np.sort(df['mag'].to_numpy())
n = magnitudes.size
alpha = 0.05
eps = math.sqrt((1 / (2 * n)) * math.log(2 / alpha))

F_hat = lambda x : np.sum(magnitudes < x) / n
U = lambda x : np.minimum(F_hat(x) + eps, 1)
L = lambda x : np.maximum(F_hat(x) - eps, 0)

plt.plot(np.sort(magnitudes), np.array(list(map(F_hat, magnitudes))), label='Empirical CDF')
plt.plot(np.sort(magnitudes), np.array(list(map(U, magnitudes))), label='UB')
plt.plot(np.sort(magnitudes), np.array(list(map(L, magnitudes))), label='LB')
plt.legend()
plt.grid()
plt.show()


theta_hat = F_hat(4.9) - F_hat(4.3)
se_hat = math.sqrt(theta_hat * (1 - theta_hat) / n)
z = norm.ppf(1-alpha/2)

print(f"95% confidence interval: ({theta_hat - z * se_hat:.3f}, {theta_hat + z * se_hat:.3f})")

95% confidence interval: (0.526, 0.588)


df = pd.read_csv('data/faithful.txt', skiprows=25, delim_whitespace=True)
waiting_times = df['waiting'].to_numpy()
n = waiting_times.size
mu_hat = np.sum(waiting_times) / n
se_hat = np.std(waiting_times, ddof=1) / np.sqrt(n)
print(f"Estimated mean: {mu_hat:.3f}")
print(f"Estimated standard error: {se_hat:.3f}")
alpha = 0.1
z = norm.ppf(1-alpha/2)
print(f"90% confidence interval for mean: ({mu_hat - z * se_hat:.3f},{mu_hat + z * se_hat:.3f})")
print(f"Estimated median: {np.median(waiting_times):.3f}")

Estimated mean: 70.897
Estimated standard error: 0.824
90% confidence interval for mean: (69.541,72.253)
Estimated median: 76.000


p_1_hat = 0.9
p_2_hat = 0.85

theta_hat = p_1_hat - p_2_hat
print(f"Estimate: {theta_hat:.3f}")
se_hat = math.sqrt((1 / 100) * (p_1_hat * (1 - p_1_hat) + p_2_hat * (1 - p_2_hat)))
print(f"Estimated standard error: {se_hat:.3f}")
alpha= 0.2
z = norm.ppf(1 - alpha/2)
print(f"80% confidence interval: ({theta_hat - z * se_hat:.3f},{theta_hat + z * se_hat:.3f})")

alpha= 0.05
z = norm.ppf(1 - alpha/2)
print(f"95% confidence interval: ({theta_hat - z * se_hat:.3f},{theta_hat + z * se_hat:.3f})")

Estimate: 0.050
Estimated standard error: 0.047
80% confidence interval: (-0.010,0.110)
95% confidence interval: (-0.041,0.141)


df = pd.read_csv('data/clouds.txt', skiprows=29, delim_whitespace=True)
df
X_1 = df['Unseeded_Clouds'].to_numpy()
X_2 = df['Seeded_Clouds'].to_numpy()

mu_1_hat = np.mean(X_1)
mu_2_hat = np.mean(X_2)
theta_hat = mu_2_hat - mu_1_hat
print(f"The estimated difference in mean precipiation: {theta_hat:.3f}")

n = X_1.size
sigma_1 = np.std(X_1, ddof=1)
sigma_2 = np.std(X_2, ddof=1)
se_hat = math.sqrt((1 / n) * (sigma_1 ** 2 + sigma_2 ** 2))
print(f"The estimated standard error: {se_hat:.3f}")

alpha = 0.05
z = norm.ppf(1 - alpha / 2)
print(f"95% confidence interval for mean: ({theta_hat - z * se_hat:.3f},{theta_hat + z * se_hat:.3f})")

The estimated difference in mean precipiation: 277.396
The estimated standard error: 138.820
95% confidence interval for mean: (5.314,549.478)

1¶

2¶

3¶

4¶

5¶

6¶

7¶

8¶

9¶

10¶