import warnings

import matplotlib.pyplot as plt
import numpy as np
from numpy import log
from numpy.linalg import det, eigh, inv
import pandas as pd
from scipy.optimize._linesearch import LineSearchWarning
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis as sk_LDA,
    QuadraticDiscriminantAnalysis as sk_QDA,
)
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression as sk_LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


pd.set_option("display.precision", 3)
plt.style.use("default.mplstyle")


! curl -s --insecure https://www.stat.cmu.edu/~larry/all-of-statistics/=data/spam.dat --output data/spam.dat

def get_spam_data():
    spam_df = pd.read_csv("data/spam.dat", sep="\s+", header=None)
    X = spam_df[[x for x in range(57)]].values  # 4601x57
    y = spam_df[57].values  # 4601x1
    return X, y

def display_results(y_pred, y_true):
    print(
        pd.DataFrame(
            confusion_matrix(y_pred, y_true),
            index=["True Negative", "True Positive"],
            columns=["Predicted Negative", "Predicted Positive"],
        )
    )
    print(
        f"\nMisclassification rate: {100 *(1 - accuracy_score(y_true, y_pred)):.2f}%"
    )


class DiscriminantAnalysis:
    def __init__(self, kind="linear"):
        self.type = kind

    def fit(self, X, y):
        N = len(X)
        class_indices = {k: np.flatnonzero(y == k) for k in [0, 1]}
        self.X_ = {k: X[class_indices[k]] for k in [0, 1]}
        self.n = {k: len(class_indices[k]) for k in [0, 1]}
        self.pi = {k: self.n[k] / N for k in [0, 1]}
        self.mu = {k: np.mean(self.X_[k], axis=0) for k in [0, 1]}
        self.S = {k: np.cov(self.X_[k], rowvar=False) for k in [0, 1]}
        self.U = {}
        self.D = {}
        for k in [0, 1]:
            d, self.U[k] = eigh(self.S[k])
            self.D[k] = np.diag(d)
        if self.type == "linear":
            self.S_common = (self.n[0] * self.S[0] + self.n[1] * self.S[1]) / (
                self.n[0] + self.n[1]
            )

    def _mahanalobis_distance_squared(self, x, k):
        diff = (x - self.mu[k]) @ self.U[k]
        return np.sum(np.dot(diff, inv(self.D[k])) * diff, axis=1)

    def _quadratic_delta(self, k, x):
        result = (-1 / 2) * log(det(self.D[k]))
        result += (-1 / 2) * self._mahanalobis_distance_squared(x, k)
        result += log(self.pi[k])
        return result

    def _linear_delta(self, k, x):
        S_common_inv = inv(self.S_common)
        result = np.sum(np.dot(x, S_common_inv) * self.mu[k], axis=1)
        result += -(1 / 2) * self.mu[k].T @ S_common_inv @ self.mu[k]
        result += log(self.pi[k])
        return result

    def predict(self, x):
        if self.type == "linear":
            return np.where(
                self._linear_delta(0, x) < self._linear_delta(1, x), 1, 0
            )
        else:
            return np.where(
                self._quadratic_delta(0, x) < self._quadratic_delta(1, x), 1, 0
            )


X, y = get_spam_data()
results = {}
lda = DiscriminantAnalysis()
lda.fit(X, y)
y_pred = lda.predict(X)
results["LDA"] = accuracy_score(y, y_pred)
print("LDA")
display_results(y_pred, y)

qda = DiscriminantAnalysis(kind="quadratic")
qda.fit(X, y)
y_pred = qda.predict(X)
results["QDA"] = accuracy_score(y, y_pred)
print("\n\nQDA")
display_results(y_pred, y)

LDA
               Predicted Negative  Predicted Positive
True Negative                2663                 387
True Positive                 125                1426

Misclassification rate: 11.13%


QDA
               Predicted Negative  Predicted Positive
True Negative                2101                  82
True Positive                 687                1731

Misclassification rate: 16.71%


clf = sk_LDA()
clf.fit(X, y)
y_pred = clf.predict(X)
print("\n\nLDA (scikit learn)")
display_results(y_pred, y)

clf = sk_QDA()
clf.fit(X, y)
y_pred = clf.predict(X)
print("\n\nQDA (scikit learn)")
display_results(y_pred, y)


LDA (scikit learn)
               Predicted Negative  Predicted Positive
True Negative                2663                 387
True Positive                 125                1426

Misclassification rate: 11.13%


QDA (scikit learn)
               Predicted Negative  Predicted Positive
True Negative                2101                  82
True Positive                 687                1731

Misclassification rate: 16.71%


class LogisticRegression(BaseEstimator, ClassifierMixin):
    def __init__(self, tol=1e-4):
        self.tol = tol
        pass

    def fit(self, X, y):
        beta_hat = {}
        p = {}
        s = 0
        self.X = np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
        beta_hat[0] = np.zeros(self.X.shape[1])
        k = 0
        while True and k < 5:
            p[s] = self._sigmoid(self.X @ beta_hat[s])
            Z = self._logit(p[s]) + (y - p[s]) / (p[s] * (1 - p[s]))
            W = np.diag(p[s] * (1 - p[s]))
            s += 1
            beta_hat[s] = (
                np.linalg.inv(self.X.T @ W @ self.X) @ self.X.T @ W @ Z
            )
            if np.linalg.norm(beta_hat[s] - beta_hat[s - 1]) < self.tol:
                break
            k = k + 1
        self.beta_hat_final = beta_hat[s]
        return self

    def predict(self, X):
        return np.round(self._sigmoid(self.X @ self.beta_hat_final))

    def predict_proba(self, X):
        return self._sigmoid(self.X @ self.beta_hat_final)

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x) + 1e-6)

    def _logit(self, x):
        return np.log(x / (1 - x))


log_reg = LogisticRegression()
log_reg.fit(X, y)
y_pred = log_reg.predict(X)
print("Logistic Regression")
display_results(y_pred, y)

Logistic Regression
               Predicted Negative  Predicted Positive
True Negative                2660                 201
True Positive                 128                1612

Misclassification rate: 7.15%


log_reg = sk_LogisticRegression(
    penalty="none",
    max_iter=1000,
    tol=1e-4,
    fit_intercept=True,
    solver="lbfgs",
)
log_reg.fit(X, y)
y_pred = log_reg.predict(X)
print("Logistic Regression (scikit-learn)")
display_results(y_pred, y)

Logistic Regression (scikit-learn)
               Predicted Negative  Predicted Positive
True Negative                2664                 197
True Positive                 124                1616

Misclassification rate: 6.98%

/home/eric-maria/eric/self_study/homl/.venv/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


tree = DecisionTreeClassifier()
tree.fit(X, y)
y_pred = tree.predict(X)
print("Decision Tree")
display_results(y_pred, y)

Decision Tree
               Predicted Negative  Predicted Positive
True Negative                2788                   3
True Positive                   0                1810

Misclassification rate: 0.07%


misclassification_rates = cross_val_score(
    tree,
    X,
    y,
    scoring=make_scorer(
        lambda y_true, y_pred: 1 - accuracy_score(y_true, y_pred)
    ),
)
print(
    f"Cross-validated misclassification rate: {100 * np.mean(misclassification_rates):.2f}%"
)

Cross-validated misclassification rate: 11.91%


def custom_warning_handler(
    message, category, filename, lineno, file=None, line=None
):
    if isinstance(message, LineSearchWarning) or isinstance(
        message, ConvergenceWarning
    ):
        pass
    else:
        warnings.showwarning(message, category, filename, lineno, file, line)


warnings.showwarning = custom_warning_handler
lda = sk_LDA()
log_reg = sk_LogisticRegression()
print("5-Fold CV Accuracy Metrics:\n")
for name, estimator in zip(["LDA", "Logistic Regression"], [lda, log_reg]):
    score = cross_val_score(estimator, X, y, scoring="accuracy")
    print(f"{name} : {np.mean(score):.3f}")

5-Fold CV Accuracy Metrics:

LDA : 0.880
Logistic Regression : 0.901


from scipy.stats import ttest_ind

_, p_values = ttest_ind(X[np.flatnonzero(y == 0)], X[np.flatnonzero(y == 1)])
selected_cols = np.argpartition(p_values, 10)[:10]
X_sub = X[:, selected_cols]

print("5-Fold CV Accuracy Metrics:\n")
for name, estimator in zip(["LDA", "Logistic Regression"], [lda, log_reg]):
    score = cross_val_score(estimator, X_sub, y, scoring="accuracy")
    print(f"{name} : {np.mean(score):.3f}")

5-Fold CV Accuracy Metrics:

LDA : 0.832
Logistic Regression : 0.865


svc = SVC()
print(
    f"Cross validation accuracy score : {np.mean(cross_val_score(svc, X, y)):.3f}"
)
print(
    f"Accuracy (in-sample): {accuracy_score(y, svc.fit(X,y).predict(X)):.3f}"
)

Cross validation accuracy score : 0.705
Accuracy (in-sample): 0.727


svc = SVC()
print(
    f"Cross validation accuracy score : {np.mean(cross_val_score(svc, X_sub, y)):.3f}"
)
print(
    f"Accuracy (in-sample): {accuracy_score(y, svc.fit(X_sub,y).predict(X_sub)):.3f}"
)

Cross validation accuracy score : 0.690
Accuracy (in-sample): 0.693


np.sqrt((32 / 4601) * np.log((8 * ((4601**5) + 1)) / 0.05))

0.573229145027204


!curl -s --insecure https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data --output data/iris.data

def get_iris_data():
    iris_df = pd.read_csv("data/iris.data", header=None, names=["sepal_length", "sepal_width", "petal_length", "petal_width", "class"])
    y = iris_df["class"].values
    X = iris_df[["sepal_length", "sepal_width", "petal_length", "petal_width"]].values
    return X, y
X, y = get_iris_data()
lda = sk_LDA()
lda.fit(X, y)
print(f"Error rate (in sample): {(1 - accuracy_score(y, lda.predict(X))):.3f}")

Error rate (in sample): 0.020


u = np.random.random(size=2**6) * 4 - 2
index_lists = [np.flatnonzero(u**2 <= 1), np.flatnonzero(u**2 > 1)]
colors = ["blue", "orange"]
labels = ["$Y=0$", "$Y=1$"]
for i, index_list in enumerate(index_lists):
    plt.scatter(
        u[index_list],
        np.power(u, 2)[index_list],
        color=colors[i],
        label=labels[i],
    )
plt.plot([-2, 2], [1, 1], color="green", linewidth=3, label="v = 1")
plt.xlabel("$u$")
plt.ylabel("$v$")
plt.legend()
plt.show()


from sklearn.svm import SVC

X, y = get_spam_data()
p_values = np.arange(0, 4, 1, dtype=int)
cv_results = {}
for p in p_values:
    svc = SVC(kernel="poly", degree=p)
    score = np.mean(cross_val_score(svc, X, y))
    cv_results[p] = score
    print(f"p = {p}:\tscore = {score:.3f}")

p = 0:	score = 0.606
p = 1:	score = 0.701
p = 2:	score = 0.669
p = 3:	score = 0.649


X, y = get_iris_data()
k_values = np.arange(1, 20, 1, dtype=np.int32)
cv_result = {}
for k in k_values:
    clf = KNeighborsClassifier(n_neighbors=k)
    cv_result[k] = np.mean(cross_val_score(clf, X, y))

plt.plot(cv_result.keys(), cv_result.values())
plt.xticks(k_values)
plt.grid()
plt.xlabel("$k$")
plt.ylabel("Accuracy (5-fold CV)")
plt.title("KNN on Iris Data")
plt.show()


from scipy.special import gamma


def get_median(n, d):
    return (1 - (1 / 2) ** (1 / n)) ** (1 / d)


print("dimension required for median to exceed distance to face:")
for n in [100, 1_000, 10_000]:
    d = 1
    while get_median(n, d) <= 0.5:
        d += 1
    print(f"n : {n}, d = {d}")

dimension required for median to exceed distance to face:
n : 100, d = 8
n : 1000, d = 11
n : 10000, d = 14


X, y = get_spam_data()
tree = DecisionTreeClassifier()
np.mean(cross_val_score(tree, X, y))

0.8811093801633385


from sklearn.ensemble import BaggingClassifier

clf = BaggingClassifier(DecisionTreeClassifier())
np.mean(cross_val_score(clf, X, y))

0.9047986593022707


stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X, y)
pred = stump.predict(X)
1 - accuracy_score(y, pred)

0.20625950880243427


X, y = get_spam_data()
n = len(X)
w = np.repeat(1 / n, n)
J = 3
stumps = [DecisionTreeClassifier(max_depth=1) for _ in range(J)]
alphas = []
for j in range(J):
    stumps[j].fit(X, y, sample_weight=w)
    pred = stumps[j].predict(X)
    L_hat = np.dot(w, pred != y) / np.sum(w)
    alpha = np.log((1 - L_hat) / L_hat)
    w = w * np.exp(alpha * (pred != y))
    alphas.append(alpha)

all_preds = np.array([stump.predict(X) for stump in stumps])
final_preds = np.sign(alphas @ all_preds)
1 - accuracy_score(y, final_preds)

0.43077591827863504

	$\hat{h}(x) = 0$	$\hat{h}(x) = 1$
$Y = 0$	??	??
$Y = 1$	??	??

1¶

2¶

3¶

4¶

5¶

6¶

7¶

8¶

9¶

10¶

11¶

12¶

13¶