# import packages

import folium
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, cohen_kappa_score


# load model input data from previous section
y_train = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/y_train.feather").set_index("index")
y_test = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/y_test.feather").set_index("index")

X_train = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/X_train.feather").set_index("index")
X_test = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/X_test.feather").set_index("index")

X_train_PC3 = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/train_set_PC3.feather").set_index("index")
X_test_PC3 = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/test_set_PC3.feather").set_index("index")

HOT_DAY_train = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/HOT_DAY_train.feather").set_index("index")
HOT_DAY_test = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/HOT_DAY_test.feather").set_index("index")

pca_PC3 = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/pca_PC3.feather")

dwd_data = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/dwd_data.feather").set_index("index")


# baseline model, binomial, logistic, rigde regression
m_log_baseline = LogisticRegressionCV(
    penalty='l2',  # L2 regularization
    solver='lbfgs',  # optimization algorithm
    Cs=100,  # values of the regularization parameter C to try
    cv=10,  # number of folds in cross-validation
    scoring='accuracy',  # scoring metric
    random_state=0,  # seed
    max_iter=10000,  # maximum number of iterations
    n_jobs=-1,  # number of parallel processes
    refit=True,  # refit the best model with the entire dataset
    multi_class='ovr',  # one-vs-rest
    class_weight='balanced'  # rebalance the data set
).fit(X_train, y_train.values.ravel())


# plot misclassification error vs. lambda
plt.plot(m_log_baseline.Cs_, np.mean(m_log_baseline.scores_[1], axis=0))
plt.axvline(m_log_baseline.C_[0], c='k', ls='--')
plt.axvline(m_log_baseline.C_[0] * 0.1, c='k', ls=':')
plt.xscale('log')
plt.xlabel('lambda')
plt.ylabel('misclassification error')
plt.grid()
plt.show()


# minimum lambda
m_log_baseline.C_

array([1.09749877])


# lambda within 1 standard error of the minimum
m_log_baseline.C_[0] * 0.1

0.10974987654930568


# prediction for the test set
m_log_baseline_pred = m_log_baseline.predict(X_test)

# prediction results
m_log_baseline_pred

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0])


# confusion matrix
cm = pd.crosstab(
    pd.Series(y_test.values.ravel(), name="Actual"),
    pd.Series(m_log_baseline_pred, name="Predicted"),
)
cm


# accuracy
accuracy_score(y_test.values.ravel(), m_log_baseline_pred)

0.9745222929936306


# Cohen's kappa
cohen_kappa_score(y_test.values.ravel(), m_log_baseline_pred)

0.8944182918628111


# Calculate the accuracy benchmark
dwd_data["HOT_DAY"].sum()/len(dwd_data["HOT_DAY"])

0.8639798488664987


# build the model
m_log_PC3 = LogisticRegressionCV(
    penalty="l2", solver="lbfgs", max_iter=10000, random_state=0
).fit(X_train_PC3, y_train.values.ravel())

# prediction for the test set
# we use the learned model and the new data
# We look for the smallest lambda
m_log_PC3_pred = m_log_PC3.predict(X_test_PC3)

# prediction results
m_log_PC3_pred

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1])


# confusion matrix
cm = pd.crosstab(
    pd.Series(y_test.values.ravel(), name="Actual"),
    pd.Series(m_log_PC3_pred, name="Predicted"),
)
cm


# accuracy
accuracy_score(y_test.values.ravel(), m_log_PC3_pred)

0.9426751592356688


# Cohen's kappa
cohen_kappa_score(y_test.values.ravel(), m_log_PC3_pred)

0.7245077013062975


# predict the occurrence of hot days at a particular weather station
m_log_PC3_full = m_log_PC3.predict(pca_PC3)

# add a new column called "Classification" to the data set
# "FN" for false negativ,
# "TN" for true negativ,
# "FP" for false positive and
# "TP" for true positive

# assign prediction vector to data set
dwd_data["Classification"] = m_log_PC3_full

# assign "FN" to all observations that are false negative
dwd_data.loc[
    (dwd_data["Classification"] == 0) & (dwd_data["HOT_DAY"] == 1), "Classification"
] = "FN"

# assign "TN" to all observations that are true negative
dwd_data.loc[
    (dwd_data["Classification"] == 0) & (dwd_data["HOT_DAY"] == 0), "Classification"
] = "TN"

# assign "FP" to all observations that are false positive
dwd_data.loc[
    (dwd_data["Classification"] == 1) & (dwd_data["HOT_DAY"] == 0), "Classification"
] = "FP"

# assign "TP" to all observations that are true positive
dwd_data.loc[
    (dwd_data["Classification"] == 1) & (dwd_data["HOT_DAY"] == 1), "Classification"
] = "TP"


# create a map
m = folium.Map(
    location=[51.3, 10.5],
    zoom_start=7,
    tiles="cartodbpositron",
    width="100%",
    height="100%",
)

# add a tile layer to the map
folium.TileLayer("cartodbpositron").add_to(m)

# add a layer control panel to the map
folium.LayerControl().add_to(m)

# add a marker for each station
for i in range(len(dwd_data)):
    if dwd_data.iloc[i]["Classification"] == "TP":
        color = "green"
        tooltip = "True Positive"
    elif dwd_data.iloc[i]["Classification"] == "TN":
        color = "blue"
        tooltip = "True Negative"
    elif dwd_data.iloc[i]["Classification"] == "FP":
        color = "black"
        tooltip = "False Positive"
    elif dwd_data.iloc[i]["Classification"] == "FN":
        color = "red"
        tooltip = "False Negative"
    else:
        raise ValueError("Unexpected Classification value")
    folium.Circle(
        location=[dwd_data.iloc[i]["LAT"], dwd_data.iloc[i]["LON"]],
        color=color,
        tooltip=tooltip,
        fill=True,
        radius=1000,
    ).add_to(m)

# show the map
m

The baseline model¶

The PC3 model¶

Visualisation of the results¶

Predicted	0	1
Actual
0	20	0
1	4	133

Predicted	0	1
Actual
0	14	6
1	3	134