# import packages

import folium
import numpy as np
import pandas as pd


# load data set
dwd = pd.read_csv("https://userpage.fu-berlin.de/soga/data/raw-data/DWD.csv", sep=",")
dwd.head()

# Exclude variables
cols_to_drop = [
    "DWD_ID",
    "STATION_NAME",
    "FEDERAL_STATE",
    "PERIOD",
    "MAX_WIND_SPEED", # shows many missing values
]
dwd = dwd.drop(cols_to_drop, axis=1) # drop columns

# Exclude missing values
dwd = dwd.dropna() # drop rows with missing values

# Create binary response variable HEAT.DAY
dwd["HOT_DAY"] = np.where(dwd["MAX_AIR_TEMP"] >= 30, 1, 0)


# set seed for reproducibility
np.random.seed(30300)
# Split data into training and test set, 60% training and 40% test set
split = np.random.rand(len(dwd)) < 0.6

# define split vector for HOT_DAY
HOT_DAY_train = dwd["HOT_DAY"][split == True]
HOT_DAY_test = dwd["HOT_DAY"][split == False]

#########################
### Original data set ###
#########################
# training set
train_set = dwd[split == True]
# test set
test_set = dwd[split == False]

# response vectors
y_train = HOT_DAY_train
y_test = HOT_DAY_test

# exclude response variable(s) from the data matrix
X_train = train_set.drop(["HOT_DAY", "MAX_AIR_TEMP"], axis=1)
X_test = test_set.drop(["HOT_DAY", "MAX_AIR_TEMP"], axis=1)

####################
### PCA data set ###
####################

# load PC3 model from previous sections
pca_PC3 = pd.read_feather("https://userpage.fu-berlin.de/soga/data/py-data/pca_PC3.feather")
pca_PC3.head()


# Split the data set:

# training set PCA
train_set_PC3 = pca_PC3[split == True]

# test set PCA
test_set_PC3 = pca_PC3[split == False]

# save defined training and test data as well as response vectors into one feather file
# we have to reset the index as feather does not support non-standar indices
train_set_PC3.reset_index(drop=False).to_feather("train_set_PC3.feather")
test_set_PC3.reset_index(drop=False).to_feather("test_set_PC3.feather")

y_train.reset_index(drop=False).to_feather("y_train.feather")
y_test.reset_index(drop=False).to_feather("y_test.feather")

HOT_DAY_train.reset_index(drop=False).to_feather("HOT_DAY_train.feather")
HOT_DAY_test.reset_index(drop=False).to_feather("HOT_DAY_test.feather")

X_train.reset_index(drop=False).to_feather("X_train.feather")
X_test.reset_index(drop=False).to_feather("X_test.feather")

dwd.reset_index(drop=False).to_feather("dwd.feather")


dwd["HOT_DAY"].describe()

count    397.000000
mean       0.863980
std        0.343243
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: HOT_DAY, dtype: float64


dwd["HOT_DAY"].info()

<class 'pandas.core.series.Series'>
Int64Index: 397 entries, 1 to 597
Series name: HOT_DAY
Non-Null Count  Dtype
--------------  -----
397 non-null    int64
dtypes: int64(1)
memory usage: 6.2 KB


dwd["HOT_DAY"].value_counts()

1    343
0     54
Name: HOT_DAY, dtype: int64


# plot the map
m = folium.Map(
    location=[51.1657, 10.4515],
    zoom_start=7,
    tiles="cartodbpositron",
    width="100%",
    height="100%",
)

# add the observations as points to the map
for i in range(len(dwd)):
    folium.Circle(
        location=[dwd.iloc[i]["LAT"], dwd.iloc[i]["LON"]],
        color="crimson",
        fill=True,
        radius=100,
    ).add_to(m)

# show the map
m

	PC1	PC2	PC3
0	-2.250742	1.188693	2.046782
1	-1.987427	0.603907	0.575597
2	2.224010	-1.298316	0.968473
3	1.173059	-2.923561	-1.130196
4	-1.992753	-1.187762	0.404263