import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


import requests, zipfile, io

url = "http://userpage.fu-berlin.de/soga/soga-py/300/307000_time_series/tageswerte_KL_00403_19500101_20211231_hist.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extract("produkt_klima_tag_19500101_20211231_00403.txt", "../data")
data_raw = pd.read_csv(
    "../data/produkt_klima_tag_19500101_20211231_00403.txt",
    sep=";",
    na_values=["-999"],
    skipinitialspace=True
)
data_raw


data = data_raw.drop(columns=["STATIONS_ID", "MESS_DATUM", "QN_3", "FX", "FM", "NM", "eor", "QN_4"])
data


data = data.rename(columns={"RSK": "prec",
                            "RSKF": "prec_type",
                            "SDK": "sun_dur",
                            "SHK_TAG": "snow_depth",
                            "VPM": "vapor_pres",
                            "PM": "pres",
                            "TMK": "temp",
                            "UPM": "rel_humid",
                            "TXK": "temp_max",
                            "TNK": "temp_min",
                            "TGK": "temp_sfc"})
data


sns.pairplot(data, vars=["temp", "temp_max", "temp_min", "temp_sfc"], corner=True)

<seaborn.axisgrid.PairGrid at 0x7f46a4079af0>


data = data.drop(columns=["temp_max", "temp_min", "temp_sfc"])
data


data.isnull().sum()

prec            0
prec_type       0
sun_dur       145
snow_depth      1
vapor_pres      1
pres            0
temp            0
rel_humid       1
dtype: int64


data.dropna(axis=0, inplace=True)
data


data["prec_type"] = data["prec_type"].apply(str)
data = pd.get_dummies(data, drop_first=True)
data


data[["prec","sun_dur", "snow_depth", "vapor_pres", "pres", "rel_humid", "temp"]].hist(bins=35, figsize=(9,6))

array([[<Axes: title={'center': 'prec'}>,
        <Axes: title={'center': 'sun_dur'}>,
        <Axes: title={'center': 'snow_depth'}>],
       [<Axes: title={'center': 'vapor_pres'}>,
        <Axes: title={'center': 'pres'}>,
        <Axes: title={'center': 'rel_humid'}>],
       [<Axes: title={'center': 'temp'}>, <Axes: >, <Axes: >]],
      dtype=object)


data[["prec","sun_dur", "snow_depth", "vapor_pres", "pres", "rel_humid", "temp"]].describe()


data = data.drop(columns=["prec", "sun_dur", "snow_depth"])
data


from sklearn.model_selection import train_test_split

# split features and target first
X, y = data.drop(columns=["temp"]), data["temp"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(len(X_train), len(X_test), len(y_train), len(y_test))

20921 5231 20921 5231


transform_cols = ["vapor_pres", "pres", "rel_humid"]
X_test[transform_cols] = (X_test[transform_cols] - X_train[transform_cols].min() + 1) / (X_train[transform_cols].max() - X_train[transform_cols].min() + 2)
X_train[transform_cols] = (X_train[transform_cols] - X_train[transform_cols].min() + 1) / (X_train[transform_cols].max() - X_train[transform_cols].min() + 2)
y_test = (y_test - y_train.min() + 1) / (y_train.max() - y_train.min() + 2)
y_train = (y_train - y_train.min() + 1) / (y_train.max() - y_train.min() + 2)


print(X_test[X_test[transform_cols].le(0).any(axis=1)])
print(X_test[X_test[transform_cols].ge(1).any(axis=1)])
print(y_test[y_test.le(0)])
print(y_test[y_test.ge(1)])

       vapor_pres      pres  rel_humid  prec_type_1  prec_type_4  prec_type_6  \
14301    0.270161 -0.032258   0.643836            0            0            1   

       prec_type_7  prec_type_8  
14301            0            0  
Empty DataFrame
Columns: [vapor_pres, pres, rel_humid, prec_type_1, prec_type_4, prec_type_6, prec_type_7, prec_type_8]
Index: []
Series([], Name: temp, dtype: float64)
Series([], Name: temp, dtype: float64)


y_test.drop(X_test[X_test[transform_cols].le(0).any(axis=1) | X_test[transform_cols].ge(1).any(axis=1)].index, inplace=True)
X_test.drop(X_test[X_test[transform_cols].le(0).any(axis=1) | X_test[transform_cols].ge(1).any(axis=1)].index, inplace=True)


X_test.drop(y_test[y_test.le(0) | y_test.ge(1)].index, inplace=True)
y_test.drop(y_test[y_test.le(0) | y_test.ge(1)].index, inplace=True)


X_test[transform_cols] = np.log(X_test[transform_cols] / (1 - X_test[transform_cols]))
X_train[transform_cols] = np.log(X_train[transform_cols] / (1 - X_train[transform_cols]))
y_test = np.log(y_test / (1 - y_test))
y_train = np.log(y_train / (1 - y_train))


X_train[transform_cols].hist(bins=35, figsize=(9, 6))
plt.show()


X_test[transform_cols].hist(bins=35, figsize=(9, 6))
plt.show()


y_train.hist(bins=35, figsize=(4.5, 3))
plt.title('Scaled Temperature (y_train)')
plt.show()


y_test.hist(bins=35, figsize=(4.5, 3))
plt.title('Scaled Temperature (y_test)')
plt.show()


from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(hidden_layer_sizes=(8,8,), activation="relu", alpha=0.0001, batch_size=200, max_iter=200, random_state=0)


mlp.fit(X_train, y_train)

MLPRegressor(batch_size=200, hidden_layer_sizes=(8, 8), random_state=0)

MLPRegressor(batch_size=200, hidden_layer_sizes=(8, 8), random_state=0)


plt.plot(mlp.loss_curve_, marker='o')
plt.xlabel("Number of epochs")
plt.ylabel("Loss")

Text(0, 0.5, 'Loss')


from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, mlp.predict(X_test), squared=False)

0.055329137035810244


from sklearn.model_selection import cross_val_score
scores = - cross_val_score(estimator=mlp, X=X_train, y=y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=1)
print(scores)
print(np.mean(scores), np.std(scores))

[0.00506648 0.00289515 0.00274493 0.00277863 0.00348901 0.00423427
 0.00360007 0.00547693 0.00387099 0.00348136]
0.0037637816379283767 0.0008837386878676773


from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(estimator=mlp,
                                                        X=X_train,
                                                        y=y_train,
                                                        train_sizes=np.linspace(0.1, 1.0, 10),
                                                        cv=10,
                                                        scoring='neg_mean_squared_error',
                                                        n_jobs=1)
train_scores = - train_scores
test_scores = - test_scores


# calculate mean and std
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)


plt.plot(train_sizes,
         train_mean,
         marker='o',
         label='Training RMSE')
plt.fill_between(train_sizes,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.25)
plt.plot(train_sizes,
         test_mean,
         linestyle='dashed',
         marker='s',
         label='Validation RMSE')
plt.fill_between(train_sizes,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.25)
plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('RMSE')
plt.legend(loc='upper right')

<matplotlib.legend.Legend at 0x7f4626b5a820>


from sklearn.model_selection import validation_curve

alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]

train_scores, test_scores = validation_curve(estimator=mlp,
                                             X=X_train,
                                             y=y_train,
                                             param_name='alpha',
                                             param_range=alphas,
                                             cv=10,
                                             scoring='neg_mean_squared_error')

train_scores = - train_scores
test_scores = - test_scores


train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.plot(alphas,
         train_mean,
         marker='o',
         label='Training RMSE')
plt.fill_between(alphas,
                 train_mean + train_std,
                 train_mean - train_std,
                 alpha=0.25)
plt.plot(alphas,
         test_mean,
         linestyle='dashed',
         marker='s',
         label='Validation RMSE')
plt.fill_between(alphas,
                 test_mean + test_std,
                 test_mean - test_std,
                 alpha=0.25)
plt.grid()
plt.xscale('log')
plt.legend(loc='upper right')
plt.xlabel('Parameter alpha')
plt.ylabel('RMSE')
plt.show()


from sklearn.model_selection import validation_curve

layouts = [(8, 8,), (8, 4,), (8,), (100,), (100, 50, 25,)]

train_scores, test_scores = validation_curve(estimator=mlp,
                                             X=X_train,
                                             y=y_train,
                                             param_name='hidden_layer_sizes',
                                             param_range=layouts,
                                             cv=10,
                                             scoring='neg_mean_squared_error')

train_scores = - train_scores
test_scores = - test_scores


train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

layout = [1, 2, 3, 4, 5]
plt.plot(layout,
         train_mean,
         marker='o',
         color='blue',
         linestyle='None',
         label='Training RMSE')
plt.errorbar(layout,
             train_mean,
             train_std,
             train_std,
             capsize=5,
             color='blue',
             linestyle='None')
plt.plot(layout,
         test_mean,
         marker='s',
         color='red',
         linestyle='None',
         label='Validation RMSE')
plt.errorbar(layout,
             test_mean,
             test_std,
             test_std,
             capsize=5,
             color='red',
             linestyle='None')
plt.grid()
plt.legend(loc='upper right')
plt.xlabel('Layout number')
plt.ylabel('RMSE')
plt.xticks(layout)
plt.show()


from sklearn.model_selection import validation_curve

activations = ["identity", "logistic", "tanh", "relu"]

train_scores, test_scores = validation_curve(estimator=mlp,
                                             X=X_train,
                                             y=y_train,
                                             param_name='activation',
                                             param_range=activations,
                                             cv=10,
                                             scoring='neg_mean_squared_error')

train_scores = - train_scores
test_scores = - test_scores


train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

activation = [1, 2, 3, 4]
plt.plot(activation,
         train_mean,
         marker='o',
         color='blue',
         linestyle='None',
         label='Training RMSE')
plt.errorbar(activation,
             train_mean,
             train_std,
             train_std,
             capsize=5,
             color='blue',
             linestyle='None')
plt.plot(activation,
         test_mean,
         marker='s',
         color='red',
         linestyle='None',
         label='Validation RMSE')
plt.errorbar(activation,
             test_mean,
             test_std,
             test_std,
             capsize=5,
             color='red',
             linestyle='None')
plt.grid()
plt.legend(loc='upper right')
plt.xlabel('Activation function')
plt.ylabel('RMSE')
plt.xticks(activation)
plt.show()


mlp = MLPRegressor(hidden_layer_sizes=(100,), activation="relu", alpha=0.0001, batch_size=200, max_iter=200, random_state=0)


mlp.fit(X_train, y_train)

MLPRegressor(batch_size=200, random_state=0)

MLPRegressor(batch_size=200, random_state=0)


from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, mlp.predict(X_test), squared=False)

0.0439057881042131


from sklearn.model_selection import train_test_split

# drop vapor pressu
X, y = data.drop(columns=["temp", "vapor_pres"]), data["temp"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


mlp.fit(X_train, y_train)

MLPRegressor(batch_size=200, random_state=0)

MLPRegressor(batch_size=200, random_state=0)


from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, mlp.predict(X_test), squared=False)

6.249823936687336

	STATIONS_ID	MESS_DATUM	QN_3	FX	FM	QN_4	RSK	RSKF	SDK	SHK_TAG	NM	VPM	PM	TMK	UPM	TXK	TNK	TGK	eor
0	403	19500101	NaN	NaN	NaN	5	2.2	7	NaN	0.0	5.0	4.0	1025.60	-3.2	83.00	-1.1	-4.9	-6.3	eor
1	403	19500102	NaN	NaN	NaN	5	12.6	8	NaN	0.0	8.0	6.1	1005.60	1.0	95.00	2.2	-3.7	-5.3	eor
2	403	19500103	NaN	NaN	NaN	5	0.5	1	NaN	0.0	5.0	6.5	996.60	2.8	86.00	3.9	1.7	-1.4	eor
3	403	19500104	NaN	NaN	NaN	5	0.5	7	NaN	0.0	7.7	5.2	999.50	-0.1	85.00	2.1	-0.9	-2.3	eor
4	403	19500105	NaN	NaN	NaN	5	10.3	7	NaN	0.0	8.0	4.0	1001.10	-2.8	79.00	-0.9	-3.3	-5.2	eor
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
26293	403	20211227	NaN	NaN	NaN	3	0.0	8	0.183	0.0	5.9	3.8	998.13	-3.7	79.67	-0.7	-7.9	-9.9	eor
26294	403	20211228	NaN	NaN	NaN	3	1.5	6	0.000	0.0	6.4	5.3	990.17	-0.5	88.46	2.7	-3.9	-5.1	eor
26295	403	20211229	NaN	NaN	NaN	3	0.3	6	0.000	0.0	7.5	8.2	994.40	4.0	100.00	5.6	1.8	0.0	eor
26296	403	20211230	NaN	NaN	NaN	3	3.2	6	0.000	0.0	7.9	11.5	1001.70	9.0	98.54	12.7	4.6	2.3	eor
26297	403	20211231	NaN	NaN	NaN	3	5.5	6	0.000	0.0	7.7	12.5	1004.72	12.8	84.96	14.0	11.5	10.7	eor

	RSK	RSKF	SDK	SHK_TAG	VPM	PM	TMK	UPM	TXK	TNK	TGK
0	2.2	7	NaN	0.0	4.0	1025.60	-3.2	83.00	-1.1	-4.9	-6.3
1	12.6	8	NaN	0.0	6.1	1005.60	1.0	95.00	2.2	-3.7	-5.3
2	0.5	1	NaN	0.0	6.5	996.60	2.8	86.00	3.9	1.7	-1.4
3	0.5	7	NaN	0.0	5.2	999.50	-0.1	85.00	2.1	-0.9	-2.3
4	10.3	7	NaN	0.0	4.0	1001.10	-2.8	79.00	-0.9	-3.3	-5.2
...	...	...	...	...	...	...	...	...	...	...	...
26293	0.0	8	0.183	0.0	3.8	998.13	-3.7	79.67	-0.7	-7.9	-9.9
26294	1.5	6	0.000	0.0	5.3	990.17	-0.5	88.46	2.7	-3.9	-5.1
26295	0.3	6	0.000	0.0	8.2	994.40	4.0	100.00	5.6	1.8	0.0
26296	3.2	6	0.000	0.0	11.5	1001.70	9.0	98.54	12.7	4.6	2.3
26297	5.5	6	0.000	0.0	12.5	1004.72	12.8	84.96	14.0	11.5	10.7

	prec	prec_type	sun_dur	snow_depth	vapor_pres	pres	temp	rel_humid	temp_max	temp_min	temp_sfc
0	2.2	7	NaN	0.0	4.0	1025.60	-3.2	83.00	-1.1	-4.9	-6.3
1	12.6	8	NaN	0.0	6.1	1005.60	1.0	95.00	2.2	-3.7	-5.3
2	0.5	1	NaN	0.0	6.5	996.60	2.8	86.00	3.9	1.7	-1.4
3	0.5	7	NaN	0.0	5.2	999.50	-0.1	85.00	2.1	-0.9	-2.3
4	10.3	7	NaN	0.0	4.0	1001.10	-2.8	79.00	-0.9	-3.3	-5.2
...	...	...	...	...	...	...	...	...	...	...	...
26293	0.0	8	0.183	0.0	3.8	998.13	-3.7	79.67	-0.7	-7.9	-9.9
26294	1.5	6	0.000	0.0	5.3	990.17	-0.5	88.46	2.7	-3.9	-5.1
26295	0.3	6	0.000	0.0	8.2	994.40	4.0	100.00	5.6	1.8	0.0
26296	3.2	6	0.000	0.0	11.5	1001.70	9.0	98.54	12.7	4.6	2.3
26297	5.5	6	0.000	0.0	12.5	1004.72	12.8	84.96	14.0	11.5	10.7

	prec	prec_type	sun_dur	snow_depth	vapor_pres	pres	temp	rel_humid
0	2.2	7	NaN	0.0	4.0	1025.60	-3.2	83.00
1	12.6	8	NaN	0.0	6.1	1005.60	1.0	95.00
2	0.5	1	NaN	0.0	6.5	996.60	2.8	86.00
3	0.5	7	NaN	0.0	5.2	999.50	-0.1	85.00
4	10.3	7	NaN	0.0	4.0	1001.10	-2.8	79.00
...	...	...	...	...	...	...	...	...
26293	0.0	8	0.183	0.0	3.8	998.13	-3.7	79.67
26294	1.5	6	0.000	0.0	5.3	990.17	-0.5	88.46
26295	0.3	6	0.000	0.0	8.2	994.40	4.0	100.00
26296	3.2	6	0.000	0.0	11.5	1001.70	9.0	98.54
26297	5.5	6	0.000	0.0	12.5	1004.72	12.8	84.96

	prec	prec_type	sun_dur	snow_depth	vapor_pres	pres	temp	rel_humid
90	1.0	1	0.000	0.0	7.1	1001.50	7.1	70.00
91	2.8	1	0.000	0.0	8.7	984.70	7.6	81.00
92	0.0	0	6.400	0.0	6.7	987.80	6.9	68.00
93	0.5	1	5.800	0.0	6.9	997.20	6.2	73.00
94	11.7	1	2.900	0.0	7.1	1000.70	6.4	74.00
...	...	...	...	...	...	...	...	...
26293	0.0	8	0.183	0.0	3.8	998.13	-3.7	79.67
26294	1.5	6	0.000	0.0	5.3	990.17	-0.5	88.46
26295	0.3	6	0.000	0.0	8.2	994.40	4.0	100.00
26296	3.2	6	0.000	0.0	11.5	1001.70	9.0	98.54
26297	5.5	6	0.000	0.0	12.5	1004.72	12.8	84.96

A typical machine learning workflow¶

1. Select data¶

1.1 Read dataset¶

1.2 Remove unnecessary columns¶

1.3 Rename columns for convenience¶

1.4 Remove correlated columns¶

1.5 Drop missing data¶

1.6 Handle categorical data¶

2. Inspect data¶

2.1 Check distributions¶

3. Train-Test split¶

4. Feature scaling¶

4.1 Min-Max scaling¶

4.2 Logistic transformation¶

5. Model definition¶

6. Model training¶

7. Model evaluation¶

7.1 Convergence of training¶

7.2 Model validation¶

7.3 Learning curve¶

8 Model selection¶

8.1 Parameter alpha¶

8.2 The network layout¶

8.3 Activation function¶

8.4 Grid search¶

9 Final model¶

9.1 Model definition¶

9.2 Model training¶

9.3 Final model evaluation¶

10 A good model might not do what we think it does...¶

	prec	sun_dur	snow_depth	vapor_pres	pres	rel_humid	temp
count	26152.000000	26152.000000	26152.000000	26152.000000	26152.000000	26152.000000	26152.000000
mean	1.605422	4.741813	0.754742	9.599373	1007.798212	76.528964	9.374013
std	3.854016	4.429256	3.293229	4.084277	9.128905	12.914631	7.619309
min	0.000000	0.000000	0.000000	1.100000	961.000000	29.000000	-17.900000
25%	0.000000	0.400000	0.000000	6.400000	1002.297500	68.000000	3.600000
50%	0.000000	3.800000	0.000000	8.900000	1008.200000	78.000000	9.500000
75%	1.500000	8.000000	0.000000	12.500000	1013.600000	87.000000	15.400000
max	106.000000	16.400000	49.000000	23.900000	1040.000000	100.000000	29.500000