import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.graphics.api import abline_plot
from mpl_toolkits.mplot3d import axes3d

def model_rmse(model_results):
    return np.sqrt(model_results.mse_resid)


import numpy as np
def diff_rmse(obs, preds):
    return np.sqrt(np.sum((obs-preds)**2)/len(obs))


import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
dwd = pd.read_table(
    "https://userpage.fu-berlin.de/soga/data/raw-data/DWD.csv", 
    index_col=0,
    sep=',',
)
df = dwd.drop(['STATION_NAME', 'FEDERAL_STATE', 'PERIOD'], axis=1).dropna()
train_set = df.sample(random_state=42, frac=.8)
test_set = df.drop(train_set.index)


baseline_model = smf.ols(formula='MEAN_ANNUAL_RAINFALL ~ 1', data=train_set)
results = baseline_model.fit()
results.params

Intercept    756.907975
dtype: float64


np.round(np.mean(train_set["MEAN_ANNUAL_RAINFALL"]),2)

756.91


# calculate rmse on training set
print("RMSE on training set:", str(model_rmse(results)))

# store model object and results of rmse in the `list` object named `model.outcome`
# Note that we build a nested list-object
model_outcome = {}
model_outcome["baseline"] = {
    "model": baseline_model, 
    "results": results, 
    "rmse": pd.DataFrame.from_dict({"name": pd.Index(["baseline model"]), "train_RMSE": model_rmse(results), "test_RMSE": np.nan})
}

RMSE on training set: 243.88215150947042


_, ax = plt.subplots(figsize=(12, 6))
ax.scatter(train_set.index, train_set["MEAN_ANNUAL_RAINFALL"])
ax.hlines(results.params, xmin=0, xmax=1000, colors="red", label="Intercept")
ax.grid()
ax.set_ylabel("mean annual precipitation in mm")
ax.set_xlabel("index")
plt.legend()
plt.show()


X = sm.add_constant(test_set["MEAN_ANNUAL_RAINFALL"])
pred = results.predict(X)

# calculate RMSE for the test data set
test_rmse = diff_rmse(test_set["MEAN_ANNUAL_RAINFALL"], pred)
print("RMSE on test set:", test_rmse)
model_outcome["baseline"]["rmse"]["test_RMSE"] = test_rmse

RMSE on test set: 180.87701106302217


model_outcome["baseline"]["rmse"]


p_cor = np.corrcoef(df["ALTITUDE"], df["MEAN_ANNUAL_RAINFALL"])
print(np.round(p_cor[0,1],2))

0.76


simple_alt_model = smf.ols(formula='MEAN_ANNUAL_RAINFALL ~ ALTITUDE', data=train_set)
results = simple_alt_model.fit()
results.summary()


fig, ax = plt.subplots(figsize=(10,6))
ax.plot(train_set["ALTITUDE"], train_set["MEAN_ANNUAL_RAINFALL"], "o", label="Data")
abline_plot(model_results=results, ax=ax, color="red", label="Regression line")
ax.grid()
ax.legend(loc="best")

<matplotlib.legend.Legend at 0x7fe5bb706cd0>


pred = results.predict(test_set)

test_rmse = diff_rmse(test_set["MEAN_ANNUAL_RAINFALL"], pred)
print("RMSE on test set:", test_rmse)

model_outcome["simple_alt"] = {
    "model": simple_alt_model, 
    "results": results, 
    "rmse": pd.DataFrame.from_dict({"name": pd.Index(["simple alt model"]), "train_RMSE": model_rmse(results), "test_RMSE": test_rmse})
}

RMSE on test set: 138.85454382261386


rmses = pd.concat([model_outcome["baseline"]["rmse"],model_outcome["simple_alt"]["rmse"]])


max_rainfall_model = smf.ols(formula='MEAN_ANNUAL_RAINFALL ~ MAX_RAINFALL', data=train_set)
results = max_rainfall_model.fit()
results.summary()

pred = results.predict(test_set)

test_rmse = diff_rmse(test_set["MEAN_ANNUAL_RAINFALL"], pred)
print("RMSE on test set:", test_rmse)

model_outcome["max_rainfall"] = {
    "model": simple_alt_model, 
    "results": results, 
    "rmse": pd.DataFrame.from_dict({"name": pd.Index(["max rainfall model"]), "train_RMSE": model_rmse(results), "test_RMSE": test_rmse})
}

RMSE on test set: 117.43789683775168


rmses = pd.concat([rmses, model_outcome["max_rainfall"]["rmse"]])
rmses


multi_alt_rain_model = smf.ols(formula='MEAN_ANNUAL_RAINFALL ~ ALTITUDE + MAX_RAINFALL', data=train_set)
results = multi_alt_rain_model.fit()

pred = results.predict(test_set)

test_rmse = diff_rmse(test_set["MEAN_ANNUAL_RAINFALL"], pred)
print("RMSE on test set:", test_rmse)

model_outcome["multi_alt_rain"] = {
    "model": simple_alt_model, 
    "results": results, 
    "rmse": pd.DataFrame.from_dict({"name": pd.Index(["multi alt rain model"]), "train_RMSE": model_rmse(results), "test_RMSE": test_rmse})
}

RMSE on test set: 113.7463628599613


results.summary()


fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection = '3d')

ax.scatter3D(
    test_set["ALTITUDE"].values,
    test_set["MAX_RAINFALL"].values, 
    test_set["MEAN_ANNUAL_RAINFALL"].values,
)

ax.set_xlabel("ALTITUDE")
ax.set_ylabel("MAX_RAINFALL")
ax.set_zlabel("MEAN_ANNUAL_RAINFALL")
plt.show()


rmses = pd.concat([rmses, model_outcome["multi_alt_rain"]["rmse"]])
rmses


rmses.reset_index().to_feather('30221_rmses.feather')


_, ax = plt.subplots(figsize=(10,6))
rmses.plot(kind="bar", x="name", ax=ax)
ax.set_xlabel("")
ax.set_ylabel("RMSE in mm")
plt.show()

Dep. Variable:	MEAN_ANNUAL_RAINFALL	R-squared:	0.599
Model:	OLS	Adj. R-squared:	0.596
Method:	Least Squares	F-statistic:	240.1
Date:	Sat, 24 Jun 2023	Prob (F-statistic):	9.89e-34
Time:	21:27:26	Log-Likelihood:	-1052.4
No. Observations:	163	AIC:	2109.
Df Residuals:	161	BIC:	2115.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	592.2684	16.133	36.712	0.000	560.409	624.128
ALTITUDE	0.6126	0.040	15.495	0.000	0.535	0.691

Omnibus:	22.111	Durbin-Watson:	2.077
Prob(Omnibus):	0.000	Jarque-Bera (JB):	30.099
Skew:	0.790	Prob(JB):	2.91e-07
Kurtosis:	4.392	Cond. No.	542.

Dep. Variable:	MEAN_ANNUAL_RAINFALL	R-squared:	0.768
Model:	OLS	Adj. R-squared:	0.766
Method:	Least Squares	F-statistic:	265.4
Date:	Wed, 21 Jun 2023	Prob (F-statistic):	1.50e-51
Time:	16:07:28	Log-Likelihood:	-1007.5
No. Observations:	163	AIC:	2021.
Df Residuals:	160	BIC:	2030.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-198.2681	74.013	-2.679	0.008	-344.437	-52.099
ALTITUDE	0.1324	0.054	2.471	0.015	0.027	0.238
MAX_RAINFALL	24.2740	2.241	10.831	0.000	19.848	28.700

Feature and model selection¶

The baseline model¶

The multiple linear regression model¶

Interim results I¶

name	train_RMSE	test_RMSE
baseline model	243.882152	180.877011
simple alt model	154.992815	138.854544
max rainfall model	119.953630	117.437897

Omnibus:	0.317	Durbin-Watson:	2.085
Prob(Omnibus):	0.853	Jarque-Bera (JB):	0.236
Skew:	0.093	Prob(JB):	0.889
Kurtosis:	2.995	Cond. No.	3.28e+03