# First, let's import the needed libraries.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime


## Your code here


def RMSE(sim, obs):
    return "Built a funtion!"


## ....


def RMSE(sim, obs):
    return np.sqrt(np.sum((sim - obs) ** 2) / len(sim))


temp_NA = pd.read_json(
    "http://userpage.fu-berlin.de/soga/soga-py/300/307000_time_series/NA_datasets_temp_NA.json"
).sort_index()

temp_sample = pd.read_json(
    "http://userpage.fu-berlin.de/soga/soga-py/300/307000_time_series/NA_datasets_temp_sample.json"
).sort_index()


temp_sample


plt.figure(figsize=(18, 6))
plt.plot(temp_sample, color="black")

plt.title("Weather station Berlin-Dahlem", fontsize=16)
plt.ylabel("Temperature in °C", fontsize=16)
plt.xlabel("Days", fontsize=16)
plt.show()


na_perc = round(temp_NA.isnull().sum() / len(temp_sample), 3) * 100
na_perc

temp.NA    36.5
dtype: float64


plt.figure(figsize=(18, 6))
plt.plot(temp_NA, color="black")

plt.title(f"Missing values: {na_perc.values[0]} %", fontsize=16)
plt.ylabel("Temperature in °C", fontsize=16)
plt.xlabel("Days", fontsize=16)
plt.show()


# forward-fill
temp_NA_ffill = temp_NA.fillna(method="ffill")
temp_NA_ffill.head(10)


# back-fill
temp_NA_bfill = temp_NA.fillna(method="bfill")
temp_NA_bfill.head(10)


## ERROR ##
rmse_NA = RMSE(temp_NA_bfill.values, temp_sample.values)
rmse_NA

1.768216699529771


plt.figure(figsize=(18, 6))
plt.plot(temp_NA_bfill, color="red", label="imputed values")
plt.plot(temp_NA, color="black")
plt.title(".fillna(method='bfill')", fontsize=16)
plt.ylabel("Temperature in °C", fontsize=16)
plt.xlabel("Days", fontsize=16)
plt.text(0, 20, f"RMSE ={round(rmse_NA,4)}", fontsize=16)
plt.legend()
plt.show()


# over-all-mean-fill
temp_NA_meanfill = temp_NA.fillna(value=temp_NA.mean())
temp_NA_meanfill.head(10)


## ERROR ##
rmse_NA = RMSE(temp_NA_meanfill.values, temp_sample.values)
rmse_NA

4.392792211787441


plt.figure(figsize=(18, 6))
plt.plot(temp_NA_meanfill, color="red", label="imputed values")
plt.plot(temp_NA, color="black")
plt.title(".fillna() using overall mean", fontsize=16)
plt.ylabel("Temperature in °C", fontsize=16)
plt.xlabel("Days", fontsize=16)
plt.text(0, 20, f"RMSE ={round(rmse_NA,4)}", fontsize=16)
plt.legend()
plt.show()


## LINEAR IMPUTING ##

temp_NA_inter = temp_NA.interpolate(method="linear").copy()
temp_NA_inter.head(10)


## ERROR ##
rmse_NA = RMSE(temp_NA_inter.values, temp_sample.values)
rmse_NA

1.1095012631275054


plt.figure(figsize=(18, 6))
plt.plot(temp_NA_inter, color="red", label="imputed values")
plt.plot(temp_NA, color="black")
plt.title(".interpolate(method='linear')", fontsize=16)
plt.ylabel("Temperature in °C", fontsize=16)
plt.xlabel("Days", fontsize=16)
plt.text(0, 20, f"RMSE ={round(rmse_NA,4)}", fontsize=16)
plt.legend()
plt.show()


# read json files from directory
co2_NA = pd.read_json(
    "http://userpage.fu-berlin.de/soga/soga-py/300/307000_time_series/NA_datasets_co2_NA.json"
).sort_index()

co2_sample = pd.read_json(
    "http://userpage.fu-berlin.de/soga/soga-py/300/307000_time_series/NA_datasets_co2_sample.json"
).sort_index()


co2_sample


plt.figure(figsize=(18, 6))
plt.plot(co2_NA, color="black")

plt.title("Atmospheric carbon dioxide, Mauna Loa, Hawaii", fontsize=16)
plt.ylabel("$CO_2$", fontsize=16)
plt.xlabel("Time", fontsize=16)
plt.show()


## Your code here...


na_perc_high = round(co2_NA.isnull().sum() / len(co2_NA), 3) * 100
na_perc_high

co2.NA    39.1
dtype: float64


## Your code here...


# backward-fill
co2_NA_bfill = co2_NA.fillna(method="bfill")


## ERROR ##
index = len(co2_NA_bfill)
co2_NA_bfill_clean = np.delete(
    co2_NA_bfill.values, index - 1
)  ## because of the method, last element is nan

rmse_NA = RMSE(co2_NA_bfill_clean, co2_sample.values)
rmse_NA

188.1500393901449


## PLOT ##
plt.figure(figsize=(18, 6))
plt.plot(co2_NA_bfill, color="red", label="imputed values")
plt.plot(co2_NA, color="black")
plt.title(".fillna(method='bfill')", fontsize=16)
plt.ylabel("$CO_2$", fontsize=16)
plt.xlabel("Time", fontsize=16)
plt.text(0, 380, f"RMSE ={round(rmse_NA,4)}", fontsize=16)
plt.legend()
plt.show()


## IMPUTING BY OVERALL MEAN ##

co2_NA_inter = co2_NA.interpolate(method="linear").copy()
co2_NA_inter.head(10)

## ERROR ##
rmse_NA = RMSE(co2_NA_inter.values, co2_sample.values)
rmse_NA

## PLOT ##
plt.figure(figsize=(18, 6))
plt.plot(co2_NA_inter, color="red", label="imputed values")
plt.plot(co2_NA, color="black")
plt.title(".interpolate(method='linear')", fontsize=16)
plt.ylabel("$CO_2$", fontsize=16)
plt.xlabel("Time", fontsize=16)
plt.text(0, 380, f"RMSE ={round(rmse_NA,4)}", fontsize=16)
plt.legend()
plt.show()

	temp.sample
0	6.2
1	4.7
2	7.0
3	7.9
4	6.7
...	...
737	3.3
738	6.2
739	8.5
740	3.4
741	5.5

	temp.NA
0	6.2
1	6.2
2	7.0
3	7.0
4	7.0
5	6.2
6	3.7
7	5.1
8	6.4
9	2.6

	temp.NA
0	6.2
1	7.0
2	7.0
3	6.2
4	6.2
5	6.2
6	3.7
7	5.1
8	6.4
9	2.6

	temp.NA
0	6.200000
1	10.315924
2	7.000000
3	10.315924
4	10.315924
5	6.200000
6	3.700000
7	5.100000
8	6.400000
9	2.600000

	temp.NA
0	6.200000
1	6.600000
2	7.000000
3	6.733333
4	6.466667
5	6.200000
6	3.700000
7	5.100000
8	6.400000
9	2.600000

Imputing daily temperature data¶

forward-fill to propagate the previous value forward: `.fillna(method="ffill")`¶

back-fill to propagate the next values backward: `.fillna(method='bfill')`¶

replacing each NA with aggregated values `.fillna()`¶

replacing `NA` with aggregated values: `.interpolate()`¶

Imputing the carbon dioxide data set¶

back-fill to propagate the next values backward: `.fillna(method='bfill')`¶

replacing `NA` with aggregated values: `.interpolate()`¶

	co2.sample
0	369.29
1	369.54
2	370.60
3	371.82
4	371.58
...	...
187	398.93
188	397.63
189	398.29
190	400.16
191	401.85

Imputing daily temperature data¶

forward-fill to propagate the previous value forward: .fillna(method="ffill")¶

back-fill to propagate the next values backward: .fillna(method='bfill')¶

replacing each NA with aggregated values .fillna()¶

replacing NA with aggregated values: .interpolate()¶

Imputing the carbon dioxide data set¶

back-fill to propagate the next values backward: .fillna(method='bfill')¶

replacing NA with aggregated values: .interpolate()¶

forward-fill to propagate the previous value forward: `.fillna(method="ffill")`¶

back-fill to propagate the next values backward: `.fillna(method='bfill')`¶

replacing each NA with aggregated values `.fillna()`¶

replacing `NA` with aggregated values: `.interpolate()`¶

back-fill to propagate the next values backward: `.fillna(method='bfill')`¶

replacing `NA` with aggregated values: `.interpolate()`¶