import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


rs = np.random.RandomState(7)
n = 25
x = rs.uniform(0, 1, n)
y = np.sin(2 * np.pi * x) + rs.normal(0, 0.35, n)
poly_data = pd.DataFrame.from_dict({"x": x, "y": y})


poly_data.head()


fig = plt.figure(figsize=[12,4])
plt.scatter(x, y)
plt.grid()
plt.xlabel("x")
plt.ylabel("y")
plt.show()


polynomial_features= PolynomialFeatures(degree=2)
xp = polynomial_features.fit_transform(sm.add_constant(x))
m1 = sm.OLS(y, xp).fit()  # Python-style

m2 = sm.OLS.from_formula('y ~ x + I(x**2)', data = poly_data).fit()  # R-style formula


m1.params

array([ 0.13582645,  0.13582645, -0.23447937,  0.13582645, -0.23447937,
       -0.74967182])


m2.params

Intercept    0.407479
x           -0.468959
I(x ** 2)   -0.749672
dtype: float64


# plotting setup
fig, ax = plt.subplots(3, 2, figsize=(12, 8), sharex=True, sharey=True)

# setup
vals = pd.DataFrame.from_dict({"x": np.arange(poly_data.x.min(), poly_data.x.max(), 0.01)}) # set up vector used for prediction
rmse_values = []
# run  loop
k = [1, 2, 3, 5, 8, 10] # k-th order

for i in np.arange(len(k)):
    # build models
    # model = sm.OLS.from_formula('y ~ np.poly1d(np.polyfit(x, y, k[i]))(x)', data = poly_data).fit()
    polynomial_features= PolynomialFeatures(degree=k[i])
    xp = polynomial_features.fit_transform(sm.add_constant(x))
    model = sm.OLS(y, xp).fit()

    # calculate RMSE and store it for further usage
    rmse_values.append(np.sqrt(np.sum((model.fittedvalues - poly_data.y)**2) / len(poly_data.y))) # calculate RMSE

    # predict
    xvals = polynomial_features.fit_transform(sm.add_constant(vals))
    predictions = model.predict(sm.add_constant(xvals))
    # plot
    plt.subplot(3, 2, i+1)
    plt.scatter(poly_data.x, poly_data.y)
    plt.plot(vals, predictions)
    plt.grid()
    plt.title(f"degree of polynomial = {k[i]}, RMSE = {np.round(rmse_values[i], 3)}")
plt.tight_layout()
plt.show()

RMSE = pd.DataFrame.from_dict({"kth_order": k, "value": rmse_values})


plt.plot(RMSE["kth_order"], RMSE["value"], 'o-')
plt.grid()
plt.xlabel("$k^{th}$-order")
plt.ylabel("RMSE")
plt.show()


n1 = 150
x = rs.uniform(0, 1, n1)
y = np.sin(2 * np.pi * x) + rs.normal(0, 0.3, n1)
new_poly_data = pd.DataFrame.from_dict({"x": x, "y": y})
plt.scatter(new_poly_data.x, new_poly_data.y)
plt.grid()
plt.xlabel("x")
plt.ylabel("y")
plt.show()


train = new_poly_data.sample(frac=.65, random_state=0)
test = new_poly_data.drop(train.index)


rmse_train_values = []
rmse_test_values = []
# run  loop
k = np.arange(1, 15) # k-th order

for i in k:
    # build models
    polynomial_features= PolynomialFeatures(degree=k[i-1])
    xp = polynomial_features.fit_transform(sm.add_constant(train.x))
    model = sm.OLS(train.y, xp).fit()

    # calculate RMSE and store it for further usage
    rmse_train_values.append(
        np.sqrt(np.sum((model.fittedvalues - train.y)**2) / len(train.y))
    )

    # predict
    xvals = polynomial_features.fit_transform(sm.add_constant(test.x))
    predictions = model.predict(sm.add_constant(xvals))
    rmse_test_values.append(
        np.sqrt(np.sum((predictions - test.y)**2) / len(test.y))
    )

RMSE = pd.DataFrame.from_dict({
    "kth_order": k,
    "test_values": rmse_test_values, 
    "train_values": rmse_train_values
})
plt.figure(figsize=(12, 4))
plt.plot(RMSE.kth_order, RMSE.train_values, 'o-', label="train values")
plt.plot(RMSE.kth_order, RMSE.test_values, 'o-', label="test values")
plt.grid()
plt.legend()
plt.xlabel("$k^{th}-order$")
plt.ylabel("RMSE")
plt.show()


# setup
# set up vector used for prediction
train_vals = np.arange(train.x.min(), train.x.max(), 0.01)
test_vals = np.arange(test.x.min(), test.x.max(), (test.x.max()-test.x.min())/len(test))
vals = np.arange(
    new_poly_data.x.min(), 
    new_poly_data.x.max(), 
    0.01
)

polynomial_features = PolynomialFeatures(degree=3)
xp = polynomial_features.fit_transform(sm.add_constant(train.x))
model = sm.OLS(train.y, xp).fit()
inner_predictions = model.predict()
# predict
xvals = polynomial_features.fit_transform(sm.add_constant(vals))
predictions = model.predict(sm.add_constant(xvals))

fig, ax = plt.subplots()
ax.plot(train.x, train.y, "o", label="train data")
ax.plot(test.x, test.y, "o", label="test data")

ax.plot(vals, np.sin(2 * np.pi * vals), '-', label="signal")
ax.plot(vals, predictions, "-", label="predictions")

ax.legend(loc="best")
plt.show()

	x	y
0	0.076308	0.486031
1	0.779919	-1.369874
2	0.438409	0.655364
3	0.723465	-0.574302
4	0.977990	-0.640220

Fitting a curve in Python: The notation¶

Learning from Data¶

Training set and validation set¶

Model building and model evaluation¶

Model presentation¶