import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
%matplotlib inline


students = pd.read_csv("https://userpage.fu-berlin.de/soga/200/2010_data_sets/students.csv")
n = 24
students_sample = students.sample(n, random_state=14)
df = students_sample[['height', 'weight']]
x = df['height']
y = df['weight']
x_train = sm.add_constant(x)
model = sm.OLS(y, x_train)
fit = model.fit()
fit.summary()


anscombe = sns.load_dataset("anscombe")
anscombe


ds_i = anscombe.query("dataset == 'I'")
ds_ii = anscombe.query("dataset == 'II'")
ds_iii = anscombe.query("dataset == 'III'")
ds_iv = anscombe.query("dataset == 'IV'")
mean_i = ds_i.mean()
mean_ii = ds_ii.mean()
mean_iii = ds_iii.mean()
mean_iv = ds_iv.mean()
print(mean_i['x'], mean_ii['x'], mean_iii['x'], mean_iv['x'])
print(mean_i['y'], mean_ii['y'], mean_iii['y'], mean_iv['y'])

9.0 9.0 9.0 9.0
7.500909090909093 7.50090909090909 7.5 7.500909090909091


var_i = np.var(ds_i)
var_ii = np.var(ds_ii)
var_iii = np.var(ds_iii)
var_iv = np.var(ds_iv)
print(var_i['x'], var_ii['x'], var_iii['x'], var_iv['x'])
print(var_i['y'], var_ii['y'], var_iii['y'], var_iv['y'])

10.0 10.0 10.0 10.0
3.7520628099173554 3.752390082644628 3.747836363636364 3.7484082644628103


ds_i


x_train_i = sm.add_constant(ds_i['x'])
x_train_ii = sm.add_constant(ds_ii['x'])
x_train_iii = sm.add_constant(ds_iii['x'])
x_train_iv = sm.add_constant(ds_iv['x'])
model_i = sm.OLS(ds_i['y'], x_train_i).fit()
model_ii = sm.OLS(ds_ii['y'], x_train_ii).fit()
model_iii = sm.OLS(ds_iii['y'], x_train_iii).fit()
model_iv = sm.OLS(ds_iv['y'], x_train_iv).fit()
print(model_i.params, "\n", model_ii.params, "\n", model_iii.params, "\n", model_iv.params)

const    3.000091
x        0.500091
dtype: float64 
 const    3.000909
x        0.500000
dtype: float64 
 const    3.002455
x        0.499727
dtype: float64 
 const    3.001727
x        0.499909
dtype: float64


print(model_i.rsquared, model_ii.rsquared, model_iii.rsquared, model_iv.rsquared)

0.666542459508775 0.6662420337274844 0.6663240410665593 0.6667072568984652


sns.lmplot(
    x="x", 
    y="y", 
    data=anscombe, 
    hue="dataset", 
    col="dataset", 
    col_wrap=2, 
    truncate=False,
    ci=95,
    height=4,
)

<seaborn.axisgrid.FacetGrid at 0x7ff75ddf36a0>


sns.set_theme(style="whitegrid")

rs = np.random.RandomState(7)
n = 200
x = rs.uniform(-100, 100, n)
y_1 = 5 * x + rs.uniform(0, 25, n)
y_2 = x ** 2 + rs.uniform(0, 25**2, n)

def h(x):
    return 100+.5*x**2
# eps = abs(rs.normal(0, h(x), n))

eps = np.arange(0, n)*h(x)

y_3 = 4*abs(x) + rs.normal(25, eps, n)

fig1 = plt.figure(figsize=(12, 3))
sns.residplot(x=x, y=y_1, color="g")
plt.title("no violation")
plt.show()

fig2 = plt.figure(figsize=(12, 3))
sns.residplot(x=x, y=y_2, color="g")
plt.title("violation of linearity")
plt.show()

fig3 = plt.figure(figsize=(12, 3))
sns.residplot(x=abs(x), y=y_3, color="g")
plt.title("violation of constant standard deviation (Heteroscedasticity)")
plt.show()


from scipy import stats

def standardize(x):
    return (x - x.mean())/(x.std())

df = pd.DataFrame({
    'x': standardize(x), 
    'y_1': standardize(y_1), 
    'y_2': standardize(y_2), 
    'y_3': standardize(y_3), 
})
df.head(10)


fig = sm.qqplot(df['y_1'], line ='45')
plt.show()

fig = sm.qqplot(df['y_2'], line ='45')
plt.show()

fig = sm.qqplot(df['y_3'], line ='45')
plt.show()


#set.seed(110) # set seed for reproducibility

rs = np.random.RandomState(1)
# random data generation
n = 100
beta0 = 5
beta1 = 2.5

x = rs.uniform(0, 10, n)
y = beta0 + beta1 * x + rs.normal(loc = 0, scale = 12, size=n) # add random noise

# generate leverage points
n_lev = np.int(np.floor(n * 0.05))
x_lev = rs.uniform(0, 8, n_lev)
y_lev = beta0**1.5 + beta1**3 * x_lev + rs.normal(0, 12, n_lev)

# generate influential points
n_inf = np.int(np.floor(n * 0.02))
x_inf = rs.uniform(10, 15, n_inf)
y_inf = beta0 + beta1**2.5 * x_inf + rs.normal(0, 12, n_inf)

# concatenate data sets
x_lev = np.concatenate([x, x_lev])
y_lev = np.concatenate([y, y_lev])

x_out = np.concatenate([x_lev, x_inf])
y_out = np.concatenate([y_lev, y_inf])

# build linear model
toy_model_out = sm.OLS(y_out, x_out).fit()

fig1, ax1 = plt.subplots(figsize=(12, 6))

sns.regplot(
    x="x", 
    y="y", 
    data=pd.DataFrame.from_dict({'x': x_out, 'y': y_out}),
    truncate=False,
    ax=ax1,
    label="influential points (105 - 106)"
)

sns.regplot(
    x="x", 
    y="y", 
    data=pd.DataFrame.from_dict({'x': x_lev, 'y': y_lev}),
    truncate=False,
    ax=ax1,
    label="leverage points (101 - 104)"
)

sns.regplot(
    x="x", 
    y="y", 
    data=pd.DataFrame.from_dict({'x': x, 'y': y}),
    truncate=False,
    ax=ax1,
    label="base points"
)

ax1.legend()
plt.show()


fig1, ax1 = plt.subplots(figsize=(12, 4))
sm.graphics.plot_leverage_resid2(toy_model_out, ax=ax1)
plt.tight_layout()
plt.show()


fig1, [ax1, ax2]  = plt.subplots(2, figsize=(12, 8))
#create instance of influence
influence = toy_model_out.get_influence()
#obtain Cook's distance for each observation
cooks = influence.cooks_distance

ax1.scatter(np.arange(len(cooks[0])), cooks[0])
ax1.set_xlabel('point number')
ax1.set_ylabel('Cooks Distance')
sm.graphics.influence_plot(toy_model_out, criterion="cooks", ax=ax2)
plt.tight_layout()
plt.show()


fig1, [ax1, ax2]  = plt.subplots(2, figsize=(12, 8))
influence = toy_model_out.get_influence()
dffits = influence.dffits

ax1.scatter(np.arange(len(dffits[0])), dffits[0])
ax1.set_xlabel('point number')
ax1.set_ylabel('dffits')
sm.graphics.influence_plot(toy_model_out, criterion="DFFITS", ax=ax2)
plt.tight_layout()
plt.show()

Dep. Variable:	weight	R-squared:	0.891
Model:	OLS	Adj. R-squared:	0.886
Method:	Least Squares	F-statistic:	178.9
Date:	Thu, 09 Mar 2023	Prob (F-statistic):	4.81e-12
Time:	08:53:19	Log-Likelihood:	-54.440
No. Observations:	24	AIC:	112.9
Df Residuals:	22	BIC:	115.2
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-55.1467	9.568	-5.764	0.000	-74.990	-35.304
height	0.7494	0.056	13.376	0.000	0.633	0.866

Omnibus:	3.844	Durbin-Watson:	2.262
Prob(Omnibus):	0.146	Jarque-Bera (JB):	2.935
Skew:	0.855	Prob(JB):	0.230
Kurtosis:	2.900	Cond. No.	3.28e+03

	dataset	x	y
0	I	10.0	8.04
1	I	8.0	6.95
2	I	13.0	7.58
3	I	9.0	8.81
4	I	11.0	8.33
5	I	14.0	9.96
6	I	6.0	7.24
7	I	4.0	4.26
8	I	12.0	10.84
9	I	7.0	4.82
10	I	5.0	5.68
11	II	10.0	9.14
12	II	8.0	8.14
13	II	13.0	8.74
14	II	9.0	8.77
15	II	11.0	9.26
16	II	14.0	8.10
17	II	6.0	6.13
18	II	4.0	3.10
19	II	12.0	9.13
20	II	7.0	7.26
21	II	5.0	4.74
22	III	10.0	7.46
23	III	8.0	6.77
24	III	13.0	12.74
25	III	9.0	7.11
26	III	11.0	7.81
27	III	14.0	8.84
28	III	6.0	6.08
29	III	4.0	5.39
30	III	12.0	8.15
31	III	7.0	6.42
32	III	5.0	5.73
33	IV	8.0	6.58
34	IV	8.0	5.76
35	IV	8.0	7.71
36	IV	8.0	8.84
37	IV	8.0	8.47
38	IV	8.0	7.04
39	IV	8.0	5.25
40	IV	19.0	12.50
41	IV	8.0	5.56
42	IV	8.0	7.91
43	IV	8.0	6.89

	dataset	x	y
0	I	10.0	8.04
1	I	8.0	6.95
2	I	13.0	7.58
3	I	9.0	8.81
4	I	11.0	8.33
5	I	14.0	9.96
6	I	6.0	7.24
7	I	4.0	4.26
8	I	12.0	10.84
9	I	7.0	4.82
10	I	5.0	5.68

Coefficient of Determination¶

Statsmodel's `summary()` method¶

Diagnostic plots¶

Residual analysis¶

Outliers and influential points¶

Cook’s distance¶

Other useful regression diagnostics¶

	x	y_1	y_2	y_3
0	-1.454424	-1.432784	1.388205	-0.061084
1	1.022916	0.983533	-0.034598	-0.056711
2	-0.179504	-0.158630	-1.061309	-0.059461
3	0.824148	0.862644	-0.437975	-0.101608
4	1.720302	1.728537	2.080581	-0.085717
5	0.172891	0.153483	-1.142410	-0.061537
6	0.041296	0.055769	-0.999634	-0.062519
7	-1.469413	-1.451283	1.447360	-0.099979
8	-0.777952	-0.764790	-0.420411	-0.116076
9	0.036937	0.005165	-1.112621	-0.066166

Coefficient of Determination¶

Statsmodel's summary() method¶

Diagnostic plots¶

Residual analysis¶

Outliers and influential points¶

Cook’s distance¶

Other useful regression diagnostics¶

Statsmodel's `summary()` method¶