# First, let's import all the needed libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random


# random data generation
n = 50
low_sd = np.random.normal(loc=0, scale=0.3, size=n)  # loc = mu, scale = sigma
high_sd = np.random.normal(loc=0, scale=1, size=n)
x_sd = np.arange(0, 1, 1 / (n - 1))


### generate plot ###
fig, axs = plt.subplots(1, 2, figsize=(15, 5))


# plot 1
axs[0].plot(x_sd, low_sd, linestyle="", marker="o", color="black")
axs[0].get_xaxis().set_visible(False)
axs[0].get_yaxis().set_visible(False)
axs[0].set_ylim((-3, 3))
axs[0].title.set_text("Small spread around the mean")
axs[0].axhline(y=0, color="r", linestyle="-", label="Mean")
axs[0].legend()

# plot 2
axs[1].plot(x_sd, high_sd, linestyle="", marker="o", color="black")
axs[1].get_xaxis().set_visible(False)
axs[1].get_yaxis().set_visible(False)
axs[1].set_ylim((-3, 3))
axs[1].title.set_text("Large spread around the mean")
axs[1].axhline(y=0, color="r", linestyle="-")
plt.show()


students = pd.read_csv(
    "https://userpage.fu-berlin.de/soga/200/2010_data_sets/students.csv"
)
quant_vars = ["name", "age", "nc.score", "height", "weight"]

students_quant = students[quant_vars]
students_quant.head(10)


# mean
students_quant_mean = students_quant.loc[:, students_quant.columns != "name"].apply(
    np.mean
)
# median
students_quant_median = students_quant.loc[:, students_quant.columns != "name"].apply(
    np.median
)
# variance
students_quant_var = students_quant.loc[:, students_quant.columns != "name"].apply(
    np.var
)
# standard deviation
students_quant_sd = students_quant.loc[:, students_quant.columns != "name"].apply(
    np.std
)
# concatenate the vectors and round to 2 digits
students_quant_stats = pd.concat(
    [students_quant_mean, students_quant_median, students_quant_var, students_quant_sd],
    axis=1,
).round(2)
# # rename column names
students_quant_stats.columns = ["mean", "median", "variance", "standard deviation"]
students_quant_stats


k = np.arange(1, 4, 0.1)
auc = 1 - (1 / k**2)
auc_percent = np.round(auc * 100, 0)

pd.DataFrame(data=zip(k, auc_percent), columns=["k", "auc_percent"])


random.seed(3)

random_k = random.sample(list(range(2, 5)), 1)[0]
percent = round((1 - (1 / random_k**2)) * 100)


random_k

2


plt.plot(k, auc_percent, color="blue", linestyle="", marker="o")
plt.title("Chebyshev's theorem")
plt.xlabel("k")
plt.ylabel("percent")

plt.show()


np.random.normal(loc=0, scale=1, size=1)

array([0.45902641])


np.random.normal(loc=0, scale=1, size=1)

array([0.43802701])


np.random.normal(loc=0, scale=1, size=1)

array([-0.26417081])


np.random.normal(loc=0, scale=1, size=1)

array([-0.75827295])


np.random.normal(loc=0, scale=1, size=10)

array([-1.00630876, -1.14747191,  0.86455021,  0.73029863,  0.6698138 ,
        1.69625483,  1.49958574,  0.3067514 , -1.13151667,  0.59911283])


np.random.normal(loc=0, scale=1, size=100)

array([-0.64472868,  0.82416068, -0.48287926, -0.13179342, -0.85811936,
        0.17101546, -0.26441321,  0.56892705, -0.74559648,  0.06014769,
       -0.57026134, -1.16148582,  0.03272348, -0.02985863,  1.9923957 ,
       -0.99597035,  0.9013824 ,  0.34768361, -0.27206932,  1.80370874,
        0.39248569, -0.12413372, -1.20019879, -0.3714098 ,  0.63675294,
        0.04750553,  0.91652386,  2.62531454, -0.1470434 ,  1.30759756,
       -1.48102079, -1.08443638, -0.5839856 ,  1.61104204,  0.64629644,
       -2.2245357 ,  0.46002925,  0.22820627,  1.05746817, -1.60256217,
       -0.89938963, -1.53339855, -0.65171997, -0.26315613,  2.08796249,
       -0.1035856 ,  0.42925875, -0.14966285,  0.52128303,  0.0413093 ,
       -0.41921695, -0.55160899,  0.37500005,  1.6576148 ,  1.30368154,
        0.56196418, -1.59381006, -0.43146528,  0.77847134, -0.41060199,
        1.12247349,  0.41745093,  0.03662133, -0.39901725,  0.63272497,
       -1.47368664, -0.64885638,  0.54238935,  1.16010944, -0.16719073,
       -1.37905476,  0.50256983,  0.02330603,  0.37909805, -0.26203286,
        0.28839444,  1.74441161, -1.67415176,  0.50040111,  0.57480205,
        0.51622608, -0.47715421,  1.68055466,  2.8784563 ,  0.79844855,
       -0.31182496, -0.41331883,  1.43617819, -0.49701804,  1.55579661,
        1.23160013,  0.05532344,  0.9608541 , -0.18589206, -1.78512244,
       -0.10734673, -1.23490236, -0.63866249,  0.34815736,  1.76756782])


y_norm = np.random.normal(loc=0, scale=1, size=100000)


plt.hist(y_norm, bins=100, color="lightgrey", edgecolor="grey")
plt.title("Normal distribution")
plt.ylabel("Frequency")
plt.show()


sd1 = len(y_norm[(y_norm > -1) & (y_norm < 1)]) / len(y_norm) * 100
sd2 = len(y_norm[(y_norm > -2) & (y_norm < 2)]) / len(y_norm) * 100
sd3 = len(y_norm[(y_norm > -3) & (y_norm < 3)]) / len(y_norm) * 100

pd.DataFrame({"sd": ["1sd", "2sd", "3sd"], "value": [sd1, sd2, sd3]})


ax = plt.hist(y_norm, bins=100, edgecolor="grey", color="lightgrey", density=1)

plt.show()


y_norm = pd.Series(y_norm)
ax = y_norm.plot.hist(bins=100, density=1)

for bar in ax.containers[0]:
    # get x midpoint of bar
    x = bar.get_x() + 0.5 * bar.get_width()

    # set bar color based on x
    if x < 1 and x > -1:
        bar.set_color("red")
        bar.set_edgecolor("grey")

    elif x < 2 and x > -2:
        bar.set_color("green")
        bar.set_edgecolor("grey")
    elif x > -1 and x < sd1 * -1:
        bar.set_color("green")
        bar.set_edgecolor("grey")

    elif x < 3 and x > 2:
        bar.set_color("blue")
        bar.set_edgecolor("grey")
    elif x > -3 and x < -2:
        bar.set_color("blue")
        bar.set_edgecolor("grey")

    else:
        bar.set_color("grey")
        bar.set_edgecolor("grey")


plt.title("Normal distribution")
plt.ylabel("Density")
plt.ylim(0, 0.65)

# horizontal lines
lwd = 3
plt.hlines(y=0.48, xmin=2, xmax=-2, linewidth=2, color="g")
plt.hlines(y=0.55, xmin=3, xmax=-3, linewidth=2, color="b")
plt.hlines(y=0.41, xmin=1, xmax=-1, linewidth=2, color="r")

# vertical lines
plt.vlines(x=1, ymin=0, ymax=0.41, linewidth=2, color="r")
plt.vlines(x=-1, ymin=0, ymax=0.41, linewidth=2, color="r")
plt.vlines(x=2, ymin=0, ymax=0.48, linewidth=2, color="g")
plt.vlines(x=-2, ymin=0, ymax=0.48, linewidth=2, color="g")
plt.vlines(x=3, ymin=0, ymax=0.55, linewidth=2, color="b")
plt.vlines(x=-3, ymin=0, ymax=0.55, linewidth=2, color="b")

# text
plt.text(0, 0.44, "68 %", color="r", size=12)
plt.text(0, 0.51, "95 %", color="g", size=12)
plt.text(0, 0.58, "99.7 %", color="b", size=12)
plt.show()


cont_vars = ["age", "nc.score", "height", "weight", "score1", "score2", "salary"]
students_quant = students[cont_vars]
students_quant.head(10)


students_quant.hist(
    bins=50, density=1, color="lightgrey", edgecolor="grey", figsize=(12, 5)
)
plt.tight_layout()
plt.show()


students_quant[["height", "salary"]].hist(
    bins=50, density=1, color="lightgrey", edgecolor="grey", figsize=(12, 5)
)
plt.tight_layout()
plt.show()


salary_vector = students[
    "salary"
].dropna()  # or use: students["salary"][students["salary"].notnull()]

# calculate mean
salary_vector_mean = np.mean(salary_vector)

# calculate standard deviations
salary_vector_sd1_pos = salary_vector_mean + np.std(salary_vector)
salary_vector_sd2_pos = salary_vector_mean + np.std(salary_vector) * 2
salary_vector_sd3_pos = salary_vector_mean + np.std(salary_vector) * 3

salary_vector_sd1_neg = salary_vector_mean - np.std(salary_vector)
salary_vector_sd2_neg = salary_vector_mean - np.std(salary_vector) * 2
salary_vector_sd3_neg = salary_vector_mean - np.std(salary_vector) * 3


salary_sd1 = (
    100
    - len(
        salary_vector[
            (salary_vector > salary_vector_sd1_pos)
            | (salary_vector < salary_vector_sd1_neg)
        ]
    )
    / len(salary_vector)
    * 100
)
salary_sd2 = (
    100
    - len(
        salary_vector[
            (salary_vector > salary_vector_sd2_pos)
            | (salary_vector < salary_vector_sd2_neg)
        ]
    )
    / len(salary_vector)
    * 100
)
salary_sd3 = (
    100
    - len(
        salary_vector[
            (salary_vector > salary_vector_sd3_pos)
            | (salary_vector < salary_vector_sd3_neg)
        ]
    )
    / len(salary_vector)
    * 100
)

pd.DataFrame(
    {
        "sd": ["1sd", "2sd", "3sd"],
        "sd_value": [round(sd1), round(sd2), round(sd3, 1)],
        "xx": [round(salary_sd1, 4), round(salary_sd2, 4), round(salary_sd3, 4)],
    }
)


ax = salary_vector.plot.hist(bins=100, density=1, edgecolor="black", figsize=(10, 5))

for bar in ax.containers[0]:
    # get x midpoint of bar
    x = bar.get_x() + 0.5 * bar.get_width()

    # set bar color based on x
    if x < salary_vector_sd1_pos and x > salary_vector_sd1_neg:
        bar.set_color("red")
        bar.set_edgecolor("grey")

    elif x < salary_vector_sd2_pos and x > salary_vector_sd1_pos:
        bar.set_color("green")
        bar.set_edgecolor("grey")
    elif x > salary_vector_sd2_neg and x < salary_vector_sd1_neg:
        bar.set_color("green")
        bar.set_edgecolor("grey")

    elif x < salary_vector_sd3_pos and x > salary_vector_sd2_pos:
        bar.set_color("blue")
        bar.set_edgecolor("grey")
    elif x > salary_vector_sd3_neg and x < salary_vector_sd2_neg:
        bar.set_color("blue")
        bar.set_edgecolor("grey")

    else:
        bar.set_color("grey")
        bar.set_edgecolor("grey")

plt.title("Normal distribution")
plt.ylabel("Density")
plt.xlabel("Annual Salary in EUR")
plt.text(70000, 0.00005, "1 s.d.", color="red")
plt.text(70000, 0.000045, "2 s.d.", color="green")
plt.text(70000, 0.00004, "3 s.d.", color="blue")
plt.show()


ax = salary_vector.plot.hist(bins=100, density=1, edgecolor="black", figsize=(10, 5))
salary_vector.plot(
    kind="density", ax=ax, color="black", linestyle="dashed", linewidth=3
)

for bar in ax.containers[0]:
    # get x midpoint of bar
    x = bar.get_x() + 0.5 * bar.get_width()

    # set bar color based on x
    if x < salary_vector_sd1_pos and x > salary_vector_sd1_neg:
        bar.set_color("red")
        bar.set_edgecolor("grey")

    elif x < salary_vector_sd2_pos and x > salary_vector_sd1_pos:
        bar.set_color("green")
        bar.set_edgecolor("grey")
    elif x > salary_vector_sd2_neg and x < salary_vector_sd1_neg:
        bar.set_color("green")
        bar.set_edgecolor("grey")

    elif x < salary_vector_sd3_pos and x > salary_vector_sd2_pos:
        bar.set_color("blue")
        bar.set_edgecolor("grey")
    elif x > salary_vector_sd3_neg and x < salary_vector_sd2_neg:
        bar.set_color("blue")
        bar.set_edgecolor("grey")

    else:
        bar.set_color("grey")
        bar.set_edgecolor("grey")

plt.title("Normal distribution")
plt.ylabel("Density")
plt.xlabel("Annual Salary in EUR")
plt.text(70000, 0.00005, "1 s.d.", color="red")
plt.text(70000, 0.000045, "2 s.d.", color="green")
plt.text(70000, 0.00004, "3 s.d.", color="blue")
plt.show()


ax = salary_vector.plot.hist(
    bins=100, density=1, edgecolor="grey", color="lightgrey", figsize=(10, 5)
)
salary_vector.plot(
    kind="density",
    ax=ax,
    color="black",
    linestyle="dashed",
    linewidth=3,
    label="emp. density",
)

mu = np.mean(salary_vector)
variance = np.var(salary_vector)
sigma = np.sqrt(variance)
x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
plt.plot(x, stats.norm.pdf(x, mu, sigma), label="pdf", color="red")

plt.title("Normal distribution")
plt.ylabel("Density")
plt.xlabel("Annual Salary in EUR")
plt.show()

	name	age	nc.score	height	weight
1	Gonzales, Christina	19	1.91	160	64.8
2	Lozano, T'Hani	19	1.56	172	73.0
3	Williams, Hanh	22	1.24	168	70.6
4	Nem, Denzel	19	1.37	183	79.7
5	Powell, Heather	21	1.46	175	71.4
6	Perez, Jadrian	19	1.34	189	85.8
7	Clardy, Anita	21	1.11	156	65.9
8	Allen, Rebecca Marie	21	2.03	167	65.7
9	Tracy, Robert	18	1.29	195	94.4
10	Nimmons, Laura	18	1.19	165	66.0

	mean	median	variance	standard deviation
age	22.54	21.00	36.78	6.06
nc.score	2.17	2.04	0.66	0.81
height	171.38	171.00	122.70	11.08
weight	73.00	71.80	74.56	8.63

	k	auc_percent
0	1.0	0.0
1	1.1	17.0
2	1.2	31.0
3	1.3	41.0
4	1.4	49.0
5	1.5	56.0
6	1.6	61.0
7	1.7	65.0
8	1.8	69.0
9	1.9	72.0
10	2.0	75.0
11	2.1	77.0
12	2.2	79.0
13	2.3	81.0
14	2.4	83.0
15	2.5	84.0
16	2.6	85.0
17	2.7	86.0
18	2.8	87.0
19	2.9	88.0
20	3.0	89.0
21	3.1	90.0
22	3.2	90.0
23	3.3	91.0
24	3.4	91.0
25	3.5	92.0
26	3.6	92.0
27	3.7	93.0
28	3.8	93.0
29	3.9	93.0

	age	nc.score	height	weight	score1	score2	salary
1	19	1.91	160	64.8	NaN	NaN	NaN
2	19	1.56	172	73.0	NaN	NaN	NaN
3	22	1.24	168	70.6	45.0	46.0	NaN
4	19	1.37	183	79.7	NaN	NaN	NaN
5	21	1.46	175	71.4	NaN	NaN	NaN
6	19	1.34	189	85.8	NaN	NaN	NaN
7	21	1.11	156	65.9	NaN	NaN	NaN
8	21	2.03	167	65.7	58.0	62.0	NaN
9	18	1.29	195	94.4	57.0	67.0	NaN
10	18	1.19	165	66.0	NaN	NaN	NaN

Use of the Standard Deviation¶

Chebyshev's Theorem¶

Empirical Rule¶

	sd	value
0	1sd	68.320
1	2sd	95.487
2	3sd	99.764

	sd	sd_value	xx
0	1sd	68.0	67.0850
1	2sd	95.0	95.5505
2	3sd	99.8	99.7718