# First, let's import all the needed libraries.
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats


random.seed(1)
n = 100
xlim = [-1, 1]
ylim = [-3, 3]


# plot1
x1 = np.random.uniform(low=-1, high=1.0, size=n)
y1 = x1 * 3 + np.random.normal(loc=0, scale=0.15, size=n)

# plot2
x2 = np.random.uniform(low=-1, high=1.0, size=n)
y2 = x2 * -3 + np.random.normal(loc=0, scale=0.15, size=n)

# plot3
x3 = np.random.normal(loc=0, scale=1, size=n)
y3 = np.random.normal(loc=0, scale=1, size=n)

# plot4
x4 = np.random.uniform(low=-1, high=1.0, size=n)
y4 = x4 * 3 + np.random.normal(loc=0, scale=2, size=n)

# plot5
x5 = np.random.uniform(low=-1, high=1.0, size=n)
y5 = x5 * -3 + np.random.normal(loc=0, scale=2, size=n)

# plot6
x6 = np.random.uniform(low=-1, high=1.0, size=n)
y6 = x6**2 + np.random.normal(loc=0, scale=0.08, size=n)


### generate plot ###
fig, axs = plt.subplots(2, 3, figsize=(12, 7))


# plot 1
coef = np.polyfit(x1, y1, 1)
lm = np.poly1d(coef)  # lm = function which takes in x and returns an estimate for y
axs[0, 0].plot(x1, y1, "bo", x1, lm(x1), "-r", linewidth=2.5)  #
axs[0, 0].get_xaxis().set_visible(False)
axs[0, 0].get_yaxis().set_visible(False)
axs[0, 0].title.set_text("strong positive linear correlation")
axs[0, 0].text(x=-1, y=2, s=f"r ={round(np.corrcoef(x1,y1)[0,1],3)}")


# plot 2
coef = np.polyfit(x2, y2, 1)
lm = np.poly1d(coef)
axs[0, 1].plot(x2, y2, "bo", x2, lm(x2), "-r", linewidth=2.5)
axs[0, 1].get_xaxis().set_visible(False)
axs[0, 1].get_yaxis().set_visible(False)
axs[0, 1].title.set_text("strong negative linear correlation")
axs[0, 1].text(x=-1, y=-2, s=f"r ={round(np.corrcoef(x2,y2)[0,1],3)}")


# plot 3
coef = np.polyfit(x3, y3, 1)
lm = np.poly1d(coef)
axs[0, 2].plot(x3, y3, "bo")
axs[0, 2].axhline(y=0, color="r", linestyle="-", linewidth=2.5)
axs[0, 2].get_xaxis().set_visible(False)
axs[0, 2].get_yaxis().set_visible(False)
axs[0, 2].title.set_text("no linear correlation")
axs[0, 2].text(x=1.5, y=2, s=f"r ={round(np.corrcoef(x3,y3)[0,1],3)}")


# plot 4
coef = np.polyfit(x4, y4, 1)
lm = np.poly1d(coef)
axs[1, 0].plot(x4, y4, "bo", x4, lm(x4), "-r", linewidth=2.5)
axs[1, 0].get_xaxis().set_visible(False)
axs[1, 0].get_yaxis().set_visible(False)
axs[1, 0].title.set_text("weak to medium positive\nlinear correlation")
axs[1, 0].text(x=-1, y=3, s=f"r ={round(np.corrcoef(x4,y4)[0,1],3)}")


# plot 5
coef = np.polyfit(x5, y5, 1)
lm = np.poly1d(coef)
axs[1, 1].plot(x5, y5, "bo", x5, lm(x5), "-r", linewidth=2.5)
axs[1, 1].get_xaxis().set_visible(False)
axs[1, 1].get_yaxis().set_visible(False)
axs[1, 1].title.set_text("weak to medium negative\nlinear correlation")
axs[1, 1].text(x=0.3, y=4, s=f"r ={round(np.corrcoef(x5,y5)[0,1],3)}")


# plot 6
coef = np.polyfit(x6, y6, 1)
lm = np.poly1d(coef)
axs[1, 2].plot(x6, y6, "bo")
axs[1, 2].plot(np.sort(x6), np.sort(x6) ** 2, "-r", linewidth=2.5)
axs[1, 2].get_xaxis().set_visible(False)
axs[1, 2].get_yaxis().set_visible(False)
axs[1, 2].title.set_text("no linear correlation")
axs[1, 2].text(x=0, y=0.9, s=f"r ={round(np.corrcoef(x6,y6)[0,1],3)}")


plt.show()


students = pd.read_csv(
    "https://userpage.fu-berlin.de/soga/200/2010_data_sets/students.csv"
)


df_rows, df_cols = students.shape
df_colnames = students.columns
n = 37


sample_idx = students.sample(n=n)
weight = sample_idx["weight"]
height = sample_idx["height"]

plt.plot(height, weight, "o")
plt.xlabel("height")
plt.ylabel("weight")
plt.show()


x = height.copy()
y = weight.copy()
x_bar = np.mean(height)
y_bar = np.mean(weight)

np.sum((x - x_bar) * (y - y_bar)) / (
    np.sqrt(np.sum((x - x_bar) ** 2)) * np.sqrt(sum((y - y_bar) ** 2))
)

0.9734419536082125


(
    np.cov(x, y) / (np.std(x) * np.std(y))
)  # [0,1] ## indicating [0,1] will give the right output

array([[1.17599026, 1.00048201],
       [1.00048201, 0.89824482]])


np.corrcoef(x, y)[0, 1]  ## indicating [0,1] will give the right output

0.9734419536082127


n = 100
sample_idx = students.sample(n=n)
vars = [
    "height",
    "weight",
    "nc.score",
    "score1",
    "score2",
    "salary",
]  # select variables

students_sample = sample_idx[vars]
students_sample.corr(method="pearson")


plt.rcParams["figure.figsize"] = (5, 3)
ax = sns.pairplot(students_sample, height=1.7, aspect=0.9)
plt.tight_layout()
plt.show()


def corr_dot(*args, **kwargs):
    corr_r = args[0].corr(args[1], "pearson")
    corr_text = f"{corr_r:2.2f}"
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter(
        [0.5],
        [0.5],
        marker_size,
        [corr_r],
        alpha=0.6,
        cmap="coolwarm",
        vmin=-1,
        vmax=1,
        transform=ax.transAxes,
    )
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(
        corr_text,
        [
            0.5,
            0.5,
        ],
        xycoords="axes fraction",
        ha="center",
        va="center",
        fontsize=font_size,
    )


sns.set(style="white", font_scale=1.6)

g = sns.PairGrid(students_sample, aspect=1.4, diag_sharey=False)
g.map_lower(plt.scatter)
g.map_diag(sns.histplot, kde_kws={"color": "black"})
g.map_upper(corr_dot)

<seaborn.axisgrid.PairGrid at 0x2202721b430>


def cf2m3(q):
    return q * 0.028316846592


Q = np.array(
    [
        8000,
        8800,
        7400,
        6700,
        11100,
        12200,
        5700,
        9400,
        14200,
        7600,
        5800,
        14300,
        11600,
        10400,
    ]
)
np.round(cf2m3(Q))

array([227., 249., 210., 190., 314., 345., 161., 266., 402., 215., 164.,
       405., 328., 294.])


Q = np.array([227, 249, 210, 190, 314, 345, 161, 266, 402, 215, 164, 405, 328, 294])
logged = np.array([53, 56, 57, 58, 55, 54, 51, 50, 49, 47, 46, 44, 43, 42])


# set number of observations
n = len(Q)
# calculate rank
r_xi = stats.rankdata(Q)
r_yi = stats.rankdata(logged)
# plug into equation
rs = 1 - ((6 * np.sum((r_xi - r_yi) ** 2)) / (n * (n**2 - 1)))
rs

-0.34065934065934056


stats.spearmanr(Q, logged)

SpearmanrResult(correlation=-0.3406593406593406, pvalue=0.23331605103682404)


np.corrcoef(stats.rankdata(Q), stats.rankdata(logged))[0, 1]

-0.3406593406593406

	height	weight	nc.score	score1	score2	salary
height	1.000000	0.958346	0.076327	-0.024130	0.032678	0.495396
weight	0.958346	1.000000	0.083525	-0.081879	-0.027481	0.456302
nc.score	0.076327	0.083525	1.000000	-0.059677	0.013939	0.076737
score1	-0.024130	-0.081879	-0.059677	1.000000	0.920166	0.363850
score2	0.032678	-0.027481	0.013939	0.920166	1.000000	0.404771
salary	0.495396	0.456302	0.076737	0.363850	0.404771	1.000000

Pearson product moment correlation coefficient¶

Pearson correlation coefficient: An example¶

Spearman's rank correlation coefficient¶

Spearman's rank correlation coefficient: An example¶