from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA


food = pd.read_csv("https://userpage.fu-berlin.de/soga/data/raw-data/food-texture.csv")
# exclude first column
food = food.iloc[:, 1:]
food.head()


# center and scale the data
food_scaled = (food - food.mean()) / food.std()
food_scaled.head()

# Calculate the PCA
food_pca = PCA().fit(food_scaled)


print(f"Rotation matrix:\n{food_pca.components_}")

Rotation matrix:
[[-0.45753343  0.4787455  -0.53238767  0.50447688 -0.15340262]
 [ 0.37043885 -0.35674997 -0.19766103  0.22123992 -0.8046661 ]
 [-0.6590302  -0.01623973  0.17888443 -0.54227938 -0.48923298]
 [-0.46794489 -0.71846318  0.13252692  0.45693168  0.19618432]
 [-0.01204121 -0.35648161 -0.79242064 -0.44011646  0.22614798]]


print(f"Standard deviation of each principal component:\n{np.sqrt(food_pca.explained_variance_)}")

Standard deviation of each principal component:
[1.74103796 1.13829072 0.55682074 0.49185372 0.34801098]


print(f"Variance explained by each principal component:\n{food_pca.explained_variance_}")

Variance explained by each principal component:
[3.03121317 1.29570576 0.31004934 0.24192008 0.12111165]


print(f"Proportion of variance explained by each principal component:\n{food_pca.explained_variance_ratio_}")

Proportion of variance explained by each principal component:
[0.60624263 0.25914115 0.06200987 0.04838402 0.02422233]


scores = food_pca.transform(food_scaled)
print(f"Principal component scores:\n{scores[:10, :]}")

Principal component scores:
[[ 1.38321116  0.61940609  0.40251151 -0.48680785  0.00594816]
 [-2.79447694 -0.35372463  1.07602689  0.25580128  0.48877325]
 [-0.23755641 -0.86145822  0.61154857  0.02952179  0.17632331]
 [ 1.93934057  1.14173664 -0.43794713  0.31253089 -0.55402198]
 [ 1.26793706 -0.66346458 -0.43716522  0.07190367 -0.40345029]
 [-1.99745643 -1.29835426 -1.1015333  -0.0802596   0.35187991]
 [-1.48818776  0.63265249  0.27302642 -0.03277424 -0.15370096]
 [ 0.82886358  2.3825485   0.24943519  0.32334864  0.03952609]
 [ 1.18515331 -0.350181    0.42578589 -0.01315433 -0.24539644]
 [ 0.99346261 -0.35082583 -0.10334451 -0.02088279 -0.23718893]]


# biplot
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(
    scores[:, 0],
    scores[:, 1],
    color="none",
    edgecolor="k",
    alpha=0.5,
)
for i in range(food_pca.components_.shape[1]):
    ax.arrow(
        0,
        0,
        food_pca.components_[0, i],
        food_pca.components_[1, i],
        head_width=0.1,
        head_length=0.1,
        linewidth=2,
        color="red",
    )
    ax.text(
        food_pca.components_[0, i] + 0.1,
        food_pca.components_[1, i] + 0.1,
        food.columns[i],
        color="red",
        ha="center",
        va="center",
    )
for i in range(scores.shape[0]):
    ax.text(
        scores[i, 0] + 0.2,
        scores[i, 1],
        food.index[i],
        color="blue",
        ha="center",
        va="center",
    )
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_title("Biplot")
plt.show()

	Oil	Density	Crispy	Fracture	Hardness
0	16.5	2955	10	23	97
1	17.7	2660	14	9	139
2	16.2	2870	12	17	143
3	16.7	2920	10	31	95
4	16.3	2975	11	26	143

The data¶

Visualization and interpretation¶