# import libraries
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd


# load data from previous sections
food_pca = pd.read_feather('https://userpage.fu-berlin.de/soga/data/py-data/food-texture-scaled.feather')
food_pca.head()


# compute eigenvalues and eigenvectors
eigvals, eigvecs = np.linalg.eig(food_pca.cov())

# get the loadings for the first two principal components
pca_loading = eigvecs[:, :2]
print(pca_loading)

[[-0.45753343  0.37043885]
 [ 0.4787455  -0.35674997]
 [-0.53238767 -0.19766103]
 [ 0.50447688  0.22123992]
 [-0.15340262 -0.8046661 ]]


# calculate the squared sum of the loadings
ssl = [sum(pca_loading[:, 0]**2), sum(pca_loading[:, 1]**2)]
print(ssl)

assert round(ssl[0], 2) == 1
assert round(ssl[1], 2) == 1

[1.0, 1.0000000000000002]


# calculate the scores for the first observation projected on PC1
score_pc1 = pca_loading[0, 0] * food_pca.iloc[0, 0] + \
    pca_loading[1, 0] * food_pca.iloc[0, 1] + \
    pca_loading[2, 0] * food_pca.iloc[0, 2] + \
    pca_loading[3, 0] * food_pca.iloc[0, 3] + \
    pca_loading[4, 0] * food_pca.iloc[0, 4]
# calculate the scores for the first observation projected on PC2
score_pc2 = pca_loading[0, 1] * food_pca.iloc[0, 0] + \
    pca_loading[1, 1] * food_pca.iloc[0, 1] + \
    pca_loading[2, 1] * food_pca.iloc[0, 2] + \
    pca_loading[3, 1] * food_pca.iloc[0, 3] + \
    pca_loading[4, 1] * food_pca.iloc[0, 4]

# print the scores
print('The score for the first observation projected on PC1 is: %.2f' % score_pc1)
print('The score for the first observation projected on PC2 is: %.2f' % score_pc2)

The score for the first observation projected on PC1 is: 1.38
The score for the first observation projected on PC2 is: 0.62


# calculate the scores for the first two principal components
scores = food_pca.dot(pca_loading)
# print the scores
print(scores)
# get the dimensions of the scores
print('The dimensions of the scores are: %s' % str(scores.shape))

           0         1
0   1.383211  0.619406
1  -2.794477 -0.353725
2  -0.237556 -0.861458
3   1.939341  1.141737
4   1.267937 -0.663465
5  -1.997456 -1.298354
6  -1.488188  0.632652
7   0.828864  2.382549
8   1.185153 -0.350181
9   0.993463 -0.350826
10 -0.984844  0.159759
11 -0.767642 -1.191135
12 -2.194773 -1.484354
13  0.845210 -2.260022
14  2.222673  1.588128
15 -1.868510 -1.936841
16 -2.039383  1.045645
17 -0.471855 -0.636961
18 -1.565497  0.310337
19 -2.744157 -0.285558
20  2.335870 -2.067209
21  1.542362 -0.923113
22 -1.740901 -0.575164
23  1.110698  1.469588
24 -1.265394 -0.735256
25 -0.983461 -0.695585
26  1.485818  0.674728
27  1.132639  0.268144
28 -0.724824 -0.179062
29 -1.560514  2.507137
30  1.991962 -0.111044
31  1.004783  0.454984
32  4.171399  0.767562
33 -0.843251 -0.111782
34  0.091710 -2.122006
35 -3.609710  1.759391
36 -1.721008  1.756569
37 -2.732756  0.239765
38  0.486516  0.764292
39  1.640091  0.523255
40  1.253794 -0.556927
41 -0.034552 -0.083409
42  0.721771 -0.486811
43  2.783942 -0.592922
44 -0.147274  0.222492
45  0.779787  0.275631
46  0.354090 -1.620391
47  2.120465  0.079597
48 -2.558568  1.312003
49  1.402999  1.578208
The dimensions of the scores are: (50, 2)


# plot the scores and plot the 2-dimensional feature space
plt.scatter(scores.iloc[:, 0], scores.iloc[:, 1])

# plot a horizontal line through the origin
plt.axhline(y=0, color='k', linestyle='--')

# plot a vertical line through the origin
plt.axvline(x=0, color='k', linestyle='--')

# add labels to the plot
plt.title('Scores of the first two principal components')

# add labels to the axes
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

	Oil	Density	Crispy	Fracture	Hardness
0	-0.440953	0.782329	-0.856063	0.391506	-1.001684
1	0.312813	-1.587149	1.396734	-2.169748	0.347602
2	-0.629394	0.099598	0.270336	-0.706174	0.476105
3	-0.315325	0.501205	-0.856063	1.855079	-1.065936
4	-0.566580	0.942972	-0.292864	0.940346	0.476105