-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathpca.py
167 lines (133 loc) · 5.78 KB
/
pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# -----------------------------------------------------------------------------
# Copyright 2019 (C) Jeremy Fix
# Released under a BSD two-clauses license
#
# Reference: Pearson, K. (1901). "On Lines and Planes of Closest Fit to Systems
# of Points in Space". Philosophical Magazine. 2 (11): 559–572.
# doi:10.1080/14786440109462720
# -----------------------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib.cm as cmx
import matplotlib.colors as colors
def get_cmap(N):
'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
RGB color.'''
color_norm = colors.Normalize(vmin=0, vmax=N)
scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
def map_index_to_rgb_color(index):
return scalar_map.to_rgba(index)
return map_index_to_rgb_color
class PCA:
''' PCA class '''
def __init__(self, n_components):
self.n_components = n_components
def fit(self, X):
'''
Performs the PCA of X and stores the principal components.
The datapoints are supposed to be stored in the row vectors of X.
It keeps only the n_components projection vectors, associated
with the n_components largest eigenvalues of the covariance matrix X.T X
'''
# Center the datapoints
self.centroid = np.mean(X, axis=0)
# Computes the covariance matrix
sigma = np.dot((X - self.centroid).T, X - self.centroid)
# Compute the eigenvalue/eigenvector decomposition of sigma
eigvals, eigvecs = np.linalg.eigh(sigma)
# Note :The eigenvalues returned by eigh are ordered in ascending order.
# Stores the n_components eigvenvectors/eigenvalues associated
# with the largest eigen values
#self.eigvals = eigvals[-self.n_components:]
#self.eigvecs = eigvecs[:, -self.n_components:]
# Stores all the eigenvectors/eigenvalues
# Usefull for latter computing some statistics of the variance we keep
self.eigvals = eigvals
self.eigvecs = eigvecs
def transform(self, Z):
'''
Uses a fitted PCA to transform the row vectors of Z
Remember the eigen vectors are ordred by ascending order
Denoting Z_trans = transform(Z),
The first component is Z_trans[:, -1]
The second component is Z_trans[:, -2]
...
'''
return np.dot(Z - self.centroid, self.eigvecs[:, -self.n_components:])
if __name__ == '__main__':
# Example 1 : Random 2D data
# -------------------------------------------------------------------------
print("Random data example")
print('-' * 70)
N = 500
## Generate some random normal data rotated and translated
X = np.column_stack([np.random.normal(0.0, 1.0, (N, )),
np.random.normal(0.0, 0.2, (N, ))])
X = np.dot(X, np.array([[np.cos(1.0), np.sin(1.0)],[-np.sin(1.0), np.cos(1.0)]]))
X = np.array([0.5, 1.0]) + X
## Extract the 2 first principal component vectors
## With a 2D dataset, these are the only interesting components
n_components = 2
pca = PCA(n_components=n_components)
pca.fit(X)
## Project the original data
X_trans = pca.transform(X)
print("{:.2f}% of the variance along the first axis (green)".format(
100 * pca.eigvals[-1] / pca.eigvals.sum()))
print("{:.2f}% of the variance along the second axis (red)".format(
100 * pca.eigvals[-2] / pca.eigvals.sum()))
## Plot
plt.figure()
plt.scatter(X[:,0], X[:,1], alpha = .25)
## Plot the first projecton axis in green
plt.plot([pca.centroid[0]-2*pca.eigvecs[0, -1], pca.centroid[0]+2*pca.eigvecs[0,-1]],
[pca.centroid[1]-2*pca.eigvecs[1, -1], pca.centroid[1]+2*pca.eigvecs[1,-1]],
'g', linewidth=3)
## Plot the second projection axis in red
plt.plot([pca.centroid[0]-2*pca.eigvecs[0, 0], pca.centroid[0]+2*pca.eigvecs[0,0]],
[pca.centroid[1]-2*pca.eigvecs[1, 0], pca.centroid[1]+2*pca.eigvecs[1,0]],
'r', linewidth=3)
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.gca().set_aspect('equal')
plt.savefig("random_points.png")
print()
# Example 2 : 28 x 28 images of digits
# -------------------------------------------------------------------------
print("Digits example")
print('-' * 70)
samples = np.load('dig_app_text.cb.npy')
X, y = samples['input'], samples['label']
## Extract the 10 first principal component vectors
n_components = 10
pca = PCA(n_components = n_components)
pca.fit(X)
## Project the original data
X_trans = pca.transform(X)
print("{:.2f}% of the variance is kept with {} components".format(
100 * pca.eigvals[-n_components:].sum()/pca.eigvals.sum(),
n_components))
## Plot
fig = plt.figure(figsize=(10,4), tight_layout=True)
gs = GridSpec(3, 10)
### Coordinates in the projected space
### the color shows how the digits get separated by the principal vectors
ax = fig.add_subplot(gs[0:2, :-2])
cmap = get_cmap(10)
ax.scatter(X_trans[:, -1], X_trans[:,-2] , c=[cmap(l) for l in y], alpha = .25)
ax = fig.add_subplot(gs[0:2, -2:])
ax.set_axis_off()
for i in range(10):
col = cmap(i)
rect = plt.Rectangle((0.1, i/10.), 0.4, 1.0/14., facecolor=col)
ax.add_artist(rect)
ax.text(0.6, i/10.+1./28., str(i), fontsize=12, color="k", **{'ha':'center', 'va':'center'})
### Projection vectors
for i in range(n_components):
ax = fig.add_subplot(gs[2, i])
ax.imshow(pca.eigvecs[:, -(i+1)].reshape((28, 28)), cmap='gray_r')
ax.set_xticks([])
ax.set_yticks([])
plt.savefig("digits.png")
plt.show()