recipes/DR/pca.py

# -----------------------------------------------------------------------------
# Copyright 2019 (C) Jeremy Fix
# Released under a BSD two-clauses license
#
# Reference: Pearson, K. (1901). "On Lines and Planes of Closest Fit to Systems
#            of Points in Space". Philosophical Magazine. 2 (11): 559–572.
#            doi:10.1080/14786440109462720
# -----------------------------------------------------------------------------
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib.cm as cmx
import matplotlib.colors as colors

def get_cmap(N):
    '''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
        RGB color.'''
    color_norm  = colors.Normalize(vmin=0, vmax=N)
    scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
    def map_index_to_rgb_color(index):
        return scalar_map.to_rgba(index)
    return map_index_to_rgb_color


class PCA:
    ''' PCA class '''

    def __init__(self, n_components):
        self.n_components = n_components

    def fit(self, X):
        '''
        Performs the PCA of X and stores the principal components.
        The datapoints are supposed to be stored in the row vectors of X.
        It keeps only the n_components projection vectors, associated
        with the n_components largest eigenvalues of the covariance matrix X.T X
        '''

        # Center the datapoints
        self.centroid = np.mean(X, axis=0)

        # Computes the covariance matrix
        sigma = np.dot((X - self.centroid).T, X - self.centroid)

        # Compute the eigenvalue/eigenvector decomposition of sigma
        eigvals, eigvecs = np.linalg.eigh(sigma)

        # Note :The eigenvalues returned by eigh are ordered in ascending order.

        # Stores the n_components eigvenvectors/eigenvalues associated
        # with the largest eigen values
        #self.eigvals = eigvals[-self.n_components:]
        #self.eigvecs = eigvecs[:, -self.n_components:]

        # Stores all the eigenvectors/eigenvalues
        # Usefull for latter computing some statistics of the variance we keep
        self.eigvals = eigvals
        self.eigvecs = eigvecs

    def transform(self, Z):
        '''
        Uses a fitted PCA to transform the row vectors of Z
        Remember the eigen vectors are ordred by ascending order
        Denoting Z_trans = transform(Z),
        The first  component is Z_trans[:, -1]
        The second component is Z_trans[:, -2]
        ...
        '''
        return np.dot(Z - self.centroid, self.eigvecs[:, -self.n_components:])


if __name__ == '__main__':


    # Example 1 : Random 2D data
    # -------------------------------------------------------------------------
    print("Random data example")
    print('-' * 70)

    N = 500

    ## Generate some random normal data rotated and translated
    X = np.column_stack([np.random.normal(0.0, 1.0, (N, )),
                         np.random.normal(0.0, 0.2, (N, ))])
    X = np.dot(X, np.array([[np.cos(1.0), np.sin(1.0)],[-np.sin(1.0), np.cos(1.0)]]))
    X = np.array([0.5, 1.0]) + X

    ## Extract the 2 first principal component vectors
    ## With a 2D dataset, these are the only interesting components
    n_components = 2
    pca = PCA(n_components=n_components)
    pca.fit(X)

    ## Project the original data
    X_trans = pca.transform(X)

    print("{:.2f}% of the variance along the first axis (green)".format(
          100 * pca.eigvals[-1] / pca.eigvals.sum()))

    print("{:.2f}% of the variance along the second axis (red)".format(
          100 * pca.eigvals[-2] / pca.eigvals.sum()))

    ## Plot
    plt.figure()
    plt.scatter(X[:,0], X[:,1], alpha = .25)
    ## Plot the first projecton axis in green
    plt.plot([pca.centroid[0]-2*pca.eigvecs[0, -1], pca.centroid[0]+2*pca.eigvecs[0,-1]],
             [pca.centroid[1]-2*pca.eigvecs[1, -1], pca.centroid[1]+2*pca.eigvecs[1,-1]],
             'g', linewidth=3)
    ## Plot the second projection axis in red
    plt.plot([pca.centroid[0]-2*pca.eigvecs[0, 0], pca.centroid[0]+2*pca.eigvecs[0,0]],
             [pca.centroid[1]-2*pca.eigvecs[1, 0], pca.centroid[1]+2*pca.eigvecs[1,0]],
             'r', linewidth=3)
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.gca().set_aspect('equal')
    plt.savefig("random_points.png")
    print()

    # Example 2 : 28 x 28 images of digits
    # -------------------------------------------------------------------------
    print("Digits example")
    print('-' * 70)

    samples = np.load('dig_app_text.cb.npy')
    X, y = samples['input'], samples['label']

    ## Extract the 10 first principal component vectors
    n_components = 10
    pca = PCA(n_components = n_components)
    pca.fit(X)

    ## Project the original data
    X_trans = pca.transform(X)

    print("{:.2f}% of the variance is kept with {} components".format(
          100 * pca.eigvals[-n_components:].sum()/pca.eigvals.sum(),
          n_components))

    ## Plot
    fig = plt.figure(figsize=(10,4), tight_layout=True)
    gs  = GridSpec(3, 10)

    ### Coordinates in the projected space
    ### the color shows how the digits get separated by the principal vectors
    ax = fig.add_subplot(gs[0:2, :-2])
    cmap = get_cmap(10)
    ax.scatter(X_trans[:, -1], X_trans[:,-2] , c=[cmap(l) for l in y], alpha = .25)
    ax = fig.add_subplot(gs[0:2, -2:])
    ax.set_axis_off()
    for i in range(10):
        col = cmap(i)
        rect = plt.Rectangle((0.1, i/10.), 0.4, 1.0/14., facecolor=col)
        ax.add_artist(rect)
        ax.text(0.6, i/10.+1./28., str(i), fontsize=12, color="k", **{'ha':'center', 'va':'center'})


    ### Projection vectors
    for i in range(n_components):
        ax = fig.add_subplot(gs[2, i])
        ax.imshow(pca.eigvecs[:, -(i+1)].reshape((28, 28)), cmap='gray_r')
        ax.set_xticks([])
        ax.set_yticks([])

    plt.savefig("digits.png")
    plt.show()