Source code for turbustat.statistics.pca.pca

# Licensed under an MIT open source license - see LICENSE


import numpy as np

from ..threeD_to_twoD import var_cov_cube


[docs]class PCA(object): ''' Implementation of Principal Component Analysis (Heyer & Brunt, 2002) Parameters ---------- cube : numpy.ndarray Data cube. n_eigs : int Number of eigenvalues to compute. ''' def __init__(self, cube, n_eigs=50): super(PCA, self).__init__() self.cube = cube self.n_eigs = n_eigs # Remove NaNs self.cube[np.isnan(self.cube)] = 0 self.n_velchan = self.cube.shape[0] self.eigvals = None
[docs] def compute_pca(self, mean_sub=False, normalize=True): ''' Create the covariance matrix and its eigenvalues. Parameters ---------- normalize : bool, optional Normalize the set of eigenvalues by the 0th component. ''' self.cov_matrix = var_cov_cube(self.cube, mean_sub=mean_sub) all_eigsvals, eigvecs = np.linalg.eig(self.cov_matrix) all_eigsvals = np.sort(all_eigsvals)[::-1] # Sort by maximum self._var_prop = np.sum(all_eigsvals[:self.n_eigs]) / \ np.sum(all_eigsvals) if normalize: self.eigvals = all_eigsvals[:self.n_eigs] / all_eigsvals[0] else: self.eigvals = all_eigsvals[:self.n_eigs] return self
@property def var_proportion(self): return self._var_prop
[docs] def run(self, verbose=False, normalize=True): ''' Run method. Needed to maintain package standards. Parameters ---------- verbose : bool, optional Enables plotting. normalize : bool, optional See ```compute_pca```. ''' self.compute_pca(normalize=normalize) if verbose: import matplotlib.pyplot as p print 'Proportion of Variance kept: %s' % (self.var_proportion) p.subplot(121) p.imshow(self.cov_matrix, origin="lower", interpolation="nearest") p.colorbar() p.subplot(122) p.bar(np.arange(1, self.n_eigs + 1), self.eigvals, 0.5, color='r') p.xlim([0, self.n_eigs + 1]) p.xlabel('Eigenvalues') p.ylabel('Variance') p.show()
[docs]class PCA_Distance(object): ''' Compare two data cubes based on the eigenvalues of the PCA decomposition. The distance is the Euclidean distance between the eigenvalues. Parameters ---------- cube1 : numpy.ndarray Data cube. cube2 : numpy.ndarray Data cube. n_eigs : int Number of eigenvalues to compute. fiducial_model : PCA Computed PCA object. Use to avoid recomputing. normalize : bool, optional Sets whether to normalize the eigenvalues by the 0th eigenvalue. ''' def __init__(self, cube1, cube2, n_eigs=50, fiducial_model=None, normalize=True): super(PCA_Distance, self).__init__() self.cube1 = cube1 self.cube2 = cube2 if fiducial_model is not None: self.pca1 = fiducial_model else: self.pca1 = PCA(self.cube1, n_eigs=n_eigs) self.pca1.run(normalize=normalize) self.pca2 = PCA(self.cube2, n_eigs=n_eigs) self.pca2.run(normalize=normalize) self.distance = None
[docs] def distance_metric(self, verbose=False): ''' Computes the distance between the cubes. Parameters ---------- verbose : bool, optional Enables plotting. ''' difference = np.abs((self.pca1.eigvals - self.pca2.eigvals) ** 2.) self.distance = np.sqrt(np.sum(difference)) if verbose: import matplotlib.pyplot as p print "Proportions of total variance: 1 - %0.3f, 2 - %0.3f" % \ (self.pca1.var_proportion, self.pca2.var_proportion) p.subplot(2, 2, 1) p.imshow( self.pca1.cov_matrix, origin="lower", interpolation="nearest") p.colorbar() p.title("PCA1") p.subplot(2, 2, 3) p.bar(np.arange(1, self.pca1.n_eigs + 1), self.pca1.eigvals, 0.5, color='r') p.xlim([0, self.pca1.n_eigs + 1]) p.xlabel('Eigenvalues') p.ylabel('Variance') p.subplot(2, 2, 2) p.imshow( self.pca2.cov_matrix, origin="lower", interpolation="nearest") p.colorbar() p.title("PCA2") p.subplot(2, 2, 4) p.bar(np.arange(1, self.pca2.n_eigs + 1), self.pca2.eigvals, 0.5, color='r') p.xlim([0, self.pca2.n_eigs + 1]) p.xlabel('Eigenvalues') p.tight_layout() p.show() return self