Source code for turbustat.statistics.pdf.compare_pdf

# Licensed under an MIT open source license - see LICENSE

'''

The density PDF as described by Kowal et al. (2007)

'''

import numpy as np
from scipy.stats import ks_2samp, anderson_ksamp

from ..stats_utils import hellinger, standardize, common_histogram_bins


[docs]class PDF(object): ''' Create the PDF of a given array. ''' def __init__(self, img, min_val=0.0, bins=None, weights=None, norm=False): super(PDF, self).__init__() self.img = img # We want to remove NaNs and value below the threshold. self.data = img[np.isfinite(img)] self.data = self.data[self.data > min_val] # Do the same for the weights, then apply weights to the data. if weights is not None: self.weights = weights[np.isfinite(img)] self.weights = self.weights[self.data > min_val] self.data *= self.weights if norm: # Normalize by the average self.data /= np.mean(self.data, axis=None) self._bins = bins
[docs] def make_pdf(self, bins=None): ''' Create the PDF. ''' if bins is not None: self._bins = bins # If the number of bins is not given, use sqrt of data length. if self.bins is None: self._bins = np.sqrt(self.data.shape[0]) self._bins = int(np.round(self.bins)) norm_weights = np.ones_like(self.data) / self.data.shape[0] self._pdf, bin_edges = np.histogram(self.data, bins=self.bins, density=False, weights=norm_weights) self._bins = (bin_edges[:-1] + bin_edges[1:]) / 2 return self
@property def pdf(self): return self._pdf @property def bins(self): return self._bins
[docs] def make_ecdf(self): ''' Create the ECDF. ''' self._ecdf = np.cumsum(np.sort(self.data.ravel())) / np.sum(self.data) return self
@property def ecdf(self): return self._ecdf
[docs] def run(self, verbose=False): ''' Run the whole thing. ''' self.make_pdf() self.make_ecdf() if verbose: import matplotlib.pyplot as p # PDF p.subplot(131) p.loglog(self.bins[self.pdf > 0], self.pdf[self.pdf > 0], 'b-') p.grid(True) p.xlabel(r"$\Sigma/\overline{\Sigma}$") p.ylabel("PDF") # ECDF p.subplot(132) p.semilogx(np.sort(self.data.ravel()), self.ecdf, 'b-') p.grid(True) p.xlabel(r"$\Sigma/\overline{\Sigma}$") p.ylabel("ECDF") # Array representation. p.subplot(133) if self.img.ndim == 1: p.plot(self.img, 'b-') elif self.img.ndim == 2: p.imshow(self.img, origin="lower", interpolation="nearest", cmap="binary") elif self.img.ndim == 3: p.imshow(np.nansum(self.img, axis=0), origin="lower", interpolation="nearest", cmap="binary") else: print("Visual representation works only up to 3D.") p.show() return self
[docs]class PDF_Distance(object): ''' Calculate the distance between two arrays using their PDFs. Parameters ---------- img1 : numpy.ndarray Array (1-3D). img2 : numpy.ndarray Array (1-3D). min_val1 : float, optional Minimum value to keep in img1 min_val2 : float, optional Minimum value to keep in img2 weights1 : numpy.ndarray, optional Weights to be used with img1 weights2 : numpy.ndarray, optional Weights to be used with img2 ''' def __init__(self, img1, img2, min_val1=0.0, min_val2=0.0, weights1=None, weights2=None): super(PDF_Distance, self).__init__() self.img1 = img1 self.img2 = img2 if weights1 is None: weights1 = np.ones(img1.shape) if weights2 is None: weights2 = np.ones(img2.shape) # We want to make sure we're using the same set of bins for the # comparisons. Unfortunately, we have redundant calculations to # do this, but it is somewhat necessary to keep PDF standalone. stand1 = standardize((img1 * weights1)[np.isfinite(img1) | (img1 > min_val1)]) stand2 = standardize((img2 * weights2)[np.isfinite(img2) | (img2 > min_val2)]) self.bins = common_histogram_bins(stand1, stand2) self.PDF1 = PDF(stand1, bins=self.bins) self.PDF1.run(verbose=False) self.PDF2 = PDF(stand2, bins=self.bins) self.PDF2.run(verbose=False)
[docs] def compute_hellinger_distance(self): ''' Computes the Hellinger Distance between the two PDFs. ''' self.hellinger_distance = hellinger(self.PDF1.pdf, self.PDF2.pdf) return self
[docs] def compute_ks_distance(self): ''' Compute the distance using the KS Test. ''' D, p = ks_2samp(self.PDF1.data, self.PDF2.data) self.ks_distance = D self.ks_pval = p return self
[docs] def compute_ad_distance(self): ''' Compute the distance using the Anderson Darling Test. ''' D, _, p = anderson_ksamp([self.PDF1.data, self.PDF2.data]) self.ad_distance = D self.ad_pval = p
[docs] def distance_metric(self, statistic='both', labels=None, verbose=False): ''' Calculate the distance. *NOTE:* The data are standardized before comparing to ensure the distance is calculated on the same scales. Parameters ---------- labels : tuple, optional Sets the labels in the output plot. verbose : bool, optional Enables plotting. ''' if statistic is 'both': self.compute_hellinger_distance() self.compute_ks_distance() self.compute_ad_distance() elif statistic is 'hellinger': self.compute_hellinger_distance() elif statistic is 'ks': self.compute_ks_distance() elif statistic is 'ad': self.compute_ad_distance() else: raise TypeError("statistic must be 'both', 'hellinger', or 'ks'.") if verbose: import matplotlib.pyplot as p # PDF if labels is None: label1 = "Input 1" label2 = "Input 2" else: label1 = labels[0] label2 = labels[1] p.subplot(131) p.loglog(self.PDF1.bins[self.PDF1.pdf > 0], self.PDF1.pdf[self.PDF1.pdf > 0], 'b-', label=label1) p.loglog(self.PDF1.bins[self.PDF2.pdf > 0], self.PDF2.pdf[self.PDF2.pdf > 0], 'g-', label=label2) p.legend(loc="best") p.grid(True) p.xlabel(r"$\Sigma/\overline{\Sigma}$") p.ylabel("PDF") # ECDF p.subplot(132) p.semilogx(self.PDF1.bins, self.PDF1.ecdf, 'b-') p.semilogx(self.PDF2.bins, self.PDF2.ecdf, 'g-') p.grid(True) p.xlabel(r"$\Sigma/\overline{\Sigma}$") p.ylabel("ECDF") # Array representation. p.subplot(233) if self.img1.ndim == 1: p.plot(self.img1, 'b-') elif self.img1.ndim == 2: p.imshow(self.img1, origin="lower", interpolation="nearest", cmap="binary") elif self.img1.ndim == 3: p.imshow(np.nansum(self.img1, axis=0), origin="lower", interpolation="nearest", cmap="binary") else: print("Visual representation works only up to 3D.") p.subplot(236) if self.img2.ndim == 1: p.plot(self.img2, 'b-') elif self.img2.ndim == 2: p.imshow(self.img2, origin="lower", interpolation="nearest", cmap="binary") elif self.img2.ndim == 3: p.imshow(np.nansum(self.img2, axis=0), origin="lower", interpolation="nearest", cmap="binary") else: print("Visual representation works only up to 3D.") p.show() return self