# Licensed under an MIT open source license - see LICENSE
'''
The density PDF as described by Kowal et al. (2007)
'''
import numpy as np
from scipy.stats import ks_2samp, anderson_ksamp
from ..stats_utils import hellinger, standardize, common_histogram_bins
[docs]class PDF(object):
'''
Create the PDF of a given array.
'''
def __init__(self, img, min_val=0.0, bins=None, weights=None, norm=False):
super(PDF, self).__init__()
self.img = img
# We want to remove NaNs and value below the threshold.
self.data = img[np.isfinite(img)]
self.data = self.data[self.data > min_val]
# Do the same for the weights, then apply weights to the data.
if weights is not None:
self.weights = weights[np.isfinite(img)]
self.weights = self.weights[self.data > min_val]
self.data *= self.weights
if norm:
# Normalize by the average
self.data /= np.mean(self.data, axis=None)
self._bins = bins
[docs] def make_pdf(self, bins=None):
'''
Create the PDF.
'''
if bins is not None:
self._bins = bins
# If the number of bins is not given, use sqrt of data length.
if self.bins is None:
self._bins = np.sqrt(self.data.shape[0])
self._bins = int(np.round(self.bins))
norm_weights = np.ones_like(self.data) / self.data.shape[0]
self._pdf, bin_edges = np.histogram(self.data, bins=self.bins,
density=False,
weights=norm_weights)
self._bins = (bin_edges[:-1] + bin_edges[1:]) / 2
return self
@property
def pdf(self):
return self._pdf
@property
def bins(self):
return self._bins
[docs] def make_ecdf(self):
'''
Create the ECDF.
'''
self._ecdf = np.cumsum(np.sort(self.data.ravel())) / np.sum(self.data)
return self
@property
def ecdf(self):
return self._ecdf
[docs] def run(self, verbose=False):
'''
Run the whole thing.
'''
self.make_pdf()
self.make_ecdf()
if verbose:
import matplotlib.pyplot as p
# PDF
p.subplot(131)
p.loglog(self.bins[self.pdf > 0], self.pdf[self.pdf > 0], 'b-')
p.grid(True)
p.xlabel(r"$\Sigma/\overline{\Sigma}$")
p.ylabel("PDF")
# ECDF
p.subplot(132)
p.semilogx(np.sort(self.data.ravel()), self.ecdf, 'b-')
p.grid(True)
p.xlabel(r"$\Sigma/\overline{\Sigma}$")
p.ylabel("ECDF")
# Array representation.
p.subplot(133)
if self.img.ndim == 1:
p.plot(self.img, 'b-')
elif self.img.ndim == 2:
p.imshow(self.img, origin="lower", interpolation="nearest",
cmap="binary")
elif self.img.ndim == 3:
p.imshow(np.nansum(self.img, axis=0), origin="lower",
interpolation="nearest", cmap="binary")
else:
print("Visual representation works only up to 3D.")
p.show()
return self
[docs]class PDF_Distance(object):
'''
Calculate the distance between two arrays using their PDFs.
Parameters
----------
img1 : numpy.ndarray
Array (1-3D).
img2 : numpy.ndarray
Array (1-3D).
min_val1 : float, optional
Minimum value to keep in img1
min_val2 : float, optional
Minimum value to keep in img2
weights1 : numpy.ndarray, optional
Weights to be used with img1
weights2 : numpy.ndarray, optional
Weights to be used with img2
'''
def __init__(self, img1, img2, min_val1=0.0, min_val2=0.0,
weights1=None, weights2=None):
super(PDF_Distance, self).__init__()
self.img1 = img1
self.img2 = img2
if weights1 is None:
weights1 = np.ones(img1.shape)
if weights2 is None:
weights2 = np.ones(img2.shape)
# We want to make sure we're using the same set of bins for the
# comparisons. Unfortunately, we have redundant calculations to
# do this, but it is somewhat necessary to keep PDF standalone.
stand1 = standardize((img1 * weights1)[np.isfinite(img1) |
(img1 > min_val1)])
stand2 = standardize((img2 * weights2)[np.isfinite(img2) |
(img2 > min_val2)])
self.bins = common_histogram_bins(stand1, stand2)
self.PDF1 = PDF(stand1, bins=self.bins)
self.PDF1.run(verbose=False)
self.PDF2 = PDF(stand2, bins=self.bins)
self.PDF2.run(verbose=False)
[docs] def compute_hellinger_distance(self):
'''
Computes the Hellinger Distance between the two PDFs.
'''
self.hellinger_distance = hellinger(self.PDF1.pdf, self.PDF2.pdf)
return self
[docs] def compute_ks_distance(self):
'''
Compute the distance using the KS Test.
'''
D, p = ks_2samp(self.PDF1.data, self.PDF2.data)
self.ks_distance = D
self.ks_pval = p
return self
[docs] def compute_ad_distance(self):
'''
Compute the distance using the Anderson Darling Test.
'''
D, _, p = anderson_ksamp([self.PDF1.data, self.PDF2.data])
self.ad_distance = D
self.ad_pval = p
[docs] def distance_metric(self, statistic='both', labels=None, verbose=False):
'''
Calculate the distance.
*NOTE:* The data are standardized before comparing to ensure the
distance is calculated on the same scales.
Parameters
----------
labels : tuple, optional
Sets the labels in the output plot.
verbose : bool, optional
Enables plotting.
'''
if statistic is 'both':
self.compute_hellinger_distance()
self.compute_ks_distance()
self.compute_ad_distance()
elif statistic is 'hellinger':
self.compute_hellinger_distance()
elif statistic is 'ks':
self.compute_ks_distance()
elif statistic is 'ad':
self.compute_ad_distance()
else:
raise TypeError("statistic must be 'both', 'hellinger', or 'ks'.")
if verbose:
import matplotlib.pyplot as p
# PDF
if labels is None:
label1 = "Input 1"
label2 = "Input 2"
else:
label1 = labels[0]
label2 = labels[1]
p.subplot(131)
p.loglog(self.PDF1.bins[self.PDF1.pdf > 0],
self.PDF1.pdf[self.PDF1.pdf > 0], 'b-', label=label1)
p.loglog(self.PDF1.bins[self.PDF2.pdf > 0],
self.PDF2.pdf[self.PDF2.pdf > 0], 'g-', label=label2)
p.legend(loc="best")
p.grid(True)
p.xlabel(r"$\Sigma/\overline{\Sigma}$")
p.ylabel("PDF")
# ECDF
p.subplot(132)
p.semilogx(self.PDF1.bins, self.PDF1.ecdf, 'b-')
p.semilogx(self.PDF2.bins, self.PDF2.ecdf, 'g-')
p.grid(True)
p.xlabel(r"$\Sigma/\overline{\Sigma}$")
p.ylabel("ECDF")
# Array representation.
p.subplot(233)
if self.img1.ndim == 1:
p.plot(self.img1, 'b-')
elif self.img1.ndim == 2:
p.imshow(self.img1, origin="lower", interpolation="nearest",
cmap="binary")
elif self.img1.ndim == 3:
p.imshow(np.nansum(self.img1, axis=0), origin="lower",
interpolation="nearest", cmap="binary")
else:
print("Visual representation works only up to 3D.")
p.subplot(236)
if self.img2.ndim == 1:
p.plot(self.img2, 'b-')
elif self.img2.ndim == 2:
p.imshow(self.img2, origin="lower", interpolation="nearest",
cmap="binary")
elif self.img2.ndim == 3:
p.imshow(np.nansum(self.img2, axis=0), origin="lower",
interpolation="nearest", cmap="binary")
else:
print("Visual representation works only up to 3D.")
p.show()
return self