Source code for easyvvuq.comparison.validate
"""Validation by comparing QoI distributions.
"""
import numpy as np
import scipy.stats as st
from . import BaseComparisonElement
__copyright__ = """
Copyright 2018 Robin A. Richardson, David W. Wright
This file is part of EasyVVUQ
EasyVVUQ is free software: you can redistribute it and/or modify
it under the terms of the Lesser GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EasyVVUQ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Lesser GNU General Public License for more details.
You should have received a copy of the Lesser GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
__author__ = 'Jalal Lakhlili'
__license__ = "LGPL"
[docs]class ValidateSimilarity(BaseComparisonElement):
def __init__(self):
pass
[docs] def dist(self, p, q):
raise NotImplementedError
[docs] def compare(self, dataframe1, dataframe2):
"""Perform comparison between two lists or arrays
of discrete distributions.
Parameters
----------
dataframe1 : NumPy array or list
dataframe2 : NumPy array or list
Returns
-------
A list of distances between two lists of discrete distributions,
dataframe1 and dataframe2.
"""
if len(dataframe1) != len(dataframe2):
raise RuntimeError("Input dataframe sizes are not equal")
shape = np.shape(dataframe1)
if len(shape) == 2:
results = []
for i in range(len(dataframe1)):
p1 = np.array(dataframe1[i])
p2 = np.array(dataframe2[i])
d = self.dist(p1, p2)
results.append(d)
else:
p1 = np.array(dataframe1)
p2 = np.array(dataframe2)
results = self.dist(p1, p2)
return results
[docs]class ValidateSimilarityHellinger(ValidateSimilarity):
[docs] def element_name(self):
return "validate_similarity_hellinger"
[docs] def element_version(self):
return "0.1"
[docs] def dist(self, p, q):
""" Compute Hellinger distance between two discrete probability
distributions (PDF). The Hellinger distance metric gives an
output in the range [0,1] with values closer to 0 meaning the
PDFs are more similar.
Parameters
----------
p : NumPy array
q : NumPy array
Returns
-------
Hellinger distance between distributions p and q.
https://en.wikipedia.org/wiki/Hellinger_distance
"""
p /= p.sum()
q /= q.sum()
return np.sqrt(1. - np.sqrt(p * q).sum())
[docs]class ValidateSimilarityJensenShannon(ValidateSimilarity):
[docs] def element_name(self):
return "validate_similarity_jensen_shannon"
[docs] def element_version(self):
return "0.1"
[docs] def dist(self, p, q):
""" Compute Jensen-Shannon distance between two discrete
probability distributions (PDF). It is based on Kullback–Leibler
divergence and gives an output metric un the range [0,1] with
values closer to 0 meaning the PDFs are more similar.
Parameters
----------
p : NumPy array
q : NumPy array
Returns
-------
Jensen-Shannon divergence between distributions p and q.
https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
"""
p /= p.sum()
q /= q.sum()
m = 0.5 * (p + q)
div = 0.5 * (st.entropy(p, m) + st.entropy(q, m))
return np.sqrt(div / np.log(2))
[docs]class ValidateSimilarityWasserstein(ValidateSimilarity):
[docs] def element_name(self):
return "validate_similarity_wasserstein"
[docs] def element_version(self):
return "0.1"
[docs] def dist(self, p, q):
""" Compute Wasserstein distance between two discrete cumulative
distributions (CDF). The Wasserstein distance has an
unrestricted range with a lower limit of 0. A smaller distance
indicates a stronger similarity between between CFDs.
Parameters
----------
p : NumPy array
q : NumPy array
Returns
-------
Wasserstein distance between distributions p and q.
https://en.wikipedia.org/wiki/Wasserstein_metric
"""
return st.wasserstein_distance(p, q)