Source code for easyvvuq.analysis.qmc_analysis

"""Analysis element for Quasi-Monte Carlo (QMC) sensitivity analysis.

Please refer to the article below for the basic approach used here.
https://en.wikipedia.org/wiki/Variance-based_sensitivity_analysis
"""
import logging
import numpy as np
from easyvvuq import OutputType
from .base import BaseAnalysisElement
from easyvvuq.sampling import QMCSampler
from .results import AnalysisResults
from easyvvuq.sampling import MCSampler
from .ensemble_boot import confidence_interval

__author__ = 'Jalal Lakhlili'
__license__ = "LGPL"

logger = logging.getLogger(__name__)



[docs]
class QMCAnalysisResults(AnalysisResults):
    """Analysis results for the QMCAnalysis Method. Refer to the AnalysisResults base class
    documentation for details on using it.
    """

    def _get_sobols_first(self, qoi, input_):
        raw_dict = AnalysisResults._keys_to_tuples(self.raw_data['sobols_first'])
        return raw_dict[AnalysisResults._to_tuple(qoi)][input_][0]

    def _get_sobols_total(self, qoi, input_):
        raw_dict = AnalysisResults._keys_to_tuples(self.raw_data['sobols_total'])
        return raw_dict[AnalysisResults._to_tuple(qoi)][input_][0]

    def _get_sobols_second(self, qoi, input_):
        raise NotImplementedError

    def _get_sobols_first_conf(self, qoi, input_):
        raw_dict = AnalysisResults._keys_to_tuples(self.raw_data['conf_sobols_first'])
        return [raw_dict[AnalysisResults._to_tuple(qoi)][input_]['low'][0],
                raw_dict[AnalysisResults._to_tuple(qoi)][input_]['high'][0]]

    def _get_sobols_total_conf(self, qoi, input_):
        raw_dict = AnalysisResults._keys_to_tuples(self.raw_data['conf_sobols_total'])
        return [raw_dict[AnalysisResults._to_tuple(qoi)][input_]['low'][0],
                raw_dict[AnalysisResults._to_tuple(qoi)][input_]['high'][0]]


[docs]
    def supported_stats(self):
        """Types of statistics supported by the describe method.

        Returns
        -------
        list of str
        """
        return ['mean', 'var', 'std', 'min', 'max', 'median', 'percentiles', '1%', '10%', '50%', '90%', '99%']


    def _describe(self, qoi, statistic):
        if statistic not in self.supported_stats():
            raise NotImplementedError
        if statistic == '1%':
            return self.raw_data['percentiles'][qoi]['p1']
        if statistic == '10%':
            return self.raw_data['percentiles'][qoi]['p10']
        elif statistic == '50%':
            return self.raw_data['percentiles'][qoi]['p50']
        elif statistic == '90%':
            return self.raw_data['percentiles'][qoi]['p90']
        elif statistic == '99%':
            return self.raw_data['percentiles'][qoi]['p99']
        else:
            return self.raw_data['statistical_moments'][qoi][statistic][0]




[docs]
class QMCAnalysis(BaseAnalysisElement):
    def __init__(self, sampler, qoi_cols=None):
        """Analysis element for Quasi-Monte Carlo (QMC).

        Parameters
        ----------
        sampler : easyvvuq.sampling.qmc.QMCSampler
            Sampler used to initiate the QMC analysis
        qoi_cols : list or None
            Column names for quantities of interest (for which analysis is to be
            performed).
        """
        if not isinstance(sampler, QMCSampler) and not isinstance(sampler, MCSampler):
            raise RuntimeError(
                'QMCAnalysis class relies on the QMCSampler or MCSampler as its sampling component')
        if qoi_cols is None:
            self.qoi_cols = list(sampler.vary.get_keys())
        else:
            self.qoi_cols = qoi_cols
        self.output_type = OutputType.SUMMARY
        self.sampler = sampler


[docs]
    def element_name(self):
        """Name for this element.

        Return
        ------
        str:
            "QMC_Analysis"
        """
        return "QMC_Analysis"



[docs]
    def element_version(self):
        """Version of this element.

        Return
        ------
        str:
            Element version.
        """
        return "0.2"



[docs]
    def contains_nan(self, values):
        """Checks if ``None`` or ``numpy.nan`` exists in `values`. Returns ``True`` if
        any there are at least one occurrence of ``None`` or ``numpy.nan``.
        Parameters
        ----------
        values : array_like, list, number
            `values` where to check for occurrences of ``None`` or ``np.nan``.
            Can be irregular and have any number of nested elements.
        Returns
        -------
        bool
            ``True`` if `values` has at least one occurrence of ``None`` or
            ``numpy.nan``.
        """
        # To speed up we first try the fast option np.any(np.isnan(values))
        try:
            return np.any(np.isnan(values))
        except (ValueError, TypeError):
            if values is None or values is np.nan:
                return True
            # To solve the problem of float/int as well as numpy int/flaot
            elif np.isscalar(values) and np.isnan(values):
                return True
            elif hasattr(values, "__iter__"):
                for value in values:
                    if self.contains_nan(value):
                        return True

                return False
            else:
                return False



[docs]
    def create_mask(self, samples):
        """
        Mask samples that do not give results (anything but np.nan or None).
        Parameters
        ----------
        samples : array_like
            Evaluations for the model.
        Returns
        -------
        masked_samples : list
            The evaluations that have results (not numpy.nan or None).
        mask : boolean array
            The mask itself, used to create the masked arrays.
        """
        masked_samples = []
        mask = np.ones(len(samples), dtype=bool)

        for i, result in enumerate(samples):
            # if np.any(np.isnan(result)):
            if self.contains_nan(result):
                mask[i] = False
            else:
                masked_samples.append(result)

        return masked_samples, mask     



[docs]
    def analyse(self, data_frame):
        """Perform QMC analysis on a given pandas DataFrame.

        Parameters
        ----------
        data_frame : pandas DataFrame
            Input data for analysis.

        Returns
        -------
        easyvvuq.analysis.qmc.QMCAnalysisResults
            AnalysisResults object for QMC.
        """
        if data_frame.empty:
            raise RuntimeError(
                "No data in data frame passed to analyse element")

        qoi_cols = self.qoi_cols

        results = {
            'statistical_moments': {k: {} for k in qoi_cols},
            'percentiles': {k: {} for k in qoi_cols},
            'sobols_first': {k: {} for k in qoi_cols},
            'sobols_total': {k: {} for k in qoi_cols},
            'conf_sobols_first': {k: {} for k in qoi_cols},
            'conf_sobols_total': {k: {} for k in qoi_cols}
        }

        # Extract output values for each quantity of interest from Dataframe
        samples = self.get_samples(data_frame)

        # Compute descriptive statistics for each quantity of interest
        for k in qoi_cols:
            # Find NaNs and create a mask excluding these samples from the analysis
            # https://github.com/simetenn/uncertainpy/blob/ffb2400289743066265b9a8561cdf3b72e478a28/src/uncertainpy/core/uncertainty_calculations.py#L1532
            masked_samples, mask = self.create_mask(samples[k])

            results['statistical_moments'][k] = {'mean': np.mean(masked_samples, axis=0),
                                                 'var': np.var(masked_samples, axis=0),
                                                 'std': np.std(masked_samples, axis=0),
                                                 'min': np.min(masked_samples, axis=0),
                                                 'max': np.max(masked_samples, axis=0),
                                                 'median': np.median(masked_samples, axis=0),
                                                 }
            results['percentiles'][k] = {'p1': np.percentile(masked_samples, 1, 0)[0],
                                         'p10': np.percentile(masked_samples, 10, 0)[0],
                                         'p50': np.percentile(masked_samples, 50, 0)[0],
                                         'p90': np.percentile(masked_samples, 90, 0)[0],
                                         'p99': np.percentile(masked_samples, 99, 0)[0]}

            # Replace Nan values by the mean before proceeding with the SA
            indices = np.where(mask == 0)[0] # samples[~mask] = results[k].mean
            for i in indices:
                samples[k][i] = results['statistical_moments'][k]['mean']

            if not np.all(mask):
                print("Warning: QoI \"{}\" only yields ".format(k) +
                    "results for {}/{} ".format(sum(mask), len(mask)) +
                    "parameter combinations. " +
                    "Runs {} are not valid. ".format(indices+1) +
                    "NaN results are set to the mean when calculating the Sobol indices. " +
                    "This might affect the Sobol indices.")
            
            sobols_first, conf_first, sobols_total, conf_total = \
                self.sobol_bootstrap(samples[k])
            results['sobols_first'][k] = sobols_first
            results['sobols_total'][k] = sobols_total
            results['conf_sobols_first'][k] = conf_first
            results['conf_sobols_total'][k] = conf_total

        return QMCAnalysisResults(raw_data=results, samples=data_frame,
                                  qois=self.qoi_cols, inputs=list(self.sampler.vary.get_keys()))



[docs]
    def get_samples(self, data_frame):
        """
        Converts the Pandas dataframe into a dictionary.

        Parameters
        ----------
        data_frame : pandas DataFrame
            the EasyVVUQ Pandas dataframe from collation.

        Returns
        -------
        dict :
            A dictionary with the QoI names as keys.
            Each element is a list of code evaluations.
        """
        samples = {k: [] for k in self.qoi_cols}
        for run_id in data_frame['run_id'].squeeze().unique():
            for k in self.qoi_cols:
                data = data_frame.loc[data_frame['run_id'].squeeze() == run_id][k]
                samples[k].append(data.values)
        return samples



[docs]
    def sobol_bootstrap_(self, samples, alpha=0.05, n_samples=1000):
        """
        Computes the first order and total order Sobol indices using Saltelli's
        method. To assess the sampling inaccuracy, bootstrap confidence intervals
        are also computed.

        Reference: A. Saltelli, Making best use of model evaluations to compute
        sensitivity indices, Computer Physics Communications, 2002.

        Parameters
        ----------
        samples : list
            The samples for a given QoI.
        alpha: float
            The (1 - alpha) * 100 confidence interval parameter. The default is 0.05.
        n_samples: int
            The number of bootstrap samples. The default is 1000.

        Returns
        -------
        sobols_first_dict, conf_first_dict, sobols_total_dict, conf_total_dict:
        dictionaries containing the first- and total-order Sobol indices for all
        parameters, and (1-alpha)*100 lower and upper confidence bounds.

        """
        assert len(samples) > 0
        assert alpha > 0.0
        assert alpha < 1.0
        assert n_samples > 0

        # convert to array
        samples = np.array(samples)
        # the number of parameter and the number of MC samples in n_mc * (n_params + 2)
        # and the size of the QoI
        n_params = self.sampler.n_params
        n_mc = self.sampler.n_mc_samples
        n_qoi = samples[0].size
        sobols_first_dict = {}
        conf_first_dict = {}
        sobols_total_dict = {}
        conf_total_dict = {}

        for j, param_name in enumerate(self.sampler.vary.get_keys()):
            # code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params
            # see reference above.
            f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc)
            # our point estimate for the 1st and total order Sobol indices
            value_first = self._first_order(f_M2, f_M1, f_Ni[:, j])
            value_total = self._total_order(f_M2, f_M1, f_Ni[:, j])
            # array for resampled estimates
            sobols_first = np.zeros([n_samples, n_qoi])
            sobols_total = np.zeros([n_samples, n_qoi])
            for i in range(n_samples):
                # resample, must be done on already seperated output due to
                # the specific order in samples
                idx = np.random.randint(0, n_mc - 1, n_mc)
                f_M2_resample = f_M2[idx]
                f_M1_resample = f_M1[idx]
                f_Ni_resample = f_Ni[idx]
                # recompute Sobol indices
                sobols_first[i] = self._first_order(f_M2_resample, f_M1_resample,
                                                    f_Ni_resample[:, j])
                sobols_total[i] = self._total_order(f_M2_resample, f_M1_resample,
                                                    f_Ni_resample[:, j])
            # compute confidence intervals
            _, low_first, high_first = confidence_interval(sobols_first, value_first,
                                                           alpha, pivotal=True)
            _, low_total, high_total = confidence_interval(sobols_total, value_total,
                                                           alpha, pivotal=True)
            # store results
            sobols_first_dict[param_name] = value_first
            conf_first_dict[param_name] = {'low': low_first, 'high': high_first}
            sobols_total_dict[param_name] = value_total
            conf_total_dict[param_name] = {'low': low_total, 'high': high_total}

        return sobols_first_dict, conf_first_dict, sobols_total_dict, conf_total_dict

    

[docs]
    def sobol_bootstrap(self, samples, alpha=0.05, n_bootstrap=1000):
        """
        Computes the first order and total order Sobol indices using Saltelli's
        method. To assess the sampling inaccuracy, bootstrap confidence intervals
        are also computed.

        Reference: A. Saltelli, Making best use of model evaluations to compute
        sensitivity indices, Computer Physics Communications, 2002.

        Parameters
        ----------
        samples : list
            The samples for a given QoI.
        alpha: float
            The (1 - alpha) * 100 confidence interval parameter. The default is 0.05.
        n_samples: int
            The number of bootstrap samples. The default is 1000.

        Returns
        -------
        sobols_first_dict, conf_first_dict, sobols_total_dict, conf_total_dict:
        dictionaries containing the first- and total-order Sobol indices for all
        parameters, and (1-alpha)*100 lower and upper confidence bounds.

        """
        assert len(samples) > 0
        assert alpha > 0.0
        assert alpha < 1.0
        assert n_bootstrap > 0

        # convert to array
        samples = np.array(samples)
        # the number of parameter and the number of MC samples in n_mc * (n_params + 2)
        # and the size of the QoI
        n_params = self.sampler.n_params
        # n_mc = self.sampler.n_mc_samples
        n_mc = int(samples.shape[0]/(n_params + 2))
        n_qoi = samples[0].size
        sobols_first_dict = {}
        conf_first_dict = {}
        sobols_total_dict = {}
        conf_total_dict = {}

        # code evaluations of input matrices M1, M2 and Ni, i = 1,...,n_params
        # see reference above.
        f_M2, f_M1, f_Ni = self._separate_output_values(samples, n_params, n_mc)
        r = np.random.randint(n_mc, size=(n_mc, n_bootstrap))

        for j, param_name in enumerate(self.sampler.vary.get_keys()):

            # our point estimate for the 1st and total order Sobol indices
            value_first = self._first_order(f_M2, f_M1, f_Ni[:, j])
            value_total = self._total_order(f_M2, f_M1, f_Ni[:, j])

            # sobols computed from resampled data points
            if n_mc * n_bootstrap * n_qoi <= 10**7:
                #this is a vectorized computation, Is fast, but f_M2[r] will be of size
                #(n_mc, n_bootstrap, n_qoi), this can become too large and cause a crash, 
                #especially when dealing with large QoI (n_qoi >> 1). So this is only done
                #when n_mc * n_bootstrap * n_qoi <= 10**7
                print("Vectorized bootstrapping")
                sobols_first = self._first_order(f_M2[r], f_M1[r], f_Ni[r, j])
                sobols_total = self._total_order(f_M2[r], f_M1[r], f_Ni[r, j])
            else:
                #array for resampled estimates
                sobols_first = np.zeros([n_bootstrap, n_qoi])
                sobols_total = np.zeros([n_bootstrap, n_qoi])
                print("Sequential bootstrapping")
                #non-vectorized implementation
                for i in range(n_bootstrap):
                    #resampled sample matrices of size (n_mc, n_qoi)
                    indices = r[:, i]  
                    sobols_first[i] = self._first_order(f_M2[indices], f_M1[indices], f_Ni[indices, j])
                    sobols_total[i] = self._total_order(f_M2[indices], f_M1[indices], f_Ni[indices, j])

            # compute confidence intervals based on percentiles
            _, low_first, high_first = confidence_interval(sobols_first, value_first,
                                                           alpha, pivotal=True)
            _, low_total, high_total = confidence_interval(sobols_total, value_total,
                                                           alpha, pivotal=True)
            # store results
            sobols_first_dict[param_name] = value_first
            conf_first_dict[param_name] = {'low': low_first, 'high': high_first}
            sobols_total_dict[param_name] = value_total
            conf_total_dict[param_name] = {'low': low_total, 'high': high_total}

        return sobols_first_dict, conf_first_dict, sobols_total_dict, conf_total_dict



    # Adapted from SALib
    @staticmethod
    def _separate_output_values(samples, n_params, n_mc_samples):
        """There are n_params + 2 different input matrices: M1, M2, N_i,
        i=1,...,n_params.  (see reference under sobol_bootstrap). The
        EasyVVUQ dataframe is stored in the order:

        [sample from M2, sample from N1, N2, ... sample from N_n_params,
         sample from M1, repeat].

        This subroutine separates the output values into the contributions
        of the different input matrices.

        Parameters
        ----------
        samples: list
            The samples for a given QoI
        n_params: int
            The number of uncertain input parameters.
        n_mc_samples: int
            The number of MC samples per input matrix, i.e. the
          number of rows in M1, M2 or Ni.

        Returns
        -------
        NumPy arrays of the separated code evaluations: f_M2, f_M1, f_Ni, where
        f_Ni contains n_params entries corresponding to the n_params Ni matrices.

        """
        evaluations = np.array(samples)

        shape = (n_mc_samples, n_params) + evaluations[0].shape
        step = n_params + 2
        f_Ni = np.zeros(shape)

        f_M2 = evaluations[0:evaluations.shape[0]:step]
        f_M1 = evaluations[(step - 1):evaluations.shape[0]:step]

        for i in range(n_params):
            f_Ni[:, i] = evaluations[(i + 1):evaluations.shape[0]:step]

        return f_M2, f_M1, f_Ni

    @staticmethod
    def _first_order(f_M2, f_M1, f_Ni):
        """Calculate first order sensitivity indices.

        Parameters
        ----------
        f_M2: NumPy array
            Array of code evaluations on input array M2
        f_M1: NumPy array
            Array of code evaluations on input array M1
        f_Ni: NumPy array
            Array of code evaluations on input array Ni, i=1,...,n_params

        Returns
        -------
        A NumPy array of the n_params first-order Sobol indices.
        """
        V = np.var(np.r_[f_M2, f_M1], axis=0)
        return np.mean(f_M1 * (f_Ni - f_M2), axis=0) / (V + (V == 0)) * (V != 0)

    @staticmethod
    def _total_order(f_M2, f_M1, f_Ni):
        """Calculate total order sensitivity indices. See also:

        A Saltelli et al, Variance based sensitivity analysis of model output.
        Design and estimator for the total sensitivity index, 2009.

        Parameters
        ----------
        f_M2: NumPy array
            Array of code evaluations on input array M2 (matrix A in ref above)
        f_M1: NumPy array
            Array of code evaluations on input array M1 (matrix B in ref above)
        f_Ni: NumPy array
            Array of code evaluations on input array Ni, i=1,...,n_params
          (matrix AB in ref above)

        Returns
        -------
        A NumPy array of the n_params total-order Sobol indices.
        """
        V = np.var(np.r_[f_M2, f_M1], axis=0)
        return 0.5 * np.mean((f_M2 - f_Ni) ** 2, axis=0) / (V + (V == 0)) * (V != 0)