Source code for easyvvuq.analysis.ensemble_boot

"""Provides analysis element for ensemble bootstrapping analysis.
"""
import numpy as np
import pandas as pd
from easyvvuq import OutputType
from .base import BaseAnalysisElement

__copyright__ = """

    Copyright 2018 Robin A. Richardson, David W. Wright

    This file is part of EasyVVUQ

    EasyVVUQ is free software: you can redistribute it and/or modify
    it under the terms of the Lesser GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    EasyVVUQ is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    Lesser GNU General Public License for more details.

    You should have received a copy of the Lesser GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
__license__ = "LGPL"


[docs]def confidence_interval(dist, value, alpha, pivotal=False): """ Get the bootstrap confidence interval for a given distribution. Parameters ---------- dist: Array containing distribution of bootstrap results. value: Value of statistic for which we are calculating error bars. alpha: The alpha value for the confidence intervals. pivotal: Use the pivotal method? Default to percentile method. Returns ------- float: Value of the bootstrap statistic float: Highest value of the confidence interval float: Lowest value of the confidence interval """ if len(dist) < 1: raise ValueError("Dist array should be non-empty") if pivotal: low = 2 * value - np.percentile(dist, 100 * (1 - alpha / 2.), axis=0) stat = value high = 2 * value - np.percentile(dist, 100 * (alpha / 2.), axis=0) else: low = np.percentile(dist, 100 * (alpha / 2.), axis=0) stat = np.percentile(dist, 50) high = np.percentile(dist, 100 * (1 - alpha / 2.), axis=0) # if low > high: # (low, high) = (high, low) return stat, low, high
[docs]def bootstrap(data, stat_func, alpha=0.05, sample_size=None, n_samples=1000, pivotal=False): """ Parameters ---------- data : :obj:`pandas.DataFrame` Input data to be analysed. stat_func : function Statistical function to be applied to data for bootstrapping. alpha : float Produce estimate of 100.0*(1-`alpha`) confidence interval. sample_size : int Size of the sample to be drawn from the input data. n_samples : int Number of times samples are to be drawn from the input data. pivotal : bool Use the pivotal method? Default to percentile method. Returns ------- float: Value of the bootstrap statistic float: Highest value of the confidence interval float: Lowest value of the confidence interval """ if data.empty: raise RuntimeError("DataFrame passed to bootstrap has to be non-empty") stat = data.apply(stat_func) if sample_size is None: sample_size = len(data) dist = [] for l in range(n_samples): sample = data.sample(sample_size, replace=True) dist.append(stat_func(sample)) return confidence_interval(dist, stat, alpha, pivotal=pivotal)
[docs]def ensemble_bootstrap(data, groupby=[], qoi_cols=[], stat_func=np.mean, alpha=0.05, sample_size=None, n_samples=1000, pivotal=False, stat_name='boot'): """ Perform bootstrapping analysis on input data. Parameters ---------- data : :obj:`pandas.DataFrame` DataFrame to be analysed. groupby : list or None Columns to use to group the data in `analyse` method before calculating stats. qoi_cols : list or None Columns of quantities of interest (for which stats will be calculated). stat_func : function Statistical function to be applied to data for bootstrapping. alpha : float, default=0.05 Produce estimate of 100.0*(1-`alpha`) confidence interval. sample_size : int Size of the sample to be drawn from the input data. n_samples : int, default=1000 Number of times samples are to be drawn from the input data. pivotal : bool, default=False Use the pivotal method? Default to percentile method. stat_name : str, default='boot' Name to use to describe columns containing output statistic (for example 'mean'). Returns ------- :obj:`pandas.DataFrame` Description of input data using bootstrap statistic and high/low confidence intervals. """ agg_funcs = {} if not qoi_cols: qoi_cols = [ x for x in data.columns if x not in groupby + ['run_id', 'status']] for col in qoi_cols: if col not in data: raise RuntimeError(f"No such attribute: {col}\nAttributes found in data: {data}") agg_funcs[col] = lambda x: bootstrap( x, stat_func=stat_func, alpha=alpha, sample_size=sample_size, n_samples=n_samples, pivotal=pivotal) if not groupby: grouped_data = data.groupby(lambda x: True, sort=False) else: grouped_data = data.groupby(groupby, sort=False) # Apply bootstrapping to all value columns selected # Note results come a tuple per cell results = grouped_data.agg(agg_funcs) outputs = [stat_name, 'low', 'high'] # Split out tuples in each cell and provide sensible naming results = pd.concat({col: results[col].apply( lambda cell: pd.Series(cell, index=outputs) ) for col in qoi_cols}, axis=1) return results
[docs]class EnsembleBoot(BaseAnalysisElement): def __init__(self, groupby=[], qoi_cols=[], stat_func=np.mean, alpha=0.05, sample_size=None, n_boot_samples=1000, pivotal=False, stat_name='boot'): """ Element to perform bootstrapping on collated simulation output. Parameters ---------- groupby : list or None Columns to use to group the data in `analyse` method before calculating stats. qoi_cols : list or None Columns of quantities of interest (for which stats will be calculated). stat_func : function Statistical function to be applied to data for bootstrapping. alpha : float, default=0.05 Produce estimate of 100.0*(1-`alpha`) confidence interval. sample_size : int Size of the sample to be drawn from the input data. n_boot_samples : int, default=1000 Number of times samples are to be drawn from the input data. pivotal : bool, default=False Use the pivotal method? Default to percentile method. stat_name : str, default='boot' Name to use to describe columns containing output statistic (for example 'mean'). """ self.groupby = groupby self.qoi_cols = qoi_cols self.stat_func = stat_func self.alpha = alpha self.sample_size = sample_size self.n_boot_samples = n_boot_samples self.pivotal = pivotal self.stat_name = stat_name self.output_type = OutputType.SUMMARY if self.stat_func is None: raise ValueError('stat_func cannot be None.')
[docs] def element_name(self): """Name for this element for logging purposes""" return "ensemble_boot"
[docs] def element_version(self): """Version of this element for logging purposes""" return "0.1"
[docs] def analyse(self, data_frame=None): """Perform bootstrapping analysis on the input `data_frame`. The data_frame is grouped according to `self.groupby` if specified and analysis is performed on the columns selected in `self.qoi_cols` if set. Parameters ---------- data_frame : :obj:`pandas.DataFrame` Summary data produced through collation of simulation output. Returns ------- :obj:`pandas.DataFrame` Basic statistic for selected columns and groupings of data. """ if data_frame is None: raise RuntimeError( "This VVUQ element needs a data frame to analyse") elif data_frame.empty: raise RuntimeError( "No data in data frame passed to analyse element") results = ensemble_bootstrap( data_frame, groupby=self.groupby, qoi_cols=self.qoi_cols, stat_func=self.stat_func, alpha=self.alpha, sample_size=self.sample_size, n_samples=self.n_boot_samples, pivotal=self.pivotal, stat_name=self.stat_name) return results