"""Provides analysis element for ensemble bootstrapping analysis.
"""
import numpy as np
import pandas as pd
from easyvvuq import OutputType
from .base import BaseAnalysisElement
__copyright__ = """
Copyright 2018 Robin A. Richardson, David W. Wright
This file is part of EasyVVUQ
EasyVVUQ is free software: you can redistribute it and/or modify
it under the terms of the Lesser GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
EasyVVUQ is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Lesser GNU General Public License for more details.
You should have received a copy of the Lesser GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
__license__ = "LGPL"
[docs]def confidence_interval(dist, value, alpha, pivotal=False):
"""
Get the bootstrap confidence interval for a given distribution.
Parameters
----------
dist:
Array containing distribution of bootstrap results.
value:
Value of statistic for which we are calculating error bars.
alpha:
The alpha value for the confidence intervals.
pivotal:
Use the pivotal method? Default to percentile method.
Returns
-------
float:
Value of the bootstrap statistic
float:
Highest value of the confidence interval
float:
Lowest value of the confidence interval
"""
if len(dist) < 1:
raise ValueError("Dist array should be non-empty")
if pivotal:
low = 2 * value - np.percentile(dist, 100 * (1 - alpha / 2.), axis=0)
stat = value
high = 2 * value - np.percentile(dist, 100 * (alpha / 2.), axis=0)
else:
low = np.percentile(dist, 100 * (alpha / 2.), axis=0)
stat = np.percentile(dist, 50)
high = np.percentile(dist, 100 * (1 - alpha / 2.), axis=0)
# if low > high:
# (low, high) = (high, low)
return stat, low, high
[docs]def bootstrap(data, stat_func, alpha=0.05,
sample_size=None, n_samples=1000,
pivotal=False):
"""
Parameters
----------
data : :obj:`pandas.DataFrame`
Input data to be analysed.
stat_func : function
Statistical function to be applied to data for bootstrapping.
alpha : float
Produce estimate of 100.0*(1-`alpha`) confidence interval.
sample_size : int
Size of the sample to be drawn from the input data.
n_samples : int
Number of times samples are to be drawn from the input data.
pivotal : bool
Use the pivotal method? Default to percentile method.
Returns
-------
float:
Value of the bootstrap statistic
float:
Highest value of the confidence interval
float:
Lowest value of the confidence interval
"""
if data.empty:
raise RuntimeError("DataFrame passed to bootstrap has to be non-empty")
stat = data.apply(stat_func)
if sample_size is None:
sample_size = len(data)
dist = []
for l in range(n_samples):
sample = data.sample(sample_size, replace=True)
dist.append(stat_func(sample))
return confidence_interval(dist, stat, alpha, pivotal=pivotal)
[docs]def ensemble_bootstrap(data, groupby=[], qoi_cols=[],
stat_func=np.mean, alpha=0.05,
sample_size=None, n_samples=1000,
pivotal=False, stat_name='boot'):
"""
Perform bootstrapping analysis on input data.
Parameters
----------
data : :obj:`pandas.DataFrame`
DataFrame to be analysed.
groupby : list or None
Columns to use to group the data in `analyse` method before
calculating stats.
qoi_cols : list or None
Columns of quantities of interest (for which stats will be
calculated).
stat_func : function
Statistical function to be applied to data for bootstrapping.
alpha : float, default=0.05
Produce estimate of 100.0*(1-`alpha`) confidence interval.
sample_size : int
Size of the sample to be drawn from the input data.
n_samples : int, default=1000
Number of times samples are to be drawn from the input data.
pivotal : bool, default=False
Use the pivotal method? Default to percentile method.
stat_name : str, default='boot'
Name to use to describe columns containing output statistic (for example
'mean').
Returns
-------
:obj:`pandas.DataFrame`
Description of input data using bootstrap statistic and high/low
confidence intervals.
"""
agg_funcs = {}
if not qoi_cols:
qoi_cols = [
x for x in data.columns if x not in groupby + ['run_id', 'status']]
for col in qoi_cols:
if col not in data:
raise RuntimeError(f"No such attribute: {col}\nAttributes found in data: {data}")
agg_funcs[col] = lambda x: bootstrap(
x,
stat_func=stat_func,
alpha=alpha,
sample_size=sample_size,
n_samples=n_samples,
pivotal=pivotal)
if not groupby:
grouped_data = data.groupby(lambda x: True, sort=False)
else:
grouped_data = data.groupby(groupby, sort=False)
# Apply bootstrapping to all value columns selected
# Note results come a tuple per cell
results = grouped_data.agg(agg_funcs)
outputs = [stat_name, 'low', 'high']
# Split out tuples in each cell and provide sensible naming
results = pd.concat({col: results[col].apply(
lambda cell: pd.Series(cell, index=outputs)
)
for col in qoi_cols}, axis=1)
return results
[docs]class EnsembleBoot(BaseAnalysisElement):
def __init__(self, groupby=[], qoi_cols=[],
stat_func=np.mean, alpha=0.05,
sample_size=None, n_boot_samples=1000,
pivotal=False, stat_name='boot'):
"""
Element to perform bootstrapping on collated simulation output.
Parameters
----------
groupby : list or None
Columns to use to group the data in `analyse` method before
calculating stats.
qoi_cols : list or None
Columns of quantities of interest (for which stats will be
calculated).
stat_func : function
Statistical function to be applied to data for bootstrapping.
alpha : float, default=0.05
Produce estimate of 100.0*(1-`alpha`) confidence interval.
sample_size : int
Size of the sample to be drawn from the input data.
n_boot_samples : int, default=1000
Number of times samples are to be drawn from the input data.
pivotal : bool, default=False
Use the pivotal method? Default to percentile method.
stat_name : str, default='boot'
Name to use to describe columns containing output statistic (for example
'mean').
"""
self.groupby = groupby
self.qoi_cols = qoi_cols
self.stat_func = stat_func
self.alpha = alpha
self.sample_size = sample_size
self.n_boot_samples = n_boot_samples
self.pivotal = pivotal
self.stat_name = stat_name
self.output_type = OutputType.SUMMARY
if self.stat_func is None:
raise ValueError('stat_func cannot be None.')
[docs] def element_name(self):
"""Name for this element for logging purposes"""
return "ensemble_boot"
[docs] def element_version(self):
"""Version of this element for logging purposes"""
return "0.1"
[docs] def analyse(self, data_frame=None):
"""Perform bootstrapping analysis on the input `data_frame`.
The data_frame is grouped according to `self.groupby` if specified and
analysis is performed on the columns selected in `self.qoi_cols` if set.
Parameters
----------
data_frame : :obj:`pandas.DataFrame`
Summary data produced through collation of simulation output.
Returns
-------
:obj:`pandas.DataFrame`
Basic statistic for selected columns and groupings of data.
"""
if data_frame is None:
raise RuntimeError(
"This VVUQ element needs a data frame to analyse")
elif data_frame.empty:
raise RuntimeError(
"No data in data frame passed to analyse element")
results = ensemble_bootstrap(
data_frame,
groupby=self.groupby,
qoi_cols=self.qoi_cols,
stat_func=self.stat_func,
alpha=self.alpha,
sample_size=self.sample_size,
n_samples=self.n_boot_samples,
pivotal=self.pivotal,
stat_name=self.stat_name)
return results