"""Module containing functions used for evaluating interval regions."""
import pandas as pd
import numpy as np
from typing import Union, List, Tuple, Optional
from .checks import check_type
[docs]def gather_intervals(
lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
intervals_with_predictions: Optional[np.ndarray] = None,
) -> Tuple[Union[np.ndarray, pd.Series], Union[np.ndarray, pd.Series]]:
"""Function to perform checks on passed intervals and return lower and upper
intervals separately if they are passed combined in intervals_with_predictions.
"""
if (
(lower_interval is None and intervals_with_predictions is None)
or (upper_interval is None and intervals_with_predictions is None)
or (
upper_interval is None
and lower_interval is None
and intervals_with_predictions is None
)
):
raise ValueError(
"either lower_interval and upper_interval or intervals_with_predictions must"
"be specified but both are None"
)
if (
(lower_interval is not None and intervals_with_predictions is not None)
or (upper_interval is not None and intervals_with_predictions is not None)
or (
upper_interval is not None
and lower_interval is not None
and intervals_with_predictions is not None
)
):
raise ValueError(
"either lower_interval and upper_interval or intervals_with_predictions must"
"be specified but both are specified"
)
# if intervals_with_predictions is passed, split out the first and third columns
# into lower_interval and upper_interval
if intervals_with_predictions is not None:
check_type(
intervals_with_predictions, [np.ndarray], "intervals_with_predictions"
)
if not intervals_with_predictions.shape[1] == 3:
raise ValueError("expecting intervals_with_predictions to have 3 columns")
lower_interval_return = intervals_with_predictions[:, 0]
upper_interval_return = intervals_with_predictions[:, 2]
else:
lower_interval_return = lower_interval
upper_interval_return = upper_interval
check_type(lower_interval_return, [np.ndarray, pd.Series], "lower_interval_return")
check_type(upper_interval_return, [np.ndarray, pd.Series], "upper_interval_return")
if lower_interval_return.shape[0] != upper_interval_return.shape[0]:
raise ValueError(
"lower_interval_return and upper_interval_return have different shapes"
)
return lower_interval_return, upper_interval_return
[docs]def check_response_within_interval(
response: Union[np.ndarray, pd.Series],
lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
intervals_with_predictions: Optional[np.ndarray] = None,
) -> pd.Series:
"""Function to check the number of times a response lies within
a prediction interval.
Either both lower_interval and upper_interval or intervals_with_predictions
must be specified.
The function returns the proportion of the response that lies between
the intervals.
Parameters
----------
response : np.ndarray, pd.Series
Response or actual values corresponding to each row in the passed
intervals.
lower_interval : np.ndarray, pd.Series or None, default = None
Lower intervals, if None then lower interval will be taken from the
first column in intervals_with_predictions.
upper_interval : np.ndarray, pd.Series or None, default = None
Upper intervals, if None then upper interval will be taken from the
first column in intervals_with_predictions.
intervals_with_predictions : np.ndarry or None, default = None
Lower intervals and upper intervals combined in a single np array.
The array must have 3 columns. The lower interval is assumed to be
the first column and the upper column is assumed to be the third
column.
"""
lower_interval, upper_interval = gather_intervals(
lower_interval=lower_interval,
upper_interval=upper_interval,
intervals_with_predictions=intervals_with_predictions,
)
check_type(response, [np.ndarray, pd.Series], "response")
if not response.shape[0] == lower_interval.shape[0]:
raise ValueError("response and intervals have different numbers of rows")
response_within_interval = (response >= lower_interval) & (
response <= upper_interval
)
results = pd.Series(response_within_interval).value_counts() / response.shape[0]
return results
[docs]def check_interval_width(
lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
intervals_with_predictions: Optional[np.ndarray] = None,
quantiles: List[Union[int, float]] = [
0,
0.05,
0.1,
0.2,
0.3,
0.4,
0.5,
0.6,
0.7,
0.8,
0.9,
0.95,
1,
],
) -> pd.Series:
"""Function to check the distribution of prediction intervals.
Either both lower_interval and upper_interval or intervals_with_predictions
must be specified.
The specified quantiles of the interval distribution, the mean, std and iqr
are returned in a dict. A histogram of the distribution is also printed.
Parameters
----------
lower_interval : np.ndarray, pd.Series or None, default = None
Lower intervals, if None then lower interval will be taken from the
first column in intervals_with_predictions.
upper_interval : np.ndarray, pd.Series or None, default = None
Upper intervals, if None then upper interval will be taken from the
first column in intervals_with_predictions.
intervals_with_predictions : np.ndarry or None, default = None
Lower intervals and upper intervals combined in a single np array.
The array must have 3 columns. The lower interval is assumed to be
the first column and the upper column is assumed to be the third
column.
quantiles : list
List of quantiles to report on the distribution of the interval widths.
"""
lower_interval, upper_interval = gather_intervals(
lower_interval=lower_interval,
upper_interval=upper_interval,
intervals_with_predictions=intervals_with_predictions,
)
interval_width = pd.Series(upper_interval - lower_interval)
interval_width_distribution = interval_width.quantile(quantiles)
interval_width_distribution["mean"] = interval_width.mean()
interval_width_distribution["std"] = interval_width.std()
interval_width_distribution["iqr"] = interval_width.quantile(
0.75
) - interval_width.quantile(0.25)
return interval_width_distribution
[docs]def prepare_prediction_interval_df(
intervals_with_predictions: np.ndarray, response: pd.Series
) -> pd.DataFrame:
"""Put response column and n x 3 array into a pd.DataFrame with columns;
"lower", "predictions", "upper" and response".
Parameters
----------
intervals_with_predictions : np.ndarray
n by 3 array containing lower interval values, predictions and upper
interval values. The columns will be added to output in columns;
"lower", "predictions" and "upper".
response : pd.Series or np.ndarray
Response column to be added to output, in "response" column. Must have
the same number of rows as intervals_with_predictions.
Returns
-------
df : pd.DataFrame
4 column pd.DataFrame containing values passed in intervals_with_predictions
and response with columns; "lower", "predictions", "upper" and response".
"""
check_type(intervals_with_predictions, [np.ndarray], "intervals_with_predictions")
check_type(response, [np.ndarray, pd.Series], "response")
if intervals_with_predictions.shape[1] != 3:
raise ValueError("intervals_with_predictions must have 3 columns")
if intervals_with_predictions.shape[0] != response.shape[0]:
raise ValueError(
"intervals_with_predictions and response have different numbers of rows"
)
df = pd.DataFrame(
intervals_with_predictions, columns=["lower", "prediction", "upper"]
)
if type(response) is pd.Series:
df["response"] = response.values
else:
df["response"] = response
return df
[docs]def create_interval_buckets(
intervals_with_predictions: pd.DataFrame, cut_function: str = "qcut", **kwargs
) -> pd.DataFrame:
"""Function to create a new column in a DataFrame that buckets all rows
on the widthof the intervals in the DataFrame.
Parameters
----------
intervals_with_predictions : pd.DataFrame
Data to add column too containing buckets of interval widths. Must
have columns called "upper" and "lower" that gives the limits
of the intervals for each row.
cut_function : str
Type of bucketing to use, must be either cut or qcut. Decides
the pandas cut function to use.
**kwargs : any
Arbitrary keyword arguments to pass onto the pandas cut method.
Returns
-------
intervals_with_predictions : pd.DataFrame
Input data with new column called "interval_width_bucket" that
splits the data on the width of the intervals in the data (defined
by the "lower" and "upper" columns)
"""
check_type(intervals_with_predictions, [pd.DataFrame], "intervals_with_predictions")
check_type(cut_function, [str], "cut_function")
if cut_function not in ["qcut", "cut"]:
raise ValueError("cut_function must be either qcut or cut")
interval_width = (
intervals_with_predictions["upper"] - intervals_with_predictions["lower"]
)
if cut_function == "qcut":
intervals_with_predictions["interval_width_bucket"] = pd.qcut(
x=interval_width, **kwargs
)
else:
intervals_with_predictions["interval_width_bucket"] = pd.cut(
x=interval_width, **kwargs
)
return intervals_with_predictions