Source code for pitci.helpers

"""Module containing functions used for evaluating interval regions."""

import pandas as pd
import numpy as np

from typing import Union, List, Tuple, Optional

from .checks import check_type


[docs]def gather_intervals(
    lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    intervals_with_predictions: Optional[np.ndarray] = None,
) -> Tuple[Union[np.ndarray, pd.Series], Union[np.ndarray, pd.Series]]:
    """Function to perform checks on passed intervals and return lower and upper
    intervals separately if they are passed combined in intervals_with_predictions.
    """

    if (
        (lower_interval is None and intervals_with_predictions is None)
        or (upper_interval is None and intervals_with_predictions is None)
        or (
            upper_interval is None
            and lower_interval is None
            and intervals_with_predictions is None
        )
    ):

        raise ValueError(
            "either lower_interval and upper_interval or intervals_with_predictions must"
            "be specified but both are None"
        )

    if (
        (lower_interval is not None and intervals_with_predictions is not None)
        or (upper_interval is not None and intervals_with_predictions is not None)
        or (
            upper_interval is not None
            and lower_interval is not None
            and intervals_with_predictions is not None
        )
    ):

        raise ValueError(
            "either lower_interval and upper_interval or intervals_with_predictions must"
            "be specified but both are specified"
        )

    # if intervals_with_predictions is passed, split out the first and third columns
    # into lower_interval and upper_interval
    if intervals_with_predictions is not None:

        check_type(
            intervals_with_predictions, [np.ndarray], "intervals_with_predictions"
        )

        if not intervals_with_predictions.shape[1] == 3:
            raise ValueError("expecting intervals_with_predictions to have 3 columns")

        lower_interval_return = intervals_with_predictions[:, 0]
        upper_interval_return = intervals_with_predictions[:, 2]

    else:

        lower_interval_return = lower_interval
        upper_interval_return = upper_interval

    check_type(lower_interval_return, [np.ndarray, pd.Series], "lower_interval_return")
    check_type(upper_interval_return, [np.ndarray, pd.Series], "upper_interval_return")

    if lower_interval_return.shape[0] != upper_interval_return.shape[0]:

        raise ValueError(
            "lower_interval_return and upper_interval_return have different shapes"
        )

    return lower_interval_return, upper_interval_return


[docs]def check_response_within_interval(
    response: Union[np.ndarray, pd.Series],
    lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    intervals_with_predictions: Optional[np.ndarray] = None,
) -> pd.Series:
    """Function to check the number of times a response lies within
    a prediction interval.

    Either both lower_interval and upper_interval or intervals_with_predictions
    must be specified.

    The function returns the proportion of the response that lies between
    the intervals.

    Parameters
    ----------
    response : np.ndarray, pd.Series
        Response or actual values corresponding to each row in the passed
        intervals.

    lower_interval : np.ndarray, pd.Series or None, default = None
        Lower intervals, if None then lower interval will be taken from the
        first column in intervals_with_predictions.

    upper_interval : np.ndarray, pd.Series or None, default = None
        Upper intervals, if None then upper interval will be taken from the
        first column in intervals_with_predictions.

    intervals_with_predictions : np.ndarry or None, default = None
        Lower intervals and upper intervals combined in a single np array.
        The array must have 3 columns. The lower interval is assumed to be
        the first column and the upper column is assumed to be the third
        column.

    """

    lower_interval, upper_interval = gather_intervals(
        lower_interval=lower_interval,
        upper_interval=upper_interval,
        intervals_with_predictions=intervals_with_predictions,
    )

    check_type(response, [np.ndarray, pd.Series], "response")

    if not response.shape[0] == lower_interval.shape[0]:
        raise ValueError("response and intervals have different numbers of rows")

    response_within_interval = (response >= lower_interval) & (
        response <= upper_interval
    )

    results = pd.Series(response_within_interval).value_counts() / response.shape[0]

    return results


[docs]def check_interval_width(
    lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    intervals_with_predictions: Optional[np.ndarray] = None,
    quantiles: List[Union[int, float]] = [
        0,
        0.05,
        0.1,
        0.2,
        0.3,
        0.4,
        0.5,
        0.6,
        0.7,
        0.8,
        0.9,
        0.95,
        1,
    ],
) -> pd.Series:
    """Function to check the distribution of prediction intervals.

    Either both lower_interval and upper_interval or intervals_with_predictions
    must be specified.

    The specified quantiles of the interval distribution, the mean, std and iqr
    are returned in a dict. A histogram of the distribution is also printed.

    Parameters
    ----------
    lower_interval : np.ndarray, pd.Series or None, default = None
        Lower intervals, if None then lower interval will be taken from the
        first column in intervals_with_predictions.

    upper_interval : np.ndarray, pd.Series or None, default = None
        Upper intervals, if None then upper interval will be taken from the
        first column in intervals_with_predictions.

    intervals_with_predictions : np.ndarry or None, default = None
        Lower intervals and upper intervals combined in a single np array.
        The array must have 3 columns. The lower interval is assumed to be
        the first column and the upper column is assumed to be the third
        column.

    quantiles : list
        List of quantiles to report on the distribution of the interval widths.

    """

    lower_interval, upper_interval = gather_intervals(
        lower_interval=lower_interval,
        upper_interval=upper_interval,
        intervals_with_predictions=intervals_with_predictions,
    )

    interval_width = pd.Series(upper_interval - lower_interval)

    interval_width_distribution = interval_width.quantile(quantiles)

    interval_width_distribution["mean"] = interval_width.mean()
    interval_width_distribution["std"] = interval_width.std()
    interval_width_distribution["iqr"] = interval_width.quantile(
        0.75
    ) - interval_width.quantile(0.25)

    return interval_width_distribution


[docs]def prepare_prediction_interval_df(
    intervals_with_predictions: np.ndarray, response: pd.Series
) -> pd.DataFrame:
    """Put response column and n x 3 array into a pd.DataFrame with columns;
    "lower", "predictions", "upper" and response".

    Parameters
    ----------
    intervals_with_predictions : np.ndarray
        n by 3 array containing lower interval values, predictions and upper
        interval values. The columns will be added to output in columns;
        "lower", "predictions" and "upper".

    response : pd.Series or np.ndarray
        Response column to be added to output, in "response" column. Must have
        the same number of rows as intervals_with_predictions.

    Returns
    -------
    df : pd.DataFrame
        4 column pd.DataFrame containing values passed in intervals_with_predictions
        and response with columns; "lower", "predictions", "upper" and response".

    """

    check_type(intervals_with_predictions, [np.ndarray], "intervals_with_predictions")

    check_type(response, [np.ndarray, pd.Series], "response")

    if intervals_with_predictions.shape[1] != 3:

        raise ValueError("intervals_with_predictions must have 3 columns")

    if intervals_with_predictions.shape[0] != response.shape[0]:

        raise ValueError(
            "intervals_with_predictions and response have different numbers of rows"
        )

    df = pd.DataFrame(
        intervals_with_predictions, columns=["lower", "prediction", "upper"]
    )

    if type(response) is pd.Series:

        df["response"] = response.values

    else:

        df["response"] = response

    return df


[docs]def create_interval_buckets(
    intervals_with_predictions: pd.DataFrame, cut_function: str = "qcut", **kwargs
) -> pd.DataFrame:
    """Function to create a new column in a DataFrame that buckets all rows
    on the widthof the intervals in the DataFrame.

    Parameters
    ----------
    intervals_with_predictions : pd.DataFrame
        Data to add column too containing buckets of interval widths. Must
        have columns called "upper" and "lower" that gives the limits
        of the intervals for each row.

    cut_function : str
        Type of bucketing to use, must be either cut or qcut. Decides
        the pandas cut function to use.

    **kwargs : any
        Arbitrary keyword arguments to pass onto the pandas cut method.

    Returns
    -------
    intervals_with_predictions : pd.DataFrame
        Input data with new column called "interval_width_bucket" that
        splits the data on the width of the intervals in the data (defined
        by the "lower" and "upper" columns)

    """

    check_type(intervals_with_predictions, [pd.DataFrame], "intervals_with_predictions")

    check_type(cut_function, [str], "cut_function")

    if cut_function not in ["qcut", "cut"]:

        raise ValueError("cut_function must be either qcut or cut")

    interval_width = (
        intervals_with_predictions["upper"] - intervals_with_predictions["lower"]
    )

    if cut_function == "qcut":

        intervals_with_predictions["interval_width_bucket"] = pd.qcut(
            x=interval_width, **kwargs
        )

    else:

        intervals_with_predictions["interval_width_bucket"] = pd.cut(
            x=interval_width, **kwargs
        )

    return intervals_with_predictions