Source code for pitci.lightgbm

"""Conformal predictor classes for LightGBM models."""

import numpy as np
import pandas as pd

try:

    import lightgbm as lgb

except ModuleNotFoundError as err:

    raise ImportError(
        "lightgbm must be installed to use functionality in pitci.lightgbm"
    ) from err

from typing import List, Union, Any

from .base import (
    AbsoluteErrorConformalPredictor,
    LeafNodeScaledConformalPredictor,
    SplitConformalPredictorMixin,
)
from .checks import check_type, check_allowed_value
from .dispatchers import (
    get_absolute_error_conformal_predictor,
    get_leaf_node_scaled_conformal_predictor,
    get_split_leaf_node_scaled_conformal_predictor,
)
from . import docstrings


def check_objective_supported(
    booster: lgb.Booster, supported_objectives: List[str]
) -> None:
    """Function to check that the booster objective parameter is in the
    supported_objectives list and raise and exception if not.

    Parameters
    ----------
    booster : lgb.Booster
        Model to check objective is supported.

    supported_objectives : list
        List of lightgbm supported objectives.

    """

    check_type(booster, [lgb.basic.Booster], "booster")
    check_type(supported_objectives, [list], "supported_objectives")

    for i, objective in enumerate(supported_objectives):

        check_type(objective, [str], f"supported_objectives[{i}]")

    booster_objective = booster.dump_model()["objective"]

    check_allowed_value(
        booster_objective, supported_objectives, "booster objective not supported"
    )


SUPPORTED_OBJECTIVES_ABSOLUTE_ERROR = [
    "regression",
    "regression_l1",
    "huber",
    "fair",
    "poisson",
    "quantile",
    "mape",
    "gamma",
    "tweedie",
    "binary",
    # "multiclass",
    # "multiclassova",
    # "cross_entropy",
    # "cross_entropy_lambda",
    # "lambdarank",
    # "rank_xendcg"
]

SUPPORTED_OBJECTIVES_DESCRIPTION = (
    "The currently supported lightgbm objective functions, given the nonconformity\n"
    "    measure that is based on absolute error, are defined in the\n"
    "    ``SUPPORTED_OBJECTIVES`` attribute."
)

SUPPORTED_OBJECTIVES_ATTRIBUTE = (
    "SUPPORTED_OBJECTIVES : list\n"
    "\tBooster supported objectives. If a lgb.Booster with a non-supported objective\n"
    "\tis passed when initialising the class object an error will be raised."
)


[docs]class LGBMBoosterAbsoluteErrorConformalPredictor(AbsoluteErrorConformalPredictor):

    __doc__ = AbsoluteErrorConformalPredictor.__doc__.format(
        model_type="``lgb.Booster``",
        description=SUPPORTED_OBJECTIVES_DESCRIPTION,
        parameters="",
        calibrate_link=":func:`~pitci.lightgbm.LGBMBoosterAbsoluteErrorConformalPredictor.calibrate`",
        attributes=SUPPORTED_OBJECTIVES_ATTRIBUTE.format(model_type="``lgb.Booster``"),
    )

[docs]    def __init__(self, model: lgb.Booster) -> None:

        check_type(model, [lgb.basic.Booster], "model")

        super().__init__(model=model)

        self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABSOLUTE_ERROR

        check_objective_supported(model, self.SUPPORTED_OBJECTIVES)

[docs]    @docstrings.doc_inherit_kwargs(
        AbsoluteErrorConformalPredictor.calibrate,
        style=docstrings.str_format_merge_style,
        description="",
        data_type="np.ndarray or pd.DataFrame",
        response_type="np.ndarray or pd.Series",
    )
    def calibrate(
        self,
        data: Union[np.ndarray, pd.DataFrame],
        response: Union[np.ndarray, pd.Series],
        alpha: Union[int, float] = 0.95,
    ) -> None:

        check_type(data, [np.ndarray, pd.DataFrame], "data")

        super().calibrate(data=data, alpha=alpha, response=response)

[docs]    @docstrings.doc_inherit_kwargs(
        AbsoluteErrorConformalPredictor.predict_with_interval,
        style=docstrings.str_format_merge_style,
        description="",
        data_type="np.ndarray or pd.DataFrame",
    )
    def predict_with_interval(
        self, data: Union[np.ndarray, pd.DataFrame]
    ) -> np.ndarray:

        check_type(data, [np.ndarray, pd.DataFrame], "data")

        return super().predict_with_interval(data)

    def _generate_predictions(
        self, data: Union[pd.DataFrame, np.ndarray]
    ) -> np.ndarray:
        """Method to generate predictions from the lgboost model.

        The number of trees to predict with is not specified, defaulting to lightgbm's
        default behaviour for the `num_iteration` argument;
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.predict.

        Parameters
        ----------
        data : lgb.Dataset
            Data to generate predictions on.

        """

        predictions = self.model.predict(data)

        return predictions


[docs]class LGBMBoosterLeafNodeScaledConformalPredictor(LeafNodeScaledConformalPredictor):

    __doc__ = LeafNodeScaledConformalPredictor.__doc__.format(
        model_type="``lgb.Booster``",
        description=SUPPORTED_OBJECTIVES_DESCRIPTION,
        parameters="",
        attributes=SUPPORTED_OBJECTIVES_ATTRIBUTE,
        calibrate_method="pitci.lightgbm.LGBMBoosterLeafNodeScaledConformalPredictor.calibrate",
    )

[docs]    def __init__(self, model: lgb.Booster) -> None:

        check_type(model, [lgb.basic.Booster], "model")

        super().__init__(model=model)

        self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABSOLUTE_ERROR

        check_objective_supported(model, self.SUPPORTED_OBJECTIVES)

[docs]    @docstrings.doc_inherit_kwargs(
        LeafNodeScaledConformalPredictor.calibrate,
        style=docstrings.str_format_merge_style,
        description="",
        predict_with_interval_method="pitci.lightgbm.LGBMBoosterLeafNodeScaledConformalPredictor.predict_with_interval",
        data_type="np.ndarray or pd.DataFrame",
        response_type="np.ndarray or pd.Series",
        train_data_type="np.ndarray, pd.DataFrame or None, default = None",
    )
    def calibrate(
        self,
        data: Union[np.ndarray, pd.DataFrame],
        response: Union[np.ndarray, pd.Series],
        alpha: Union[int, float] = 0.95,
        train_data: Union[np.ndarray, pd.DataFrame] = None,
    ) -> None:

        check_type(data, [np.ndarray, pd.DataFrame], "data")
        check_type(train_data, [np.ndarray, pd.DataFrame, type(None)], "train_data")

        super().calibrate(
            data=data, response=response, alpha=alpha, train_data=train_data
        )

[docs]    @docstrings.doc_inherit_kwargs(
        LeafNodeScaledConformalPredictor.predict_with_interval,
        style=docstrings.str_format_merge_style,
        description="",
        data_type="np.ndarray or pd.DataFrame",
    )
    def predict_with_interval(
        self, data: Union[np.ndarray, pd.DataFrame]
    ) -> np.ndarray:

        check_type(data, [np.ndarray, pd.DataFrame], "data")

        return super().predict_with_interval(data=data)

    def _calibrate_leaf_node_counts(self, data: Any) -> None:
        """Method to get the number of times each leaf node was visited on the training
        dataset.

        LightGBM exposes this information through the the `trees_to_dataframe` method. This
        returns a dataframe with tree stats and the `count` column gives the number of records
        in the training data that fall into this node. See
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.trees_to_dataframe
        for more info.

        """

        trees_df = self.model.trees_to_dataframe()

        leaf_nodes = trees_df.loc[trees_df["split_feature"].isnull()].copy()

        # strip out the number part of the node index
        leaf_nodes["leaf_index"] = (
            leaf_nodes["node_index"]
            .apply(lambda x: x.split("-")[1].replace("L", ""))
            .astype(int)
        )

        self.leaf_node_counts = []

        for tree_no in np.sort(leaf_nodes["tree_index"].unique()):

            tree_leaf_node_counts = {
                row[1]["leaf_index"]: row[1]["count"]
                for row in leaf_nodes.loc[
                    leaf_nodes["tree_index"] == tree_no, ["leaf_index", "count"]
                ].iterrows()
            }

            self.leaf_node_counts.append(tree_leaf_node_counts)

    def _generate_predictions(
        self, data: Union[pd.DataFrame, np.ndarray]
    ) -> np.ndarray:
        """Method to generate predictions from the lgboost model.

        The number of trees to predict with is not specified, defaulting to lightgbm's
        default behaviour for the `num_iteration` argument;
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.predict.

        Parameters
        ----------
        data : lgb.Dataset
            Data to generate predictions on.

        """

        predictions = self.model.predict(data)

        return predictions

    def _generate_leaf_node_predictions(
        self, data: Union[pd.DataFrame, np.ndarray]
    ) -> np.ndarray:
        """Method to generate leaf node predictions from the lgboost model.

        Method calls lgb.Booster.predict with pred_leaf = True. Like the
        _generate_predictions method the number of trees to predict with
        is not specified, defaulting to lightgbm's behaviour.

        If the output of predict is not a 2d matrix the output is shaped to
        be 2d.

        Parameters
        ----------
        data : lgb.Dataset
            Data to generate predictions on.

        """

        # matrix of (nsample, ntrees) with each record giving
        # the leaf node of each sample in each tree
        leaf_node_predictions = self.model.predict(data, pred_leaf=True)

        # if the input data is a single column reshape the output to
        # be 2d array rather than 1d
        if len(leaf_node_predictions.shape) == 1:

            leaf_node_predictions = leaf_node_predictions.reshape(
                (leaf_node_predictions.shape[0], 1)
            )

        return leaf_node_predictions


[docs]class LGBMBoosterSplitLeafNodeScaledConformalPredictor(
    SplitConformalPredictorMixin, LGBMBoosterLeafNodeScaledConformalPredictor
):

    __doc__ = docstrings.combine_split_mixin_docs(
        SplitConformalPredictorMixin, LGBMBoosterLeafNodeScaledConformalPredictor
    )

[docs]    @docstrings.doc_inherit_kwargs(
        LeafNodeScaledConformalPredictor.calibrate,
        style=docstrings.str_format_merge_style,
        description="The ``baseline_interval`` values are each calibrated to the required ``alpha``\n"
        "\tlevel on the subsets of the data where the scaling factor values\n"
        "\tfall into the range for that particular bucket.",
        predict_with_interval_method="pitci.lightgbm.LGBMBoosterLeafNodeScaledConformalPredictor.predict_with_interval",
        data_type="np.ndarray or pd.DataFrame",
        response_type="np.ndarray or pd.Series",
        train_data_type="np.ndarray, pd.DataFrame or None, default = None",
    )
    def calibrate(
        self,
        data: Union[np.ndarray, pd.DataFrame],
        response: Union[np.ndarray, pd.Series],
        alpha: Union[int, float] = 0.95,
        train_data: Union[np.ndarray, pd.DataFrame] = None,
    ) -> None:

        super().calibrate(
            data=data, response=response, alpha=alpha, train_data=train_data
        )

[docs]    @docstrings.doc_inherit_kwargs(
        LGBMBoosterLeafNodeScaledConformalPredictor.predict_with_interval,
        style=docstrings.str_format_merge_style,
        predict_with_interval_method="pitci.xgboost.XGBoosterLeafNodeScaledConformalPredictor.predict_with_interval",
        data_type="pd.DataFrame of np.ndarray",
    )
    def predict_with_interval(
        self, data: Union[np.ndarray, pd.DataFrame]
    ) -> np.ndarray:

        return super().predict_with_interval(data=data)


@get_absolute_error_conformal_predictor.register(lgb.basic.Booster)
def return_xgb_booster_absolute_error_confromal_predictor(
    model: lgb.Booster,
) -> LGBMBoosterAbsoluteErrorConformalPredictor:
    """Function to return an instance of LGBMBoosterAbsoluteErrorConformalPredictor
    class the passed lightgbm model object.
    """

    confo_model = LGBMBoosterAbsoluteErrorConformalPredictor(model=model)

    return confo_model


@get_leaf_node_scaled_conformal_predictor.register(lgb.basic.Booster)
def return_lgb_booster_leaf_node_scaled_confromal_predictor(
    model: lgb.Booster,
) -> LGBMBoosterLeafNodeScaledConformalPredictor:
    """Function to return an instance of LGBMBoosterLeafNodeScaledConformalPredictor
    class the passed lgb.Booster object.
    """

    confo_model = LGBMBoosterLeafNodeScaledConformalPredictor(model=model)

    return confo_model


@get_split_leaf_node_scaled_conformal_predictor.register(lgb.basic.Booster)
def return_lgb_booster_leaf_node_split_confromal_predictor(
    model: lgb.Booster, n_bins: int = 3
) -> LGBMBoosterSplitLeafNodeScaledConformalPredictor:
    """Function to return an instance of LGBMBoosterSplitLeafNodeScaledConformalPredictor
    class the passed lgb.Booster object.
    """

    confo_model = LGBMBoosterSplitLeafNodeScaledConformalPredictor(
        model=model, n_bins=n_bins
    )

    return confo_model