Source code for pitci.lightgbm

"""Conformal predictor classes for LightGBM models."""

import numpy as np
import pandas as pd

try:

    import lightgbm as lgb

except ModuleNotFoundError as err:

    raise ImportError(
        "lightgbm must be installed to use functionality in pitci.lightgbm"
    ) from err

from typing import List, Union, Any

from .base import (
    AbsoluteErrorConformalPredictor,
    LeafNodeScaledConformalPredictor,
    SplitConformalPredictorMixin,
)
from .checks import check_type, check_allowed_value
from .dispatchers import (
    get_absolute_error_conformal_predictor,
    get_leaf_node_scaled_conformal_predictor,
    get_split_leaf_node_scaled_conformal_predictor,
)
from . import docstrings


def check_objective_supported(
    booster: lgb.Booster, supported_objectives: List[str]
) -> None:
    """Function to check that the booster objective parameter is in the
    supported_objectives list and raise and exception if not.

    Parameters
    ----------
    booster : lgb.Booster
        Model to check objective is supported.

    supported_objectives : list
        List of lightgbm supported objectives.

    """

    check_type(booster, [lgb.basic.Booster], "booster")
    check_type(supported_objectives, [list], "supported_objectives")

    for i, objective in enumerate(supported_objectives):

        check_type(objective, [str], f"supported_objectives[{i}]")

    booster_objective = booster.dump_model()["objective"]

    check_allowed_value(
        booster_objective, supported_objectives, "booster objective not supported"
    )


SUPPORTED_OBJECTIVES_ABSOLUTE_ERROR = [
    "regression",
    "regression_l1",
    "huber",
    "fair",
    "poisson",
    "quantile",
    "mape",
    "gamma",
    "tweedie",
    "binary",
    # "multiclass",
    # "multiclassova",
    # "cross_entropy",
    # "cross_entropy_lambda",
    # "lambdarank",
    # "rank_xendcg"
]

SUPPORTED_OBJECTIVES_DESCRIPTION = (
    "The currently supported lightgbm objective functions, given the nonconformity\n"
    "    measure that is based on absolute error, are defined in the\n"
    "    ``SUPPORTED_OBJECTIVES`` attribute."
)

SUPPORTED_OBJECTIVES_ATTRIBUTE = (
    "SUPPORTED_OBJECTIVES : list\n"
    "\tBooster supported objectives. If a lgb.Booster with a non-supported objective\n"
    "\tis passed when initialising the class object an error will be raised."
)


[docs]class LGBMBoosterAbsoluteErrorConformalPredictor(AbsoluteErrorConformalPredictor): __doc__ = AbsoluteErrorConformalPredictor.__doc__.format( model_type="``lgb.Booster``", description=SUPPORTED_OBJECTIVES_DESCRIPTION, parameters="", calibrate_link=":func:`~pitci.lightgbm.LGBMBoosterAbsoluteErrorConformalPredictor.calibrate`", attributes=SUPPORTED_OBJECTIVES_ATTRIBUTE.format(model_type="``lgb.Booster``"), )
[docs] def __init__(self, model: lgb.Booster) -> None: check_type(model, [lgb.basic.Booster], "model") super().__init__(model=model) self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABSOLUTE_ERROR check_objective_supported(model, self.SUPPORTED_OBJECTIVES)
[docs] @docstrings.doc_inherit_kwargs( AbsoluteErrorConformalPredictor.calibrate, style=docstrings.str_format_merge_style, description="", data_type="np.ndarray or pd.DataFrame", response_type="np.ndarray or pd.Series", ) def calibrate( self, data: Union[np.ndarray, pd.DataFrame], response: Union[np.ndarray, pd.Series], alpha: Union[int, float] = 0.95, ) -> None: check_type(data, [np.ndarray, pd.DataFrame], "data") super().calibrate(data=data, alpha=alpha, response=response)
[docs] @docstrings.doc_inherit_kwargs( AbsoluteErrorConformalPredictor.predict_with_interval, style=docstrings.str_format_merge_style, description="", data_type="np.ndarray or pd.DataFrame", ) def predict_with_interval( self, data: Union[np.ndarray, pd.DataFrame] ) -> np.ndarray: check_type(data, [np.ndarray, pd.DataFrame], "data") return super().predict_with_interval(data)
def _generate_predictions( self, data: Union[pd.DataFrame, np.ndarray] ) -> np.ndarray: """Method to generate predictions from the lgboost model. The number of trees to predict with is not specified, defaulting to lightgbm's default behaviour for the `num_iteration` argument; https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.predict. Parameters ---------- data : lgb.Dataset Data to generate predictions on. """ predictions = self.model.predict(data) return predictions
[docs]class LGBMBoosterLeafNodeScaledConformalPredictor(LeafNodeScaledConformalPredictor): __doc__ = LeafNodeScaledConformalPredictor.__doc__.format( model_type="``lgb.Booster``", description=SUPPORTED_OBJECTIVES_DESCRIPTION, parameters="", attributes=SUPPORTED_OBJECTIVES_ATTRIBUTE, calibrate_method="pitci.lightgbm.LGBMBoosterLeafNodeScaledConformalPredictor.calibrate", )
[docs] def __init__(self, model: lgb.Booster) -> None: check_type(model, [lgb.basic.Booster], "model") super().__init__(model=model) self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABSOLUTE_ERROR check_objective_supported(model, self.SUPPORTED_OBJECTIVES)
[docs] @docstrings.doc_inherit_kwargs( LeafNodeScaledConformalPredictor.calibrate, style=docstrings.str_format_merge_style, description="", predict_with_interval_method="pitci.lightgbm.LGBMBoosterLeafNodeScaledConformalPredictor.predict_with_interval", data_type="np.ndarray or pd.DataFrame", response_type="np.ndarray or pd.Series", train_data_type="np.ndarray, pd.DataFrame or None, default = None", ) def calibrate( self, data: Union[np.ndarray, pd.DataFrame], response: Union[np.ndarray, pd.Series], alpha: Union[int, float] = 0.95, train_data: Union[np.ndarray, pd.DataFrame] = None, ) -> None: check_type(data, [np.ndarray, pd.DataFrame], "data") check_type(train_data, [np.ndarray, pd.DataFrame, type(None)], "train_data") super().calibrate( data=data, response=response, alpha=alpha, train_data=train_data )
[docs] @docstrings.doc_inherit_kwargs( LeafNodeScaledConformalPredictor.predict_with_interval, style=docstrings.str_format_merge_style, description="", data_type="np.ndarray or pd.DataFrame", ) def predict_with_interval( self, data: Union[np.ndarray, pd.DataFrame] ) -> np.ndarray: check_type(data, [np.ndarray, pd.DataFrame], "data") return super().predict_with_interval(data=data)
def _calibrate_leaf_node_counts(self, data: Any) -> None: """Method to get the number of times each leaf node was visited on the training dataset. LightGBM exposes this information through the the `trees_to_dataframe` method. This returns a dataframe with tree stats and the `count` column gives the number of records in the training data that fall into this node. See https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.trees_to_dataframe for more info. """ trees_df = self.model.trees_to_dataframe() leaf_nodes = trees_df.loc[trees_df["split_feature"].isnull()].copy() # strip out the number part of the node index leaf_nodes["leaf_index"] = ( leaf_nodes["node_index"] .apply(lambda x: x.split("-")[1].replace("L", "")) .astype(int) ) self.leaf_node_counts = [] for tree_no in np.sort(leaf_nodes["tree_index"].unique()): tree_leaf_node_counts = { row[1]["leaf_index"]: row[1]["count"] for row in leaf_nodes.loc[ leaf_nodes["tree_index"] == tree_no, ["leaf_index", "count"] ].iterrows() } self.leaf_node_counts.append(tree_leaf_node_counts) def _generate_predictions( self, data: Union[pd.DataFrame, np.ndarray] ) -> np.ndarray: """Method to generate predictions from the lgboost model. The number of trees to predict with is not specified, defaulting to lightgbm's default behaviour for the `num_iteration` argument; https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.predict. Parameters ---------- data : lgb.Dataset Data to generate predictions on. """ predictions = self.model.predict(data) return predictions def _generate_leaf_node_predictions( self, data: Union[pd.DataFrame, np.ndarray] ) -> np.ndarray: """Method to generate leaf node predictions from the lgboost model. Method calls lgb.Booster.predict with pred_leaf = True. Like the _generate_predictions method the number of trees to predict with is not specified, defaulting to lightgbm's behaviour. If the output of predict is not a 2d matrix the output is shaped to be 2d. Parameters ---------- data : lgb.Dataset Data to generate predictions on. """ # matrix of (nsample, ntrees) with each record giving # the leaf node of each sample in each tree leaf_node_predictions = self.model.predict(data, pred_leaf=True) # if the input data is a single column reshape the output to # be 2d array rather than 1d if len(leaf_node_predictions.shape) == 1: leaf_node_predictions = leaf_node_predictions.reshape( (leaf_node_predictions.shape[0], 1) ) return leaf_node_predictions
[docs]class LGBMBoosterSplitLeafNodeScaledConformalPredictor( SplitConformalPredictorMixin, LGBMBoosterLeafNodeScaledConformalPredictor ): __doc__ = docstrings.combine_split_mixin_docs( SplitConformalPredictorMixin, LGBMBoosterLeafNodeScaledConformalPredictor )
[docs] @docstrings.doc_inherit_kwargs( LeafNodeScaledConformalPredictor.calibrate, style=docstrings.str_format_merge_style, description="The ``baseline_interval`` values are each calibrated to the required ``alpha``\n" "\tlevel on the subsets of the data where the scaling factor values\n" "\tfall into the range for that particular bucket.", predict_with_interval_method="pitci.lightgbm.LGBMBoosterLeafNodeScaledConformalPredictor.predict_with_interval", data_type="np.ndarray or pd.DataFrame", response_type="np.ndarray or pd.Series", train_data_type="np.ndarray, pd.DataFrame or None, default = None", ) def calibrate( self, data: Union[np.ndarray, pd.DataFrame], response: Union[np.ndarray, pd.Series], alpha: Union[int, float] = 0.95, train_data: Union[np.ndarray, pd.DataFrame] = None, ) -> None: super().calibrate( data=data, response=response, alpha=alpha, train_data=train_data )
[docs] @docstrings.doc_inherit_kwargs( LGBMBoosterLeafNodeScaledConformalPredictor.predict_with_interval, style=docstrings.str_format_merge_style, predict_with_interval_method="pitci.xgboost.XGBoosterLeafNodeScaledConformalPredictor.predict_with_interval", data_type="pd.DataFrame of np.ndarray", ) def predict_with_interval( self, data: Union[np.ndarray, pd.DataFrame] ) -> np.ndarray: return super().predict_with_interval(data=data)
@get_absolute_error_conformal_predictor.register(lgb.basic.Booster) def return_xgb_booster_absolute_error_confromal_predictor( model: lgb.Booster, ) -> LGBMBoosterAbsoluteErrorConformalPredictor: """Function to return an instance of LGBMBoosterAbsoluteErrorConformalPredictor class the passed lightgbm model object. """ confo_model = LGBMBoosterAbsoluteErrorConformalPredictor(model=model) return confo_model @get_leaf_node_scaled_conformal_predictor.register(lgb.basic.Booster) def return_lgb_booster_leaf_node_scaled_confromal_predictor( model: lgb.Booster, ) -> LGBMBoosterLeafNodeScaledConformalPredictor: """Function to return an instance of LGBMBoosterLeafNodeScaledConformalPredictor class the passed lgb.Booster object. """ confo_model = LGBMBoosterLeafNodeScaledConformalPredictor(model=model) return confo_model @get_split_leaf_node_scaled_conformal_predictor.register(lgb.basic.Booster) def return_lgb_booster_leaf_node_split_confromal_predictor( model: lgb.Booster, n_bins: int = 3 ) -> LGBMBoosterSplitLeafNodeScaledConformalPredictor: """Function to return an instance of LGBMBoosterSplitLeafNodeScaledConformalPredictor class the passed lgb.Booster object. """ confo_model = LGBMBoosterSplitLeafNodeScaledConformalPredictor( model=model, n_bins=n_bins ) return confo_model