1178 lines
40 KiB
Python
1178 lines
40 KiB
Python
"""
|
|
This module contains loss classes suitable for fitting.
|
|
|
|
It is not part of the public API.
|
|
Specific losses are used for regression, binary classification or multiclass
|
|
classification.
|
|
"""
|
|
# Goals:
|
|
# - Provide a common private module for loss functions/classes.
|
|
# - To be used in:
|
|
# - LogisticRegression
|
|
# - PoissonRegressor, GammaRegressor, TweedieRegressor
|
|
# - HistGradientBoostingRegressor, HistGradientBoostingClassifier
|
|
# - GradientBoostingRegressor, GradientBoostingClassifier
|
|
# - SGDRegressor, SGDClassifier
|
|
# - Replace link module of GLMs.
|
|
|
|
import numbers
|
|
|
|
import numpy as np
|
|
from scipy.special import xlogy
|
|
|
|
from ..utils import check_scalar
|
|
from ..utils.stats import _weighted_percentile
|
|
from ._loss import (
|
|
CyAbsoluteError,
|
|
CyExponentialLoss,
|
|
CyHalfBinomialLoss,
|
|
CyHalfGammaLoss,
|
|
CyHalfMultinomialLoss,
|
|
CyHalfPoissonLoss,
|
|
CyHalfSquaredError,
|
|
CyHalfTweedieLoss,
|
|
CyHalfTweedieLossIdentity,
|
|
CyHuberLoss,
|
|
CyPinballLoss,
|
|
)
|
|
from .link import (
|
|
HalfLogitLink,
|
|
IdentityLink,
|
|
Interval,
|
|
LogitLink,
|
|
LogLink,
|
|
MultinomialLogit,
|
|
)
|
|
|
|
|
|
# Note: The shape of raw_prediction for multiclass classifications are
|
|
# - GradientBoostingClassifier: (n_samples, n_classes)
|
|
# - HistGradientBoostingClassifier: (n_classes, n_samples)
|
|
#
|
|
# Note: Instead of inheritance like
|
|
#
|
|
# class BaseLoss(BaseLink, CyLossFunction):
|
|
# ...
|
|
#
|
|
# # Note: Naturally, we would inherit in the following order
|
|
# # class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
|
|
# # But because of https://github.com/cython/cython/issues/4350 we set BaseLoss as
|
|
# # the last one. This, of course, changes the MRO.
|
|
# class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
|
|
#
|
|
# we use composition. This way we improve maintainability by avoiding the above
|
|
# mentioned Cython edge case and have easier to understand code (which method calls
|
|
# which code).
|
|
class BaseLoss:
|
|
"""Base class for a loss function of 1-dimensional targets.
|
|
|
|
Conventions:
|
|
|
|
- y_true.shape = sample_weight.shape = (n_samples,)
|
|
- y_pred.shape = raw_prediction.shape = (n_samples,)
|
|
- If is_multiclass is true (multiclass classification), then
|
|
y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
|
|
Note that this corresponds to the return value of decision_function.
|
|
|
|
y_true, y_pred, sample_weight and raw_prediction must either be all float64
|
|
or all float32.
|
|
gradient and hessian must be either both float64 or both float32.
|
|
|
|
Note that y_pred = link.inverse(raw_prediction).
|
|
|
|
Specific loss classes can inherit specific link classes to satisfy
|
|
BaseLink's abstractmethods.
|
|
|
|
Parameters
|
|
----------
|
|
sample_weight : {None, ndarray}
|
|
If sample_weight is None, the hessian might be constant.
|
|
n_classes : {None, int}
|
|
The number of classes for classification, else None.
|
|
|
|
Attributes
|
|
----------
|
|
closs: CyLossFunction
|
|
link : BaseLink
|
|
interval_y_true : Interval
|
|
Valid interval for y_true
|
|
interval_y_pred : Interval
|
|
Valid Interval for y_pred
|
|
differentiable : bool
|
|
Indicates whether or not loss function is differentiable in
|
|
raw_prediction everywhere.
|
|
need_update_leaves_values : bool
|
|
Indicates whether decision trees in gradient boosting need to uptade
|
|
leave values after having been fit to the (negative) gradients.
|
|
approx_hessian : bool
|
|
Indicates whether the hessian is approximated or exact. If,
|
|
approximated, it should be larger or equal to the exact one.
|
|
constant_hessian : bool
|
|
Indicates whether the hessian is one for this loss.
|
|
is_multiclass : bool
|
|
Indicates whether n_classes > 2 is allowed.
|
|
"""
|
|
|
|
# For gradient boosted decision trees:
|
|
# This variable indicates whether the loss requires the leaves values to
|
|
# be updated once the tree has been trained. The trees are trained to
|
|
# predict a Newton-Raphson step (see grower._finalize_leaf()). But for
|
|
# some losses (e.g. least absolute deviation) we need to adjust the tree
|
|
# values to account for the "line search" of the gradient descent
|
|
# procedure. See the original paper Greedy Function Approximation: A
|
|
# Gradient Boosting Machine by Friedman
|
|
# (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
|
|
differentiable = True
|
|
need_update_leaves_values = False
|
|
is_multiclass = False
|
|
|
|
def __init__(self, closs, link, n_classes=None):
|
|
self.closs = closs
|
|
self.link = link
|
|
self.approx_hessian = False
|
|
self.constant_hessian = False
|
|
self.n_classes = n_classes
|
|
self.interval_y_true = Interval(-np.inf, np.inf, False, False)
|
|
self.interval_y_pred = self.link.interval_y_pred
|
|
|
|
def in_y_true_range(self, y):
|
|
"""Return True if y is in the valid range of y_true.
|
|
|
|
Parameters
|
|
----------
|
|
y : ndarray
|
|
"""
|
|
return self.interval_y_true.includes(y)
|
|
|
|
def in_y_pred_range(self, y):
|
|
"""Return True if y is in the valid range of y_pred.
|
|
|
|
Parameters
|
|
----------
|
|
y : ndarray
|
|
"""
|
|
return self.interval_y_pred.includes(y)
|
|
|
|
def loss(
|
|
self,
|
|
y_true,
|
|
raw_prediction,
|
|
sample_weight=None,
|
|
loss_out=None,
|
|
n_threads=1,
|
|
):
|
|
"""Compute the pointwise loss value for each input.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : C-contiguous array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
|
shape (n_samples, n_classes)
|
|
Raw prediction values (in link space).
|
|
sample_weight : None or C-contiguous array of shape (n_samples,)
|
|
Sample weights.
|
|
loss_out : None or C-contiguous array of shape (n_samples,)
|
|
A location into which the result is stored. If None, a new array
|
|
might be created.
|
|
n_threads : int, default=1
|
|
Might use openmp thread parallelism.
|
|
|
|
Returns
|
|
-------
|
|
loss : array of shape (n_samples,)
|
|
Element-wise loss function.
|
|
"""
|
|
if loss_out is None:
|
|
loss_out = np.empty_like(y_true)
|
|
# Be graceful to shape (n_samples, 1) -> (n_samples,)
|
|
if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
|
|
raw_prediction = raw_prediction.squeeze(1)
|
|
|
|
self.closs.loss(
|
|
y_true=y_true,
|
|
raw_prediction=raw_prediction,
|
|
sample_weight=sample_weight,
|
|
loss_out=loss_out,
|
|
n_threads=n_threads,
|
|
)
|
|
return loss_out
|
|
|
|
def loss_gradient(
|
|
self,
|
|
y_true,
|
|
raw_prediction,
|
|
sample_weight=None,
|
|
loss_out=None,
|
|
gradient_out=None,
|
|
n_threads=1,
|
|
):
|
|
"""Compute loss and gradient w.r.t. raw_prediction for each input.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : C-contiguous array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
|
shape (n_samples, n_classes)
|
|
Raw prediction values (in link space).
|
|
sample_weight : None or C-contiguous array of shape (n_samples,)
|
|
Sample weights.
|
|
loss_out : None or C-contiguous array of shape (n_samples,)
|
|
A location into which the loss is stored. If None, a new array
|
|
might be created.
|
|
gradient_out : None or C-contiguous array of shape (n_samples,) or array \
|
|
of shape (n_samples, n_classes)
|
|
A location into which the gradient is stored. If None, a new array
|
|
might be created.
|
|
n_threads : int, default=1
|
|
Might use openmp thread parallelism.
|
|
|
|
Returns
|
|
-------
|
|
loss : array of shape (n_samples,)
|
|
Element-wise loss function.
|
|
|
|
gradient : array of shape (n_samples,) or (n_samples, n_classes)
|
|
Element-wise gradients.
|
|
"""
|
|
if loss_out is None:
|
|
if gradient_out is None:
|
|
loss_out = np.empty_like(y_true)
|
|
gradient_out = np.empty_like(raw_prediction)
|
|
else:
|
|
loss_out = np.empty_like(y_true, dtype=gradient_out.dtype)
|
|
elif gradient_out is None:
|
|
gradient_out = np.empty_like(raw_prediction, dtype=loss_out.dtype)
|
|
|
|
# Be graceful to shape (n_samples, 1) -> (n_samples,)
|
|
if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
|
|
raw_prediction = raw_prediction.squeeze(1)
|
|
if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
|
|
gradient_out = gradient_out.squeeze(1)
|
|
|
|
self.closs.loss_gradient(
|
|
y_true=y_true,
|
|
raw_prediction=raw_prediction,
|
|
sample_weight=sample_weight,
|
|
loss_out=loss_out,
|
|
gradient_out=gradient_out,
|
|
n_threads=n_threads,
|
|
)
|
|
return loss_out, gradient_out
|
|
|
|
def gradient(
|
|
self,
|
|
y_true,
|
|
raw_prediction,
|
|
sample_weight=None,
|
|
gradient_out=None,
|
|
n_threads=1,
|
|
):
|
|
"""Compute gradient of loss w.r.t raw_prediction for each input.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : C-contiguous array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
|
shape (n_samples, n_classes)
|
|
Raw prediction values (in link space).
|
|
sample_weight : None or C-contiguous array of shape (n_samples,)
|
|
Sample weights.
|
|
gradient_out : None or C-contiguous array of shape (n_samples,) or array \
|
|
of shape (n_samples, n_classes)
|
|
A location into which the result is stored. If None, a new array
|
|
might be created.
|
|
n_threads : int, default=1
|
|
Might use openmp thread parallelism.
|
|
|
|
Returns
|
|
-------
|
|
gradient : array of shape (n_samples,) or (n_samples, n_classes)
|
|
Element-wise gradients.
|
|
"""
|
|
if gradient_out is None:
|
|
gradient_out = np.empty_like(raw_prediction)
|
|
|
|
# Be graceful to shape (n_samples, 1) -> (n_samples,)
|
|
if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
|
|
raw_prediction = raw_prediction.squeeze(1)
|
|
if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
|
|
gradient_out = gradient_out.squeeze(1)
|
|
|
|
self.closs.gradient(
|
|
y_true=y_true,
|
|
raw_prediction=raw_prediction,
|
|
sample_weight=sample_weight,
|
|
gradient_out=gradient_out,
|
|
n_threads=n_threads,
|
|
)
|
|
return gradient_out
|
|
|
|
def gradient_hessian(
|
|
self,
|
|
y_true,
|
|
raw_prediction,
|
|
sample_weight=None,
|
|
gradient_out=None,
|
|
hessian_out=None,
|
|
n_threads=1,
|
|
):
|
|
"""Compute gradient and hessian of loss w.r.t raw_prediction.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : C-contiguous array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
|
shape (n_samples, n_classes)
|
|
Raw prediction values (in link space).
|
|
sample_weight : None or C-contiguous array of shape (n_samples,)
|
|
Sample weights.
|
|
gradient_out : None or C-contiguous array of shape (n_samples,) or array \
|
|
of shape (n_samples, n_classes)
|
|
A location into which the gradient is stored. If None, a new array
|
|
might be created.
|
|
hessian_out : None or C-contiguous array of shape (n_samples,) or array \
|
|
of shape (n_samples, n_classes)
|
|
A location into which the hessian is stored. If None, a new array
|
|
might be created.
|
|
n_threads : int, default=1
|
|
Might use openmp thread parallelism.
|
|
|
|
Returns
|
|
-------
|
|
gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
|
|
Element-wise gradients.
|
|
|
|
hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
|
|
Element-wise hessians.
|
|
"""
|
|
if gradient_out is None:
|
|
if hessian_out is None:
|
|
gradient_out = np.empty_like(raw_prediction)
|
|
hessian_out = np.empty_like(raw_prediction)
|
|
else:
|
|
gradient_out = np.empty_like(hessian_out)
|
|
elif hessian_out is None:
|
|
hessian_out = np.empty_like(gradient_out)
|
|
|
|
# Be graceful to shape (n_samples, 1) -> (n_samples,)
|
|
if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
|
|
raw_prediction = raw_prediction.squeeze(1)
|
|
if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
|
|
gradient_out = gradient_out.squeeze(1)
|
|
if hessian_out.ndim == 2 and hessian_out.shape[1] == 1:
|
|
hessian_out = hessian_out.squeeze(1)
|
|
|
|
self.closs.gradient_hessian(
|
|
y_true=y_true,
|
|
raw_prediction=raw_prediction,
|
|
sample_weight=sample_weight,
|
|
gradient_out=gradient_out,
|
|
hessian_out=hessian_out,
|
|
n_threads=n_threads,
|
|
)
|
|
return gradient_out, hessian_out
|
|
|
|
def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
|
|
"""Compute the weighted average loss.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : C-contiguous array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
|
shape (n_samples, n_classes)
|
|
Raw prediction values (in link space).
|
|
sample_weight : None or C-contiguous array of shape (n_samples,)
|
|
Sample weights.
|
|
n_threads : int, default=1
|
|
Might use openmp thread parallelism.
|
|
|
|
Returns
|
|
-------
|
|
loss : float
|
|
Mean or averaged loss function.
|
|
"""
|
|
return np.average(
|
|
self.loss(
|
|
y_true=y_true,
|
|
raw_prediction=raw_prediction,
|
|
sample_weight=None,
|
|
loss_out=None,
|
|
n_threads=n_threads,
|
|
),
|
|
weights=sample_weight,
|
|
)
|
|
|
|
def fit_intercept_only(self, y_true, sample_weight=None):
|
|
"""Compute raw_prediction of an intercept-only model.
|
|
|
|
This can be used as initial estimates of predictions, i.e. before the
|
|
first iteration in fit.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : array-like of shape (n_samples,)
|
|
Observed, true target values.
|
|
sample_weight : None or array of shape (n_samples,)
|
|
Sample weights.
|
|
|
|
Returns
|
|
-------
|
|
raw_prediction : numpy scalar or array of shape (n_classes,)
|
|
Raw predictions of an intercept-only model.
|
|
"""
|
|
# As default, take weighted average of the target over the samples
|
|
# axis=0 and then transform into link-scale (raw_prediction).
|
|
y_pred = np.average(y_true, weights=sample_weight, axis=0)
|
|
eps = 10 * np.finfo(y_pred.dtype).eps
|
|
|
|
if self.interval_y_pred.low == -np.inf:
|
|
a_min = None
|
|
elif self.interval_y_pred.low_inclusive:
|
|
a_min = self.interval_y_pred.low
|
|
else:
|
|
a_min = self.interval_y_pred.low + eps
|
|
|
|
if self.interval_y_pred.high == np.inf:
|
|
a_max = None
|
|
elif self.interval_y_pred.high_inclusive:
|
|
a_max = self.interval_y_pred.high
|
|
else:
|
|
a_max = self.interval_y_pred.high - eps
|
|
|
|
if a_min is None and a_max is None:
|
|
return self.link.link(y_pred)
|
|
else:
|
|
return self.link.link(np.clip(y_pred, a_min, a_max))
|
|
|
|
def constant_to_optimal_zero(self, y_true, sample_weight=None):
|
|
"""Calculate term dropped in loss.
|
|
|
|
With this term added, the loss of perfect predictions is zero.
|
|
"""
|
|
return np.zeros_like(y_true)
|
|
|
|
def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"):
|
|
"""Initialize arrays for gradients and hessians.
|
|
|
|
Unless hessians are constant, arrays are initialized with undefined values.
|
|
|
|
Parameters
|
|
----------
|
|
n_samples : int
|
|
The number of samples, usually passed to `fit()`.
|
|
dtype : {np.float64, np.float32}, default=np.float64
|
|
The dtype of the arrays gradient and hessian.
|
|
order : {'C', 'F'}, default='F'
|
|
Order of the arrays gradient and hessian. The default 'F' makes the arrays
|
|
contiguous along samples.
|
|
|
|
Returns
|
|
-------
|
|
gradient : C-contiguous array of shape (n_samples,) or array of shape \
|
|
(n_samples, n_classes)
|
|
Empty array (allocated but not initialized) to be used as argument
|
|
gradient_out.
|
|
hessian : C-contiguous array of shape (n_samples,), array of shape
|
|
(n_samples, n_classes) or shape (1,)
|
|
Empty (allocated but not initialized) array to be used as argument
|
|
hessian_out.
|
|
If constant_hessian is True (e.g. `HalfSquaredError`), the array is
|
|
initialized to ``1``.
|
|
"""
|
|
if dtype not in (np.float32, np.float64):
|
|
raise ValueError(
|
|
"Valid options for 'dtype' are np.float32 and np.float64. "
|
|
f"Got dtype={dtype} instead."
|
|
)
|
|
|
|
if self.is_multiclass:
|
|
shape = (n_samples, self.n_classes)
|
|
else:
|
|
shape = (n_samples,)
|
|
gradient = np.empty(shape=shape, dtype=dtype, order=order)
|
|
|
|
if self.constant_hessian:
|
|
# If the hessians are constant, we consider them equal to 1.
|
|
# - This is correct for HalfSquaredError
|
|
# - For AbsoluteError, hessians are actually 0, but they are
|
|
# always ignored anyway.
|
|
hessian = np.ones(shape=(1,), dtype=dtype)
|
|
else:
|
|
hessian = np.empty(shape=shape, dtype=dtype, order=order)
|
|
|
|
return gradient, hessian
|
|
|
|
|
|
# Note: Naturally, we would inherit in the following order
|
|
# class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
|
|
# But because of https://github.com/cython/cython/issues/4350 we
|
|
# set BaseLoss as the last one. This, of course, changes the MRO.
|
|
class HalfSquaredError(BaseLoss):
|
|
"""Half squared error with identity link, for regression.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
|
|
For a given sample x_i, half squared error is defined as::
|
|
|
|
loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
|
|
|
|
The factor of 0.5 simplifies the computation of gradients and results in a
|
|
unit hessian (and is consistent with what is done in LightGBM). It is also
|
|
half the Normal distribution deviance.
|
|
"""
|
|
|
|
def __init__(self, sample_weight=None):
|
|
super().__init__(closs=CyHalfSquaredError(), link=IdentityLink())
|
|
self.constant_hessian = sample_weight is None
|
|
|
|
|
|
class AbsoluteError(BaseLoss):
|
|
"""Absolute error with identity link, for regression.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
|
|
For a given sample x_i, the absolute error is defined as::
|
|
|
|
loss(x_i) = |y_true_i - raw_prediction_i|
|
|
|
|
Note that the exact hessian = 0 almost everywhere (except at one point, therefore
|
|
differentiable = False). Optimization routines like in HGBT, however, need a
|
|
hessian > 0. Therefore, we assign 1.
|
|
"""
|
|
|
|
differentiable = False
|
|
need_update_leaves_values = True
|
|
|
|
def __init__(self, sample_weight=None):
|
|
super().__init__(closs=CyAbsoluteError(), link=IdentityLink())
|
|
self.approx_hessian = True
|
|
self.constant_hessian = sample_weight is None
|
|
|
|
def fit_intercept_only(self, y_true, sample_weight=None):
|
|
"""Compute raw_prediction of an intercept-only model.
|
|
|
|
This is the weighted median of the target, i.e. over the samples
|
|
axis=0.
|
|
"""
|
|
if sample_weight is None:
|
|
return np.median(y_true, axis=0)
|
|
else:
|
|
return _weighted_percentile(y_true, sample_weight, 50)
|
|
|
|
|
|
class PinballLoss(BaseLoss):
|
|
"""Quantile loss aka pinball loss, for regression.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
quantile in (0, 1)
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
|
|
For a given sample x_i, the pinball loss is defined as::
|
|
|
|
loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
|
|
|
|
rho_{quantile}(u) = u * (quantile - 1_{u<0})
|
|
= -u *(1 - quantile) if u < 0
|
|
u * quantile if u >= 0
|
|
|
|
Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
|
|
|
|
Note that the exact hessian = 0 almost everywhere (except at one point, therefore
|
|
differentiable = False). Optimization routines like in HGBT, however, need a
|
|
hessian > 0. Therefore, we assign 1.
|
|
|
|
Additional Attributes
|
|
---------------------
|
|
quantile : float
|
|
The quantile level of the quantile to be estimated. Must be in range (0, 1).
|
|
"""
|
|
|
|
differentiable = False
|
|
need_update_leaves_values = True
|
|
|
|
def __init__(self, sample_weight=None, quantile=0.5):
|
|
check_scalar(
|
|
quantile,
|
|
"quantile",
|
|
target_type=numbers.Real,
|
|
min_val=0,
|
|
max_val=1,
|
|
include_boundaries="neither",
|
|
)
|
|
super().__init__(
|
|
closs=CyPinballLoss(quantile=float(quantile)),
|
|
link=IdentityLink(),
|
|
)
|
|
self.approx_hessian = True
|
|
self.constant_hessian = sample_weight is None
|
|
|
|
def fit_intercept_only(self, y_true, sample_weight=None):
|
|
"""Compute raw_prediction of an intercept-only model.
|
|
|
|
This is the weighted median of the target, i.e. over the samples
|
|
axis=0.
|
|
"""
|
|
if sample_weight is None:
|
|
return np.percentile(y_true, 100 * self.closs.quantile, axis=0)
|
|
else:
|
|
return _weighted_percentile(
|
|
y_true, sample_weight, 100 * self.closs.quantile
|
|
)
|
|
|
|
|
|
class HuberLoss(BaseLoss):
|
|
"""Huber loss, for regression.
|
|
|
|
Domain:
|
|
y_true and y_pred all real numbers
|
|
quantile in (0, 1)
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
|
|
For a given sample x_i, the Huber loss is defined as::
|
|
|
|
loss(x_i) = 1/2 * abserr**2 if abserr <= delta
|
|
delta * (abserr - delta/2) if abserr > delta
|
|
|
|
abserr = |y_true_i - raw_prediction_i|
|
|
delta = quantile(abserr, self.quantile)
|
|
|
|
Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
|
|
equals delta * (AbsoluteError() - delta/2).
|
|
|
|
Additional Attributes
|
|
---------------------
|
|
quantile : float
|
|
The quantile level which defines the breaking point `delta` to distinguish
|
|
between absolute error and squared error. Must be in range (0, 1).
|
|
|
|
Reference
|
|
---------
|
|
.. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
|
|
boosting machine <10.1214/aos/1013203451>`.
|
|
Annals of Statistics, 29, 1189-1232.
|
|
"""
|
|
|
|
differentiable = False
|
|
need_update_leaves_values = True
|
|
|
|
def __init__(self, sample_weight=None, quantile=0.9, delta=0.5):
|
|
check_scalar(
|
|
quantile,
|
|
"quantile",
|
|
target_type=numbers.Real,
|
|
min_val=0,
|
|
max_val=1,
|
|
include_boundaries="neither",
|
|
)
|
|
self.quantile = quantile # This is better stored outside of Cython.
|
|
super().__init__(
|
|
closs=CyHuberLoss(delta=float(delta)),
|
|
link=IdentityLink(),
|
|
)
|
|
self.approx_hessian = True
|
|
self.constant_hessian = False
|
|
|
|
def fit_intercept_only(self, y_true, sample_weight=None):
|
|
"""Compute raw_prediction of an intercept-only model.
|
|
|
|
This is the weighted median of the target, i.e. over the samples
|
|
axis=0.
|
|
"""
|
|
# See formula before algo 4 in Friedman (2001), but we apply it to y_true,
|
|
# not to the residual y_true - raw_prediction. An estimator like
|
|
# HistGradientBoostingRegressor might then call it on the residual, e.g.
|
|
# fit_intercept_only(y_true - raw_prediction).
|
|
if sample_weight is None:
|
|
median = np.percentile(y_true, 50, axis=0)
|
|
else:
|
|
median = _weighted_percentile(y_true, sample_weight, 50)
|
|
diff = y_true - median
|
|
term = np.sign(diff) * np.minimum(self.closs.delta, np.abs(diff))
|
|
return median + np.average(term, weights=sample_weight)
|
|
|
|
|
|
class HalfPoissonLoss(BaseLoss):
|
|
"""Half Poisson deviance loss with log-link, for regression.
|
|
|
|
Domain:
|
|
y_true in non-negative real numbers
|
|
y_pred in positive real numbers
|
|
|
|
Link:
|
|
y_pred = exp(raw_prediction)
|
|
|
|
For a given sample x_i, half the Poisson deviance is defined as::
|
|
|
|
loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
|
|
- y_true_i + exp(raw_prediction_i)
|
|
|
|
Half the Poisson deviance is actually the negative log-likelihood up to
|
|
constant terms (not involving raw_prediction) and simplifies the
|
|
computation of the gradients.
|
|
We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
|
|
"""
|
|
|
|
def __init__(self, sample_weight=None):
|
|
super().__init__(closs=CyHalfPoissonLoss(), link=LogLink())
|
|
self.interval_y_true = Interval(0, np.inf, True, False)
|
|
|
|
def constant_to_optimal_zero(self, y_true, sample_weight=None):
|
|
term = xlogy(y_true, y_true) - y_true
|
|
if sample_weight is not None:
|
|
term *= sample_weight
|
|
return term
|
|
|
|
|
|
class HalfGammaLoss(BaseLoss):
|
|
"""Half Gamma deviance loss with log-link, for regression.
|
|
|
|
Domain:
|
|
y_true and y_pred in positive real numbers
|
|
|
|
Link:
|
|
y_pred = exp(raw_prediction)
|
|
|
|
For a given sample x_i, half Gamma deviance loss is defined as::
|
|
|
|
loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
|
|
+ y_true/exp(raw_prediction_i) - 1
|
|
|
|
Half the Gamma deviance is actually proportional to the negative log-
|
|
likelihood up to constant terms (not involving raw_prediction) and
|
|
simplifies the computation of the gradients.
|
|
We also skip the constant term `-log(y_true_i) - 1`.
|
|
"""
|
|
|
|
def __init__(self, sample_weight=None):
|
|
super().__init__(closs=CyHalfGammaLoss(), link=LogLink())
|
|
self.interval_y_true = Interval(0, np.inf, False, False)
|
|
|
|
def constant_to_optimal_zero(self, y_true, sample_weight=None):
|
|
term = -np.log(y_true) - 1
|
|
if sample_weight is not None:
|
|
term *= sample_weight
|
|
return term
|
|
|
|
|
|
class HalfTweedieLoss(BaseLoss):
|
|
"""Half Tweedie deviance loss with log-link, for regression.
|
|
|
|
Domain:
|
|
y_true in real numbers for power <= 0
|
|
y_true in non-negative real numbers for 0 < power < 2
|
|
y_true in positive real numbers for 2 <= power
|
|
y_pred in positive real numbers
|
|
power in real numbers
|
|
|
|
Link:
|
|
y_pred = exp(raw_prediction)
|
|
|
|
For a given sample x_i, half Tweedie deviance loss with p=power is defined
|
|
as::
|
|
|
|
loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
|
|
- y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
|
|
+ exp(raw_prediction_i)**(2-p) / (2-p)
|
|
|
|
Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
|
|
HalfPoissonLoss and HalfGammaLoss.
|
|
|
|
We also skip constant terms, but those are different for p=0, 1, 2.
|
|
Therefore, the loss is not continuous in `power`.
|
|
|
|
Note furthermore that although no Tweedie distribution exists for
|
|
0 < power < 1, it still gives a strictly consistent scoring function for
|
|
the expectation.
|
|
"""
|
|
|
|
def __init__(self, sample_weight=None, power=1.5):
|
|
super().__init__(
|
|
closs=CyHalfTweedieLoss(power=float(power)),
|
|
link=LogLink(),
|
|
)
|
|
if self.closs.power <= 0:
|
|
self.interval_y_true = Interval(-np.inf, np.inf, False, False)
|
|
elif self.closs.power < 2:
|
|
self.interval_y_true = Interval(0, np.inf, True, False)
|
|
else:
|
|
self.interval_y_true = Interval(0, np.inf, False, False)
|
|
|
|
def constant_to_optimal_zero(self, y_true, sample_weight=None):
|
|
if self.closs.power == 0:
|
|
return HalfSquaredError().constant_to_optimal_zero(
|
|
y_true=y_true, sample_weight=sample_weight
|
|
)
|
|
elif self.closs.power == 1:
|
|
return HalfPoissonLoss().constant_to_optimal_zero(
|
|
y_true=y_true, sample_weight=sample_weight
|
|
)
|
|
elif self.closs.power == 2:
|
|
return HalfGammaLoss().constant_to_optimal_zero(
|
|
y_true=y_true, sample_weight=sample_weight
|
|
)
|
|
else:
|
|
p = self.closs.power
|
|
term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p)
|
|
if sample_weight is not None:
|
|
term *= sample_weight
|
|
return term
|
|
|
|
|
|
class HalfTweedieLossIdentity(BaseLoss):
|
|
"""Half Tweedie deviance loss with identity link, for regression.
|
|
|
|
Domain:
|
|
y_true in real numbers for power <= 0
|
|
y_true in non-negative real numbers for 0 < power < 2
|
|
y_true in positive real numbers for 2 <= power
|
|
y_pred in positive real numbers for power != 0
|
|
y_pred in real numbers for power = 0
|
|
power in real numbers
|
|
|
|
Link:
|
|
y_pred = raw_prediction
|
|
|
|
For a given sample x_i, half Tweedie deviance loss with p=power is defined
|
|
as::
|
|
|
|
loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
|
|
- y_true_i * raw_prediction_i**(1-p) / (1-p)
|
|
+ raw_prediction_i**(2-p) / (2-p)
|
|
|
|
Note that the minimum value of this loss is 0.
|
|
|
|
Note furthermore that although no Tweedie distribution exists for
|
|
0 < power < 1, it still gives a strictly consistent scoring function for
|
|
the expectation.
|
|
"""
|
|
|
|
def __init__(self, sample_weight=None, power=1.5):
|
|
super().__init__(
|
|
closs=CyHalfTweedieLossIdentity(power=float(power)),
|
|
link=IdentityLink(),
|
|
)
|
|
if self.closs.power <= 0:
|
|
self.interval_y_true = Interval(-np.inf, np.inf, False, False)
|
|
elif self.closs.power < 2:
|
|
self.interval_y_true = Interval(0, np.inf, True, False)
|
|
else:
|
|
self.interval_y_true = Interval(0, np.inf, False, False)
|
|
|
|
if self.closs.power == 0:
|
|
self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
|
|
else:
|
|
self.interval_y_pred = Interval(0, np.inf, False, False)
|
|
|
|
|
|
class HalfBinomialLoss(BaseLoss):
|
|
"""Half Binomial deviance loss with logit link, for binary classification.
|
|
|
|
This is also know as binary cross entropy, log-loss and logistic loss.
|
|
|
|
Domain:
|
|
y_true in [0, 1], i.e. regression on the unit interval
|
|
y_pred in (0, 1), i.e. boundaries excluded
|
|
|
|
Link:
|
|
y_pred = expit(raw_prediction)
|
|
|
|
For a given sample x_i, half Binomial deviance is defined as the negative
|
|
log-likelihood of the Binomial/Bernoulli distribution and can be expressed
|
|
as::
|
|
|
|
loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
|
|
|
|
See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
|
|
section 4.4.1 (about logistic regression).
|
|
|
|
Note that the formulation works for classification, y = {0, 1}, as well as
|
|
logistic regression, y = [0, 1].
|
|
If you add `constant_to_optimal_zero` to the loss, you get half the
|
|
Bernoulli/binomial deviance.
|
|
|
|
More details: Inserting the predicted probability y_pred = expit(raw_prediction)
|
|
in the loss gives the well known::
|
|
|
|
loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
|
|
"""
|
|
|
|
def __init__(self, sample_weight=None):
|
|
super().__init__(
|
|
closs=CyHalfBinomialLoss(),
|
|
link=LogitLink(),
|
|
n_classes=2,
|
|
)
|
|
self.interval_y_true = Interval(0, 1, True, True)
|
|
|
|
def constant_to_optimal_zero(self, y_true, sample_weight=None):
|
|
# This is non-zero only if y_true is neither 0 nor 1.
|
|
term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true)
|
|
if sample_weight is not None:
|
|
term *= sample_weight
|
|
return term
|
|
|
|
def predict_proba(self, raw_prediction):
|
|
"""Predict probabilities.
|
|
|
|
Parameters
|
|
----------
|
|
raw_prediction : array of shape (n_samples,) or (n_samples, 1)
|
|
Raw prediction values (in link space).
|
|
|
|
Returns
|
|
-------
|
|
proba : array of shape (n_samples, 2)
|
|
Element-wise class probabilities.
|
|
"""
|
|
# Be graceful to shape (n_samples, 1) -> (n_samples,)
|
|
if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
|
|
raw_prediction = raw_prediction.squeeze(1)
|
|
proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
|
|
proba[:, 1] = self.link.inverse(raw_prediction)
|
|
proba[:, 0] = 1 - proba[:, 1]
|
|
return proba
|
|
|
|
|
|
class HalfMultinomialLoss(BaseLoss):
|
|
"""Categorical cross-entropy loss, for multiclass classification.
|
|
|
|
Domain:
|
|
y_true in {0, 1, 2, 3, .., n_classes - 1}
|
|
y_pred has n_classes elements, each element in (0, 1)
|
|
|
|
Link:
|
|
y_pred = softmax(raw_prediction)
|
|
|
|
Note: We assume y_true to be already label encoded. The inverse link is
|
|
softmax. But the full link function is the symmetric multinomial logit
|
|
function.
|
|
|
|
For a given sample x_i, the categorical cross-entropy loss is defined as
|
|
the negative log-likelihood of the multinomial distribution, it
|
|
generalizes the binary cross-entropy to more than 2 classes::
|
|
|
|
loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
|
|
- sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
|
|
|
|
See [1].
|
|
|
|
Note that for the hessian, we calculate only the diagonal part in the
|
|
classes: If the full hessian for classes k and l and sample i is H_i_k_l,
|
|
we calculate H_i_k_k, i.e. k=l.
|
|
|
|
Reference
|
|
---------
|
|
.. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
|
|
"A Blockwise Descent Algorithm for Group-penalized Multiresponse and
|
|
Multinomial Regression".
|
|
<1311.6529>`
|
|
"""
|
|
|
|
is_multiclass = True
|
|
|
|
def __init__(self, sample_weight=None, n_classes=3):
|
|
super().__init__(
|
|
closs=CyHalfMultinomialLoss(),
|
|
link=MultinomialLogit(),
|
|
n_classes=n_classes,
|
|
)
|
|
self.interval_y_true = Interval(0, np.inf, True, False)
|
|
self.interval_y_pred = Interval(0, 1, False, False)
|
|
|
|
def in_y_true_range(self, y):
|
|
"""Return True if y is in the valid range of y_true.
|
|
|
|
Parameters
|
|
----------
|
|
y : ndarray
|
|
"""
|
|
return self.interval_y_true.includes(y) and np.all(y.astype(int) == y)
|
|
|
|
def fit_intercept_only(self, y_true, sample_weight=None):
|
|
"""Compute raw_prediction of an intercept-only model.
|
|
|
|
This is the softmax of the weighted average of the target, i.e. over
|
|
the samples axis=0.
|
|
"""
|
|
out = np.zeros(self.n_classes, dtype=y_true.dtype)
|
|
eps = np.finfo(y_true.dtype).eps
|
|
for k in range(self.n_classes):
|
|
out[k] = np.average(y_true == k, weights=sample_weight, axis=0)
|
|
out[k] = np.clip(out[k], eps, 1 - eps)
|
|
return self.link.link(out[None, :]).reshape(-1)
|
|
|
|
def predict_proba(self, raw_prediction):
|
|
"""Predict probabilities.
|
|
|
|
Parameters
|
|
----------
|
|
raw_prediction : array of shape (n_samples, n_classes)
|
|
Raw prediction values (in link space).
|
|
|
|
Returns
|
|
-------
|
|
proba : array of shape (n_samples, n_classes)
|
|
Element-wise class probabilities.
|
|
"""
|
|
return self.link.inverse(raw_prediction)
|
|
|
|
def gradient_proba(
|
|
self,
|
|
y_true,
|
|
raw_prediction,
|
|
sample_weight=None,
|
|
gradient_out=None,
|
|
proba_out=None,
|
|
n_threads=1,
|
|
):
|
|
"""Compute gradient and class probabilities fow raw_prediction.
|
|
|
|
Parameters
|
|
----------
|
|
y_true : C-contiguous array of shape (n_samples,)
|
|
Observed, true target values.
|
|
raw_prediction : array of shape (n_samples, n_classes)
|
|
Raw prediction values (in link space).
|
|
sample_weight : None or C-contiguous array of shape (n_samples,)
|
|
Sample weights.
|
|
gradient_out : None or array of shape (n_samples, n_classes)
|
|
A location into which the gradient is stored. If None, a new array
|
|
might be created.
|
|
proba_out : None or array of shape (n_samples, n_classes)
|
|
A location into which the class probabilities are stored. If None,
|
|
a new array might be created.
|
|
n_threads : int, default=1
|
|
Might use openmp thread parallelism.
|
|
|
|
Returns
|
|
-------
|
|
gradient : array of shape (n_samples, n_classes)
|
|
Element-wise gradients.
|
|
|
|
proba : array of shape (n_samples, n_classes)
|
|
Element-wise class probabilities.
|
|
"""
|
|
if gradient_out is None:
|
|
if proba_out is None:
|
|
gradient_out = np.empty_like(raw_prediction)
|
|
proba_out = np.empty_like(raw_prediction)
|
|
else:
|
|
gradient_out = np.empty_like(proba_out)
|
|
elif proba_out is None:
|
|
proba_out = np.empty_like(gradient_out)
|
|
|
|
self.closs.gradient_proba(
|
|
y_true=y_true,
|
|
raw_prediction=raw_prediction,
|
|
sample_weight=sample_weight,
|
|
gradient_out=gradient_out,
|
|
proba_out=proba_out,
|
|
n_threads=n_threads,
|
|
)
|
|
return gradient_out, proba_out
|
|
|
|
|
|
class ExponentialLoss(BaseLoss):
|
|
"""Exponential loss with (half) logit link, for binary classification.
|
|
|
|
This is also know as boosting loss.
|
|
|
|
Domain:
|
|
y_true in [0, 1], i.e. regression on the unit interval
|
|
y_pred in (0, 1), i.e. boundaries excluded
|
|
|
|
Link:
|
|
y_pred = expit(2 * raw_prediction)
|
|
|
|
For a given sample x_i, the exponential loss is defined as::
|
|
|
|
loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)
|
|
|
|
See:
|
|
- J. Friedman, T. Hastie, R. Tibshirani.
|
|
"Additive logistic regression: a statistical view of boosting (With discussion
|
|
and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
|
|
https://doi.org/10.1214/aos/1016218223
|
|
- A. Buja, W. Stuetzle, Y. Shen. (2005).
|
|
"Loss Functions for Binary Class Probability Estimation and Classification:
|
|
Structure and Applications."
|
|
|
|
Note that the formulation works for classification, y = {0, 1}, as well as
|
|
"exponential logistic" regression, y = [0, 1].
|
|
Note that this is a proper scoring rule, but without it's canonical link.
|
|
|
|
More details: Inserting the predicted probability
|
|
y_pred = expit(2 * raw_prediction) in the loss gives::
|
|
|
|
loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
|
|
+ (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
|
|
"""
|
|
|
|
def __init__(self, sample_weight=None):
|
|
super().__init__(
|
|
closs=CyExponentialLoss(),
|
|
link=HalfLogitLink(),
|
|
n_classes=2,
|
|
)
|
|
self.interval_y_true = Interval(0, 1, True, True)
|
|
|
|
def constant_to_optimal_zero(self, y_true, sample_weight=None):
|
|
# This is non-zero only if y_true is neither 0 nor 1.
|
|
term = -2 * np.sqrt(y_true * (1 - y_true))
|
|
if sample_weight is not None:
|
|
term *= sample_weight
|
|
return term
|
|
|
|
def predict_proba(self, raw_prediction):
|
|
"""Predict probabilities.
|
|
|
|
Parameters
|
|
----------
|
|
raw_prediction : array of shape (n_samples,) or (n_samples, 1)
|
|
Raw prediction values (in link space).
|
|
|
|
Returns
|
|
-------
|
|
proba : array of shape (n_samples, 2)
|
|
Element-wise class probabilities.
|
|
"""
|
|
# Be graceful to shape (n_samples, 1) -> (n_samples,)
|
|
if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
|
|
raw_prediction = raw_prediction.squeeze(1)
|
|
proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
|
|
proba[:, 1] = self.link.inverse(raw_prediction)
|
|
proba[:, 0] = 1 - proba[:, 1]
|
|
return proba
|
|
|
|
|
|
_LOSSES = {
|
|
"squared_error": HalfSquaredError,
|
|
"absolute_error": AbsoluteError,
|
|
"pinball_loss": PinballLoss,
|
|
"huber_loss": HuberLoss,
|
|
"poisson_loss": HalfPoissonLoss,
|
|
"gamma_loss": HalfGammaLoss,
|
|
"tweedie_loss": HalfTweedieLoss,
|
|
"binomial_loss": HalfBinomialLoss,
|
|
"multinomial_loss": HalfMultinomialLoss,
|
|
"exponential_loss": ExponentialLoss,
|
|
}
|