import pickle import numpy as np import pytest from numpy.testing import assert_allclose, assert_array_equal from pytest import approx from scipy.optimize import ( LinearConstraint, minimize, minimize_scalar, newton, ) from scipy.special import logsumexp from sklearn._loss.link import IdentityLink, _inclusive_low_high from sklearn._loss.loss import ( _LOSSES, AbsoluteError, BaseLoss, HalfBinomialLoss, HalfGammaLoss, HalfMultinomialLoss, HalfPoissonLoss, HalfSquaredError, HalfTweedieLoss, HalfTweedieLossIdentity, HuberLoss, PinballLoss, ) from sklearn.utils import _IS_WASM, assert_all_finite from sklearn.utils._testing import create_memmap_backed_data, skip_if_32bit ALL_LOSSES = list(_LOSSES.values()) LOSS_INSTANCES = [loss() for loss in ALL_LOSSES] # HalfTweedieLoss(power=1.5) is already there as default LOSS_INSTANCES += [ PinballLoss(quantile=0.25), HuberLoss(quantile=0.75), HalfTweedieLoss(power=-1.5), HalfTweedieLoss(power=0), HalfTweedieLoss(power=1), HalfTweedieLoss(power=2), HalfTweedieLoss(power=3.0), HalfTweedieLossIdentity(power=0), HalfTweedieLossIdentity(power=1), HalfTweedieLossIdentity(power=2), HalfTweedieLossIdentity(power=3.0), ] def loss_instance_name(param): if isinstance(param, BaseLoss): loss = param name = loss.__class__.__name__ if isinstance(loss, PinballLoss): name += f"(quantile={loss.closs.quantile})" elif isinstance(loss, HuberLoss): name += f"(quantile={loss.quantile}" elif hasattr(loss, "closs") and hasattr(loss.closs, "power"): name += f"(power={loss.closs.power})" return name else: return str(param) def random_y_true_raw_prediction( loss, n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=42 ): """Random generate y_true and raw_prediction in valid range.""" rng = np.random.RandomState(seed) if loss.is_multiclass: raw_prediction = np.empty((n_samples, loss.n_classes)) raw_prediction.flat[:] = rng.uniform( low=raw_bound[0], high=raw_bound[1], size=n_samples * loss.n_classes, ) y_true = np.arange(n_samples).astype(float) % loss.n_classes else: # If link is identity, we must respect the interval of y_pred: if isinstance(loss.link, IdentityLink): low, high = _inclusive_low_high(loss.interval_y_pred) low = np.amax([low, raw_bound[0]]) high = np.amin([high, raw_bound[1]]) raw_bound = (low, high) raw_prediction = rng.uniform( low=raw_bound[0], high=raw_bound[1], size=n_samples ) # generate a y_true in valid range low, high = _inclusive_low_high(loss.interval_y_true) low = max(low, y_bound[0]) high = min(high, y_bound[1]) y_true = rng.uniform(low, high, size=n_samples) # set some values at special boundaries if loss.interval_y_true.low == 0 and loss.interval_y_true.low_inclusive: y_true[:: (n_samples // 3)] = 0 if loss.interval_y_true.high == 1 and loss.interval_y_true.high_inclusive: y_true[1 :: (n_samples // 3)] = 1 return y_true, raw_prediction def numerical_derivative(func, x, eps): """Helper function for numerical (first) derivatives.""" # For numerical derivatives, see # https://en.wikipedia.org/wiki/Numerical_differentiation # https://en.wikipedia.org/wiki/Finite_difference_coefficient # We use central finite differences of accuracy 4. h = np.full_like(x, fill_value=eps) f_minus_2h = func(x - 2 * h) f_minus_1h = func(x - h) f_plus_1h = func(x + h) f_plus_2h = func(x + 2 * h) return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (12.0 * eps) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) def test_loss_boundary(loss): """Test interval ranges of y_true and y_pred in losses.""" # make sure low and high are always within the interval, used for linspace if loss.is_multiclass: y_true = np.linspace(0, 9, num=10) else: low, high = _inclusive_low_high(loss.interval_y_true) y_true = np.linspace(low, high, num=10) # add boundaries if they are included if loss.interval_y_true.low_inclusive: y_true = np.r_[y_true, loss.interval_y_true.low] if loss.interval_y_true.high_inclusive: y_true = np.r_[y_true, loss.interval_y_true.high] assert loss.in_y_true_range(y_true) n = y_true.shape[0] low, high = _inclusive_low_high(loss.interval_y_pred) if loss.is_multiclass: y_pred = np.empty((n, 3)) y_pred[:, 0] = np.linspace(low, high, num=n) y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0]) y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0]) else: y_pred = np.linspace(low, high, num=n) assert loss.in_y_pred_range(y_pred) # calculating losses should not fail raw_prediction = loss.link.link(y_pred) loss.loss(y_true=y_true, raw_prediction=raw_prediction) # Fixture to test valid value ranges. Y_COMMON_PARAMS = [ # (loss, [y success], [y fail]) (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), (HuberLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]), (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, np.inf]), (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), (HalfTweedieLoss(power=-3), [0.1, 100], [-np.inf, np.inf]), (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, np.inf]), (HalfTweedieLoss(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]), (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), (HalfTweedieLossIdentity(power=-3), [0.1, 100], [-np.inf, np.inf]), (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0, 0.1, 100], [-np.inf, np.inf]), (HalfTweedieLossIdentity(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]), (HalfTweedieLossIdentity(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), (HalfTweedieLossIdentity(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]), (HalfBinomialLoss(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]), (HalfMultinomialLoss(), [], [-np.inf, -1, 1.1, np.inf]), ] # y_pred and y_true do not always have the same domain (valid value range). # Hence, we define extra sets of parameters for each of them. Y_TRUE_PARAMS = [ # type: ignore # (loss, [y success], [y fail]) (HalfPoissonLoss(), [0], []), (HuberLoss(), [0], []), (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []), (HalfTweedieLoss(power=0), [-100, 0], []), (HalfTweedieLoss(power=1.5), [0], []), (HalfTweedieLossIdentity(power=-3), [-100, -0.1, 0], []), (HalfTweedieLossIdentity(power=0), [-100, 0], []), (HalfTweedieLossIdentity(power=1.5), [0], []), (HalfBinomialLoss(), [0, 1], []), (HalfMultinomialLoss(), [0.0, 1.0, 2], []), ] Y_PRED_PARAMS = [ # (loss, [y success], [y fail]) (HalfPoissonLoss(), [], [0]), (HalfTweedieLoss(power=-3), [], [-3, -0.1, 0]), (HalfTweedieLoss(power=0), [], [-3, -0.1, 0]), (HalfTweedieLoss(power=1.5), [], [0]), (HalfTweedieLossIdentity(power=-3), [], [-3, -0.1, 0]), (HalfTweedieLossIdentity(power=0), [-3, -0.1, 0], []), (HalfTweedieLossIdentity(power=1.5), [], [0]), (HalfBinomialLoss(), [], [0, 1]), (HalfMultinomialLoss(), [0.1, 0.5], [0, 1]), ] @pytest.mark.parametrize( "loss, y_true_success, y_true_fail", Y_COMMON_PARAMS + Y_TRUE_PARAMS ) def test_loss_boundary_y_true(loss, y_true_success, y_true_fail): """Test boundaries of y_true for loss functions.""" for y in y_true_success: assert loss.in_y_true_range(np.array([y])) for y in y_true_fail: assert not loss.in_y_true_range(np.array([y])) @pytest.mark.parametrize( "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS # type: ignore ) def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail): """Test boundaries of y_pred for loss functions.""" for y in y_pred_success: assert loss.in_y_pred_range(np.array([y])) for y in y_pred_fail: assert not loss.in_y_pred_range(np.array([y])) @pytest.mark.parametrize( "loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true", [ (HalfSquaredError(), 1.0, 5.0, 8, 4, 1), (AbsoluteError(), 1.0, 5.0, 4.0, 1.0, None), (PinballLoss(quantile=0.5), 1.0, 5.0, 2, 0.5, None), (PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25), 1 - 0.25, None), (PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25, -0.25, None), (HuberLoss(quantile=0.5, delta=3), 1.0, 5.0, 3 * (4 - 3 / 2), None, None), (HuberLoss(quantile=0.5, delta=3), 1.0, 3.0, 0.5 * 2**2, None, None), (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4), 4 - 2, 4), (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4, 1 - 2 / 4, 2 / 4), (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4**2, None, None), (HalfTweedieLossIdentity(power=1), 2.0, 4.0, 2 - 2 * np.log(2), None, None), (HalfTweedieLossIdentity(power=2), 2.0, 4.0, np.log(2) - 1 / 2, None, None), ( HalfTweedieLossIdentity(power=3), 2.0, 4.0, -1 / 4 + 1 / 4**2 + 1 / 2 / 2, None, None, ), ( HalfBinomialLoss(), 0.25, np.log(4), np.log1p(4) - 0.25 * np.log(4), None, None, ), # Extreme log loss cases, checked with mpmath: # import mpmath as mp # # # Stolen from scipy # def mpf2float(x): # return float(mp.nstr(x, 17, min_fixed=0, max_fixed=0)) # # def mp_logloss(y_true, raw): # with mp.workdps(100): # y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw)) # out = mp.log1p(mp.exp(raw)) - y_true * raw # return mpf2float(out) # # def mp_gradient(y_true, raw): # with mp.workdps(100): # y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw)) # out = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) - y_true # return mpf2float(out) # # def mp_hessian(y_true, raw): # with mp.workdps(100): # y_true, raw = mp.mpf(float(y_true)), mp.mpf(float(raw)) # p = mp.mpf(1) / (mp.mpf(1) + mp.exp(-raw)) # out = p * (mp.mpf(1) - p) # return mpf2float(out) # # y, raw = 0.0, 37. # mp_logloss(y, raw), mp_gradient(y, raw), mp_hessian(y, raw) (HalfBinomialLoss(), 0.0, -1e20, 0, 0, 0), (HalfBinomialLoss(), 1.0, -1e20, 1e20, -1, 0), (HalfBinomialLoss(), 0.0, -1e3, 0, 0, 0), (HalfBinomialLoss(), 1.0, -1e3, 1e3, -1, 0), (HalfBinomialLoss(), 1.0, -37.5, 37.5, -1, 0), (HalfBinomialLoss(), 1.0, -37.0, 37, 1e-16 - 1, 8.533047625744065e-17), (HalfBinomialLoss(), 0.0, -37.0, *[8.533047625744065e-17] * 3), (HalfBinomialLoss(), 1.0, -36.9, 36.9, 1e-16 - 1, 9.430476078526806e-17), (HalfBinomialLoss(), 0.0, -36.9, *[9.430476078526806e-17] * 3), (HalfBinomialLoss(), 0.0, 37.0, 37, 1 - 1e-16, 8.533047625744065e-17), (HalfBinomialLoss(), 1.0, 37.0, *[8.533047625744066e-17] * 3), (HalfBinomialLoss(), 0.0, 37.5, 37.5, 1, 5.175555005801868e-17), (HalfBinomialLoss(), 0.0, 232.8, 232.8, 1, 1.4287342391028437e-101), (HalfBinomialLoss(), 1.0, 1e20, 0, 0, 0), (HalfBinomialLoss(), 0.0, 1e20, 1e20, 1, 0), ( HalfBinomialLoss(), 1.0, 232.8, 0, -1.4287342391028437e-101, 1.4287342391028437e-101, ), (HalfBinomialLoss(), 1.0, 232.9, 0, 0, 0), (HalfBinomialLoss(), 1.0, 1e3, 0, 0, 0), (HalfBinomialLoss(), 0.0, 1e3, 1e3, 1, 0), ( HalfMultinomialLoss(n_classes=3), 0.0, [0.2, 0.5, 0.3], logsumexp([0.2, 0.5, 0.3]) - 0.2, None, None, ), ( HalfMultinomialLoss(n_classes=3), 1.0, [0.2, 0.5, 0.3], logsumexp([0.2, 0.5, 0.3]) - 0.5, None, None, ), ( HalfMultinomialLoss(n_classes=3), 2.0, [0.2, 0.5, 0.3], logsumexp([0.2, 0.5, 0.3]) - 0.3, None, None, ), ( HalfMultinomialLoss(n_classes=3), 2.0, [1e4, 0, 7e-7], logsumexp([1e4, 0, 7e-7]) - (7e-7), None, None, ), ], ids=loss_instance_name, ) def test_loss_on_specific_values( loss, y_true, raw_prediction, loss_true, gradient_true, hessian_true ): """Test losses, gradients and hessians at specific values.""" loss1 = loss(y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])) grad1 = loss.gradient( y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]) ) loss2, grad2 = loss.loss_gradient( y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]) ) grad3, hess = loss.gradient_hessian( y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction]) ) assert loss1 == approx(loss_true, rel=1e-15, abs=1e-15) assert loss2 == approx(loss_true, rel=1e-15, abs=1e-15) if gradient_true is not None: assert grad1 == approx(gradient_true, rel=1e-15, abs=1e-15) assert grad2 == approx(gradient_true, rel=1e-15, abs=1e-15) assert grad3 == approx(gradient_true, rel=1e-15, abs=1e-15) if hessian_true is not None: assert hess == approx(hessian_true, rel=1e-15, abs=1e-15) @pytest.mark.parametrize("loss", ALL_LOSSES) @pytest.mark.parametrize("readonly_memmap", [False, True]) @pytest.mark.parametrize("dtype_in", [np.float32, np.float64]) @pytest.mark.parametrize("dtype_out", [np.float32, np.float64]) @pytest.mark.parametrize("sample_weight", [None, 1]) @pytest.mark.parametrize("out1", [None, 1]) @pytest.mark.parametrize("out2", [None, 1]) @pytest.mark.parametrize("n_threads", [1, 2]) def test_loss_dtype( loss, readonly_memmap, dtype_in, dtype_out, sample_weight, out1, out2, n_threads ): """Test acceptance of dtypes, readonly and writeable arrays in loss functions. Check that loss accepts if all input arrays are either all float32 or all float64, and all output arrays are either all float32 or all float64. Also check that input arrays can be readonly, e.g. memory mapped. """ if _IS_WASM and readonly_memmap: # pragma: nocover pytest.xfail(reason="memmap not fully supported") loss = loss() # generate a y_true and raw_prediction in valid range n_samples = 5 y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=n_samples, y_bound=(-100, 100), raw_bound=(-10, 10), seed=42, ) y_true = y_true.astype(dtype_in) raw_prediction = raw_prediction.astype(dtype_in) if sample_weight is not None: sample_weight = np.array([2.0] * n_samples, dtype=dtype_in) if out1 is not None: out1 = np.empty_like(y_true, dtype=dtype_out) if out2 is not None: out2 = np.empty_like(raw_prediction, dtype=dtype_out) if readonly_memmap: y_true = create_memmap_backed_data(y_true) raw_prediction = create_memmap_backed_data(raw_prediction) if sample_weight is not None: sample_weight = create_memmap_backed_data(sample_weight) loss.loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out1, n_threads=n_threads, ) loss.gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out2, n_threads=n_threads, ) loss.loss_gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out1, gradient_out=out2, n_threads=n_threads, ) if out1 is not None and loss.is_multiclass: out1 = np.empty_like(raw_prediction, dtype=dtype_out) loss.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out1, hessian_out=out2, n_threads=n_threads, ) loss(y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight) loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight) loss.constant_to_optimal_zero(y_true=y_true, sample_weight=sample_weight) if hasattr(loss, "predict_proba"): loss.predict_proba(raw_prediction=raw_prediction) if hasattr(loss, "gradient_proba"): loss.gradient_proba( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out1, proba_out=out2, n_threads=n_threads, ) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) @pytest.mark.parametrize("sample_weight", [None, "range"]) def test_loss_same_as_C_functions(loss, sample_weight): """Test that Python and Cython functions return same results.""" y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=20, y_bound=(-100, 100), raw_bound=(-10, 10), seed=42, ) if sample_weight == "range": sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) out_l1 = np.empty_like(y_true) out_l2 = np.empty_like(y_true) out_g1 = np.empty_like(raw_prediction) out_g2 = np.empty_like(raw_prediction) out_h1 = np.empty_like(raw_prediction) out_h2 = np.empty_like(raw_prediction) loss.loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out_l1, ) loss.closs.loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out_l2, ), assert_allclose(out_l1, out_l2) loss.gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out_g1, ) loss.closs.gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out_g2, ) assert_allclose(out_g1, out_g2) loss.closs.loss_gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out_l1, gradient_out=out_g1, ) loss.closs.loss_gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out_l2, gradient_out=out_g2, ) assert_allclose(out_l1, out_l2) assert_allclose(out_g1, out_g2) loss.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out_g1, hessian_out=out_h1, ) loss.closs.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out_g2, hessian_out=out_h2, ) assert_allclose(out_g1, out_g2) assert_allclose(out_h1, out_h2) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) @pytest.mark.parametrize("sample_weight", [None, "range"]) def test_loss_gradients_are_the_same(loss, sample_weight, global_random_seed): """Test that loss and gradient are the same across different functions. Also test that output arguments contain correct results. """ y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=20, y_bound=(-100, 100), raw_bound=(-10, 10), seed=global_random_seed, ) if sample_weight == "range": sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) out_l1 = np.empty_like(y_true) out_l2 = np.empty_like(y_true) out_g1 = np.empty_like(raw_prediction) out_g2 = np.empty_like(raw_prediction) out_g3 = np.empty_like(raw_prediction) out_h3 = np.empty_like(raw_prediction) l1 = loss.loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out_l1, ) g1 = loss.gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out_g1, ) l2, g2 = loss.loss_gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, loss_out=out_l2, gradient_out=out_g2, ) g3, h3 = loss.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out_g3, hessian_out=out_h3, ) assert_allclose(l1, l2) assert_array_equal(l1, out_l1) assert np.shares_memory(l1, out_l1) assert_array_equal(l2, out_l2) assert np.shares_memory(l2, out_l2) assert_allclose(g1, g2) assert_allclose(g1, g3) assert_array_equal(g1, out_g1) assert np.shares_memory(g1, out_g1) assert_array_equal(g2, out_g2) assert np.shares_memory(g2, out_g2) assert_array_equal(g3, out_g3) assert np.shares_memory(g3, out_g3) if hasattr(loss, "gradient_proba"): assert loss.is_multiclass # only for HalfMultinomialLoss out_g4 = np.empty_like(raw_prediction) out_proba = np.empty_like(raw_prediction) g4, proba = loss.gradient_proba( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, gradient_out=out_g4, proba_out=out_proba, ) assert_allclose(g1, out_g4) assert_allclose(g1, g4) assert_allclose(proba, out_proba) assert_allclose(np.sum(proba, axis=1), 1, rtol=1e-11) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) @pytest.mark.parametrize("sample_weight", ["ones", "random"]) def test_sample_weight_multiplies(loss, sample_weight, global_random_seed): """Test sample weights in loss, gradients and hessians. Make sure that passing sample weights to loss, gradient and hessian computation methods is equivalent to multiplying by the weights. """ n_samples = 100 y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=global_random_seed, ) if sample_weight == "ones": sample_weight = np.ones(shape=n_samples, dtype=np.float64) else: rng = np.random.RandomState(global_random_seed) sample_weight = rng.normal(size=n_samples).astype(np.float64) assert_allclose( loss.loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, ), sample_weight * loss.loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=None, ), ) losses, gradient = loss.loss_gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=None, ) losses_sw, gradient_sw = loss.loss_gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, ) assert_allclose(losses * sample_weight, losses_sw) if not loss.is_multiclass: assert_allclose(gradient * sample_weight, gradient_sw) else: assert_allclose(gradient * sample_weight[:, None], gradient_sw) gradient, hessian = loss.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction, sample_weight=None, ) gradient_sw, hessian_sw = loss.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, ) if not loss.is_multiclass: assert_allclose(gradient * sample_weight, gradient_sw) assert_allclose(hessian * sample_weight, hessian_sw) else: assert_allclose(gradient * sample_weight[:, None], gradient_sw) assert_allclose(hessian * sample_weight[:, None], hessian_sw) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) def test_graceful_squeezing(loss): """Test that reshaped raw_prediction gives same results.""" y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=20, y_bound=(-100, 100), raw_bound=(-10, 10), seed=42, ) if raw_prediction.ndim == 1: raw_prediction_2d = raw_prediction[:, None] assert_allclose( loss.loss(y_true=y_true, raw_prediction=raw_prediction_2d), loss.loss(y_true=y_true, raw_prediction=raw_prediction), ) assert_allclose( loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction_2d), loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction), ) assert_allclose( loss.gradient(y_true=y_true, raw_prediction=raw_prediction_2d), loss.gradient(y_true=y_true, raw_prediction=raw_prediction), ) assert_allclose( loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction_2d), loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction), ) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) @pytest.mark.parametrize("sample_weight", [None, "range"]) def test_loss_of_perfect_prediction(loss, sample_weight): """Test value of perfect predictions. Loss of y_pred = y_true plus constant_to_optimal_zero should sums up to zero. """ if not loss.is_multiclass: # Use small values such that exp(value) is not nan. raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10]) # If link is identity, we must respect the interval of y_pred: if isinstance(loss.link, IdentityLink): eps = 1e-10 low = loss.interval_y_pred.low if not loss.interval_y_pred.low_inclusive: low = low + eps high = loss.interval_y_pred.high if not loss.interval_y_pred.high_inclusive: high = high - eps raw_prediction = np.clip(raw_prediction, low, high) y_true = loss.link.inverse(raw_prediction) else: # HalfMultinomialLoss y_true = np.arange(loss.n_classes).astype(float) # raw_prediction with entries -exp(10), but +exp(10) on the diagonal # this is close enough to np.inf which would produce nan raw_prediction = np.full( shape=(loss.n_classes, loss.n_classes), fill_value=-np.exp(10), dtype=float, ) raw_prediction.flat[:: loss.n_classes + 1] = np.exp(10) if sample_weight == "range": sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) loss_value = loss.loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, ) constant_term = loss.constant_to_optimal_zero( y_true=y_true, sample_weight=sample_weight ) # Comparing loss_value + constant_term to zero would result in large # round-off errors. assert_allclose(loss_value, -constant_term, atol=1e-14, rtol=1e-15) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) @pytest.mark.parametrize("sample_weight", [None, "range"]) def test_gradients_hessians_numerically(loss, sample_weight, global_random_seed): """Test gradients and hessians with numerical derivatives. Gradient should equal the numerical derivatives of the loss function. Hessians should equal the numerical derivatives of gradients. """ n_samples = 20 y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=global_random_seed, ) if sample_weight == "range": sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0]) g, h = loss.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, ) assert g.shape == raw_prediction.shape assert h.shape == raw_prediction.shape if not loss.is_multiclass: def loss_func(x): return loss.loss( y_true=y_true, raw_prediction=x, sample_weight=sample_weight, ) g_numeric = numerical_derivative(loss_func, raw_prediction, eps=1e-6) assert_allclose(g, g_numeric, rtol=5e-6, atol=1e-10) def grad_func(x): return loss.gradient( y_true=y_true, raw_prediction=x, sample_weight=sample_weight, ) h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6) if loss.approx_hessian: # TODO: What could we test if loss.approx_hessian? pass else: assert_allclose(h, h_numeric, rtol=5e-6, atol=1e-10) else: # For multiclass loss, we should only change the predictions of the # class for which the derivative is taken for, e.g. offset[:, k] = eps # for class k. # As a softmax is computed, offsetting the whole array by a constant # would have no effect on the probabilities, and thus on the loss. for k in range(loss.n_classes): def loss_func(x): raw = raw_prediction.copy() raw[:, k] = x return loss.loss( y_true=y_true, raw_prediction=raw, sample_weight=sample_weight, ) g_numeric = numerical_derivative(loss_func, raw_prediction[:, k], eps=1e-5) assert_allclose(g[:, k], g_numeric, rtol=5e-6, atol=1e-10) def grad_func(x): raw = raw_prediction.copy() raw[:, k] = x return loss.gradient( y_true=y_true, raw_prediction=raw, sample_weight=sample_weight, )[:, k] h_numeric = numerical_derivative(grad_func, raw_prediction[:, k], eps=1e-6) if loss.approx_hessian: # TODO: What could we test if loss.approx_hessian? pass else: assert_allclose(h[:, k], h_numeric, rtol=5e-6, atol=1e-10) @pytest.mark.parametrize( "loss, x0, y_true", [ ("squared_error", -2.0, 42), ("squared_error", 117.0, 1.05), ("squared_error", 0.0, 0.0), # The argmin of binomial_loss for y_true=0 and y_true=1 is resp. # -inf and +inf due to logit, cf. "complete separation". Therefore, we # use 0 < y_true < 1. ("binomial_loss", 0.3, 0.1), ("binomial_loss", -12, 0.2), ("binomial_loss", 30, 0.9), ("poisson_loss", 12.0, 1.0), ("poisson_loss", 0.0, 2.0), ("poisson_loss", -22.0, 10.0), ], ) @skip_if_32bit def test_derivatives(loss, x0, y_true): """Test that gradients are zero at the minimum of the loss. We check this on a single value/sample using Halley's method with the first and second order derivatives computed by the Loss instance. Note that methods of Loss instances operate on arrays while the newton root finder expects a scalar or a one-element array for this purpose. """ loss = _LOSSES[loss](sample_weight=None) y_true = np.array([y_true], dtype=np.float64) x0 = np.array([x0], dtype=np.float64) def func(x: np.ndarray) -> np.ndarray: """Compute loss plus constant term. The constant term is such that the minimum function value is zero, which is required by the Newton method. """ return loss.loss( y_true=y_true, raw_prediction=x ) + loss.constant_to_optimal_zero(y_true=y_true) def fprime(x: np.ndarray) -> np.ndarray: return loss.gradient(y_true=y_true, raw_prediction=x) def fprime2(x: np.ndarray) -> np.ndarray: return loss.gradient_hessian(y_true=y_true, raw_prediction=x)[1] optimum = newton( func, x0=x0, fprime=fprime, fprime2=fprime2, maxiter=100, tol=5e-8, ) # Need to ravel arrays because assert_allclose requires matching # dimensions. y_true = y_true.ravel() optimum = optimum.ravel() assert_allclose(loss.link.inverse(optimum), y_true) assert_allclose(func(optimum), 0, atol=1e-14) assert_allclose(loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) @pytest.mark.parametrize("sample_weight", [None, "range"]) def test_loss_intercept_only(loss, sample_weight): """Test that fit_intercept_only returns the argmin of the loss. Also test that the gradient is zero at the minimum. """ n_samples = 50 if not loss.is_multiclass: y_true = loss.link.inverse(np.linspace(-4, 4, num=n_samples)) else: y_true = np.arange(n_samples).astype(np.float64) % loss.n_classes y_true[::5] = 0 # exceedance of class 0 if sample_weight == "range": sample_weight = np.linspace(0.1, 2, num=n_samples) a = loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight) # find minimum by optimization def fun(x): if not loss.is_multiclass: raw_prediction = np.full(shape=(n_samples), fill_value=x) else: raw_prediction = np.ascontiguousarray( np.broadcast_to(x, shape=(n_samples, loss.n_classes)) ) return loss( y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight, ) if not loss.is_multiclass: opt = minimize_scalar(fun, tol=1e-7, options={"maxiter": 100}) grad = loss.gradient( y_true=y_true, raw_prediction=np.full_like(y_true, a), sample_weight=sample_weight, ) assert a.shape == tuple() # scalar assert a.dtype == y_true.dtype assert_all_finite(a) a == approx(opt.x, rel=1e-7) grad.sum() == approx(0, abs=1e-12) else: # The constraint corresponds to sum(raw_prediction) = 0. Without it, we would # need to apply loss.symmetrize_raw_prediction to opt.x before comparing. opt = minimize( fun, np.zeros((loss.n_classes)), tol=1e-13, options={"maxiter": 100}, method="SLSQP", constraints=LinearConstraint(np.ones((1, loss.n_classes)), 0, 0), ) grad = loss.gradient( y_true=y_true, raw_prediction=np.tile(a, (n_samples, 1)), sample_weight=sample_weight, ) assert a.dtype == y_true.dtype assert_all_finite(a) assert_allclose(a, opt.x, rtol=5e-6, atol=1e-12) assert_allclose(grad.sum(axis=0), 0, atol=1e-12) @pytest.mark.parametrize( "loss, func, random_dist", [ (HalfSquaredError(), np.mean, "normal"), (AbsoluteError(), np.median, "normal"), (PinballLoss(quantile=0.25), lambda x: np.percentile(x, q=25), "normal"), (HalfPoissonLoss(), np.mean, "poisson"), (HalfGammaLoss(), np.mean, "exponential"), (HalfTweedieLoss(), np.mean, "exponential"), (HalfBinomialLoss(), np.mean, "binomial"), ], ) def test_specific_fit_intercept_only(loss, func, random_dist, global_random_seed): """Test that fit_intercept_only returns the correct functional. We test the functional for specific, meaningful distributions, e.g. squared error estimates the expectation of a probability distribution. """ rng = np.random.RandomState(global_random_seed) if random_dist == "binomial": y_train = rng.binomial(1, 0.5, size=100) else: y_train = getattr(rng, random_dist)(size=100) baseline_prediction = loss.fit_intercept_only(y_true=y_train) # Make sure baseline prediction is the expected functional=func, e.g. mean # or median. assert_all_finite(baseline_prediction) assert baseline_prediction == approx(loss.link.link(func(y_train))) assert loss.link.inverse(baseline_prediction) == approx(func(y_train)) if isinstance(loss, IdentityLink): assert_allclose(loss.link.inverse(baseline_prediction), baseline_prediction) # Test baseline at boundary if loss.interval_y_true.low_inclusive: y_train.fill(loss.interval_y_true.low) baseline_prediction = loss.fit_intercept_only(y_true=y_train) assert_all_finite(baseline_prediction) if loss.interval_y_true.high_inclusive: y_train.fill(loss.interval_y_true.high) baseline_prediction = loss.fit_intercept_only(y_true=y_train) assert_all_finite(baseline_prediction) def test_multinomial_loss_fit_intercept_only(): """Test that fit_intercept_only returns the mean functional for CCE.""" rng = np.random.RandomState(0) n_classes = 4 loss = HalfMultinomialLoss(n_classes=n_classes) # Same logic as test_specific_fit_intercept_only. Here inverse link # function = softmax and link function = log - symmetry term. y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64) baseline_prediction = loss.fit_intercept_only(y_true=y_train) assert baseline_prediction.shape == (n_classes,) p = np.zeros(n_classes, dtype=y_train.dtype) for k in range(n_classes): p[k] = (y_train == k).mean() assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p))) assert_allclose(baseline_prediction[None, :], loss.link.link(p[None, :])) for y_train in (np.zeros(shape=10), np.ones(shape=10)): y_train = y_train.astype(np.float64) baseline_prediction = loss.fit_intercept_only(y_true=y_train) assert baseline_prediction.dtype == y_train.dtype assert_all_finite(baseline_prediction) def test_binomial_and_multinomial_loss(global_random_seed): """Test that multinomial loss with n_classes = 2 is the same as binomial loss.""" rng = np.random.RandomState(global_random_seed) n_samples = 20 binom = HalfBinomialLoss() multinom = HalfMultinomialLoss(n_classes=2) y_train = rng.randint(0, 2, size=n_samples).astype(np.float64) raw_prediction = rng.normal(size=n_samples) raw_multinom = np.empty((n_samples, 2)) raw_multinom[:, 0] = -0.5 * raw_prediction raw_multinom[:, 1] = 0.5 * raw_prediction assert_allclose( binom.loss(y_true=y_train, raw_prediction=raw_prediction), multinom.loss(y_true=y_train, raw_prediction=raw_multinom), ) @pytest.mark.parametrize("y_true", (np.array([0.0, 0, 0]), np.array([1.0, 1, 1]))) @pytest.mark.parametrize("y_pred", (np.array([-5.0, -5, -5]), np.array([3.0, 3, 3]))) def test_binomial_vs_alternative_formulation(y_true, y_pred, global_dtype): """Test that both formulations of the binomial deviance agree. Often, the binomial deviance or log loss is written in terms of a variable z in {-1, +1}, but we use y in {0, 1}, hence z = 2 * y - 1. ESL II Eq. (10.18): -loglike(z, f) = log(1 + exp(-2 * z * f)) Note: - ESL 2*f = raw_prediction, hence the factor 2 of ESL disappears. - Deviance = -2*loglike + .., but HalfBinomialLoss is half of the deviance, hence the factor of 2 cancels in the comparison. """ def alt_loss(y, raw_pred): z = 2 * y - 1 return np.mean(np.log(1 + np.exp(-z * raw_pred))) def alt_gradient(y, raw_pred): # alternative gradient formula according to ESL z = 2 * y - 1 return -z / (1 + np.exp(z * raw_pred)) bin_loss = HalfBinomialLoss() y_true = y_true.astype(global_dtype) y_pred = y_pred.astype(global_dtype) datum = (y_true, y_pred) assert bin_loss(*datum) == approx(alt_loss(*datum)) assert_allclose(bin_loss.gradient(*datum), alt_gradient(*datum)) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) def test_predict_proba(loss, global_random_seed): """Test that predict_proba and gradient_proba work as expected.""" n_samples = 20 y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=global_random_seed, ) if hasattr(loss, "predict_proba"): proba = loss.predict_proba(raw_prediction) assert proba.shape == (n_samples, loss.n_classes) assert np.sum(proba, axis=1) == approx(1, rel=1e-11) if hasattr(loss, "gradient_proba"): for grad, proba in ( (None, None), (None, np.empty_like(raw_prediction)), (np.empty_like(raw_prediction), None), (np.empty_like(raw_prediction), np.empty_like(raw_prediction)), ): grad, proba = loss.gradient_proba( y_true=y_true, raw_prediction=raw_prediction, sample_weight=None, gradient_out=grad, proba_out=proba, ) assert proba.shape == (n_samples, loss.n_classes) assert np.sum(proba, axis=1) == approx(1, rel=1e-11) assert_allclose( grad, loss.gradient( y_true=y_true, raw_prediction=raw_prediction, sample_weight=None, gradient_out=None, ), ) @pytest.mark.parametrize("loss", ALL_LOSSES) @pytest.mark.parametrize("sample_weight", [None, "range"]) @pytest.mark.parametrize("dtype", (np.float32, np.float64)) @pytest.mark.parametrize("order", ("C", "F")) def test_init_gradient_and_hessians(loss, sample_weight, dtype, order): """Test that init_gradient_and_hessian works as expected. passing sample_weight to a loss correctly influences the constant_hessian attribute, and consequently the shape of the hessian array. """ n_samples = 5 if sample_weight == "range": sample_weight = np.ones(n_samples) loss = loss(sample_weight=sample_weight) gradient, hessian = loss.init_gradient_and_hessian( n_samples=n_samples, dtype=dtype, order=order, ) if loss.constant_hessian: assert gradient.shape == (n_samples,) assert hessian.shape == (1,) elif loss.is_multiclass: assert gradient.shape == (n_samples, loss.n_classes) assert hessian.shape == (n_samples, loss.n_classes) else: assert hessian.shape == (n_samples,) assert hessian.shape == (n_samples,) assert gradient.dtype == dtype assert hessian.dtype == dtype if order == "C": assert gradient.flags.c_contiguous assert hessian.flags.c_contiguous else: assert gradient.flags.f_contiguous assert hessian.flags.f_contiguous @pytest.mark.parametrize("loss", ALL_LOSSES) @pytest.mark.parametrize( "params, err_msg", [ ( {"dtype": np.int64}, f"Valid options for 'dtype' are .* Got dtype={np.int64} instead.", ), ], ) def test_init_gradient_and_hessian_raises(loss, params, err_msg): """Test that init_gradient_and_hessian raises errors for invalid input.""" loss = loss() with pytest.raises((ValueError, TypeError), match=err_msg): gradient, hessian = loss.init_gradient_and_hessian(n_samples=5, **params) @pytest.mark.parametrize( "loss, params, err_type, err_msg", [ ( PinballLoss, {"quantile": None}, TypeError, "quantile must be an instance of float, not NoneType.", ), ( PinballLoss, {"quantile": 0}, ValueError, "quantile == 0, must be > 0.", ), (PinballLoss, {"quantile": 1.1}, ValueError, "quantile == 1.1, must be < 1."), ( HuberLoss, {"quantile": None}, TypeError, "quantile must be an instance of float, not NoneType.", ), ( HuberLoss, {"quantile": 0}, ValueError, "quantile == 0, must be > 0.", ), (HuberLoss, {"quantile": 1.1}, ValueError, "quantile == 1.1, must be < 1."), ], ) def test_loss_init_parameter_validation(loss, params, err_type, err_msg): """Test that loss raises errors for invalid input.""" with pytest.raises(err_type, match=err_msg): loss(**params) @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name) def test_loss_pickle(loss): """Test that losses can be pickled.""" n_samples = 20 y_true, raw_prediction = random_y_true_raw_prediction( loss=loss, n_samples=n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=42, ) pickled_loss = pickle.dumps(loss) unpickled_loss = pickle.loads(pickled_loss) assert loss(y_true=y_true, raw_prediction=raw_prediction) == approx( unpickled_loss(y_true=y_true, raw_prediction=raw_prediction) ) @pytest.mark.parametrize("p", [-1.5, 0, 1, 1.5, 2, 3]) def test_tweedie_log_identity_consistency(p): """Test for identical losses when only the link function is different.""" half_tweedie_log = HalfTweedieLoss(power=p) half_tweedie_identity = HalfTweedieLossIdentity(power=p) n_samples = 10 y_true, raw_prediction = random_y_true_raw_prediction( loss=half_tweedie_log, n_samples=n_samples, seed=42 ) y_pred = half_tweedie_log.link.inverse(raw_prediction) # exp(raw_prediction) # Let's compare the loss values, up to some constant term that is dropped # in HalfTweedieLoss but not in HalfTweedieLossIdentity. loss_log = half_tweedie_log.loss( y_true=y_true, raw_prediction=raw_prediction ) + half_tweedie_log.constant_to_optimal_zero(y_true) loss_identity = half_tweedie_identity.loss( y_true=y_true, raw_prediction=y_pred ) + half_tweedie_identity.constant_to_optimal_zero(y_true) # Note that HalfTweedieLoss ignores different constant terms than # HalfTweedieLossIdentity. Constant terms means terms not depending on # raw_prediction. By adding these terms, `constant_to_optimal_zero`, both losses # give the same values. assert_allclose(loss_log, loss_identity) # For gradients and hessians, the constant terms do not matter. We have, however, # to account for the chain rule, i.e. with x=raw_prediction # gradient_log(x) = d/dx loss_log(x) # = d/dx loss_identity(exp(x)) # = exp(x) * gradient_identity(exp(x)) # Similarly, # hessian_log(x) = exp(x) * gradient_identity(exp(x)) # + exp(x)**2 * hessian_identity(x) gradient_log, hessian_log = half_tweedie_log.gradient_hessian( y_true=y_true, raw_prediction=raw_prediction ) gradient_identity, hessian_identity = half_tweedie_identity.gradient_hessian( y_true=y_true, raw_prediction=y_pred ) assert_allclose(gradient_log, y_pred * gradient_identity) assert_allclose( hessian_log, y_pred * gradient_identity + y_pred**2 * hessian_identity )