"""
Testing for the gradient boosting module (sklearn.ensemble.gradient_boosting).
"""
import re
import warnings

import numpy as np
import pytest
from numpy.testing import assert_allclose

from sklearn import datasets
from sklearn.base import clone
from sklearn.datasets import make_classification, make_regression
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble._gb import _safe_divide
from sklearn.ensemble._gradient_boosting import predict_stages
from sklearn.exceptions import DataConversionWarning, NotFittedError
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.svm import NuSVR
from sklearn.utils import check_random_state, tosequence
from sklearn.utils._mocking import NoSampleWeightWrapper
from sklearn.utils._param_validation import InvalidParameterError
from sklearn.utils._testing import (
    assert_array_almost_equal,
    assert_array_equal,
    skip_if_32bit,
)
from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS

GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, GradientBoostingRegressor]

# toy sample
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
y = [-1, -1, -1, 1, 1, 1]
T = [[-1, -1], [2, 2], [3, 2]]
true_result = [-1, 1, 1]

# also make regression dataset
X_reg, y_reg = make_regression(
    n_samples=100, n_features=4, n_informative=8, noise=10, random_state=7
)
y_reg = scale(y_reg)

rng = np.random.RandomState(0)
# also load the iris dataset
# and randomly permute it
iris = datasets.load_iris()
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]


def test_exponential_n_classes_gt_2():
    """Test exponential loss raises for n_classes > 2."""
    clf = GradientBoostingClassifier(loss="exponential")
    msg = "loss='exponential' is only suitable for a binary classification"
    with pytest.raises(ValueError, match=msg):
        clf.fit(iris.data, iris.target)


def test_raise_if_init_has_no_predict_proba():
    """Test raise if init_ has no predict_proba method."""
    clf = GradientBoostingClassifier(init=GradientBoostingRegressor)
    msg = (
        "The 'init' parameter of GradientBoostingClassifier must be a str among "
        "{'zero'}, None or an object implementing 'fit' and 'predict_proba'."
    )
    with pytest.raises(ValueError, match=msg):
        clf.fit(X, y)


@pytest.mark.parametrize("loss", ("log_loss", "exponential"))
def test_classification_toy(loss, global_random_seed):
    # Check classification on a toy dataset.
    clf = GradientBoostingClassifier(
        loss=loss, n_estimators=10, random_state=global_random_seed
    )

    with pytest.raises(ValueError):
        clf.predict(T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert 10 == len(clf.estimators_)

    log_loss_decrease = clf.train_score_[:-1] - clf.train_score_[1:]
    assert np.any(log_loss_decrease >= 0.0)

    leaves = clf.apply(X)
    assert leaves.shape == (6, 10, 1)


@pytest.mark.parametrize("loss", ("log_loss", "exponential"))
def test_classification_synthetic(loss, global_random_seed):
    # Test GradientBoostingClassifier on synthetic dataset used by
    # Hastie et al. in ESLII - Figure 10.9
    # Note that Figure 10.9 reuses the dataset generated for figure 10.2
    # and should have 2_000 train data points and 10_000 test data points.
    # Here we intentionally use a smaller variant to make the test run faster,
    # but the conclusions are still the same, despite the smaller datasets.
    X, y = datasets.make_hastie_10_2(n_samples=2000, random_state=global_random_seed)

    split_idx = 500
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    # Increasing the number of trees should decrease the test error
    common_params = {
        "max_depth": 1,
        "learning_rate": 1.0,
        "loss": loss,
        "random_state": global_random_seed,
    }
    gbrt_10_stumps = GradientBoostingClassifier(n_estimators=10, **common_params)
    gbrt_10_stumps.fit(X_train, y_train)

    gbrt_50_stumps = GradientBoostingClassifier(n_estimators=50, **common_params)
    gbrt_50_stumps.fit(X_train, y_train)

    assert gbrt_10_stumps.score(X_test, y_test) < gbrt_50_stumps.score(X_test, y_test)

    # Decision stumps are better suited for this dataset with a large number of
    # estimators.
    common_params = {
        "n_estimators": 200,
        "learning_rate": 1.0,
        "loss": loss,
        "random_state": global_random_seed,
    }
    gbrt_stumps = GradientBoostingClassifier(max_depth=1, **common_params)
    gbrt_stumps.fit(X_train, y_train)

    gbrt_10_nodes = GradientBoostingClassifier(max_leaf_nodes=10, **common_params)
    gbrt_10_nodes.fit(X_train, y_train)

    assert gbrt_stumps.score(X_test, y_test) > gbrt_10_nodes.score(X_test, y_test)


@pytest.mark.parametrize("loss", ("squared_error", "absolute_error", "huber"))
@pytest.mark.parametrize("subsample", (1.0, 0.5))
def test_regression_dataset(loss, subsample, global_random_seed):
    # Check consistency on regression dataset with least squares
    # and least absolute deviation.
    ones = np.ones(len(y_reg))
    last_y_pred = None
    for sample_weight in [None, ones, 2 * ones]:
        # learning_rate, max_depth and n_estimators were adjusted to get a mode
        # that is accurate enough to reach a low MSE on the training set while
        # keeping the resource used to execute this test low enough.
        reg = GradientBoostingRegressor(
            n_estimators=30,
            loss=loss,
            max_depth=4,
            subsample=subsample,
            min_samples_split=2,
            random_state=global_random_seed,
            learning_rate=0.5,
        )

        reg.fit(X_reg, y_reg, sample_weight=sample_weight)
        leaves = reg.apply(X_reg)
        assert leaves.shape == (100, 30)

        y_pred = reg.predict(X_reg)
        mse = mean_squared_error(y_reg, y_pred)
        assert mse < 0.05

        if last_y_pred is not None:
            # FIXME: We temporarily bypass this test. This is due to the fact
            # that GBRT with and without `sample_weight` do not use the same
            # implementation of the median during the initialization with the
            # `DummyRegressor`. In the future, we should make sure that both
            # implementations should be the same. See PR #17377 for more.
            # assert_allclose(last_y_pred, y_pred)
            pass

        last_y_pred = y_pred


@pytest.mark.parametrize("subsample", (1.0, 0.5))
@pytest.mark.parametrize("sample_weight", (None, 1))
def test_iris(subsample, sample_weight, global_random_seed):
    if sample_weight == 1:
        sample_weight = np.ones(len(iris.target))
    # Check consistency on dataset iris.
    clf = GradientBoostingClassifier(
        n_estimators=100,
        loss="log_loss",
        random_state=global_random_seed,
        subsample=subsample,
    )
    clf.fit(iris.data, iris.target, sample_weight=sample_weight)
    score = clf.score(iris.data, iris.target)
    assert score > 0.9

    leaves = clf.apply(iris.data)
    assert leaves.shape == (150, 100, 3)


def test_regression_synthetic(global_random_seed):
    # Test on synthetic regression datasets used in Leo Breiman,
    # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
    random_state = check_random_state(global_random_seed)
    regression_params = {
        "n_estimators": 100,
        "max_depth": 4,
        "min_samples_split": 2,
        "learning_rate": 0.1,
        "loss": "squared_error",
        "random_state": global_random_seed,
    }

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 6.5

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 2500.0

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 0.025


@pytest.mark.parametrize(
    "GradientBoosting, X, y",
    [
        (GradientBoostingRegressor, X_reg, y_reg),
        (GradientBoostingClassifier, iris.data, iris.target),
    ],
)
def test_feature_importances(GradientBoosting, X, y):
    # smoke test to check that the gradient boosting expose an attribute
    # feature_importances_
    gbdt = GradientBoosting()
    assert not hasattr(gbdt, "feature_importances_")
    gbdt.fit(X, y)
    assert hasattr(gbdt, "feature_importances_")


def test_probability_log(global_random_seed):
    # Predict probabilities.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=global_random_seed)

    with pytest.raises(ValueError):
        clf.predict_proba(T)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # check if probabilities are in [0, 1].
    y_proba = clf.predict_proba(T)
    assert np.all(y_proba >= 0.0)
    assert np.all(y_proba <= 1.0)

    # derive predictions from probabilities
    y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
    assert_array_equal(y_pred, true_result)


def test_single_class_with_sample_weight():
    sample_weight = [0, 0, 0, 1, 1, 1]
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    msg = (
        "y contains 1 class after sample_weight trimmed classes with "
        "zero weights, while a minimum of 2 classes are required."
    )
    with pytest.raises(ValueError, match=msg):
        clf.fit(X, y, sample_weight=sample_weight)


@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_check_inputs_predict_stages(csc_container):
    # check that predict_stages through an error if the type of X is not
    # supported
    x, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    x_sparse_csc = csc_container(x)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(x, y)
    score = np.zeros((y.shape)).reshape(-1, 1)
    err_msg = "When X is a sparse matrix, a CSR format is expected"
    with pytest.raises(ValueError, match=err_msg):
        predict_stages(clf.estimators_, x_sparse_csc, clf.learning_rate, score)
    x_fortran = np.asfortranarray(x)
    with pytest.raises(ValueError, match="X should be C-ordered np.ndarray"):
        predict_stages(clf.estimators_, x_fortran, clf.learning_rate, score)


def test_max_feature_regression(global_random_seed):
    # Test to make sure random state is set properly.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=global_random_seed)

    X_train, X_test = X[:2000], X[2000:]
    y_train, y_test = y[:2000], y[2000:]

    gbrt = GradientBoostingClassifier(
        n_estimators=100,
        min_samples_split=5,
        max_depth=2,
        learning_rate=0.1,
        max_features=2,
        random_state=global_random_seed,
    )
    gbrt.fit(X_train, y_train)
    log_loss = gbrt._loss(y_test, gbrt.decision_function(X_test))
    assert log_loss < 0.5, "GB failed with deviance %.4f" % log_loss


def test_feature_importance_regression(
    fetch_california_housing_fxt, global_random_seed
):
    """Test that Gini importance is calculated correctly.

    This test follows the example from [1]_ (pg. 373).

    .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
       of statistical learning. New York: Springer series in statistics.
    """
    california = fetch_california_housing_fxt()
    X, y = california.data, california.target
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=global_random_seed
    )

    reg = GradientBoostingRegressor(
        loss="huber",
        learning_rate=0.1,
        max_leaf_nodes=6,
        n_estimators=100,
        random_state=global_random_seed,
    )
    reg.fit(X_train, y_train)
    sorted_idx = np.argsort(reg.feature_importances_)[::-1]
    sorted_features = [california.feature_names[s] for s in sorted_idx]

    # The most important feature is the median income by far.
    assert sorted_features[0] == "MedInc"

    # The three subsequent features are the following. Their relative ordering
    # might change a bit depending on the randomness of the trees and the
    # train / test split.
    assert set(sorted_features[1:4]) == {"Longitude", "AveOccup", "Latitude"}


def test_max_features():
    # Test if max features is set properly for floats and str.
    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
    _, n_features = X.shape

    X_train = X[:2000]
    y_train = y[:2000]

    gbrt = GradientBoostingClassifier(n_estimators=1, max_features=None)
    gbrt.fit(X_train, y_train)
    assert gbrt.max_features_ == n_features

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=None)
    gbrt.fit(X_train, y_train)
    assert gbrt.max_features_ == n_features

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3)
    gbrt.fit(X_train, y_train)
    assert gbrt.max_features_ == int(n_features * 0.3)

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features="sqrt")
    gbrt.fit(X_train, y_train)
    assert gbrt.max_features_ == int(np.sqrt(n_features))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features="log2")
    gbrt.fit(X_train, y_train)
    assert gbrt.max_features_ == int(np.log2(n_features))

    gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.01 / X.shape[1])
    gbrt.fit(X_train, y_train)
    assert gbrt.max_features_ == 1


def test_staged_predict():
    # Test whether staged decision function eventually gives
    # the same prediction.
    X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test = X[200:]
    clf = GradientBoostingRegressor()
    # test raise ValueError if not fitted
    with pytest.raises(ValueError):
        np.fromiter(clf.staged_predict(X_test), dtype=np.float64)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # test if prediction for last stage equals ``predict``
    for y in clf.staged_predict(X_test):
        assert y.shape == y_pred.shape

    assert_array_almost_equal(y_pred, y)


def test_staged_predict_proba():
    # Test whether staged predict proba eventually gives
    # the same prediction.
    X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingClassifier(n_estimators=20)
    # test raise NotFittedError if not
    with pytest.raises(NotFittedError):
        np.fromiter(clf.staged_predict_proba(X_test), dtype=np.float64)

    clf.fit(X_train, y_train)

    # test if prediction for last stage equals ``predict``
    for y_pred in clf.staged_predict(X_test):
        assert y_test.shape == y_pred.shape

    assert_array_equal(clf.predict(X_test), y_pred)

    # test if prediction for last stage equals ``predict_proba``
    for staged_proba in clf.staged_predict_proba(X_test):
        assert y_test.shape[0] == staged_proba.shape[0]
        assert 2 == staged_proba.shape[1]

    assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)


@pytest.mark.parametrize("Estimator", GRADIENT_BOOSTING_ESTIMATORS)
def test_staged_functions_defensive(Estimator, global_random_seed):
    # test that staged_functions make defensive copies
    rng = np.random.RandomState(global_random_seed)
    X = rng.uniform(size=(10, 3))
    y = (4 * X[:, 0]).astype(int) + 1  # don't predict zeros
    estimator = Estimator()
    estimator.fit(X, y)
    for func in ["predict", "decision_function", "predict_proba"]:
        staged_func = getattr(estimator, "staged_" + func, None)
        if staged_func is None:
            # regressor has no staged_predict_proba
            continue
        with warnings.catch_warnings(record=True):
            staged_result = list(staged_func(X))
        staged_result[1][:] = 0
        assert np.all(staged_result[0] != 0)


def test_serialization():
    # Check model serialization.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)
    assert 100 == len(clf.estimators_)

    try:
        import cPickle as pickle
    except ImportError:
        import pickle

    serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL)
    clf = None
    clf = pickle.loads(serialized_clf)
    assert_array_equal(clf.predict(T), true_result)
    assert 100 == len(clf.estimators_)


def test_degenerate_targets():
    # Check if we can fit even though all targets are equal.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    # classifier should raise exception
    with pytest.raises(ValueError):
        clf.fit(X, np.ones(len(X)))

    clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
    clf.fit(X, np.ones(len(X)))
    clf.predict([rng.rand(2)])
    assert_array_equal(np.ones((1,), dtype=np.float64), clf.predict([rng.rand(2)]))


def test_quantile_loss(global_random_seed):
    # Check if quantile loss with alpha=0.5 equals absolute_error.
    clf_quantile = GradientBoostingRegressor(
        n_estimators=100,
        loss="quantile",
        max_depth=4,
        alpha=0.5,
        random_state=global_random_seed,
    )

    clf_quantile.fit(X_reg, y_reg)
    y_quantile = clf_quantile.predict(X_reg)

    clf_ae = GradientBoostingRegressor(
        n_estimators=100,
        loss="absolute_error",
        max_depth=4,
        random_state=global_random_seed,
    )

    clf_ae.fit(X_reg, y_reg)
    y_ae = clf_ae.predict(X_reg)
    assert_allclose(y_quantile, y_ae)


def test_symbol_labels():
    # Test with non-integer class labels.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    symbol_y = tosequence(map(str, y))

    clf.fit(X, symbol_y)
    assert_array_equal(clf.predict(T), tosequence(map(str, true_result)))
    assert 100 == len(clf.estimators_)


def test_float_class_labels():
    # Test with float class labels.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    float_y = np.asarray(y, dtype=np.float32)

    clf.fit(X, float_y)
    assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32))
    assert 100 == len(clf.estimators_)


def test_shape_y():
    # Test with float class labels.
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)

    y_ = np.asarray(y, dtype=np.int32)
    y_ = y_[:, np.newaxis]

    # This will raise a DataConversionWarning that we want to
    # "always" raise, elsewhere the warnings gets ignored in the
    # later tests, and the tests that check for this warning fail
    warn_msg = (
        "A column-vector y was passed when a 1d array was expected. "
        "Please change the shape of y to \\(n_samples, \\), for "
        "example using ravel()."
    )
    with pytest.warns(DataConversionWarning, match=warn_msg):
        clf.fit(X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert 100 == len(clf.estimators_)


def test_mem_layout():
    # Test with different memory layouts of X and y
    X_ = np.asfortranarray(X)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X_, y)
    assert_array_equal(clf.predict(T), true_result)
    assert 100 == len(clf.estimators_)

    X_ = np.ascontiguousarray(X)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X_, y)
    assert_array_equal(clf.predict(T), true_result)
    assert 100 == len(clf.estimators_)

    y_ = np.asarray(y, dtype=np.int32)
    y_ = np.ascontiguousarray(y_)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert 100 == len(clf.estimators_)

    y_ = np.asarray(y, dtype=np.int32)
    y_ = np.asfortranarray(y_)
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
    clf.fit(X, y_)
    assert_array_equal(clf.predict(T), true_result)
    assert 100 == len(clf.estimators_)


@pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS)
def test_oob_improvement(GradientBoostingEstimator):
    # Test if oob improvement has correct shape and regression test.
    estimator = GradientBoostingEstimator(
        n_estimators=100, random_state=1, subsample=0.5
    )
    estimator.fit(X, y)
    assert estimator.oob_improvement_.shape[0] == 100
    # hard-coded regression test - change if modification in OOB computation
    assert_array_almost_equal(
        estimator.oob_improvement_[:5],
        np.array([0.19, 0.15, 0.12, -0.11, 0.11]),
        decimal=2,
    )


@pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS)
def test_oob_scores(GradientBoostingEstimator):
    # Test if oob scores has correct shape and regression test.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    estimator = GradientBoostingEstimator(
        n_estimators=100, random_state=1, subsample=0.5
    )
    estimator.fit(X, y)
    assert estimator.oob_scores_.shape[0] == 100
    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)

    estimator = GradientBoostingEstimator(
        n_estimators=100,
        random_state=1,
        subsample=0.5,
        n_iter_no_change=5,
    )
    estimator.fit(X, y)
    assert estimator.oob_scores_.shape[0] < 100
    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)


@pytest.mark.parametrize(
    "GradientBoostingEstimator, oob_attribute",
    [
        (GradientBoostingClassifier, "oob_improvement_"),
        (GradientBoostingClassifier, "oob_scores_"),
        (GradientBoostingClassifier, "oob_score_"),
        (GradientBoostingRegressor, "oob_improvement_"),
        (GradientBoostingRegressor, "oob_scores_"),
        (GradientBoostingRegressor, "oob_score_"),
    ],
)
def test_oob_attributes_error(GradientBoostingEstimator, oob_attribute):
    """
    Check that we raise an AttributeError when the OOB statistics were not computed.
    """
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    estimator = GradientBoostingEstimator(
        n_estimators=100,
        random_state=1,
        subsample=1.0,
    )
    estimator.fit(X, y)
    with pytest.raises(AttributeError):
        estimator.oob_attribute


def test_oob_multilcass_iris():
    # Check OOB improvement on multi-class dataset.
    estimator = GradientBoostingClassifier(
        n_estimators=100, loss="log_loss", random_state=1, subsample=0.5
    )
    estimator.fit(iris.data, iris.target)
    score = estimator.score(iris.data, iris.target)
    assert score > 0.9
    assert estimator.oob_improvement_.shape[0] == estimator.n_estimators
    assert estimator.oob_scores_.shape[0] == estimator.n_estimators
    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)

    estimator = GradientBoostingClassifier(
        n_estimators=100,
        loss="log_loss",
        random_state=1,
        subsample=0.5,
        n_iter_no_change=5,
    )
    estimator.fit(iris.data, iris.target)
    score = estimator.score(iris.data, iris.target)
    assert estimator.oob_improvement_.shape[0] < estimator.n_estimators
    assert estimator.oob_scores_.shape[0] < estimator.n_estimators
    assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)

    # hard-coded regression test - change if modification in OOB computation
    # FIXME: the following snippet does not yield the same results on 32 bits
    # assert_array_almost_equal(estimator.oob_improvement_[:5],
    #                           np.array([12.68, 10.45, 8.18, 6.43, 5.13]),
    #                           decimal=2)


def test_verbose_output():
    # Check verbose=1 does not cause error.
    import sys
    from io import StringIO

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    clf = GradientBoostingClassifier(
        n_estimators=100, random_state=1, verbose=1, subsample=0.8
    )
    clf.fit(X, y)
    verbose_output = sys.stdout
    sys.stdout = old_stdout

    # check output
    verbose_output.seek(0)
    header = verbose_output.readline().rstrip()
    # with OOB
    true_header = " ".join(["%10s"] + ["%16s"] * 3) % (
        "Iter",
        "Train Loss",
        "OOB Improve",
        "Remaining Time",
    )
    assert true_header == header

    n_lines = sum(1 for l in verbose_output.readlines())
    # one for 1-10 and then 9 for 20-100
    assert 10 + 9 == n_lines


def test_more_verbose_output():
    # Check verbose=2 does not cause error.
    import sys
    from io import StringIO

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=2)
    clf.fit(X, y)
    verbose_output = sys.stdout
    sys.stdout = old_stdout

    # check output
    verbose_output.seek(0)
    header = verbose_output.readline().rstrip()
    # no OOB
    true_header = " ".join(["%10s"] + ["%16s"] * 2) % (
        "Iter",
        "Train Loss",
        "Remaining Time",
    )
    assert true_header == header

    n_lines = sum(1 for l in verbose_output.readlines())
    # 100 lines for n_estimators==100
    assert 100 == n_lines


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start(Cls, global_random_seed):
    # Test if warm start equals fit.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
    est = Cls(n_estimators=200, max_depth=1, random_state=global_random_seed)
    est.fit(X, y)

    est_ws = Cls(
        n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed
    )
    est_ws.fit(X, y)
    est_ws.set_params(n_estimators=200)
    est_ws.fit(X, y)

    if Cls is GradientBoostingRegressor:
        assert_allclose(est_ws.predict(X), est.predict(X))
    else:
        # Random state is preserved and hence predict_proba must also be
        # same
        assert_array_equal(est_ws.predict(X), est.predict(X))
        assert_allclose(est_ws.predict_proba(X), est.predict_proba(X))


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_n_estimators(Cls, global_random_seed):
    # Test if warm start equals fit - set n_estimators.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
    est = Cls(n_estimators=300, max_depth=1, random_state=global_random_seed)
    est.fit(X, y)

    est_ws = Cls(
        n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed
    )
    est_ws.fit(X, y)
    est_ws.set_params(n_estimators=300)
    est_ws.fit(X, y)

    assert_allclose(est_ws.predict(X), est.predict(X))


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_max_depth(Cls):
    # Test if possible to fit trees of different depth in ensemble.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
    est.fit(X, y)
    est.set_params(n_estimators=110, max_depth=2)
    est.fit(X, y)

    # last 10 trees have different depth
    assert est.estimators_[0, 0].max_depth == 1
    for i in range(1, 11):
        assert est.estimators_[-i, 0].max_depth == 2


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_clear(Cls):
    # Test if fit clears state.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1)
    est.fit(X, y)

    est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True)
    est_2.fit(X, y)  # inits state
    est_2.set_params(warm_start=False)
    est_2.fit(X, y)  # clears old state and equals est

    assert_array_almost_equal(est_2.predict(X), est.predict(X))


@pytest.mark.parametrize("GradientBoosting", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_state_oob_scores(GradientBoosting):
    """
    Check that the states of the OOB scores are cleared when used with `warm_start`.
    """
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    n_estimators = 100
    estimator = GradientBoosting(
        n_estimators=n_estimators,
        max_depth=1,
        subsample=0.5,
        warm_start=True,
        random_state=1,
    )
    estimator.fit(X, y)
    oob_scores, oob_score = estimator.oob_scores_, estimator.oob_score_
    assert len(oob_scores) == n_estimators
    assert oob_scores[-1] == pytest.approx(oob_score)

    n_more_estimators = 200
    estimator.set_params(n_estimators=n_more_estimators).fit(X, y)
    assert len(estimator.oob_scores_) == n_more_estimators
    assert_allclose(estimator.oob_scores_[:n_estimators], oob_scores)

    estimator.set_params(n_estimators=n_estimators, warm_start=False).fit(X, y)
    assert estimator.oob_scores_ is not oob_scores
    assert estimator.oob_score_ is not oob_score
    assert_allclose(estimator.oob_scores_, oob_scores)
    assert estimator.oob_score_ == pytest.approx(oob_score)
    assert oob_scores[-1] == pytest.approx(oob_score)


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_smaller_n_estimators(Cls):
    # Test if warm start with smaller n_estimators raises error
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
    est.fit(X, y)
    est.set_params(n_estimators=99)
    with pytest.raises(ValueError):
        est.fit(X, y)


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_equal_n_estimators(Cls):
    # Test if warm start with equal n_estimators does nothing
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1)
    est.fit(X, y)

    est2 = clone(est)
    est2.set_params(n_estimators=est.n_estimators, warm_start=True)
    est2.fit(X, y)

    assert_array_almost_equal(est2.predict(X), est.predict(X))


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_oob_switch(Cls):
    # Test if oob can be turned on during warm start.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=100, max_depth=1, warm_start=True)
    est.fit(X, y)
    est.set_params(n_estimators=110, subsample=0.5)
    est.fit(X, y)

    assert_array_equal(est.oob_improvement_[:100], np.zeros(100))
    assert_array_equal(est.oob_scores_[:100], np.zeros(100))

    # the last 10 are not zeros
    assert (est.oob_improvement_[-10:] != 0.0).all()
    assert (est.oob_scores_[-10:] != 0.0).all()

    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_oob(Cls):
    # Test if warm start OOB equals fit.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est = Cls(n_estimators=200, max_depth=1, subsample=0.5, random_state=1)
    est.fit(X, y)

    est_ws = Cls(
        n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True
    )
    est_ws.fit(X, y)
    est_ws.set_params(n_estimators=200)
    est_ws.fit(X, y)

    assert_array_almost_equal(est_ws.oob_improvement_[:100], est.oob_improvement_[:100])
    assert_array_almost_equal(est_ws.oob_scores_[:100], est.oob_scores_[:100])
    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
    assert est_ws.oob_scores_[-1] == pytest.approx(est_ws.oob_score_)


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
@pytest.mark.parametrize(
    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
)
def test_warm_start_sparse(Cls, sparse_container):
    # Test that all sparse matrix types are supported
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    est_dense = Cls(
        n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True
    )
    est_dense.fit(X, y)
    est_dense.predict(X)
    est_dense.set_params(n_estimators=200)
    est_dense.fit(X, y)
    y_pred_dense = est_dense.predict(X)

    X_sparse = sparse_container(X)

    est_sparse = Cls(
        n_estimators=100,
        max_depth=1,
        subsample=0.5,
        random_state=1,
        warm_start=True,
    )
    est_sparse.fit(X_sparse, y)
    est_sparse.predict(X)
    est_sparse.set_params(n_estimators=200)
    est_sparse.fit(X_sparse, y)
    y_pred_sparse = est_sparse.predict(X)

    assert_array_almost_equal(
        est_dense.oob_improvement_[:100], est_sparse.oob_improvement_[:100]
    )
    assert est_dense.oob_scores_[-1] == pytest.approx(est_dense.oob_score_)
    assert_array_almost_equal(est_dense.oob_scores_[:100], est_sparse.oob_scores_[:100])
    assert est_sparse.oob_scores_[-1] == pytest.approx(est_sparse.oob_score_)
    assert_array_almost_equal(y_pred_dense, y_pred_sparse)


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_warm_start_fortran(Cls, global_random_seed):
    # Test that feeding a X in Fortran-ordered is giving the same results as
    # in C-ordered
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
    est_c = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True)
    est_fortran = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True)

    est_c.fit(X, y)
    est_c.set_params(n_estimators=11)
    est_c.fit(X, y)

    X_fortran = np.asfortranarray(X)
    est_fortran.fit(X_fortran, y)
    est_fortran.set_params(n_estimators=11)
    est_fortran.fit(X_fortran, y)

    assert_allclose(est_c.predict(X), est_fortran.predict(X))


def early_stopping_monitor(i, est, locals):
    """Returns True on the 10th iteration."""
    if i == 9:
        return True
    else:
        return False


@pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
def test_monitor_early_stopping(Cls):
    # Test if monitor return value works.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5)
    est.fit(X, y, monitor=early_stopping_monitor)
    assert est.n_estimators == 20  # this is not altered
    assert est.estimators_.shape[0] == 10
    assert est.train_score_.shape[0] == 10
    assert est.oob_improvement_.shape[0] == 10
    assert est.oob_scores_.shape[0] == 10
    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)

    # try refit
    est.set_params(n_estimators=30)
    est.fit(X, y)
    assert est.n_estimators == 30
    assert est.estimators_.shape[0] == 30
    assert est.train_score_.shape[0] == 30
    assert est.oob_improvement_.shape[0] == 30
    assert est.oob_scores_.shape[0] == 30
    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)

    est = Cls(
        n_estimators=20, max_depth=1, random_state=1, subsample=0.5, warm_start=True
    )
    est.fit(X, y, monitor=early_stopping_monitor)
    assert est.n_estimators == 20
    assert est.estimators_.shape[0] == 10
    assert est.train_score_.shape[0] == 10
    assert est.oob_improvement_.shape[0] == 10
    assert est.oob_scores_.shape[0] == 10
    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)

    # try refit
    est.set_params(n_estimators=30, warm_start=False)
    est.fit(X, y)
    assert est.n_estimators == 30
    assert est.train_score_.shape[0] == 30
    assert est.estimators_.shape[0] == 30
    assert est.oob_improvement_.shape[0] == 30
    assert est.oob_scores_.shape[0] == 30
    assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)


def test_complete_classification():
    # Test greedy trees with max_depth + 1 leafs.
    from sklearn.tree._tree import TREE_LEAF

    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
    k = 4

    est = GradientBoostingClassifier(
        n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1
    )
    est.fit(X, y)

    tree = est.estimators_[0, 0].tree_
    assert tree.max_depth == k
    assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1


def test_complete_regression():
    # Test greedy trees with max_depth + 1 leafs.
    from sklearn.tree._tree import TREE_LEAF

    k = 4

    est = GradientBoostingRegressor(
        n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1
    )
    est.fit(X_reg, y_reg)

    tree = est.estimators_[-1, 0].tree_
    assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1


def test_zero_estimator_reg(global_random_seed):
    # Test if init='zero' works for regression by checking that it is better
    # than a simple baseline.

    baseline = DummyRegressor(strategy="mean").fit(X_reg, y_reg)
    mse_baseline = mean_squared_error(baseline.predict(X_reg), y_reg)
    est = GradientBoostingRegressor(
        n_estimators=5,
        max_depth=1,
        random_state=global_random_seed,
        init="zero",
        learning_rate=0.5,
    )
    est.fit(X_reg, y_reg)
    y_pred = est.predict(X_reg)
    mse_gbdt = mean_squared_error(y_reg, y_pred)
    assert mse_gbdt < mse_baseline


def test_zero_estimator_clf(global_random_seed):
    # Test if init='zero' works for classification.
    X = iris.data
    y = np.array(iris.target)

    est = GradientBoostingClassifier(
        n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero"
    )
    est.fit(X, y)

    assert est.score(X, y) > 0.96

    # binary clf
    mask = y != 0
    y[mask] = 1
    y[~mask] = 0
    est = GradientBoostingClassifier(
        n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero"
    )
    est.fit(X, y)
    assert est.score(X, y) > 0.96


@pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS)
def test_max_leaf_nodes_max_depth(GBEstimator):
    # Test precedence of max_leaf_nodes over max_depth.
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    k = 4

    est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
    tree = est.estimators_[0, 0].tree_
    assert tree.max_depth == 1

    est = GBEstimator(max_depth=1).fit(X, y)
    tree = est.estimators_[0, 0].tree_
    assert tree.max_depth == 1


@pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS)
def test_min_impurity_decrease(GBEstimator):
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)

    est = GBEstimator(min_impurity_decrease=0.1)
    est.fit(X, y)
    for tree in est.estimators_.flat:
        # Simply check if the parameter is passed on correctly. Tree tests
        # will suffice for the actual working of this param
        assert tree.min_impurity_decrease == 0.1


def test_warm_start_wo_nestimators_change():
    # Test if warm_start does nothing if n_estimators is not changed.
    # Regression test for #3513.
    clf = GradientBoostingClassifier(n_estimators=10, warm_start=True)
    clf.fit([[0, 1], [2, 3]], [0, 1])
    assert clf.estimators_.shape[0] == 10
    clf.fit([[0, 1], [2, 3]], [0, 1])
    assert clf.estimators_.shape[0] == 10


@pytest.mark.parametrize(
    ("loss", "value"),
    [
        ("squared_error", 0.5),
        ("absolute_error", 0.0),
        ("huber", 0.5),
        ("quantile", 0.5),
    ],
)
def test_non_uniform_weights_toy_edge_case_reg(loss, value):
    X = [[1, 0], [1, 0], [1, 0], [0, 1]]
    y = [0, 0, 1, 0]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1]
    gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2, loss=loss)
    gb.fit(X, y, sample_weight=sample_weight)
    assert gb.predict([[1, 0]])[0] >= value


def test_non_uniform_weights_toy_edge_case_clf():
    X = [[1, 0], [1, 0], [1, 0], [0, 1]]
    y = [0, 0, 1, 0]
    # ignore the first 2 training samples by setting their weight to 0
    sample_weight = [0, 0, 1, 1]
    for loss in ("log_loss", "exponential"):
        gb = GradientBoostingClassifier(n_estimators=5, loss=loss)
        gb.fit(X, y, sample_weight=sample_weight)
        assert_array_equal(gb.predict([[1, 0]]), [1])


@skip_if_32bit
@pytest.mark.parametrize(
    "EstimatorClass", (GradientBoostingClassifier, GradientBoostingRegressor)
)
@pytest.mark.parametrize(
    "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS
)
def test_sparse_input(EstimatorClass, sparse_container):
    y, X = datasets.make_multilabel_classification(
        random_state=0, n_samples=50, n_features=1, n_classes=20
    )
    y = y[:, 0]
    X_sparse = sparse_container(X)

    dense = EstimatorClass(
        n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
    ).fit(X, y)
    sparse = EstimatorClass(
        n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
    ).fit(X_sparse, y)

    assert_array_almost_equal(sparse.apply(X), dense.apply(X))
    assert_array_almost_equal(sparse.predict(X), dense.predict(X))
    assert_array_almost_equal(sparse.feature_importances_, dense.feature_importances_)

    assert_array_almost_equal(sparse.predict(X_sparse), dense.predict(X))
    assert_array_almost_equal(dense.predict(X_sparse), sparse.predict(X))

    if issubclass(EstimatorClass, GradientBoostingClassifier):
        assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X))
        assert_array_almost_equal(
            sparse.predict_log_proba(X), dense.predict_log_proba(X)
        )

        assert_array_almost_equal(
            sparse.decision_function(X_sparse), sparse.decision_function(X)
        )
        assert_array_almost_equal(
            dense.decision_function(X_sparse), sparse.decision_function(X)
        )
        for res_sparse, res in zip(
            sparse.staged_decision_function(X_sparse),
            sparse.staged_decision_function(X),
        ):
            assert_array_almost_equal(res_sparse, res)


@pytest.mark.parametrize(
    "GradientBoostingEstimator", [GradientBoostingClassifier, GradientBoostingRegressor]
)
def test_gradient_boosting_early_stopping(GradientBoostingEstimator):
    # Check if early stopping works as expected, that is empirically check that the
    # number of trained estimators is increasing when the tolerance decreases.

    X, y = make_classification(n_samples=1000, random_state=0)
    n_estimators = 1000

    gb_large_tol = GradientBoostingEstimator(
        n_estimators=n_estimators,
        n_iter_no_change=10,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
        tol=1e-1,
    )

    gb_small_tol = GradientBoostingEstimator(
        n_estimators=n_estimators,
        n_iter_no_change=10,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
        tol=1e-3,
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    gb_large_tol.fit(X_train, y_train)
    gb_small_tol.fit(X_train, y_train)

    assert gb_large_tol.n_estimators_ < gb_small_tol.n_estimators_ < n_estimators

    assert gb_large_tol.score(X_test, y_test) > 0.7
    assert gb_small_tol.score(X_test, y_test) > 0.7


def test_gradient_boosting_without_early_stopping():
    # When early stopping is not used, the number of trained estimators
    # must be the one specified.
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(
        n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42
    )
    gbc.fit(X, y)
    gbr = GradientBoostingRegressor(
        n_estimators=30, learning_rate=0.1, max_depth=3, random_state=42
    )
    gbr.fit(X, y)

    # The number of trained estimators must be the one specified.
    assert gbc.n_estimators_ == 50
    assert gbr.n_estimators_ == 30


def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(
        n_estimators=100,
        n_iter_no_change=10,
        validation_fraction=0.1,
        learning_rate=0.1,
        max_depth=3,
        random_state=42,
    )
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(
        n_estimators=100,
        n_iter_no_change=10,
        learning_rate=0.1,
        max_depth=3,
        validation_fraction=0.1,
        random_state=42,
    )
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_


def test_early_stopping_stratified():
    # Make sure data splitting for early stopping is stratified
    X = [[1, 2], [2, 3], [3, 4], [4, 5]]
    y = [0, 0, 0, 1]

    gbc = GradientBoostingClassifier(n_iter_no_change=5)
    with pytest.raises(
        ValueError, match="The least populated class in y has only 1 member"
    ):
        gbc.fit(X, y)


def _make_multiclass():
    return make_classification(n_classes=3, n_clusters_per_class=1)


@pytest.mark.parametrize(
    "gb, dataset_maker, init_estimator",
    [
        (GradientBoostingClassifier, make_classification, DummyClassifier),
        (GradientBoostingClassifier, _make_multiclass, DummyClassifier),
        (GradientBoostingRegressor, make_regression, DummyRegressor),
    ],
    ids=["binary classification", "multiclass classification", "regression"],
)
def test_gradient_boosting_with_init(
    gb, dataset_maker, init_estimator, global_random_seed
):
    # Check that GradientBoostingRegressor works when init is a sklearn
    # estimator.
    # Check that an error is raised if trying to fit with sample weight but
    # initial estimator does not support sample weight

    X, y = dataset_maker()
    sample_weight = np.random.RandomState(global_random_seed).rand(100)

    # init supports sample weights
    init_est = init_estimator()
    gb(init=init_est).fit(X, y, sample_weight=sample_weight)

    # init does not support sample weights
    init_est = NoSampleWeightWrapper(init_estimator())
    gb(init=init_est).fit(X, y)  # ok no sample weights
    with pytest.raises(ValueError, match="estimator.*does not support sample weights"):
        gb(init=init_est).fit(X, y, sample_weight=sample_weight)


def test_gradient_boosting_with_init_pipeline():
    # Check that the init estimator can be a pipeline (see issue #13466)

    X, y = make_regression(random_state=0)
    init = make_pipeline(LinearRegression())
    gb = GradientBoostingRegressor(init=init)
    gb.fit(X, y)  # pipeline without sample_weight works fine

    with pytest.raises(
        ValueError,
        match="The initial estimator Pipeline does not support sample weights",
    ):
        gb.fit(X, y, sample_weight=np.ones(X.shape[0]))

    # Passing sample_weight to a pipeline raises a ValueError. This test makes
    # sure we make the distinction between ValueError raised by a pipeline that
    # was passed sample_weight, and a InvalidParameterError raised by a regular
    # estimator whose input checking failed.
    invalid_nu = 1.5
    err_msg = (
        "The 'nu' parameter of NuSVR must be a float in the"
        f" range (0.0, 1.0]. Got {invalid_nu} instead."
    )
    with pytest.raises(InvalidParameterError, match=re.escape(err_msg)):
        # Note that NuSVR properly supports sample_weight
        init = NuSVR(gamma="auto", nu=invalid_nu)
        gb = GradientBoostingRegressor(init=init)
        gb.fit(X, y, sample_weight=np.ones(X.shape[0]))


def test_early_stopping_n_classes():
    # when doing early stopping (_, , y_train, _ = train_test_split(X, y))
    # there might be classes in y that are missing in y_train. As the init
    # estimator will be trained on y_train, we need to raise an error if this
    # happens.

    X = [[1]] * 10
    y = [0, 0] + [1] * 8  # only 2 negative class over 10 samples
    gb = GradientBoostingClassifier(
        n_iter_no_change=5, random_state=0, validation_fraction=0.8
    )
    with pytest.raises(
        ValueError, match="The training data after the early stopping split"
    ):
        gb.fit(X, y)

    # No error if we let training data be big enough
    gb = GradientBoostingClassifier(
        n_iter_no_change=5, random_state=0, validation_fraction=0.4
    )


def test_gbr_degenerate_feature_importances():
    # growing an ensemble of single node trees. See #13620
    X = np.zeros((10, 10))
    y = np.ones((10,))
    gbr = GradientBoostingRegressor().fit(X, y)
    assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64))


def test_huber_vs_mean_and_median():
    """Check that huber lies between absolute and squared error."""
    n_rep = 100
    n_samples = 10
    y = np.tile(np.arange(n_samples), n_rep)
    x1 = np.minimum(y, n_samples / 2)
    x2 = np.minimum(-y, -n_samples / 2)
    X = np.c_[x1, x2]

    rng = np.random.RandomState(42)
    # We want an asymmetric distribution.
    y = y + rng.exponential(scale=1, size=y.shape)

    gbt_absolute_error = GradientBoostingRegressor(loss="absolute_error").fit(X, y)
    gbt_huber = GradientBoostingRegressor(loss="huber").fit(X, y)
    gbt_squared_error = GradientBoostingRegressor().fit(X, y)

    gbt_huber_predictions = gbt_huber.predict(X)
    assert np.all(gbt_absolute_error.predict(X) <= gbt_huber_predictions)
    assert np.all(gbt_huber_predictions <= gbt_squared_error.predict(X))


def test_safe_divide():
    """Test that _safe_divide handles division by zero."""
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        assert _safe_divide(np.float64(1e300), 0) == 0
        assert _safe_divide(np.float64(0.0), np.float64(0.0)) == 0
    with pytest.warns(RuntimeWarning, match="overflow"):
        # np.finfo(float).max = 1.7976931348623157e+308
        _safe_divide(np.float64(1e300), 1e-10)


def test_squared_error_exact_backward_compat():
    """Test squared error GBT backward compat on a simple dataset.

    The results to compare against are taken from scikit-learn v1.2.0.
    """
    n_samples = 10
    y = np.arange(n_samples)
    x1 = np.minimum(y, n_samples / 2)
    x2 = np.minimum(-y, -n_samples / 2)
    X = np.c_[x1, x2]
    gbt = GradientBoostingRegressor(loss="squared_error", n_estimators=100).fit(X, y)

    pred_result = np.array(
        [
            1.39245726e-04,
            1.00010468e00,
            2.00007043e00,
            3.00004051e00,
            4.00000802e00,
            4.99998972e00,
            5.99996312e00,
            6.99993395e00,
            7.99989372e00,
            8.99985660e00,
        ]
    )
    assert_allclose(gbt.predict(X), pred_result, rtol=1e-8)

    train_score = np.array(
        [
            4.87246390e-08,
            3.95590036e-08,
            3.21267865e-08,
            2.60970300e-08,
            2.11820178e-08,
            1.71995782e-08,
            1.39695549e-08,
            1.13391770e-08,
            9.19931587e-09,
            7.47000575e-09,
        ]
    )
    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)

    # Same but with sample_weights
    sample_weights = np.tile([1, 10], n_samples // 2)
    gbt = GradientBoostingRegressor(loss="squared_error", n_estimators=100).fit(
        X, y, sample_weight=sample_weights
    )

    pred_result = np.array(
        [
            1.52391462e-04,
            1.00011168e00,
            2.00007724e00,
            3.00004638e00,
            4.00001302e00,
            4.99999873e00,
            5.99997093e00,
            6.99994329e00,
            7.99991290e00,
            8.99988727e00,
        ]
    )
    assert_allclose(gbt.predict(X), pred_result, rtol=1e-6, atol=1e-5)

    train_score = np.array(
        [
            4.12445296e-08,
            3.34418322e-08,
            2.71151383e-08,
            2.19782469e-08,
            1.78173649e-08,
            1.44461976e-08,
            1.17120123e-08,
            9.49485678e-09,
            7.69772505e-09,
            6.24155316e-09,
        ]
    )
    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-3, atol=1e-11)


@skip_if_32bit
def test_huber_exact_backward_compat():
    """Test huber GBT backward compat on a simple dataset.

    The results to compare against are taken from scikit-learn v1.2.0.
    """
    n_samples = 10
    y = np.arange(n_samples)
    x1 = np.minimum(y, n_samples / 2)
    x2 = np.minimum(-y, -n_samples / 2)
    X = np.c_[x1, x2]
    gbt = GradientBoostingRegressor(loss="huber", n_estimators=100, alpha=0.8).fit(X, y)

    assert_allclose(gbt._loss.closs.delta, 0.0001655688041282133)

    pred_result = np.array(
        [
            1.48120765e-04,
            9.99949174e-01,
            2.00116957e00,
            2.99986716e00,
            4.00012064e00,
            5.00002462e00,
            5.99998898e00,
            6.99692549e00,
            8.00006356e00,
            8.99985099e00,
        ]
    )
    assert_allclose(gbt.predict(X), pred_result, rtol=1e-8)

    train_score = np.array(
        [
            2.59484709e-07,
            2.19165900e-07,
            1.89644782e-07,
            1.64556454e-07,
            1.38705110e-07,
            1.20373736e-07,
            1.04746082e-07,
            9.13835687e-08,
            8.20245756e-08,
            7.17122188e-08,
        ]
    )
    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)


def test_binomial_error_exact_backward_compat():
    """Test binary log_loss GBT backward compat on a simple dataset.

    The results to compare against are taken from scikit-learn v1.2.0.
    """
    n_samples = 10
    y = np.arange(n_samples) % 2
    x1 = np.minimum(y, n_samples / 2)
    x2 = np.minimum(-y, -n_samples / 2)
    X = np.c_[x1, x2]
    gbt = GradientBoostingClassifier(loss="log_loss", n_estimators=100).fit(X, y)

    pred_result = np.array(
        [
            [9.99978098e-01, 2.19017313e-05],
            [2.19017313e-05, 9.99978098e-01],
            [9.99978098e-01, 2.19017313e-05],
            [2.19017313e-05, 9.99978098e-01],
            [9.99978098e-01, 2.19017313e-05],
            [2.19017313e-05, 9.99978098e-01],
            [9.99978098e-01, 2.19017313e-05],
            [2.19017313e-05, 9.99978098e-01],
            [9.99978098e-01, 2.19017313e-05],
            [2.19017313e-05, 9.99978098e-01],
        ]
    )
    assert_allclose(gbt.predict_proba(X), pred_result, rtol=1e-8)

    train_score = np.array(
        [
            1.07742210e-04,
            9.74889078e-05,
            8.82113863e-05,
            7.98167784e-05,
            7.22210566e-05,
            6.53481907e-05,
            5.91293869e-05,
            5.35023988e-05,
            4.84109045e-05,
            4.38039423e-05,
        ]
    )
    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)


def test_multinomial_error_exact_backward_compat():
    """Test multiclass log_loss GBT backward compat on a simple dataset.

    The results to compare against are taken from scikit-learn v1.2.0.
    """
    n_samples = 10
    y = np.arange(n_samples) % 4
    x1 = np.minimum(y, n_samples / 2)
    x2 = np.minimum(-y, -n_samples / 2)
    X = np.c_[x1, x2]
    gbt = GradientBoostingClassifier(loss="log_loss", n_estimators=100).fit(X, y)

    pred_result = np.array(
        [
            [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08],
            [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08],
            [1.19417637e-07, 1.19417637e-07, 9.99999675e-01, 8.60526098e-08],
            [1.19417637e-07, 1.19417637e-07, 8.60526088e-08, 9.99999675e-01],
            [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08],
            [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08],
            [1.19417637e-07, 1.19417637e-07, 9.99999675e-01, 8.60526098e-08],
            [1.19417637e-07, 1.19417637e-07, 8.60526088e-08, 9.99999675e-01],
            [9.99999727e-01, 1.11956255e-07, 8.04921671e-08, 8.04921668e-08],
            [1.11956254e-07, 9.99999727e-01, 8.04921671e-08, 8.04921668e-08],
        ]
    )
    assert_allclose(gbt.predict_proba(X), pred_result, rtol=1e-8)

    train_score = np.array(
        [
            1.13300150e-06,
            9.75183397e-07,
            8.39348103e-07,
            7.22433588e-07,
            6.21804338e-07,
            5.35191943e-07,
            4.60643966e-07,
            3.96479930e-07,
            3.41253434e-07,
            2.93719550e-07,
        ]
    )
    assert_allclose(gbt.train_score_[-10:], train_score, rtol=1e-8)


def test_gb_denominator_zero(global_random_seed):
    """Test _update_terminal_regions denominator is not zero.

    For instance for log loss based binary classification, the line search step might
    become nan/inf as denominator = hessian = prob * (1 - prob) and prob = 0 or 1 can
    happen.
    Here, we create a situation were this happens (at least with roughly 80%) based
    on the random seed.
    """
    X, y = datasets.make_hastie_10_2(n_samples=100, random_state=20)

    params = {
        "learning_rate": 1.0,
        "subsample": 0.5,
        "n_estimators": 100,
        "max_leaf_nodes": 4,
        "max_depth": None,
        "random_state": global_random_seed,
        "min_samples_leaf": 2,
    }

    clf = GradientBoostingClassifier(**params)
    # _safe_devide would raise a RuntimeWarning
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        clf.fit(X, y)