"""Testing for the boost module (sklearn.ensemble.boost).""" import re import numpy as np import pytest from sklearn import datasets from sklearn.base import BaseEstimator, clone from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor from sklearn.ensemble._weight_boosting import _samme_proba from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils import shuffle from sklearn.utils._mocking import NoSampleWeightWrapper from sklearn.utils._testing import ( assert_allclose, assert_array_almost_equal, assert_array_equal, assert_array_less, ) from sklearn.utils.fixes import ( COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS, DOK_CONTAINERS, LIL_CONTAINERS, ) # Common random state rng = np.random.RandomState(0) # Toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels y_regr = [-1, -1, -1, 1, 1, 1] T = [[-1, -1], [2, 2], [3, 2]] y_t_class = ["foo", 1, 1] y_t_regr = [-1, 1, 1] # Load the iris dataset and randomly permute it iris = datasets.load_iris() perm = rng.permutation(iris.target.size) iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng) # Load the diabetes dataset and randomly permute it diabetes = datasets.load_diabetes() diabetes.data, diabetes.target = shuffle( diabetes.data, diabetes.target, random_state=rng ) def test_samme_proba(): # Test the `_samme_proba` helper function. # Define some example (bad) `predict_proba` output. probs = np.array( [[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]] ) probs /= np.abs(probs.sum(axis=1))[:, np.newaxis] # _samme_proba calls estimator.predict_proba. # Make a mock object so I can control what gets returned. class MockEstimator: def predict_proba(self, X): assert_array_equal(X.shape, probs.shape) return probs mock = MockEstimator() samme_proba = _samme_proba(mock, 3, np.ones_like(probs)) assert_array_equal(samme_proba.shape, probs.shape) assert np.isfinite(samme_proba).all() # Make sure that the correct elements come out as smallest -- # `_samme_proba` should preserve the ordering in each example. assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2]) assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1]) def test_oneclass_adaboost_proba(): # Test predict_proba robustness for one class label input. # In response to issue #7501 # https://github.com/scikit-learn/scikit-learn/issues/7501 y_t = np.ones(len(X)) clf = AdaBoostClassifier(algorithm="SAMME").fit(X, y_t) assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1))) # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_classification_toy(algorithm): # Check classification on a toy dataset. clf = AdaBoostClassifier(algorithm=algorithm, random_state=0) clf.fit(X, y_class) assert_array_equal(clf.predict(T), y_t_class) assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_) assert clf.predict_proba(T).shape == (len(T), 2) assert clf.decision_function(T).shape == (len(T),) def test_regression_toy(): # Check classification on a toy dataset. clf = AdaBoostRegressor(random_state=0) clf.fit(X, y_regr) assert_array_equal(clf.predict(T), y_t_regr) # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") def test_iris(): # Check consistency on dataset iris. classes = np.unique(iris.target) clf_samme = prob_samme = None for alg in ["SAMME", "SAMME.R"]: clf = AdaBoostClassifier(algorithm=alg) clf.fit(iris.data, iris.target) assert_array_equal(classes, clf.classes_) proba = clf.predict_proba(iris.data) if alg == "SAMME": clf_samme = clf prob_samme = proba assert proba.shape[1] == len(classes) assert clf.decision_function(iris.data).shape[1] == len(classes) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with algorithm %s and score = %f" % (alg, score) # Check we used multiple estimators assert len(clf.estimators_) > 1 # Check for distinct random states (see issue #7408) assert len(set(est.random_state for est in clf.estimators_)) == len( clf.estimators_ ) # Somewhat hacky regression test: prior to # ae7adc880d624615a34bafdb1d75ef67051b8200, # predict_proba returned SAMME.R values for SAMME. clf_samme.algorithm = "SAMME.R" assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme)) @pytest.mark.parametrize("loss", ["linear", "square", "exponential"]) def test_diabetes(loss): # Check consistency on dataset diabetes. reg = AdaBoostRegressor(loss=loss, random_state=0) reg.fit(diabetes.data, diabetes.target) score = reg.score(diabetes.data, diabetes.target) assert score > 0.55 # Check we used multiple estimators assert len(reg.estimators_) > 1 # Check for distinct random states (see issue #7408) assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_) # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_staged_predict(algorithm): # Check staged predictions. rng = np.random.RandomState(0) iris_weights = rng.randint(10, size=iris.target.shape) diabetes_weights = rng.randint(10, size=diabetes.target.shape) clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10) clf.fit(iris.data, iris.target, sample_weight=iris_weights) predictions = clf.predict(iris.data) staged_predictions = [p for p in clf.staged_predict(iris.data)] proba = clf.predict_proba(iris.data) staged_probas = [p for p in clf.staged_predict_proba(iris.data)] score = clf.score(iris.data, iris.target, sample_weight=iris_weights) staged_scores = [ s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights) ] assert len(staged_predictions) == 10 assert_array_almost_equal(predictions, staged_predictions[-1]) assert len(staged_probas) == 10 assert_array_almost_equal(proba, staged_probas[-1]) assert len(staged_scores) == 10 assert_array_almost_equal(score, staged_scores[-1]) # AdaBoost regression clf = AdaBoostRegressor(n_estimators=10, random_state=0) clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights) predictions = clf.predict(diabetes.data) staged_predictions = [p for p in clf.staged_predict(diabetes.data)] score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights) staged_scores = [ s for s in clf.staged_score( diabetes.data, diabetes.target, sample_weight=diabetes_weights ) ] assert len(staged_predictions) == 10 assert_array_almost_equal(predictions, staged_predictions[-1]) assert len(staged_scores) == 10 assert_array_almost_equal(score, staged_scores[-1]) def test_gridsearch(): # Check that base trees can be grid-searched. # AdaBoost classification boost = AdaBoostClassifier(estimator=DecisionTreeClassifier()) parameters = { "n_estimators": (1, 2), "estimator__max_depth": (1, 2), "algorithm": ("SAMME", "SAMME.R"), } clf = GridSearchCV(boost, parameters) clf.fit(iris.data, iris.target) # AdaBoost regression boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0) parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)} clf = GridSearchCV(boost, parameters) clf.fit(diabetes.data, diabetes.target) # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") def test_pickle(): # Check pickability. import pickle # Adaboost classifier for alg in ["SAMME", "SAMME.R"]: obj = AdaBoostClassifier(algorithm=alg) obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert type(obj2) == obj.__class__ score2 = obj2.score(iris.data, iris.target) assert score == score2 # Adaboost regressor obj = AdaBoostRegressor(random_state=0) obj.fit(diabetes.data, diabetes.target) score = obj.score(diabetes.data, diabetes.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert type(obj2) == obj.__class__ score2 = obj2.score(diabetes.data, diabetes.target) assert score == score2 # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") def test_importances(): # Check variable importances. X, y = datasets.make_classification( n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=1, ) for alg in ["SAMME", "SAMME.R"]: clf = AdaBoostClassifier(algorithm=alg) clf.fit(X, y) importances = clf.feature_importances_ assert importances.shape[0] == 10 assert (importances[:3, np.newaxis] >= importances[3:]).all() def test_adaboost_classifier_sample_weight_error(): # Test that it gives proper exception on incorrect sample weight. clf = AdaBoostClassifier() msg = re.escape("sample_weight.shape == (1,), expected (6,)") with pytest.raises(ValueError, match=msg): clf.fit(X, y_class, sample_weight=np.asarray([-1])) def test_estimator(): # Test different estimators. from sklearn.ensemble import RandomForestClassifier # XXX doesn't work with y_class because RF doesn't support classes_ # Shouldn't AdaBoost run a LabelBinarizer? clf = AdaBoostClassifier(RandomForestClassifier(), algorithm="SAMME") clf.fit(X, y_regr) clf = AdaBoostClassifier(SVC(), algorithm="SAMME") clf.fit(X, y_class) from sklearn.ensemble import RandomForestRegressor clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0) clf.fit(X, y_regr) clf = AdaBoostRegressor(SVR(), random_state=0) clf.fit(X, y_regr) # Check that an empty discrete ensemble fails in fit, not predict. X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]] y_fail = ["foo", "bar", 1, 2] clf = AdaBoostClassifier(SVC(), algorithm="SAMME") with pytest.raises(ValueError, match="worse than random"): clf.fit(X_fail, y_fail) def test_sample_weights_infinite(): msg = "Sample weights have reached infinite values" clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0, algorithm="SAMME") with pytest.warns(UserWarning, match=msg): clf.fit(iris.data, iris.target) @pytest.mark.parametrize( "sparse_container, expected_internal_type", zip( [ *CSC_CONTAINERS, *CSR_CONTAINERS, *LIL_CONTAINERS, *COO_CONTAINERS, *DOK_CONTAINERS, ], CSC_CONTAINERS + 4 * CSR_CONTAINERS, ), ) def test_sparse_classification(sparse_container, expected_internal_type): # Check classification with sparse input. class CustomSVC(SVC): """SVC variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super().fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_multilabel_classification( n_classes=1, n_samples=15, n_features=5, random_state=42 ) # Flatten y to a 1d array y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) X_train_sparse = sparse_container(X_train) X_test_sparse = sparse_container(X_test) # Trained on sparse format sparse_classifier = AdaBoostClassifier( estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME", ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = AdaBoostClassifier( estimator=CustomSVC(probability=True), random_state=1, algorithm="SAMME", ).fit(X_train, y_train) # predict sparse_clf_results = sparse_classifier.predict(X_test_sparse) dense_clf_results = dense_classifier.predict(X_test) assert_array_equal(sparse_clf_results, dense_clf_results) # decision_function sparse_clf_results = sparse_classifier.decision_function(X_test_sparse) dense_clf_results = dense_classifier.decision_function(X_test) assert_array_almost_equal(sparse_clf_results, dense_clf_results) # predict_log_proba sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse) dense_clf_results = dense_classifier.predict_log_proba(X_test) assert_array_almost_equal(sparse_clf_results, dense_clf_results) # predict_proba sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse) dense_clf_results = dense_classifier.predict_proba(X_test) assert_array_almost_equal(sparse_clf_results, dense_clf_results) # score sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test) dense_clf_results = dense_classifier.score(X_test, y_test) assert_array_almost_equal(sparse_clf_results, dense_clf_results) # staged_decision_function sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse) dense_clf_results = dense_classifier.staged_decision_function(X_test) for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): assert_array_almost_equal(sparse_clf_res, dense_clf_res) # staged_predict sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse) dense_clf_results = dense_classifier.staged_predict(X_test) for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): assert_array_equal(sparse_clf_res, dense_clf_res) # staged_predict_proba sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse) dense_clf_results = dense_classifier.staged_predict_proba(X_test) for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): assert_array_almost_equal(sparse_clf_res, dense_clf_res) # staged_score sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test) dense_clf_results = dense_classifier.staged_score(X_test, y_test) for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results): assert_array_equal(sparse_clf_res, dense_clf_res) # Verify sparsity of data is maintained during training types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([t == expected_internal_type for t in types]) @pytest.mark.parametrize( "sparse_container, expected_internal_type", zip( [ *CSC_CONTAINERS, *CSR_CONTAINERS, *LIL_CONTAINERS, *COO_CONTAINERS, *DOK_CONTAINERS, ], CSC_CONTAINERS + 4 * CSR_CONTAINERS, ), ) def test_sparse_regression(sparse_container, expected_internal_type): # Check regression with sparse input. class CustomSVR(SVR): """SVR variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super().fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_regression( n_samples=15, n_features=50, n_targets=1, random_state=42 ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) X_train_sparse = sparse_container(X_train) X_test_sparse = sparse_container(X_test) # Trained on sparse format sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit( X_train_sparse, y_train ) # Trained on dense format dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit( X_train, y_train ) # predict sparse_regr_results = sparse_regressor.predict(X_test_sparse) dense_regr_results = dense_regressor.predict(X_test) assert_array_almost_equal(sparse_regr_results, dense_regr_results) # staged_predict sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse) dense_regr_results = dense_regressor.staged_predict(X_test) for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results): assert_array_almost_equal(sparse_regr_res, dense_regr_res) types = [i.data_type_ for i in sparse_regressor.estimators_] assert all([t == expected_internal_type for t in types]) def test_sample_weight_adaboost_regressor(): """ AdaBoostRegressor should work without sample_weights in the base estimator The random weighted sampling is done internally in the _boost method in AdaBoostRegressor. """ class DummyEstimator(BaseEstimator): def fit(self, X, y): pass def predict(self, X): return np.zeros(X.shape[0]) boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3) boost.fit(X, y_regr) assert len(boost.estimator_weights_) == len(boost.estimator_errors_) def test_multidimensional_X(): """ Check that the AdaBoost estimators can work with n-dimensional data matrix """ rng = np.random.RandomState(0) X = rng.randn(51, 3, 3) yc = rng.choice([0, 1], 51) yr = rng.randn(51) boost = AdaBoostClassifier( DummyClassifier(strategy="most_frequent"), algorithm="SAMME" ) boost.fit(X, yc) boost.predict(X) boost.predict_proba(X) boost = AdaBoostRegressor(DummyRegressor()) boost.fit(X, yr) boost.predict(X) # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_adaboostclassifier_without_sample_weight(algorithm): X, y = iris.data, iris.target estimator = NoSampleWeightWrapper(DummyClassifier()) clf = AdaBoostClassifier(estimator=estimator, algorithm=algorithm) err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__) with pytest.raises(ValueError, match=err_msg): clf.fit(X, y) def test_adaboostregressor_sample_weight(): # check that giving weight will have an influence on the error computed # for a weak learner rng = np.random.RandomState(42) X = np.linspace(0, 100, num=1000) y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001) X = X.reshape(-1, 1) # add an arbitrary outlier X[-1] *= 10 y[-1] = 10000 # random_state=0 ensure that the underlying bootstrap will use the outlier regr_no_outlier = AdaBoostRegressor( estimator=LinearRegression(), n_estimators=1, random_state=0 ) regr_with_weight = clone(regr_no_outlier) regr_with_outlier = clone(regr_no_outlier) # fit 3 models: # - a model containing the outlier # - a model without the outlier # - a model containing the outlier but with a null sample-weight regr_with_outlier.fit(X, y) regr_no_outlier.fit(X[:-1], y[:-1]) sample_weight = np.ones_like(y) sample_weight[-1] = 0 regr_with_weight.fit(X, y, sample_weight=sample_weight) score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1]) score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1]) score_with_weight = regr_with_weight.score(X[:-1], y[:-1]) assert score_with_outlier < score_no_outlier assert score_with_outlier < score_with_weight assert score_no_outlier == pytest.approx(score_with_weight) # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_adaboost_consistent_predict(algorithm): # check that predict_proba and predict give consistent results # regression test for: # https://github.com/scikit-learn/scikit-learn/issues/14084 X_train, X_test, y_train, y_test = train_test_split( *datasets.load_digits(return_X_y=True), random_state=42 ) model = AdaBoostClassifier(algorithm=algorithm, random_state=42) model.fit(X_train, y_train) assert_array_equal( np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test) ) @pytest.mark.parametrize( "model, X, y", [ (AdaBoostClassifier(), iris.data, iris.target), (AdaBoostRegressor(), diabetes.data, diabetes.target), ], ) def test_adaboost_negative_weight_error(model, X, y): sample_weight = np.ones_like(y) sample_weight[-1] = -10 err_msg = "Negative values in data passed to `sample_weight`" with pytest.raises(ValueError, match=err_msg): model.fit(X, y, sample_weight=sample_weight) def test_adaboost_numerically_stable_feature_importance_with_small_weights(): """Check that we don't create NaN feature importance with numerically instable inputs. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/20320 """ rng = np.random.RandomState(42) X = rng.normal(size=(1000, 10)) y = rng.choice([0, 1], size=1000) sample_weight = np.ones_like(y) * 1e-263 tree = DecisionTreeClassifier(max_depth=10, random_state=12) ada_model = AdaBoostClassifier( estimator=tree, n_estimators=20, algorithm="SAMME", random_state=12 ) ada_model.fit(X, y, sample_weight=sample_weight) assert np.isnan(ada_model.feature_importances_).sum() == 0 # TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed # and substituted with the SAMME algorithm as a default; also re-write test to # only consider "SAMME" @pytest.mark.filterwarnings("ignore:The SAMME.R algorithm") @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"]) def test_adaboost_decision_function(algorithm, global_random_seed): """Check that the decision function respects the symmetric constraint for weak learners. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/26520 """ n_classes = 3 X, y = datasets.make_classification( n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed ) clf = AdaBoostClassifier( n_estimators=1, random_state=global_random_seed, algorithm=algorithm ).fit(X, y) y_score = clf.decision_function(X) assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) if algorithm == "SAMME": # With a single learner, we expect to have a decision function in # {1, - 1 / (n_classes - 1)}. assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)} # We can assert the same for staged_decision_function since we have a single learner for y_score in clf.staged_decision_function(X): assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) if algorithm == "SAMME": # With a single learner, we expect to have a decision function in # {1, - 1 / (n_classes - 1)}. assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)} clf.set_params(n_estimators=5).fit(X, y) y_score = clf.decision_function(X) assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) for y_score in clf.staged_decision_function(X): assert_allclose(y_score.sum(axis=1), 0, atol=1e-8) # TODO(1.6): remove def test_deprecated_samme_r_algorithm(): adaboost_clf = AdaBoostClassifier(n_estimators=1) with pytest.warns( FutureWarning, match=re.escape("The SAMME.R algorithm (the default) is deprecated"), ): adaboost_clf.fit(X, y_class)