706 lines
25 KiB
Python
706 lines
25 KiB
Python
|
"""Testing for the boost module (sklearn.ensemble.boost)."""
|
||
|
|
||
|
import re
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from sklearn import datasets
|
||
|
from sklearn.base import BaseEstimator, clone
|
||
|
from sklearn.dummy import DummyClassifier, DummyRegressor
|
||
|
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
|
||
|
from sklearn.ensemble._weight_boosting import _samme_proba
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
from sklearn.model_selection import GridSearchCV, train_test_split
|
||
|
from sklearn.svm import SVC, SVR
|
||
|
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||
|
from sklearn.utils import shuffle
|
||
|
from sklearn.utils._mocking import NoSampleWeightWrapper
|
||
|
from sklearn.utils._testing import (
|
||
|
assert_allclose,
|
||
|
assert_array_almost_equal,
|
||
|
assert_array_equal,
|
||
|
assert_array_less,
|
||
|
)
|
||
|
from sklearn.utils.fixes import (
|
||
|
COO_CONTAINERS,
|
||
|
CSC_CONTAINERS,
|
||
|
CSR_CONTAINERS,
|
||
|
DOK_CONTAINERS,
|
||
|
LIL_CONTAINERS,
|
||
|
)
|
||
|
|
||
|
# Common random state
|
||
|
rng = np.random.RandomState(0)
|
||
|
|
||
|
# Toy sample
|
||
|
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||
|
y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
|
||
|
y_regr = [-1, -1, -1, 1, 1, 1]
|
||
|
T = [[-1, -1], [2, 2], [3, 2]]
|
||
|
y_t_class = ["foo", 1, 1]
|
||
|
y_t_regr = [-1, 1, 1]
|
||
|
|
||
|
# Load the iris dataset and randomly permute it
|
||
|
iris = datasets.load_iris()
|
||
|
perm = rng.permutation(iris.target.size)
|
||
|
iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
|
||
|
|
||
|
# Load the diabetes dataset and randomly permute it
|
||
|
diabetes = datasets.load_diabetes()
|
||
|
diabetes.data, diabetes.target = shuffle(
|
||
|
diabetes.data, diabetes.target, random_state=rng
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_samme_proba():
|
||
|
# Test the `_samme_proba` helper function.
|
||
|
|
||
|
# Define some example (bad) `predict_proba` output.
|
||
|
probs = np.array(
|
||
|
[[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]]
|
||
|
)
|
||
|
probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
|
||
|
|
||
|
# _samme_proba calls estimator.predict_proba.
|
||
|
# Make a mock object so I can control what gets returned.
|
||
|
class MockEstimator:
|
||
|
def predict_proba(self, X):
|
||
|
assert_array_equal(X.shape, probs.shape)
|
||
|
return probs
|
||
|
|
||
|
mock = MockEstimator()
|
||
|
|
||
|
samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
|
||
|
|
||
|
assert_array_equal(samme_proba.shape, probs.shape)
|
||
|
assert np.isfinite(samme_proba).all()
|
||
|
|
||
|
# Make sure that the correct elements come out as smallest --
|
||
|
# `_samme_proba` should preserve the ordering in each example.
|
||
|
assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
|
||
|
assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
|
||
|
|
||
|
|
||
|
def test_oneclass_adaboost_proba():
|
||
|
# Test predict_proba robustness for one class label input.
|
||
|
# In response to issue #7501
|
||
|
# https://github.com/scikit-learn/scikit-learn/issues/7501
|
||
|
y_t = np.ones(len(X))
|
||
|
clf = AdaBoostClassifier(algorithm="SAMME").fit(X, y_t)
|
||
|
assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||
|
def test_classification_toy(algorithm):
|
||
|
# Check classification on a toy dataset.
|
||
|
clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
|
||
|
clf.fit(X, y_class)
|
||
|
assert_array_equal(clf.predict(T), y_t_class)
|
||
|
assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
|
||
|
assert clf.predict_proba(T).shape == (len(T), 2)
|
||
|
assert clf.decision_function(T).shape == (len(T),)
|
||
|
|
||
|
|
||
|
def test_regression_toy():
|
||
|
# Check classification on a toy dataset.
|
||
|
clf = AdaBoostRegressor(random_state=0)
|
||
|
clf.fit(X, y_regr)
|
||
|
assert_array_equal(clf.predict(T), y_t_regr)
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
def test_iris():
|
||
|
# Check consistency on dataset iris.
|
||
|
classes = np.unique(iris.target)
|
||
|
clf_samme = prob_samme = None
|
||
|
|
||
|
for alg in ["SAMME", "SAMME.R"]:
|
||
|
clf = AdaBoostClassifier(algorithm=alg)
|
||
|
clf.fit(iris.data, iris.target)
|
||
|
|
||
|
assert_array_equal(classes, clf.classes_)
|
||
|
proba = clf.predict_proba(iris.data)
|
||
|
if alg == "SAMME":
|
||
|
clf_samme = clf
|
||
|
prob_samme = proba
|
||
|
assert proba.shape[1] == len(classes)
|
||
|
assert clf.decision_function(iris.data).shape[1] == len(classes)
|
||
|
|
||
|
score = clf.score(iris.data, iris.target)
|
||
|
assert score > 0.9, "Failed with algorithm %s and score = %f" % (alg, score)
|
||
|
|
||
|
# Check we used multiple estimators
|
||
|
assert len(clf.estimators_) > 1
|
||
|
# Check for distinct random states (see issue #7408)
|
||
|
assert len(set(est.random_state for est in clf.estimators_)) == len(
|
||
|
clf.estimators_
|
||
|
)
|
||
|
|
||
|
# Somewhat hacky regression test: prior to
|
||
|
# ae7adc880d624615a34bafdb1d75ef67051b8200,
|
||
|
# predict_proba returned SAMME.R values for SAMME.
|
||
|
clf_samme.algorithm = "SAMME.R"
|
||
|
assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
|
||
|
def test_diabetes(loss):
|
||
|
# Check consistency on dataset diabetes.
|
||
|
reg = AdaBoostRegressor(loss=loss, random_state=0)
|
||
|
reg.fit(diabetes.data, diabetes.target)
|
||
|
score = reg.score(diabetes.data, diabetes.target)
|
||
|
assert score > 0.55
|
||
|
|
||
|
# Check we used multiple estimators
|
||
|
assert len(reg.estimators_) > 1
|
||
|
# Check for distinct random states (see issue #7408)
|
||
|
assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||
|
def test_staged_predict(algorithm):
|
||
|
# Check staged predictions.
|
||
|
rng = np.random.RandomState(0)
|
||
|
iris_weights = rng.randint(10, size=iris.target.shape)
|
||
|
diabetes_weights = rng.randint(10, size=diabetes.target.shape)
|
||
|
|
||
|
clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
|
||
|
clf.fit(iris.data, iris.target, sample_weight=iris_weights)
|
||
|
|
||
|
predictions = clf.predict(iris.data)
|
||
|
staged_predictions = [p for p in clf.staged_predict(iris.data)]
|
||
|
proba = clf.predict_proba(iris.data)
|
||
|
staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
|
||
|
score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
|
||
|
staged_scores = [
|
||
|
s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
|
||
|
]
|
||
|
|
||
|
assert len(staged_predictions) == 10
|
||
|
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||
|
assert len(staged_probas) == 10
|
||
|
assert_array_almost_equal(proba, staged_probas[-1])
|
||
|
assert len(staged_scores) == 10
|
||
|
assert_array_almost_equal(score, staged_scores[-1])
|
||
|
|
||
|
# AdaBoost regression
|
||
|
clf = AdaBoostRegressor(n_estimators=10, random_state=0)
|
||
|
clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||
|
|
||
|
predictions = clf.predict(diabetes.data)
|
||
|
staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
|
||
|
score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
|
||
|
staged_scores = [
|
||
|
s
|
||
|
for s in clf.staged_score(
|
||
|
diabetes.data, diabetes.target, sample_weight=diabetes_weights
|
||
|
)
|
||
|
]
|
||
|
|
||
|
assert len(staged_predictions) == 10
|
||
|
assert_array_almost_equal(predictions, staged_predictions[-1])
|
||
|
assert len(staged_scores) == 10
|
||
|
assert_array_almost_equal(score, staged_scores[-1])
|
||
|
|
||
|
|
||
|
def test_gridsearch():
|
||
|
# Check that base trees can be grid-searched.
|
||
|
# AdaBoost classification
|
||
|
boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
|
||
|
parameters = {
|
||
|
"n_estimators": (1, 2),
|
||
|
"estimator__max_depth": (1, 2),
|
||
|
"algorithm": ("SAMME", "SAMME.R"),
|
||
|
}
|
||
|
clf = GridSearchCV(boost, parameters)
|
||
|
clf.fit(iris.data, iris.target)
|
||
|
|
||
|
# AdaBoost regression
|
||
|
boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
|
||
|
parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
|
||
|
clf = GridSearchCV(boost, parameters)
|
||
|
clf.fit(diabetes.data, diabetes.target)
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
def test_pickle():
|
||
|
# Check pickability.
|
||
|
import pickle
|
||
|
|
||
|
# Adaboost classifier
|
||
|
for alg in ["SAMME", "SAMME.R"]:
|
||
|
obj = AdaBoostClassifier(algorithm=alg)
|
||
|
obj.fit(iris.data, iris.target)
|
||
|
score = obj.score(iris.data, iris.target)
|
||
|
s = pickle.dumps(obj)
|
||
|
|
||
|
obj2 = pickle.loads(s)
|
||
|
assert type(obj2) == obj.__class__
|
||
|
score2 = obj2.score(iris.data, iris.target)
|
||
|
assert score == score2
|
||
|
|
||
|
# Adaboost regressor
|
||
|
obj = AdaBoostRegressor(random_state=0)
|
||
|
obj.fit(diabetes.data, diabetes.target)
|
||
|
score = obj.score(diabetes.data, diabetes.target)
|
||
|
s = pickle.dumps(obj)
|
||
|
|
||
|
obj2 = pickle.loads(s)
|
||
|
assert type(obj2) == obj.__class__
|
||
|
score2 = obj2.score(diabetes.data, diabetes.target)
|
||
|
assert score == score2
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
def test_importances():
|
||
|
# Check variable importances.
|
||
|
X, y = datasets.make_classification(
|
||
|
n_samples=2000,
|
||
|
n_features=10,
|
||
|
n_informative=3,
|
||
|
n_redundant=0,
|
||
|
n_repeated=0,
|
||
|
shuffle=False,
|
||
|
random_state=1,
|
||
|
)
|
||
|
|
||
|
for alg in ["SAMME", "SAMME.R"]:
|
||
|
clf = AdaBoostClassifier(algorithm=alg)
|
||
|
|
||
|
clf.fit(X, y)
|
||
|
importances = clf.feature_importances_
|
||
|
|
||
|
assert importances.shape[0] == 10
|
||
|
assert (importances[:3, np.newaxis] >= importances[3:]).all()
|
||
|
|
||
|
|
||
|
def test_adaboost_classifier_sample_weight_error():
|
||
|
# Test that it gives proper exception on incorrect sample weight.
|
||
|
clf = AdaBoostClassifier()
|
||
|
msg = re.escape("sample_weight.shape == (1,), expected (6,)")
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
clf.fit(X, y_class, sample_weight=np.asarray([-1]))
|
||
|
|
||
|
|
||
|
def test_estimator():
|
||
|
# Test different estimators.
|
||
|
from sklearn.ensemble import RandomForestClassifier
|
||
|
|
||
|
# XXX doesn't work with y_class because RF doesn't support classes_
|
||
|
# Shouldn't AdaBoost run a LabelBinarizer?
|
||
|
clf = AdaBoostClassifier(RandomForestClassifier(), algorithm="SAMME")
|
||
|
clf.fit(X, y_regr)
|
||
|
|
||
|
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||
|
clf.fit(X, y_class)
|
||
|
|
||
|
from sklearn.ensemble import RandomForestRegressor
|
||
|
|
||
|
clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
|
||
|
clf.fit(X, y_regr)
|
||
|
|
||
|
clf = AdaBoostRegressor(SVR(), random_state=0)
|
||
|
clf.fit(X, y_regr)
|
||
|
|
||
|
# Check that an empty discrete ensemble fails in fit, not predict.
|
||
|
X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
|
||
|
y_fail = ["foo", "bar", 1, 2]
|
||
|
clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
|
||
|
with pytest.raises(ValueError, match="worse than random"):
|
||
|
clf.fit(X_fail, y_fail)
|
||
|
|
||
|
|
||
|
def test_sample_weights_infinite():
|
||
|
msg = "Sample weights have reached infinite values"
|
||
|
clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0, algorithm="SAMME")
|
||
|
with pytest.warns(UserWarning, match=msg):
|
||
|
clf.fit(iris.data, iris.target)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container, expected_internal_type",
|
||
|
zip(
|
||
|
[
|
||
|
*CSC_CONTAINERS,
|
||
|
*CSR_CONTAINERS,
|
||
|
*LIL_CONTAINERS,
|
||
|
*COO_CONTAINERS,
|
||
|
*DOK_CONTAINERS,
|
||
|
],
|
||
|
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
|
||
|
),
|
||
|
)
|
||
|
def test_sparse_classification(sparse_container, expected_internal_type):
|
||
|
# Check classification with sparse input.
|
||
|
|
||
|
class CustomSVC(SVC):
|
||
|
"""SVC variant that records the nature of the training set."""
|
||
|
|
||
|
def fit(self, X, y, sample_weight=None):
|
||
|
"""Modification on fit caries data type for later verification."""
|
||
|
super().fit(X, y, sample_weight=sample_weight)
|
||
|
self.data_type_ = type(X)
|
||
|
return self
|
||
|
|
||
|
X, y = datasets.make_multilabel_classification(
|
||
|
n_classes=1, n_samples=15, n_features=5, random_state=42
|
||
|
)
|
||
|
# Flatten y to a 1d array
|
||
|
y = np.ravel(y)
|
||
|
|
||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||
|
|
||
|
X_train_sparse = sparse_container(X_train)
|
||
|
X_test_sparse = sparse_container(X_test)
|
||
|
|
||
|
# Trained on sparse format
|
||
|
sparse_classifier = AdaBoostClassifier(
|
||
|
estimator=CustomSVC(probability=True),
|
||
|
random_state=1,
|
||
|
algorithm="SAMME",
|
||
|
).fit(X_train_sparse, y_train)
|
||
|
|
||
|
# Trained on dense format
|
||
|
dense_classifier = AdaBoostClassifier(
|
||
|
estimator=CustomSVC(probability=True),
|
||
|
random_state=1,
|
||
|
algorithm="SAMME",
|
||
|
).fit(X_train, y_train)
|
||
|
|
||
|
# predict
|
||
|
sparse_clf_results = sparse_classifier.predict(X_test_sparse)
|
||
|
dense_clf_results = dense_classifier.predict(X_test)
|
||
|
assert_array_equal(sparse_clf_results, dense_clf_results)
|
||
|
|
||
|
# decision_function
|
||
|
sparse_clf_results = sparse_classifier.decision_function(X_test_sparse)
|
||
|
dense_clf_results = dense_classifier.decision_function(X_test)
|
||
|
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||
|
|
||
|
# predict_log_proba
|
||
|
sparse_clf_results = sparse_classifier.predict_log_proba(X_test_sparse)
|
||
|
dense_clf_results = dense_classifier.predict_log_proba(X_test)
|
||
|
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||
|
|
||
|
# predict_proba
|
||
|
sparse_clf_results = sparse_classifier.predict_proba(X_test_sparse)
|
||
|
dense_clf_results = dense_classifier.predict_proba(X_test)
|
||
|
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||
|
|
||
|
# score
|
||
|
sparse_clf_results = sparse_classifier.score(X_test_sparse, y_test)
|
||
|
dense_clf_results = dense_classifier.score(X_test, y_test)
|
||
|
assert_array_almost_equal(sparse_clf_results, dense_clf_results)
|
||
|
|
||
|
# staged_decision_function
|
||
|
sparse_clf_results = sparse_classifier.staged_decision_function(X_test_sparse)
|
||
|
dense_clf_results = dense_classifier.staged_decision_function(X_test)
|
||
|
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||
|
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
|
||
|
|
||
|
# staged_predict
|
||
|
sparse_clf_results = sparse_classifier.staged_predict(X_test_sparse)
|
||
|
dense_clf_results = dense_classifier.staged_predict(X_test)
|
||
|
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||
|
assert_array_equal(sparse_clf_res, dense_clf_res)
|
||
|
|
||
|
# staged_predict_proba
|
||
|
sparse_clf_results = sparse_classifier.staged_predict_proba(X_test_sparse)
|
||
|
dense_clf_results = dense_classifier.staged_predict_proba(X_test)
|
||
|
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||
|
assert_array_almost_equal(sparse_clf_res, dense_clf_res)
|
||
|
|
||
|
# staged_score
|
||
|
sparse_clf_results = sparse_classifier.staged_score(X_test_sparse, y_test)
|
||
|
dense_clf_results = dense_classifier.staged_score(X_test, y_test)
|
||
|
for sparse_clf_res, dense_clf_res in zip(sparse_clf_results, dense_clf_results):
|
||
|
assert_array_equal(sparse_clf_res, dense_clf_res)
|
||
|
|
||
|
# Verify sparsity of data is maintained during training
|
||
|
types = [i.data_type_ for i in sparse_classifier.estimators_]
|
||
|
|
||
|
assert all([t == expected_internal_type for t in types])
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container, expected_internal_type",
|
||
|
zip(
|
||
|
[
|
||
|
*CSC_CONTAINERS,
|
||
|
*CSR_CONTAINERS,
|
||
|
*LIL_CONTAINERS,
|
||
|
*COO_CONTAINERS,
|
||
|
*DOK_CONTAINERS,
|
||
|
],
|
||
|
CSC_CONTAINERS + 4 * CSR_CONTAINERS,
|
||
|
),
|
||
|
)
|
||
|
def test_sparse_regression(sparse_container, expected_internal_type):
|
||
|
# Check regression with sparse input.
|
||
|
|
||
|
class CustomSVR(SVR):
|
||
|
"""SVR variant that records the nature of the training set."""
|
||
|
|
||
|
def fit(self, X, y, sample_weight=None):
|
||
|
"""Modification on fit caries data type for later verification."""
|
||
|
super().fit(X, y, sample_weight=sample_weight)
|
||
|
self.data_type_ = type(X)
|
||
|
return self
|
||
|
|
||
|
X, y = datasets.make_regression(
|
||
|
n_samples=15, n_features=50, n_targets=1, random_state=42
|
||
|
)
|
||
|
|
||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
|
||
|
|
||
|
X_train_sparse = sparse_container(X_train)
|
||
|
X_test_sparse = sparse_container(X_test)
|
||
|
|
||
|
# Trained on sparse format
|
||
|
sparse_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
|
||
|
X_train_sparse, y_train
|
||
|
)
|
||
|
|
||
|
# Trained on dense format
|
||
|
dense_regressor = AdaBoostRegressor(estimator=CustomSVR(), random_state=1).fit(
|
||
|
X_train, y_train
|
||
|
)
|
||
|
|
||
|
# predict
|
||
|
sparse_regr_results = sparse_regressor.predict(X_test_sparse)
|
||
|
dense_regr_results = dense_regressor.predict(X_test)
|
||
|
assert_array_almost_equal(sparse_regr_results, dense_regr_results)
|
||
|
|
||
|
# staged_predict
|
||
|
sparse_regr_results = sparse_regressor.staged_predict(X_test_sparse)
|
||
|
dense_regr_results = dense_regressor.staged_predict(X_test)
|
||
|
for sparse_regr_res, dense_regr_res in zip(sparse_regr_results, dense_regr_results):
|
||
|
assert_array_almost_equal(sparse_regr_res, dense_regr_res)
|
||
|
|
||
|
types = [i.data_type_ for i in sparse_regressor.estimators_]
|
||
|
|
||
|
assert all([t == expected_internal_type for t in types])
|
||
|
|
||
|
|
||
|
def test_sample_weight_adaboost_regressor():
|
||
|
"""
|
||
|
AdaBoostRegressor should work without sample_weights in the base estimator
|
||
|
The random weighted sampling is done internally in the _boost method in
|
||
|
AdaBoostRegressor.
|
||
|
"""
|
||
|
|
||
|
class DummyEstimator(BaseEstimator):
|
||
|
def fit(self, X, y):
|
||
|
pass
|
||
|
|
||
|
def predict(self, X):
|
||
|
return np.zeros(X.shape[0])
|
||
|
|
||
|
boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
|
||
|
boost.fit(X, y_regr)
|
||
|
assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
|
||
|
|
||
|
|
||
|
def test_multidimensional_X():
|
||
|
"""
|
||
|
Check that the AdaBoost estimators can work with n-dimensional
|
||
|
data matrix
|
||
|
"""
|
||
|
rng = np.random.RandomState(0)
|
||
|
|
||
|
X = rng.randn(51, 3, 3)
|
||
|
yc = rng.choice([0, 1], 51)
|
||
|
yr = rng.randn(51)
|
||
|
|
||
|
boost = AdaBoostClassifier(
|
||
|
DummyClassifier(strategy="most_frequent"), algorithm="SAMME"
|
||
|
)
|
||
|
boost.fit(X, yc)
|
||
|
boost.predict(X)
|
||
|
boost.predict_proba(X)
|
||
|
|
||
|
boost = AdaBoostRegressor(DummyRegressor())
|
||
|
boost.fit(X, yr)
|
||
|
boost.predict(X)
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||
|
def test_adaboostclassifier_without_sample_weight(algorithm):
|
||
|
X, y = iris.data, iris.target
|
||
|
estimator = NoSampleWeightWrapper(DummyClassifier())
|
||
|
clf = AdaBoostClassifier(estimator=estimator, algorithm=algorithm)
|
||
|
err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
clf.fit(X, y)
|
||
|
|
||
|
|
||
|
def test_adaboostregressor_sample_weight():
|
||
|
# check that giving weight will have an influence on the error computed
|
||
|
# for a weak learner
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = np.linspace(0, 100, num=1000)
|
||
|
y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
|
||
|
X = X.reshape(-1, 1)
|
||
|
|
||
|
# add an arbitrary outlier
|
||
|
X[-1] *= 10
|
||
|
y[-1] = 10000
|
||
|
|
||
|
# random_state=0 ensure that the underlying bootstrap will use the outlier
|
||
|
regr_no_outlier = AdaBoostRegressor(
|
||
|
estimator=LinearRegression(), n_estimators=1, random_state=0
|
||
|
)
|
||
|
regr_with_weight = clone(regr_no_outlier)
|
||
|
regr_with_outlier = clone(regr_no_outlier)
|
||
|
|
||
|
# fit 3 models:
|
||
|
# - a model containing the outlier
|
||
|
# - a model without the outlier
|
||
|
# - a model containing the outlier but with a null sample-weight
|
||
|
regr_with_outlier.fit(X, y)
|
||
|
regr_no_outlier.fit(X[:-1], y[:-1])
|
||
|
sample_weight = np.ones_like(y)
|
||
|
sample_weight[-1] = 0
|
||
|
regr_with_weight.fit(X, y, sample_weight=sample_weight)
|
||
|
|
||
|
score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
|
||
|
score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
|
||
|
score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
|
||
|
|
||
|
assert score_with_outlier < score_no_outlier
|
||
|
assert score_with_outlier < score_with_weight
|
||
|
assert score_no_outlier == pytest.approx(score_with_weight)
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||
|
def test_adaboost_consistent_predict(algorithm):
|
||
|
# check that predict_proba and predict give consistent results
|
||
|
# regression test for:
|
||
|
# https://github.com/scikit-learn/scikit-learn/issues/14084
|
||
|
X_train, X_test, y_train, y_test = train_test_split(
|
||
|
*datasets.load_digits(return_X_y=True), random_state=42
|
||
|
)
|
||
|
model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
|
||
|
model.fit(X_train, y_train)
|
||
|
|
||
|
assert_array_equal(
|
||
|
np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"model, X, y",
|
||
|
[
|
||
|
(AdaBoostClassifier(), iris.data, iris.target),
|
||
|
(AdaBoostRegressor(), diabetes.data, diabetes.target),
|
||
|
],
|
||
|
)
|
||
|
def test_adaboost_negative_weight_error(model, X, y):
|
||
|
sample_weight = np.ones_like(y)
|
||
|
sample_weight[-1] = -10
|
||
|
|
||
|
err_msg = "Negative values in data passed to `sample_weight`"
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
model.fit(X, y, sample_weight=sample_weight)
|
||
|
|
||
|
|
||
|
def test_adaboost_numerically_stable_feature_importance_with_small_weights():
|
||
|
"""Check that we don't create NaN feature importance with numerically
|
||
|
instable inputs.
|
||
|
|
||
|
Non-regression test for:
|
||
|
https://github.com/scikit-learn/scikit-learn/issues/20320
|
||
|
"""
|
||
|
rng = np.random.RandomState(42)
|
||
|
X = rng.normal(size=(1000, 10))
|
||
|
y = rng.choice([0, 1], size=1000)
|
||
|
sample_weight = np.ones_like(y) * 1e-263
|
||
|
tree = DecisionTreeClassifier(max_depth=10, random_state=12)
|
||
|
ada_model = AdaBoostClassifier(
|
||
|
estimator=tree, n_estimators=20, algorithm="SAMME", random_state=12
|
||
|
)
|
||
|
ada_model.fit(X, y, sample_weight=sample_weight)
|
||
|
assert np.isnan(ada_model.feature_importances_).sum() == 0
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove "@pytest.mark.filterwarnings" as SAMME.R will be removed
|
||
|
# and substituted with the SAMME algorithm as a default; also re-write test to
|
||
|
# only consider "SAMME"
|
||
|
@pytest.mark.filterwarnings("ignore:The SAMME.R algorithm")
|
||
|
@pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
|
||
|
def test_adaboost_decision_function(algorithm, global_random_seed):
|
||
|
"""Check that the decision function respects the symmetric constraint for weak
|
||
|
learners.
|
||
|
|
||
|
Non-regression test for:
|
||
|
https://github.com/scikit-learn/scikit-learn/issues/26520
|
||
|
"""
|
||
|
n_classes = 3
|
||
|
X, y = datasets.make_classification(
|
||
|
n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
|
||
|
)
|
||
|
clf = AdaBoostClassifier(
|
||
|
n_estimators=1, random_state=global_random_seed, algorithm=algorithm
|
||
|
).fit(X, y)
|
||
|
|
||
|
y_score = clf.decision_function(X)
|
||
|
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||
|
|
||
|
if algorithm == "SAMME":
|
||
|
# With a single learner, we expect to have a decision function in
|
||
|
# {1, - 1 / (n_classes - 1)}.
|
||
|
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
|
||
|
|
||
|
# We can assert the same for staged_decision_function since we have a single learner
|
||
|
for y_score in clf.staged_decision_function(X):
|
||
|
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||
|
|
||
|
if algorithm == "SAMME":
|
||
|
# With a single learner, we expect to have a decision function in
|
||
|
# {1, - 1 / (n_classes - 1)}.
|
||
|
assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
|
||
|
|
||
|
clf.set_params(n_estimators=5).fit(X, y)
|
||
|
|
||
|
y_score = clf.decision_function(X)
|
||
|
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||
|
|
||
|
for y_score in clf.staged_decision_function(X):
|
||
|
assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
|
||
|
|
||
|
|
||
|
# TODO(1.6): remove
|
||
|
def test_deprecated_samme_r_algorithm():
|
||
|
adaboost_clf = AdaBoostClassifier(n_estimators=1)
|
||
|
with pytest.warns(
|
||
|
FutureWarning,
|
||
|
match=re.escape("The SAMME.R algorithm (the default) is deprecated"),
|
||
|
):
|
||
|
adaboost_clf.fit(X, y_class)
|