263 lines
8.9 KiB
Python
263 lines
8.9 KiB
Python
import numpy as np
|
|
import pytest
|
|
|
|
from sklearn.base import ClassifierMixin, clone, is_classifier
|
|
from sklearn.datasets import (
|
|
load_diabetes,
|
|
load_iris,
|
|
make_classification,
|
|
make_regression,
|
|
)
|
|
from sklearn.ensemble import (
|
|
RandomForestClassifier,
|
|
RandomForestRegressor,
|
|
StackingClassifier,
|
|
StackingRegressor,
|
|
VotingClassifier,
|
|
VotingRegressor,
|
|
)
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.linear_model import LinearRegression, LogisticRegression
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
|
|
|
|
X, y = load_iris(return_X_y=True)
|
|
|
|
X_r, y_r = load_diabetes(return_X_y=True)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"X, y, estimator",
|
|
[
|
|
(
|
|
*make_classification(n_samples=10),
|
|
StackingClassifier(
|
|
estimators=[
|
|
("lr", LogisticRegression()),
|
|
("svm", LinearSVC(dual="auto")),
|
|
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
|
],
|
|
cv=2,
|
|
),
|
|
),
|
|
(
|
|
*make_classification(n_samples=10),
|
|
VotingClassifier(
|
|
estimators=[
|
|
("lr", LogisticRegression()),
|
|
("svm", LinearSVC(dual="auto")),
|
|
("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
|
|
]
|
|
),
|
|
),
|
|
(
|
|
*make_regression(n_samples=10),
|
|
StackingRegressor(
|
|
estimators=[
|
|
("lr", LinearRegression()),
|
|
("svm", LinearSVR(dual="auto")),
|
|
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
|
],
|
|
cv=2,
|
|
),
|
|
),
|
|
(
|
|
*make_regression(n_samples=10),
|
|
VotingRegressor(
|
|
estimators=[
|
|
("lr", LinearRegression()),
|
|
("svm", LinearSVR(dual="auto")),
|
|
("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
|
|
]
|
|
),
|
|
),
|
|
],
|
|
ids=[
|
|
"stacking-classifier",
|
|
"voting-classifier",
|
|
"stacking-regressor",
|
|
"voting-regressor",
|
|
],
|
|
)
|
|
def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
|
|
# check that the behavior of `estimators`, `estimators_`,
|
|
# `named_estimators`, `named_estimators_` is consistent across all
|
|
# ensemble classes and when using `set_params()`.
|
|
|
|
# before fit
|
|
assert "svm" in estimator.named_estimators
|
|
assert estimator.named_estimators.svm is estimator.estimators[1][1]
|
|
assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
|
|
|
|
# check fitted attributes
|
|
estimator.fit(X, y)
|
|
assert len(estimator.named_estimators) == 3
|
|
assert len(estimator.named_estimators_) == 3
|
|
assert sorted(list(estimator.named_estimators_.keys())) == sorted(
|
|
["lr", "svm", "rf"]
|
|
)
|
|
|
|
# check that set_params() does not add a new attribute
|
|
estimator_new_params = clone(estimator)
|
|
svm_estimator = SVC() if is_classifier(estimator) else SVR()
|
|
estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
|
|
assert not hasattr(estimator_new_params, "svm")
|
|
assert (
|
|
estimator_new_params.named_estimators.lr.get_params()
|
|
== estimator.named_estimators.lr.get_params()
|
|
)
|
|
assert (
|
|
estimator_new_params.named_estimators.rf.get_params()
|
|
== estimator.named_estimators.rf.get_params()
|
|
)
|
|
|
|
# check the behavior when setting an dropping an estimator
|
|
estimator_dropped = clone(estimator)
|
|
estimator_dropped.set_params(svm="drop")
|
|
estimator_dropped.fit(X, y)
|
|
assert len(estimator_dropped.named_estimators) == 3
|
|
assert estimator_dropped.named_estimators.svm == "drop"
|
|
assert len(estimator_dropped.named_estimators_) == 3
|
|
assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
|
|
["lr", "svm", "rf"]
|
|
)
|
|
for sub_est in estimator_dropped.named_estimators_:
|
|
# check that the correspondence is correct
|
|
assert not isinstance(sub_est, type(estimator.named_estimators.svm))
|
|
|
|
# check that we can set the parameters of the underlying classifier
|
|
estimator.set_params(svm__C=10.0)
|
|
estimator.set_params(rf__max_depth=5)
|
|
assert (
|
|
estimator.get_params()["svm__C"]
|
|
== estimator.get_params()["svm"].get_params()["C"]
|
|
)
|
|
assert (
|
|
estimator.get_params()["rf__max_depth"]
|
|
== estimator.get_params()["rf"].get_params()["max_depth"]
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"Ensemble",
|
|
[VotingClassifier, StackingRegressor, VotingRegressor],
|
|
)
|
|
def test_ensemble_heterogeneous_estimators_type(Ensemble):
|
|
# check that ensemble will fail during validation if the underlying
|
|
# estimators are not of the same type (i.e. classifier or regressor)
|
|
# StackingClassifier can have an underlying regresor so it's not checked
|
|
if issubclass(Ensemble, ClassifierMixin):
|
|
X, y = make_classification(n_samples=10)
|
|
estimators = [("lr", LinearRegression())]
|
|
ensemble_type = "classifier"
|
|
else:
|
|
X, y = make_regression(n_samples=10)
|
|
estimators = [("lr", LogisticRegression())]
|
|
ensemble_type = "regressor"
|
|
ensemble = Ensemble(estimators=estimators)
|
|
|
|
err_msg = "should be a {}".format(ensemble_type)
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
ensemble.fit(X, y)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"X, y, Ensemble",
|
|
[
|
|
(*make_classification(n_samples=10), StackingClassifier),
|
|
(*make_classification(n_samples=10), VotingClassifier),
|
|
(*make_regression(n_samples=10), StackingRegressor),
|
|
(*make_regression(n_samples=10), VotingRegressor),
|
|
],
|
|
)
|
|
def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
|
|
# raise an error when the name contains dunder
|
|
if issubclass(Ensemble, ClassifierMixin):
|
|
estimators = [("lr__", LogisticRegression())]
|
|
else:
|
|
estimators = [("lr__", LinearRegression())]
|
|
ensemble = Ensemble(estimators=estimators)
|
|
|
|
err_msg = r"Estimator names must not contain __: got \['lr__'\]"
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
ensemble.fit(X, y)
|
|
|
|
# raise an error when the name is not unique
|
|
if issubclass(Ensemble, ClassifierMixin):
|
|
estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
|
|
else:
|
|
estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
|
|
ensemble = Ensemble(estimators=estimators)
|
|
|
|
err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
ensemble.fit(X, y)
|
|
|
|
# raise an error when the name conflicts with the parameters
|
|
if issubclass(Ensemble, ClassifierMixin):
|
|
estimators = [("estimators", LogisticRegression())]
|
|
else:
|
|
estimators = [("estimators", LinearRegression())]
|
|
ensemble = Ensemble(estimators=estimators)
|
|
|
|
err_msg = "Estimator names conflict with constructor arguments"
|
|
with pytest.raises(ValueError, match=err_msg):
|
|
ensemble.fit(X, y)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"X, y, estimator",
|
|
[
|
|
(
|
|
*make_classification(n_samples=10),
|
|
StackingClassifier(estimators=[("lr", LogisticRegression())]),
|
|
),
|
|
(
|
|
*make_classification(n_samples=10),
|
|
VotingClassifier(estimators=[("lr", LogisticRegression())]),
|
|
),
|
|
(
|
|
*make_regression(n_samples=10),
|
|
StackingRegressor(estimators=[("lr", LinearRegression())]),
|
|
),
|
|
(
|
|
*make_regression(n_samples=10),
|
|
VotingRegressor(estimators=[("lr", LinearRegression())]),
|
|
),
|
|
],
|
|
ids=[
|
|
"stacking-classifier",
|
|
"voting-classifier",
|
|
"stacking-regressor",
|
|
"voting-regressor",
|
|
],
|
|
)
|
|
def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
|
|
# check that we raise a consistent error when all estimators are
|
|
# dropped
|
|
estimator.set_params(lr="drop")
|
|
with pytest.raises(ValueError, match="All estimators are dropped."):
|
|
estimator.fit(X, y)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"Ensemble, Estimator, X, y",
|
|
[
|
|
(StackingClassifier, LogisticRegression, X, y),
|
|
(StackingRegressor, LinearRegression, X_r, y_r),
|
|
(VotingClassifier, LogisticRegression, X, y),
|
|
(VotingRegressor, LinearRegression, X_r, y_r),
|
|
],
|
|
)
|
|
# FIXME: we should move this test in `estimator_checks` once we are able
|
|
# to construct meta-estimator instances
|
|
def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
|
|
# check that Voting and Stacking predictor delegate the missing values
|
|
# validation to the underlying estimator.
|
|
X = X.copy()
|
|
mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
|
|
X[mask] = np.nan
|
|
pipe = make_pipeline(SimpleImputer(), Estimator())
|
|
ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
|
|
ensemble.fit(X, y).score(X, y)
|