# Authors: # # Giorgio Patrini # # License: BSD 3 clause import re import warnings import numpy as np import numpy.linalg as la import pytest from scipy import sparse, stats from sklearn import datasets from sklearn.base import clone from sklearn.exceptions import NotFittedError from sklearn.metrics.pairwise import linear_kernel from sklearn.model_selection import cross_val_predict from sklearn.pipeline import Pipeline from sklearn.preprocessing import ( Binarizer, KernelCenterer, MaxAbsScaler, MinMaxScaler, Normalizer, PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler, add_dummy_feature, maxabs_scale, minmax_scale, normalize, power_transform, quantile_transform, robust_scale, scale, ) from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale from sklearn.svm import SVR from sklearn.utils import gen_batches, shuffle from sklearn.utils._array_api import ( yield_namespace_device_dtype_combinations, ) from sklearn.utils._testing import ( _convert_container, assert_allclose, assert_allclose_dense_sparse, assert_almost_equal, assert_array_almost_equal, assert_array_equal, assert_array_less, skip_if_32bit, ) from sklearn.utils.estimator_checks import ( _get_check_estimator_ids, check_array_api_input_and_values, ) from sklearn.utils.fixes import ( COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS, ) from sklearn.utils.sparsefuncs import mean_variance_axis iris = datasets.load_iris() # Make some data to be used many times rng = np.random.RandomState(0) n_features = 30 n_samples = 1000 offsets = rng.uniform(-1, 1, size=n_features) scales = rng.uniform(1, 10, size=n_features) X_2d = rng.randn(n_samples, n_features) * scales + offsets X_1row = X_2d[0, :].reshape(1, n_features) X_1col = X_2d[:, 0].reshape(n_samples, 1) X_list_1row = X_1row.tolist() X_list_1col = X_1col.tolist() def toarray(a): if hasattr(a, "toarray"): a = a.toarray() return a def _check_dim_1axis(a): return np.asarray(a).shape[0] def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen): if batch_stop != n: assert (i + 1) * chunk_size == n_samples_seen else: assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D n_sampless = [2, 3] n_featuress = [3, 2] for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) scaler = StandardScaler() # make sure Error is raised the sample weights greater than 1d sample_weight_notOK = rng.randn(n_samples, 1) ** 2 with pytest.raises(ValueError): scaler.fit(X, y, sample_weight=sample_weight_notOK) @pytest.mark.parametrize( ["Xw", "X", "sample_weight"], [ ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]), ( [[1, 0, 1], [0, 0, 1]], [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]], np.array([1, 3]), ), ( [[1, np.nan, 1], [np.nan, np.nan, 1]], [ [1, np.nan, 1], [np.nan, np.nan, 1], [np.nan, np.nan, 1], [np.nan, np.nan, 1], ], np.array([1, 3]), ), ], ) @pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"]) def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor): with_mean = not array_constructor.startswith("sparse") X = _convert_container(X, array_constructor) Xw = _convert_container(Xw, array_constructor) # weighted StandardScaler yw = np.ones(Xw.shape[0]) scaler_w = StandardScaler(with_mean=with_mean) scaler_w.fit(Xw, yw, sample_weight=sample_weight) # unweighted, but with repeated samples y = np.ones(X.shape[0]) scaler = StandardScaler(with_mean=with_mean) scaler.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(scaler.mean_, scaler_w.mean_) assert_almost_equal(scaler.var_, scaler_w.var_) assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test)) def test_standard_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_almost_equal(scaler.mean_, X.ravel()) assert_almost_equal(scaler.scale_, np.ones(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features)) else: assert_almost_equal(scaler.mean_, X.mean()) assert_almost_equal(scaler.scale_, X.std()) assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_almost_equal(scaler.mean_, 1.0) assert_almost_equal(scaler.scale_, 1.0) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 0.0) assert scaler.n_samples_seen_ == X.shape[0] @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) @pytest.mark.parametrize("add_sample_weight", [False, True]) def test_standard_scaler_dtype(add_sample_weight, sparse_container): # Ensure scaling does not affect dtype rng = np.random.RandomState(0) n_samples = 10 n_features = 3 if add_sample_weight: sample_weight = np.ones(n_samples) else: sample_weight = None with_mean = True if sparse_container is not None: # scipy sparse containers do not support float16, see # https://github.com/scipy/scipy/issues/7408 for more details. supported_dtype = [np.float64, np.float32] else: supported_dtype = [np.float64, np.float32, np.float16] for dtype in supported_dtype: X = rng.randn(n_samples, n_features).astype(dtype) if sparse_container is not None: X = sparse_container(X) with_mean = False scaler = StandardScaler(with_mean=with_mean) X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X) assert X.dtype == X_scaled.dtype assert scaler.mean_.dtype == np.float64 assert scaler.scale_.dtype == np.float64 @pytest.mark.parametrize( "scaler", [ StandardScaler(with_mean=False), RobustScaler(with_centering=False), ], ) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) @pytest.mark.parametrize("add_sample_weight", [False, True]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("constant", [0, 1.0, 100.0]) def test_standard_scaler_constant_features( scaler, add_sample_weight, sparse_container, dtype, constant ): if isinstance(scaler, RobustScaler) and add_sample_weight: pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight") rng = np.random.RandomState(0) n_samples = 100 n_features = 1 if add_sample_weight: fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2) else: fit_params = {} X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype) X = X_array if sparse_container is None else sparse_container(X_array) X_scaled = scaler.fit(X, **fit_params).transform(X) if isinstance(scaler, StandardScaler): # The variance info should be close to zero for constant features. assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7) # Constant features should not be scaled (scale of 1.): assert_allclose(scaler.scale_, np.ones(X.shape[1])) assert X_scaled is not X # make sure we make a copy assert_allclose_dense_sparse(X_scaled, X) if isinstance(scaler, StandardScaler) and not add_sample_weight: # Also check consistency with the standard scale function. X_scaled_2 = scale(X, with_mean=scaler.with_mean) assert X_scaled_2 is not X # make sure we did a copy assert_allclose_dense_sparse(X_scaled_2, X) @pytest.mark.parametrize("n_samples", [10, 100, 10_000]) @pytest.mark.parametrize("average", [1e-10, 1, 1e10]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) def test_standard_scaler_near_constant_features( n_samples, sparse_container, average, dtype ): # Check that when the variance is too small (var << mean**2) the feature # is considered constant and not scaled. scale_min, scale_max = -30, 19 scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype) n_features = scales.shape[0] X = np.empty((n_samples, n_features), dtype=dtype) # Make a dataset of known var = scales**2 and mean = average X[: n_samples // 2, :] = average + scales X[n_samples // 2 :, :] = average - scales X_array = X if sparse_container is None else sparse_container(X) scaler = StandardScaler(with_mean=False).fit(X_array) # StandardScaler uses float64 accumulators even if the data has a float32 # dtype. eps = np.finfo(np.float64).eps # if var < bound = N.eps.var + N².eps².mean², the feature is considered # constant and the scale_ attribute is set to 1. bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2 within_bounds = scales**2 <= bounds # Check that scale_min is small enough to have some scales below the # bound and therefore detected as constant: assert np.any(within_bounds) # Check that such features are actually treated as constant by the scaler: assert all(scaler.var_[within_bounds] <= bounds[within_bounds]) assert_allclose(scaler.scale_[within_bounds], 1.0) # Depending the on the dtype of X, some features might not actually be # representable as non constant for small scales (even if above the # precision bound of the float64 variance estimate). Such feature should # be correctly detected as constants with 0 variance by StandardScaler. representable_diff = X[0, :] - X[-1, :] != 0 assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0) assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1) # The other features are scaled and scale_ is equal to sqrt(var_) assuming # that scales are large enough for average + scale and average - scale to # be distinct in X (depending on X's dtype). common_mask = np.logical_and(scales**2 > bounds, representable_diff) assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask]) def test_scale_1d(): # 1-d inputs X_list = [1.0, 3.0, 5.0, 0.0] X_arr = np.array(X_list) for X in [X_list, X_arr]: X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(), 0.0) assert_array_almost_equal(X_scaled.std(), 1.0) assert_array_equal(scale(X, with_mean=False, with_std=False), X) @skip_if_32bit def test_standard_scaler_numerical_stability(): # Test numerical stability of scaling # np.log(1e-5) is taken because of its floating point representation # was empirically found to cause numerical problems with np.mean & np.std. x = np.full(8, np.log(1e-5), dtype=np.float64) # This does not raise a warning as the number of samples is too low # to trigger the problem in recent numpy with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) scale(x) assert_array_almost_equal(scale(x), np.zeros(8)) # with 2 more samples, the std computation run into numerical issues: x = np.full(10, np.log(1e-5), dtype=np.float64) warning_message = "standard deviation of the data is probably very close to 0" with pytest.warns(UserWarning, match=warning_message): x_scaled = scale(x) assert_array_almost_equal(x_scaled, np.zeros(10)) x = np.full(10, 1e-100, dtype=np.float64) with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) x_small_scaled = scale(x) assert_array_almost_equal(x_small_scaled, np.zeros(10)) # Large values can cause (often recoverable) numerical stability issues: x_big = np.full(10, 1e100, dtype=np.float64) warning_message = "Dataset may contain too large values" with pytest.warns(UserWarning, match=warning_message): x_big_scaled = scale(x_big) assert_array_almost_equal(x_big_scaled, np.zeros(10)) assert_array_almost_equal(x_big_scaled, x_small_scaled) with pytest.warns(UserWarning, match=warning_message): x_big_centered = scale(x_big, with_std=False) assert_array_almost_equal(x_big_centered, np.zeros(10)) assert_array_almost_equal(x_big_centered, x_small_scaled) def test_scaler_2d_arrays(): # Test scaling of 2d array along first axis rng = np.random.RandomState(0) n_features = 5 n_samples = 4 X = rng.randn(n_samples, n_features) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert scaler.n_samples_seen_ == n_samples assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has been copied assert X_scaled is not X # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0]) # Check that the data hasn't been modified assert X_scaled is not X X_scaled = scaler.fit(X).transform(X, copy=False) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has not been copied assert X_scaled is X X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has not been copied assert X_scaled is not X def test_scaler_float16_overflow(): # Test if the scaler will not overflow on float16 numpy arrays rng = np.random.RandomState(0) # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000 # which is enough to overflow the data type X = rng.uniform(5, 10, [200000, 1]).astype(np.float16) with np.errstate(over="raise"): scaler = StandardScaler().fit(X) X_scaled = scaler.transform(X) # Calculate the float64 equivalent to verify result X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64)) # Overflow calculations may cause -inf, inf, or nan. Since there is no nan # input, all of the outputs should be finite. This may be redundant since a # FloatingPointError exception will be thrown on overflow above. assert np.all(np.isfinite(X_scaled)) # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are # checked to account for precision differences. assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2) def test_handle_zeros_in_scale(): s1 = np.array([0, 1e-16, 1, 2, 3]) s2 = _handle_zeros_in_scale(s1, copy=True) assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3])) assert_allclose(s2, np.array([1, 1, 1, 2, 3])) def test_minmax_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MinMaxScaler().fit(X[batch0]) scaler_incr = MinMaxScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_) assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_) # Test std until the end of partial fits, and scaler_batch = MinMaxScaler().fit(X) scaler_incr = MinMaxScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr( i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_, ) def test_standard_scaler_partial_fit(): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = StandardScaler(with_std=False).fit(X) scaler_incr = StandardScaler(with_std=False) for batch in gen_batches(n_samples, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert scaler_batch.var_ == scaler_incr.var_ # Nones assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ # Test std after 1 step batch0 = slice(0, chunk_size) scaler_incr = StandardScaler().partial_fit(X[batch0]) if chunk_size == 1: assert_array_almost_equal( np.zeros(n_features, dtype=np.float64), scaler_incr.var_ ) assert_array_almost_equal( np.ones(n_features, dtype=np.float64), scaler_incr.scale_ ) else: assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_) assert_array_almost_equal( np.std(X[batch0], axis=0), scaler_incr.scale_ ) # no constants # Test std until the end of partial fits, and scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() # Clean estimator for i, batch in enumerate(gen_batches(n_samples, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr( i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_, ) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_standard_scaler_partial_fit_numerical_stability(sparse_container): # Test if the incremental computation introduces significative errors # for large datasets with values of large magniture rng = np.random.RandomState(0) n_features = 2 n_samples = 100 offsets = rng.uniform(-1e15, 1e15, size=n_features) scales = rng.uniform(1e3, 1e6, size=n_features) X = rng.randn(n_samples, n_features) * scales + offsets scaler_batch = StandardScaler().fit(X) scaler_incr = StandardScaler() for chunk in X: scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features)) # Regardless of abs values, they must not be more diff 6 significant digits tol = 10 ** (-6) assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol) assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol) assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol) # NOTE Be aware that for much larger offsets std is very unstable (last # assert) while mean is OK. # Sparse input size = (100, 3) scale = 1e20 X = sparse_container(rng.randint(0, 2, size).astype(np.float64) * scale) # with_mean=False is required with sparse input scaler = StandardScaler(with_mean=False).fit(X) scaler_incr = StandardScaler(with_mean=False) for chunk in X: scaler_incr = scaler_incr.partial_fit(chunk) # Regardless of magnitude, they must not differ more than of 6 digits tol = 10 ** (-6) assert scaler.mean_ is not None assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol) assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol) @pytest.mark.parametrize("sample_weight", [True, None]) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_partial_fit_sparse_input(sample_weight, sparse_container): # Check that sparsity is not destroyed X = sparse_container(np.array([[1.0], [0.0], [0.0], [5.0]])) if sample_weight: sample_weight = rng.rand(X.shape[0]) null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X) assert_array_equal(X_null.toarray(), X.toarray()) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.toarray(), X_null.toarray()) assert_array_equal(X_orig.toarray(), X.toarray()) @pytest.mark.parametrize("sample_weight", [True, None]) def test_standard_scaler_trasform_with_partial_fit(sample_weight): # Check some postconditions after applying partial_fit and transform X = X_2d[:100, :] if sample_weight: sample_weight = rng.rand(X.shape[0]) scaler_incr = StandardScaler() for i, batch in enumerate(gen_batches(X.shape[0], 1)): X_sofar = X[: (i + 1), :] chunks_copy = X_sofar.copy() if sample_weight is None: scaled_batch = StandardScaler().fit_transform(X_sofar) scaler_incr = scaler_incr.partial_fit(X[batch]) else: scaled_batch = StandardScaler().fit_transform( X_sofar, sample_weight=sample_weight[: i + 1] ) scaler_incr = scaler_incr.partial_fit( X[batch], sample_weight=sample_weight[batch] ) scaled_incr = scaler_incr.transform(X_sofar) assert_array_almost_equal(scaled_batch, scaled_incr) assert_array_almost_equal(X_sofar, chunks_copy) # No change right_input = scaler_incr.inverse_transform(scaled_incr) assert_array_almost_equal(X_sofar, right_input) zero = np.zeros(X.shape[1]) epsilon = np.finfo(float).eps assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) if sample_weight is None: # (i+1) because the Scaler has been already fitted assert (i + 1) == scaler_incr.n_samples_seen_ else: assert np.sum(sample_weight[: i + 1]) == pytest.approx( scaler_incr.n_samples_seen_ ) def test_standard_check_array_of_inverse_transform(): # Check if StandardScaler inverse_transform is # converting the integer array to float x = np.array( [ [1, 1, 1, 0, 1, 0], [1, 1, 1, 0, 1, 0], [0, 8, 0, 1, 0, 0], [1, 4, 1, 1, 0, 0], [0, 1, 0, 0, 1, 0], [0, 4, 0, 1, 0, 1], ], dtype=np.int32, ) scaler = StandardScaler() scaler.fit(x) # The of inverse_transform should be converted # to a float array. # If not X *= self.scale_ will fail. scaler.inverse_transform(x) @pytest.mark.parametrize( "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations() ) @pytest.mark.parametrize( "check", [check_array_api_input_and_values], ids=_get_check_estimator_ids, ) @pytest.mark.parametrize( "estimator", [ MaxAbsScaler(), MinMaxScaler(), KernelCenterer(), Normalizer(norm="l1"), Normalizer(norm="l2"), Normalizer(norm="max"), ], ids=_get_check_estimator_ids, ) def test_scaler_array_api_compliance( estimator, check, array_namespace, device, dtype_name ): name = estimator.__class__.__name__ check(name, estimator, array_namespace, device=device, dtype_name=dtype_name) def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-0.5, 0.6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -0.5) assert_array_almost_equal(X_trans.max(axis=0), 0.6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) with pytest.raises(ValueError): scaler.fit(X) def test_min_max_scaler_zero_variance_features(): # Check min max scaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2) # function interface X_trans = minmax_scale(X) assert_array_almost_equal(X_trans, X_expected_0_1) X_trans = minmax_scale(X, feature_range=(1, 2)) assert_array_almost_equal(X_trans, X_expected_1_2) def test_minmax_scale_axis1(): X = iris.data X_trans = minmax_scale(X, axis=1) assert_array_almost_equal(np.min(X_trans, axis=1), 0) assert_array_almost_equal(np.max(X_trans, axis=1), 1) def test_min_max_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = MinMaxScaler(copy=True) X_scaled = scaler.fit(X).transform(X) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features)) assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features)) else: assert_array_almost_equal(X_scaled.min(axis=0), 0.0) assert_array_almost_equal(X_scaled.max(axis=0), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = MinMaxScaler() X_scaled = scaler.fit(X).transform(X) assert X_scaled.min() >= 0.0 assert X_scaled.max() <= 1.0 assert scaler.n_samples_seen_ == X.shape[0] # Function interface X_1d = X_1row.ravel() min_ = X_1d.min() max_ = X_1d.max() assert_array_almost_equal( (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True) ) @pytest.mark.parametrize("sample_weight", [True, None]) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_without_centering(sample_weight, sparse_container): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_sparse = sparse_container(X) if sample_weight: sample_weight = rng.rand(X.shape[0]) with pytest.raises(ValueError): StandardScaler().fit(X_sparse) scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight) X_scaled = scaler.transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) scaler_sparse = StandardScaler(with_mean=False).fit( X_sparse, sample_weight=sample_weight ) X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) assert not np.any(np.isnan(X_sparse_scaled.data)) assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) assert_array_almost_equal(scaler.var_, scaler_sparse.var_) assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) assert_array_almost_equal(scaler.n_samples_seen_, scaler_sparse.n_samples_seen_) if sample_weight is None: assert_array_almost_equal( X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 ) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) X_sparse_scaled_mean, X_sparse_scaled_var = mean_variance_axis(X_sparse_scaled, 0) assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_sparse_scaled_var, X_scaled.var(axis=0)) # Check that X has not been modified (copy) assert X_scaled is not X assert X_sparse_scaled is not X_sparse X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) assert X_sparse_scaled_back is not X_sparse assert X_sparse_scaled_back is not X_sparse_scaled assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) if sparse_container in CSR_CONTAINERS: null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) X_null = null_transform.fit_transform(X_sparse) assert_array_equal(X_null.data, X_sparse.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_sparse.data) @pytest.mark.parametrize("with_mean", [True, False]) @pytest.mark.parametrize("with_std", [True, False]) @pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_n_samples_seen_with_nan(with_mean, with_std, sparse_container): X = np.array( [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64 ) if sparse_container is not None: X = sparse_container(X) if sparse.issparse(X) and with_mean: pytest.skip("'with_mean=True' cannot be used with sparse matrix.") transformer = StandardScaler(with_mean=with_mean, with_std=with_std) transformer.fit(X) assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2])) def _check_identity_scalers_attributes(scaler_1, scaler_2): assert scaler_1.mean_ is scaler_2.mean_ is None assert scaler_1.var_ is scaler_2.var_ is None assert scaler_1.scale_ is scaler_2.scale_ is None assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_ @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_return_identity(sparse_container): # test that the scaler return identity when with_mean and with_std are # False X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64) X_sparse = sparse_container(X_dense) transformer_dense = StandardScaler(with_mean=False, with_std=False) X_trans_dense = transformer_dense.fit_transform(X_dense) assert_allclose(X_trans_dense, X_dense) transformer_sparse = clone(transformer_dense) X_trans_sparse = transformer_sparse.fit_transform(X_sparse) assert_allclose_dense_sparse(X_trans_sparse, X_sparse) _check_identity_scalers_attributes(transformer_dense, transformer_sparse) transformer_dense.partial_fit(X_dense) transformer_sparse.partial_fit(X_sparse) _check_identity_scalers_attributes(transformer_dense, transformer_sparse) transformer_dense.fit(X_dense) transformer_sparse.fit(X_sparse) _check_identity_scalers_attributes(transformer_dense, transformer_sparse) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_scaler_int(sparse_container): # test that scaler converts integer input to floating # for both sparse and dense matrices rng = np.random.RandomState(42) X = rng.randint(20, size=(4, 5)) X[:, 0] = 0 # first feature is always of zero X_sparse = sparse_container(X) with warnings.catch_warnings(record=True): scaler = StandardScaler(with_mean=False).fit(X) X_scaled = scaler.transform(X, copy=True) assert not np.any(np.isnan(X_scaled)) with warnings.catch_warnings(record=True): scaler_sparse = StandardScaler(with_mean=False).fit(X_sparse) X_sparse_scaled = scaler_sparse.transform(X_sparse, copy=True) assert not np.any(np.isnan(X_sparse_scaled.data)) assert_array_almost_equal(scaler.mean_, scaler_sparse.mean_) assert_array_almost_equal(scaler.var_, scaler_sparse.var_) assert_array_almost_equal(scaler.scale_, scaler_sparse.scale_) assert_array_almost_equal( X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2 ) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) X_sparse_scaled_mean, X_sparse_scaled_std = mean_variance_axis( X_sparse_scaled.astype(float), 0 ) assert_array_almost_equal(X_sparse_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_sparse_scaled_std, X_scaled.std(axis=0)) # Check that X has not been modified (copy) assert X_scaled is not X assert X_sparse_scaled is not X_sparse X_scaled_back = scaler.inverse_transform(X_scaled) assert X_scaled_back is not X assert X_scaled_back is not X_scaled assert_array_almost_equal(X_scaled_back, X) X_sparse_scaled_back = scaler_sparse.inverse_transform(X_sparse_scaled) assert X_sparse_scaled_back is not X_sparse assert X_sparse_scaled_back is not X_sparse_scaled assert_array_almost_equal(X_sparse_scaled_back.toarray(), X) if sparse_container in CSR_CONTAINERS: null_transform = StandardScaler(with_mean=False, with_std=False, copy=True) with warnings.catch_warnings(record=True): X_null = null_transform.fit_transform(X_sparse) assert_array_equal(X_null.data, X_sparse.data) X_orig = null_transform.inverse_transform(X_null) assert_array_equal(X_orig.data, X_sparse.data) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_scaler_without_copy(sparse_container): # Check that StandardScaler.fit does not change input rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_sparse = sparse_container(X) X_copy = X.copy() StandardScaler(copy=False).fit(X) assert_array_equal(X, X_copy) X_sparse_copy = X_sparse.copy() StandardScaler(with_mean=False, copy=False).fit(X_sparse) assert_array_equal(X_sparse.toarray(), X_sparse_copy.toarray()) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) def test_scale_sparse_with_mean_raise_exception(sparse_container): rng = np.random.RandomState(42) X = rng.randn(4, 5) X_sparse = sparse_container(X) # check scaling and fit with direct calls on sparse data with pytest.raises(ValueError): scale(X_sparse, with_mean=True) with pytest.raises(ValueError): StandardScaler(with_mean=True).fit(X_sparse) # check transform and inverse_transform after a fit on a dense array scaler = StandardScaler(with_mean=True).fit(X) with pytest.raises(ValueError): scaler.transform(X_sparse) X_transformed_sparse = sparse_container(scaler.transform(X)) with pytest.raises(ValueError): scaler.inverse_transform(X_transformed_sparse) def test_scale_input_finiteness_validation(): # Check if non finite inputs raise ValueError X = [[np.inf, 5, 6, 7, 8]] with pytest.raises( ValueError, match="Input contains infinity or a value too large" ): scale(X) def test_robust_scaler_error_sparse(): X_sparse = sparse.rand(1000, 10) scaler = RobustScaler(with_centering=True) err_msg = "Cannot center sparse matrices" with pytest.raises(ValueError, match=err_msg): scaler.fit(X_sparse) @pytest.mark.parametrize("with_centering", [True, False]) @pytest.mark.parametrize("with_scaling", [True, False]) @pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)]) def test_robust_scaler_attributes(X, with_centering, with_scaling): # check consistent type of attributes if with_centering and sparse.issparse(X): pytest.skip("RobustScaler cannot center sparse matrix") scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling) scaler.fit(X) if with_centering: assert isinstance(scaler.center_, np.ndarray) else: assert scaler.center_ is None if with_scaling: assert isinstance(scaler.scale_, np.ndarray) else: assert scaler.scale_ is None @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_robust_scaler_col_zero_sparse(csr_container): # check that the scaler is working when there is not data materialized in a # column of a sparse matrix X = np.random.randn(10, 5) X[:, 0] = 0 X = csr_container(X) scaler = RobustScaler(with_centering=False) scaler.fit(X) assert scaler.scale_[0] == pytest.approx(1) X_trans = scaler.transform(X) assert_allclose(X[:, [0]].toarray(), X_trans[:, [0]].toarray()) def test_robust_scaler_2d_arrays(): # Test robust scaling of 2d array along first axis rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = RobustScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0)[0], 0) @pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1]) @pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None]) def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed): # Check the equivalence of the fitting with dense and sparse matrices X_sparse = sparse.rand(1000, 5, density=density).tocsc() if strictly_signed == "positive": X_sparse.data = np.abs(X_sparse.data) elif strictly_signed == "negative": X_sparse.data = -np.abs(X_sparse.data) elif strictly_signed == "zeros": X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64) X_dense = X_sparse.toarray() scaler_sparse = RobustScaler(with_centering=False) scaler_dense = RobustScaler(with_centering=False) scaler_sparse.fit(X_sparse) scaler_dense.fit(X_dense) assert_allclose(scaler_sparse.scale_, scaler_dense.scale_) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_robust_scaler_transform_one_row_csr(csr_container): # Check RobustScaler on transforming csr matrix with one row rng = np.random.RandomState(0) X = rng.randn(4, 5) single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]]) scaler = RobustScaler(with_centering=False) scaler = scaler.fit(X) row_trans = scaler.transform(csr_container(single_row)) row_expected = single_row / scaler.scale_ assert_array_almost_equal(row_trans.toarray(), row_expected) row_scaled_back = scaler.inverse_transform(row_trans) assert_array_almost_equal(single_row, row_scaled_back.toarray()) def test_robust_scaler_iris(): X = iris.data scaler = RobustScaler() X_trans = scaler.fit_transform(X) assert_array_almost_equal(np.median(X_trans, axis=0), 0) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) q = np.percentile(X_trans, q=(25, 75), axis=0) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scaler_iris_quantiles(): X = iris.data scaler = RobustScaler(quantile_range=(10, 90)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(np.median(X_trans, axis=0), 0) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) q = np.percentile(X_trans, q=(10, 90), axis=0) q_range = q[1] - q[0] assert_array_almost_equal(q_range, 1) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_iris(csc_container): X = iris.data # uniform output distribution transformer = QuantileTransformer(n_quantiles=30) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # normal output distribution transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal") X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure it is possible to take the inverse of a sparse matrix # which contain negative value; this is the case in the iris dataset X_sparse = csc_container(X) X_sparse_tran = transformer.fit_transform(X_sparse) X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran) assert_array_almost_equal(X_sparse.toarray(), X_sparse_tran_inv.toarray()) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_check_error(csc_container): X = np.transpose( [ [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], ] ) X = csc_container(X) X_neg = np.transpose( [ [0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1], ] ) X_neg = csc_container(X_neg) err_msg = ( "The number of quantiles cannot be greater than " "the number of samples used. Got 1000 quantiles " "and 10 samples." ) with pytest.raises(ValueError, match=err_msg): QuantileTransformer(subsample=10).fit(X) transformer = QuantileTransformer(n_quantiles=10) err_msg = "QuantileTransformer only accepts non-negative sparse matrices." with pytest.raises(ValueError, match=err_msg): transformer.fit(X_neg) transformer.fit(X) err_msg = "QuantileTransformer only accepts non-negative sparse matrices." with pytest.raises(ValueError, match=err_msg): transformer.transform(X_neg) X_bad_feat = np.transpose( [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]] ) err_msg = ( "X has 2 features, but QuantileTransformer is expecting 3 features as input." ) with pytest.raises(ValueError, match=err_msg): transformer.inverse_transform(X_bad_feat) transformer = QuantileTransformer(n_quantiles=10).fit(X) # check that an error is raised if input is scalar with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"): transformer.transform(10) # check that a warning is raised is n_quantiles > n_samples transformer = QuantileTransformer(n_quantiles=100) warn_msg = "n_quantiles is set to n_samples" with pytest.warns(UserWarning, match=warn_msg) as record: transformer.fit(X) assert len(record) == 1 assert transformer.n_quantiles_ == X.shape[0] @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_sparse_ignore_zeros(csc_container): X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]]) X_sparse = csc_container(X) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) # dense case -> warning raise warning_message = ( "'ignore_implicit_zeros' takes effect" " only with sparse matrix. This parameter has no" " effect." ) with pytest.warns(UserWarning, match=warning_message): transformer.fit(X) X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]]) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.toarray()) # consider the case where sparse entries are missing values and user-given # zeros are to be considered X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8]) X_sparse = csc_container((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) X_expected = np.array( [ [0.0, 0.5], [0.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 0.5], [0.0, 0.0], [0.0, 0.5], [0.0, 1.0], [0.0, 0.0], ] ) assert_almost_equal(X_expected, X_trans.toarray()) transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5) X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1]) X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1]) X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6]) X_sparse = csc_container((X_data, (X_row, X_col))) X_trans = transformer.fit_transform(X_sparse) X_expected = np.array( [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]] ) assert_almost_equal(X_expected, X_trans.toarray()) assert_almost_equal( X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() ) # check in conjunction with subsampling transformer = QuantileTransformer( ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0 ) X_trans = transformer.fit_transform(X_sparse) assert_almost_equal(X_expected, X_trans.toarray()) assert_almost_equal( X_sparse.toarray(), transformer.inverse_transform(X_trans).toarray() ) def test_quantile_transform_dense_toy(): X = np.array( [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]] ) transformer = QuantileTransformer(n_quantiles=5) transformer.fit(X) # using a uniform output, each entry of X should be map between 0 and 1 # and equally spaced X_trans = transformer.fit_transform(X) X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T assert_almost_equal(np.sort(X_trans, axis=0), X_expected) X_test = np.array( [ [-1, 1, 0], [101, 11, 10], ] ) X_expected = np.array( [ [0, 0, 0], [1, 1, 1], ] ) assert_array_almost_equal(transformer.transform(X_test), X_expected) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) def test_quantile_transform_subsampling(): # Test that subsampling the input yield to a consistent results We check # that the computed quantiles are almost mapped to a [0, 1] vector where # values are equally spaced. The infinite norm is checked to be smaller # than a given threshold. This is repeated 5 times. # dense support n_samples = 1000000 n_quantiles = 1000 X = np.sort(np.random.sample((n_samples, 1)), axis=0) ROUND = 5 inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer( random_state=random_state, n_quantiles=n_quantiles, subsample=n_samples // 10, ) transformer.fit(X) diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) inf_norm = np.max(np.abs(diff)) assert inf_norm < 1e-2 inf_norm_arr.append(inf_norm) # each random subsampling yield a unique approximation to the expected # linspace CDF assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) # sparse support X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0) inf_norm_arr = [] for random_state in range(ROUND): transformer = QuantileTransformer( random_state=random_state, n_quantiles=n_quantiles, subsample=n_samples // 10, ) transformer.fit(X) diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_) inf_norm = np.max(np.abs(diff)) assert inf_norm < 1e-1 inf_norm_arr.append(inf_norm) # each random subsampling yield a unique approximation to the expected # linspace CDF assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_sparse_toy(csc_container): X = np.array( [ [0.0, 2.0, 0.0], [25.0, 4.0, 0.0], [50.0, 0.0, 2.6], [0.0, 0.0, 4.1], [0.0, 6.0, 0.0], [0.0, 8.0, 0.0], [75.0, 0.0, 2.3], [0.0, 10.0, 0.0], [0.0, 0.0, 9.5], [100.0, 0.0, 0.1], ] ) X = csc_container(X) transformer = QuantileTransformer(n_quantiles=10) transformer.fit(X) X_trans = transformer.fit_transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray()) X_trans = transformer_dense.transform(X) assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0) assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0) X_trans_inv = transformer_dense.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_trans_inv.toarray()) def test_quantile_transform_axis1(): X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]]) X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5) X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5) assert_array_almost_equal(X_trans_a0, X_trans_a1.T) @pytest.mark.parametrize("csc_container", CSC_CONTAINERS) def test_quantile_transform_bounds(csc_container): # Lower and upper bounds are manually mapped. We checked that in the case # of a constant feature and binary feature, the bounds are properly mapped. X_dense = np.array([[0, 0], [0, 0], [1, 0]]) X_sparse = csc_container(X_dense) # check sparse and dense are consistent X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense) assert_array_almost_equal(X_trans, X_dense) X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform( X_sparse ) assert_array_almost_equal(X_trans_sp.toarray(), X_dense) assert_array_almost_equal(X_trans, X_trans_sp.toarray()) # check the consistency of the bounds by learning on 1 matrix # and transforming another X = np.array([[0, 1], [0, 0.5], [1, 0]]) X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]]) transformer = QuantileTransformer(n_quantiles=3).fit(X) X_trans = transformer.transform(X1) assert_array_almost_equal(X_trans, X1) # check that values outside of the range learned will be mapped properly. X = np.random.random((1000, 1)) transformer = QuantileTransformer() transformer.fit(X) assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]]) assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]]) assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform( [[np.min(transformer.references_)]] ) assert transformer.inverse_transform([[10]]) == transformer.inverse_transform( [[np.max(transformer.references_)]] ) def test_quantile_transform_and_inverse(): X_1 = iris.data X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]]) for X in [X_1, X_2]: transformer = QuantileTransformer(n_quantiles=1000, random_state=0) X_trans = transformer.fit_transform(X) X_trans_inv = transformer.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv, decimal=9) def test_quantile_transform_nan(): X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]]) transformer = QuantileTransformer(n_quantiles=10, random_state=42) transformer.fit_transform(X) # check that the quantile of the first column is all NaN assert np.isnan(transformer.quantiles_[:, 0]).all() # all other column should not contain NaN assert not np.isnan(transformer.quantiles_[:, 1:]).any() @pytest.mark.parametrize("array_type", ["array", "sparse"]) def test_quantile_transformer_sorted_quantiles(array_type): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/15733 # Taken from upstream bug report: # https://github.com/numpy/numpy/issues/14685 X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10) X = 0.1 * X.reshape(-1, 1) X = _convert_container(X, array_type) n_quantiles = 100 qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X) # Check that the estimated quantile thresholds are monotically # increasing: quantiles = qt.quantiles_[:, 0] assert len(quantiles) == 100 assert all(np.diff(quantiles) >= 0) def test_robust_scaler_invalid_range(): for range_ in [ (-1, 90), (-2, -3), (10, 101), (100.5, 101), (90, 50), ]: scaler = RobustScaler(quantile_range=range_) with pytest.raises(ValueError, match=r"Invalid quantile range: \("): scaler.fit(iris.data) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_scale_function_without_centering(csr_container): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = csr_container(X) X_scaled = scale(X, with_mean=False) assert not np.any(np.isnan(X_scaled)) X_csr_scaled = scale(X_csr, with_mean=False) assert not np.any(np.isnan(X_csr_scaled.data)) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 with pytest.raises(ValueError): scale(X_csr, with_mean=False, axis=1) assert_array_almost_equal( X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2 ) assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0]) # Check that X has not been copied assert X_scaled is not X X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0)) # null scale X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True) assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray()) def test_robust_scale_axis1(): X = iris.data X_trans = robust_scale(X, axis=1) assert_array_almost_equal(np.median(X_trans, axis=1), 0) q = np.percentile(X_trans, q=(25, 75), axis=1) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scale_1d_array(): X = iris.data[:, 1] X_trans = robust_scale(X) assert_array_almost_equal(np.median(X_trans), 0) q = np.percentile(X_trans, q=(25, 75)) iqr = q[1] - q[0] assert_array_almost_equal(iqr, 1) def test_robust_scaler_zero_variance_features(): # Check RobustScaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]] scaler = RobustScaler() X_trans = scaler.fit_transform(X) # NOTE: for such a small sample size, what we expect in the third column # depends HEAVILY on the method used to calculate quantiles. The values # here were calculated to fit the quantiles produces by np.percentile # using numpy 1.9 Calculating quantiles with # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles # would yield very different results! X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]] assert_array_almost_equal(X_trans, X_expected) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure new data gets transformed correctly X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] X_trans_new = scaler.transform(X_new) X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]] assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3) def test_robust_scaler_unit_variance(): # Check RobustScaler with unit_variance=True on standard normal data with # outliers rng = np.random.RandomState(42) X = rng.randn(1000000, 1) X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100]) quantile_range = (1, 99) robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit( X_with_outliers ) X_trans = robust_scaler.transform(X) assert robust_scaler.center_ == pytest.approx(0, abs=1e-3) assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2) assert X_trans.std() == pytest.approx(1, abs=1e-2) @pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS) def test_maxabs_scaler_zero_variance_features(sparse_container): # Check MaxAbsScaler on toy data with zero variance features X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]] scaler = MaxAbsScaler() X_trans = scaler.fit_transform(X) X_expected = [ [0.0, 1.0, 1.0 / 3.0], [0.0, 1.0, -0.2], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], ] assert_array_almost_equal(X_trans, X_expected) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # make sure new data gets transformed correctly X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]] X_trans_new = scaler.transform(X_new) X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]] assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2) # function interface X_trans = maxabs_scale(X) assert_array_almost_equal(X_trans, X_expected) # sparse data X_sparse = sparse_container(X) X_trans_sparse = scaler.fit_transform(X_sparse) X_expected = [ [0.0, 1.0, 1.0 / 3.0], [0.0, 1.0, -0.2], [0.0, 1.0, 1.0], [0.0, 0.0, 0.0], ] assert_array_almost_equal(X_trans_sparse.toarray(), X_expected) X_trans_sparse_inv = scaler.inverse_transform(X_trans_sparse) assert_array_almost_equal(X, X_trans_sparse_inv.toarray()) def test_maxabs_scaler_large_negative_value(): # Check MaxAbsScaler on toy data with a large negative value X = [ [0.0, 1.0, +0.5, -1.0], [0.0, 1.0, -0.3, -0.5], [0.0, 1.0, -100.0, 0.0], [0.0, 0.0, +0.0, -2.0], ] scaler = MaxAbsScaler() X_trans = scaler.fit_transform(X) X_expected = [ [0.0, 1.0, 0.005, -0.5], [0.0, 1.0, -0.003, -0.25], [0.0, 1.0, -1.0, 0.0], [0.0, 0.0, 0.0, -1.0], ] assert_array_almost_equal(X_trans, X_expected) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_maxabs_scaler_transform_one_row_csr(csr_container): # Check MaxAbsScaler on transforming csr matrix with one row X = csr_container([[0.5, 1.0, 1.0]]) scaler = MaxAbsScaler() scaler = scaler.fit(X) X_trans = scaler.transform(X) X_expected = csr_container([[1.0, 1.0, 1.0]]) assert_array_almost_equal(X_trans.toarray(), X_expected.toarray()) X_scaled_back = scaler.inverse_transform(X_trans) assert_array_almost_equal(X.toarray(), X_scaled_back.toarray()) def test_maxabs_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: scaler = MaxAbsScaler(copy=True) X_scaled = scaler.fit(X).transform(X) if isinstance(X, list): X = np.array(X) # cast only after scaling done if _check_dim_1axis(X) == 1: assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features)) else: assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X) # Constant feature X = np.ones((5, 1)) scaler = MaxAbsScaler() X_scaled = scaler.fit(X).transform(X) assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0) assert scaler.n_samples_seen_ == X.shape[0] # function interface X_1d = X_1row.ravel() max_abs = np.abs(X_1d).max() assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True)) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_maxabs_scaler_partial_fit(csr_container): # Test if partial_fit run over many batches of size 1 and 50 # gives the same results as fit X = X_2d[:100, :] n = X.shape[0] for chunk_size in [1, 2, 50, n, n + 42]: # Test mean at the end of the process scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() scaler_incr_csr = MaxAbsScaler() scaler_incr_csc = MaxAbsScaler() for batch in gen_batches(n, chunk_size): scaler_incr = scaler_incr.partial_fit(X[batch]) X_csr = csr_container(X[batch]) scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr) X_csc = csr_container(X[batch]) scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_ assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_ assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_) assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std after 1 step batch0 = slice(0, chunk_size) scaler_batch = MaxAbsScaler().fit(X[batch0]) scaler_incr = MaxAbsScaler().partial_fit(X[batch0]) assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_) assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_ assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_) assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X)) # Test std until the end of partial fits, and scaler_batch = MaxAbsScaler().fit(X) scaler_incr = MaxAbsScaler() # Clean estimator for i, batch in enumerate(gen_batches(n, chunk_size)): scaler_incr = scaler_incr.partial_fit(X[batch]) assert_correct_incr( i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, n_samples_seen=scaler_incr.n_samples_seen_, ) def check_normalizer(norm, X_norm): """ Convenient checking function for `test_normalizer_l1_l2_max` and `test_normalizer_l1_l2_max_non_csr` """ if norm == "l1": row_sums = np.abs(X_norm).sum(axis=1) for i in range(3): assert_almost_equal(row_sums[i], 1.0) assert_almost_equal(row_sums[3], 0.0) elif norm == "l2": for i in range(3): assert_almost_equal(la.norm(X_norm[i]), 1.0) assert_almost_equal(la.norm(X_norm[3]), 0.0) elif norm == "max": row_maxs = abs(X_norm).max(axis=1) for i in range(3): assert_almost_equal(row_maxs[i], 1.0) assert_almost_equal(row_maxs[3], 0.0) @pytest.mark.parametrize("norm", ["l1", "l2", "max"]) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_normalizer_l1_l2_max(norm, csr_container): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) X_sparse_unpruned = csr_container(X_dense) # set the row number 3 to zero X_dense[3, :] = 0.0 # set the row number 3 to zero without pruning (can happen in real life) indptr_3 = X_sparse_unpruned.indptr[3] indptr_4 = X_sparse_unpruned.indptr[4] X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0 # build the pruned variant using the regular constructor X_sparse_pruned = csr_container(X_dense) # check inputs that support the no-copy optim for X in (X_dense, X_sparse_pruned, X_sparse_unpruned): normalizer = Normalizer(norm=norm, copy=True) X_norm1 = normalizer.transform(X) assert X_norm1 is not X X_norm1 = toarray(X_norm1) normalizer = Normalizer(norm=norm, copy=False) X_norm2 = normalizer.transform(X) assert X_norm2 is X X_norm2 = toarray(X_norm2) for X_norm in (X_norm1, X_norm2): check_normalizer(norm, X_norm) @pytest.mark.parametrize("norm", ["l1", "l2", "max"]) @pytest.mark.parametrize( "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + LIL_CONTAINERS ) def test_normalizer_l1_l2_max_non_csr(norm, sparse_container): rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) # set the row number 3 to zero X_dense[3, :] = 0.0 X = sparse_container(X_dense) X_norm = Normalizer(norm=norm, copy=False).transform(X) assert X_norm is not X assert sparse.issparse(X_norm) and X_norm.format == "csr" X_norm = toarray(X_norm) check_normalizer(norm, X_norm) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_normalizer_max_sign(csr_container): # check that we normalize by a positive number even for negative data rng = np.random.RandomState(0) X_dense = rng.randn(4, 5) # set the row number 3 to zero X_dense[3, :] = 0.0 # check for mixed data where the value with # largest magnitude is negative X_dense[2, abs(X_dense[2, :]).argmax()] *= -1 X_all_neg = -np.abs(X_dense) X_all_neg_sparse = csr_container(X_all_neg) for X in (X_dense, X_all_neg, X_all_neg_sparse): normalizer = Normalizer(norm="max") X_norm = normalizer.transform(X) assert X_norm is not X X_norm = toarray(X_norm) assert_array_equal(np.sign(X_norm), np.sign(toarray(X))) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_normalize(csr_container): # Test normalize function # Only tests functionality not used by the tests for Normalizer. X = np.random.RandomState(37).randn(3, 2) assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T) rs = np.random.RandomState(0) X_dense = rs.randn(10, 5) X_sparse = csr_container(X_dense) ones = np.ones((10)) for X in (X_dense, X_sparse): for dtype in (np.float32, np.float64): for norm in ("l1", "l2"): X = X.astype(dtype) X_norm = normalize(X, norm=norm) assert X_norm.dtype == dtype X_norm = toarray(X_norm) if norm == "l1": row_sums = np.abs(X_norm).sum(axis=1) else: X_norm_squared = X_norm**2 row_sums = X_norm_squared.sum(axis=1) assert_array_almost_equal(row_sums, ones) # Test return_norm X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]]) for norm in ("l1", "l2", "max"): _, norms = normalize(X_dense, norm=norm, return_norm=True) if norm == "l1": assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0])) elif norm == "l2": assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127])) else: assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) X_sparse = csr_container(X_dense) for norm in ("l1", "l2"): with pytest.raises(NotImplementedError): normalize(X_sparse, norm=norm, return_norm=True) _, norms = normalize(X_sparse, norm="max", return_norm=True) assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0])) @pytest.mark.parametrize( "constructor", [np.array, list] + CSC_CONTAINERS + CSR_CONTAINERS ) def test_binarizer(constructor): X_ = np.array([[1, 0, 5], [2, 3, -1]]) X = constructor(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 4 assert np.sum(X_bin == 1) == 2 X_bin = binarizer.transform(X) assert sparse.issparse(X) == sparse.issparse(X_bin) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert X_bin is not X assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert X_bin is not X X_bin = toarray(X_bin) assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) if constructor is not list: assert X_bin is X binarizer = Binarizer(copy=False) X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64) X_bin = binarizer.transform(X_float) if constructor is not list: assert X_bin is X_float X_bin = toarray(X_bin) assert np.sum(X_bin == 0) == 2 assert np.sum(X_bin == 1) == 4 binarizer = Binarizer(threshold=-0.5, copy=True) if constructor in (np.array, list): X = constructor(X_.copy()) X_bin = toarray(binarizer.transform(X)) assert np.sum(X_bin == 0) == 1 assert np.sum(X_bin == 1) == 5 X_bin = binarizer.transform(X) # Cannot use threshold < 0 for sparse if constructor in CSC_CONTAINERS: with pytest.raises(ValueError): binarizer.transform(constructor(X)) def test_center_kernel(): # Test that KernelCenterer is equivalent to StandardScaler # in feature space rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) scaler = StandardScaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = rng.random_sample((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2) # check the results coherence with the method proposed in: # B. Schölkopf, A. Smola, and K.R. Müller, # "Nonlinear component analysis as a kernel eigenvalue problem" # equation (B.3) # K_centered3 = (I - 1_M) K (I - 1_M) # = K - 1_M K - K 1_M + 1_M K 1_M ones_M = np.ones_like(K_fit) / K_fit.shape[0] K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M assert_allclose(K_fit_centered, K_fit_centered3) # K_test_centered3 = (K_test - 1'_M K)(I - 1_M) # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0] K_pred_centered3 = ( K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M ) assert_allclose(K_pred_centered, K_pred_centered3) def test_kernelcenterer_non_linear_kernel(): """Check kernel centering for non-linear kernel.""" rng = np.random.RandomState(0) X, X_test = rng.randn(100, 50), rng.randn(20, 50) def phi(X): """Our mapping function phi.""" return np.vstack( [ np.clip(X, a_min=0, a_max=None), -np.clip(X, a_min=None, a_max=0), ] ) phi_X = phi(X) phi_X_test = phi(X_test) # centered the projection scaler = StandardScaler(with_std=False) phi_X_center = scaler.fit_transform(phi_X) phi_X_test_center = scaler.transform(phi_X_test) # create the different kernel K = phi_X @ phi_X.T K_test = phi_X_test @ phi_X.T K_center = phi_X_center @ phi_X_center.T K_test_center = phi_X_test_center @ phi_X_center.T kernel_centerer = KernelCenterer() kernel_centerer.fit(K) assert_allclose(kernel_centerer.transform(K), K_center) assert_allclose(kernel_centerer.transform(K_test), K_test_center) # check the results coherence with the method proposed in: # B. Schölkopf, A. Smola, and K.R. Müller, # "Nonlinear component analysis as a kernel eigenvalue problem" # equation (B.3) # K_centered = (I - 1_M) K (I - 1_M) # = K - 1_M K - K 1_M + 1_M K 1_M ones_M = np.ones_like(K) / K.shape[0] K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M assert_allclose(kernel_centerer.transform(K), K_centered) # K_test_centered = (K_test - 1'_M K)(I - 1_M) # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M ones_prime_M = np.ones_like(K_test) / K.shape[0] K_test_centered = ( K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M ) assert_allclose(kernel_centerer.transform(K_test), K_test_centered) def test_cv_pipeline_precomputed(): # Cross-validate a regression on four coplanar points with the same # value. Use precomputed kernel to ensure Pipeline with KernelCenterer # is treated as a pairwise operation. X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]]) y_true = np.ones((4,)) K = X.dot(X.T) kcent = KernelCenterer() pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())]) # did the pipeline set the pairwise attribute? assert pipeline._get_tags()["pairwise"] # test cross-validation, score should be almost perfect # NB: this test is pretty vacuous -- it's mainly to test integration # of Pipeline and KernelCenterer y_pred = cross_val_predict(pipeline, K, y_true, cv=2) assert_array_almost_equal(y_true, y_pred) def test_fit_transform(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for obj in (StandardScaler(), Normalizer(), Binarizer()): X_transformed = obj.fit(X).transform(X) X_transformed2 = obj.fit_transform(X) assert_array_equal(X_transformed, X_transformed2) def test_add_dummy_feature(): X = [[1, 0], [0, 1], [0, 1]] X = add_dummy_feature(X) assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) @pytest.mark.parametrize( "sparse_container", COO_CONTAINERS + CSC_CONTAINERS + CSR_CONTAINERS ) def test_add_dummy_feature_sparse(sparse_container): X = sparse_container([[1, 0], [0, 1], [0, 1]]) desired_format = X.format X = add_dummy_feature(X) assert sparse.issparse(X) and X.format == desired_format, X assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]]) def test_fit_cold_start(): X = iris.data X_2d = X[:, :2] # Scalers that have a partial_fit method scalers = [ StandardScaler(with_mean=False, with_std=False), MinMaxScaler(), MaxAbsScaler(), ] for scaler in scalers: scaler.fit_transform(X) # with a different shape, this may break the scaler unless the internal # state is reset scaler.fit_transform(X_2d) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) def test_power_transformer_notfitted(method): pt = PowerTransformer(method=method) X = np.abs(X_1col) with pytest.raises(NotFittedError): pt.transform(X) with pytest.raises(NotFittedError): pt.inverse_transform(X) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) @pytest.mark.parametrize("X", [X_1col, X_2d]) def test_power_transformer_inverse(method, standardize, X): # Make sure we get the original input when applying transform and then # inverse transform X = np.abs(X) if method == "box-cox" else X pt = PowerTransformer(method=method, standardize=standardize) X_trans = pt.fit_transform(X) assert_almost_equal(X, pt.inverse_transform(X_trans)) def test_power_transformer_1d(): X = np.abs(X_1col) for standardize in [True, False]: pt = PowerTransformer(method="box-cox", standardize=standardize) X_trans = pt.fit_transform(X) X_trans_func = power_transform(X, method="box-cox", standardize=standardize) X_expected, lambda_expected = stats.boxcox(X.flatten()) if standardize: X_expected = scale(X_expected) assert_almost_equal(X_expected.reshape(-1, 1), X_trans) assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func) assert_almost_equal(X, pt.inverse_transform(X_trans)) assert_almost_equal(lambda_expected, pt.lambdas_[0]) assert len(pt.lambdas_) == X.shape[1] assert isinstance(pt.lambdas_, np.ndarray) def test_power_transformer_2d(): X = np.abs(X_2d) for standardize in [True, False]: pt = PowerTransformer(method="box-cox", standardize=standardize) X_trans_class = pt.fit_transform(X) X_trans_func = power_transform(X, method="box-cox", standardize=standardize) for X_trans in [X_trans_class, X_trans_func]: for j in range(X_trans.shape[1]): X_expected, lmbda = stats.boxcox(X[:, j].flatten()) if standardize: X_expected = scale(X_expected) assert_almost_equal(X_trans[:, j], X_expected) assert_almost_equal(lmbda, pt.lambdas_[j]) # Test inverse transformation X_inv = pt.inverse_transform(X_trans) assert_array_almost_equal(X_inv, X) assert len(pt.lambdas_) == X.shape[1] assert isinstance(pt.lambdas_, np.ndarray) def test_power_transformer_boxcox_strictly_positive_exception(): # Exceptions should be raised for negative arrays and zero arrays when # method is boxcox pt = PowerTransformer(method="box-cox") pt.fit(np.abs(X_2d)) X_with_negatives = X_2d not_positive_message = "strictly positive" with pytest.raises(ValueError, match=not_positive_message): pt.transform(X_with_negatives) with pytest.raises(ValueError, match=not_positive_message): pt.fit(X_with_negatives) with pytest.raises(ValueError, match=not_positive_message): power_transform(X_with_negatives, method="box-cox") with pytest.raises(ValueError, match=not_positive_message): pt.transform(np.zeros(X_2d.shape)) with pytest.raises(ValueError, match=not_positive_message): pt.fit(np.zeros(X_2d.shape)) with pytest.raises(ValueError, match=not_positive_message): power_transform(np.zeros(X_2d.shape), method="box-cox") @pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)]) def test_power_transformer_yeojohnson_any_input(X): # Yeo-Johnson method should support any kind of input power_transform(X, method="yeo-johnson") @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) def test_power_transformer_shape_exception(method): pt = PowerTransformer(method=method) X = np.abs(X_2d) pt.fit(X) # Exceptions should be raised for arrays with different num_columns # than during fitting wrong_shape_message = ( r"X has \d+ features, but PowerTransformer is " r"expecting \d+ features" ) with pytest.raises(ValueError, match=wrong_shape_message): pt.transform(X[:, 0:1]) with pytest.raises(ValueError, match=wrong_shape_message): pt.inverse_transform(X[:, 0:1]) def test_power_transformer_lambda_zero(): pt = PowerTransformer(method="box-cox", standardize=False) X = np.abs(X_2d)[:, 0:1] # Test the lambda = 0 case pt.lambdas_ = np.array([0]) X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) def test_power_transformer_lambda_one(): # Make sure lambda = 1 corresponds to the identity for yeo-johnson pt = PowerTransformer(method="yeo-johnson", standardize=False) X = np.abs(X_2d)[:, 0:1] pt.lambdas_ = np.array([1]) X_trans = pt.transform(X) assert_array_almost_equal(X_trans, X) @pytest.mark.parametrize( "method, lmbda", [ ("box-cox", 0.1), ("box-cox", 0.5), ("yeo-johnson", 0.1), ("yeo-johnson", 0.5), ("yeo-johnson", 1.0), ], ) def test_optimization_power_transformer(method, lmbda): # Test the optimization procedure: # - set a predefined value for lambda # - apply inverse_transform to a normal dist (we get X_inv) # - apply fit_transform to X_inv (we get X_inv_trans) # - check that X_inv_trans is roughly equal to X rng = np.random.RandomState(0) n_samples = 20000 X = rng.normal(loc=0, scale=1, size=(n_samples, 1)) pt = PowerTransformer(method=method, standardize=False) pt.lambdas_ = [lmbda] X_inv = pt.inverse_transform(X) pt = PowerTransformer(method=method, standardize=False) X_inv_trans = pt.fit_transform(X_inv) assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2) assert_almost_equal(0, X_inv_trans.mean(), decimal=1) assert_almost_equal(1, X_inv_trans.std(), decimal=1) def test_yeo_johnson_darwin_example(): # test from original paper "A new family of power transformations to # improve normality or symmetry" by Yeo and Johnson. X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0] X = np.array(X).reshape(-1, 1) lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_ assert np.allclose(lmbda, 1.305, atol=1e-3) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) def test_power_transformer_nans(method): # Make sure lambda estimation is not influenced by NaN values # and that transform() supports NaN silently X = np.abs(X_1col) pt = PowerTransformer(method=method) pt.fit(X) lmbda_no_nans = pt.lambdas_[0] # concat nans at the end and check lambda stays the same X = np.concatenate([X, np.full_like(X, np.nan)]) X = shuffle(X, random_state=0) pt.fit(X) lmbda_nans = pt.lambdas_[0] assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5) X_trans = pt.transform(X) assert_array_equal(np.isnan(X_trans), np.isnan(X)) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_fit_transform(method, standardize): # check that fit_transform() and fit().transform() return the same values X = X_1col if method == "box-cox": X = np.abs(X) pt = PowerTransformer(method, standardize=standardize) assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_copy_True(method, standardize): # Check that neither fit, transform, fit_transform nor inverse_transform # modify X inplace when copy=True X = X_1col if method == "box-cox": X = np.abs(X) X_original = X.copy() assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) pt = PowerTransformer(method, standardize=standardize, copy=True) pt.fit(X) assert_array_almost_equal(X, X_original) X_trans = pt.transform(X) assert X_trans is not X X_trans = pt.fit_transform(X) assert_array_almost_equal(X, X_original) assert X_trans is not X X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is not X_inv_trans @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_copy_False(method, standardize): # check that when copy=False fit doesn't change X inplace but transform, # fit_transform and inverse_transform do. X = X_1col if method == "box-cox": X = np.abs(X) X_original = X.copy() assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) pt = PowerTransformer(method, standardize=standardize, copy=False) pt.fit(X) assert_array_almost_equal(X, X_original) # fit didn't change X X_trans = pt.transform(X) assert X_trans is X if method == "box-cox": X = np.abs(X) X_trans = pt.fit_transform(X) assert X_trans is X X_inv_trans = pt.inverse_transform(X_trans) assert X_trans is X_inv_trans def test_power_transformer_box_cox_raise_all_nans_col(): """Check that box-cox raises informative when a column contains all nans. Non-regression test for gh-26303 """ X = rng.random_sample((4, 5)) X[:, 0] = np.nan err_msg = "Column must not be all nan." pt = PowerTransformer(method="box-cox") with pytest.raises(ValueError, match=err_msg): pt.fit_transform(X) @pytest.mark.parametrize( "X_2", [sparse.random(10, 1, density=0.8, random_state=0)] + [ csr_container(np.full((10, 1), fill_value=np.nan)) for csr_container in CSR_CONTAINERS ], ) def test_standard_scaler_sparse_partial_fit_finite_variance(X_2): # non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/16448 X_1 = sparse.random(5, 1, density=0.8) scaler = StandardScaler(with_mean=False) scaler.fit(X_1).partial_fit(X_2) assert np.isfinite(scaler.var_[0]) @pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)]) def test_minmax_scaler_clip(feature_range): # test behaviour of the parameter 'clip' in MinMaxScaler X = iris.data scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X) X_min, X_max = np.min(X, axis=0), np.max(X, axis=0) X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]] X_transformed = scaler.transform(X_test) assert_allclose( X_transformed, [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]], ) def test_standard_scaler_raise_error_for_1d_input(): """Check that `inverse_transform` from `StandardScaler` raises an error with 1D array. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/19518 """ scaler = StandardScaler().fit(X_2d) err_msg = "Expected 2D array, got 1D array instead" with pytest.raises(ValueError, match=err_msg): scaler.inverse_transform(X_2d[:, 0]) def test_power_transformer_significantly_non_gaussian(): """Check that significantly non-Gaussian data before transforms correctly. For some explored lambdas, the transformed data may be constant and will be rejected. Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/14959 """ X_non_gaussian = 1e6 * np.array( [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64 ).reshape(-1, 1) pt = PowerTransformer() with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) X_trans = pt.fit_transform(X_non_gaussian) assert not np.any(np.isnan(X_trans)) assert X_trans.mean() == pytest.approx(0.0) assert X_trans.std() == pytest.approx(1.0) assert X_trans.min() > -2 assert X_trans.max() < 2 @pytest.mark.parametrize( "Transformer", [ MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler, QuantileTransformer, PowerTransformer, ], ) def test_one_to_one_features(Transformer): """Check one-to-one transformers give correct feature names.""" tr = Transformer().fit(iris.data) names_out = tr.get_feature_names_out(iris.feature_names) assert_array_equal(names_out, iris.feature_names) @pytest.mark.parametrize( "Transformer", [ MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler, QuantileTransformer, PowerTransformer, Normalizer, Binarizer, ], ) def test_one_to_one_features_pandas(Transformer): """Check one-to-one transformers give correct feature names.""" pd = pytest.importorskip("pandas") df = pd.DataFrame(iris.data, columns=iris.feature_names) tr = Transformer().fit(df) names_out_df_default = tr.get_feature_names_out() assert_array_equal(names_out_df_default, iris.feature_names) names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names) assert_array_equal(names_out_df_valid_in, iris.feature_names) msg = re.escape("input_features is not equal to feature_names_in_") with pytest.raises(ValueError, match=msg): invalid_names = list("abcd") tr.get_feature_names_out(invalid_names) def test_kernel_centerer_feature_names_out(): """Test that kernel centerer `feature_names_out`.""" rng = np.random.RandomState(0) X = rng.random_sample((6, 4)) X_pairwise = linear_kernel(X) centerer = KernelCenterer().fit(X_pairwise) names_out = centerer.get_feature_names_out() samples_out2 = X_pairwise.shape[1] assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) @pytest.mark.parametrize("standardize", [True, False]) def test_power_transformer_constant_feature(standardize): """Check that PowerTransfomer leaves constant features unchanged.""" X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]] pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X) assert_allclose(pt.lambdas_, [1, 1, 1]) Xft = pt.fit_transform(X) Xt = pt.transform(X) for Xt_ in [Xft, Xt]: if standardize: assert_allclose(Xt_, np.zeros_like(X)) else: assert_allclose(Xt_, X)