# Author: Henry Lin # Tom Dupré la Tour # License: BSD import warnings from numbers import Integral import numpy as np from ..base import BaseEstimator, TransformerMixin, _fit_context from ..utils import _safe_indexing from ..utils._param_validation import Hidden, Interval, Options, StrOptions from ..utils.stats import _weighted_percentile from ..utils.validation import ( _check_feature_names_in, _check_sample_weight, check_array, check_is_fitted, check_random_state, ) from ._encoders import OneHotEncoder class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ Bin continuous data into intervals. Read more in the :ref:`User Guide `. .. versionadded:: 0.20 Parameters ---------- n_bins : int or array-like of shape (n_features,), default=5 The number of bins to produce. Raises ValueError if ``n_bins < 2``. encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' Method used to encode the transformed result. - 'onehot': Encode the transformed result with one-hot encoding and return a sparse matrix. Ignored features are always stacked to the right. - 'onehot-dense': Encode the transformed result with one-hot encoding and return a dense array. Ignored features are always stacked to the right. - 'ordinal': Return the bin identifier encoded as an integer value. strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' Strategy used to define the widths of the bins. - 'uniform': All bins in each feature have identical widths. - 'quantile': All bins in each feature have the same number of points. - 'kmeans': Values in each bin have the same nearest center of a 1D k-means cluster. For an example of the different strategies see: :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`. dtype : {np.float32, np.float64}, default=None The desired data-type for the output. If None, output dtype is consistent with input dtype. Only np.float32 and np.float64 are supported. .. versionadded:: 0.24 subsample : int or None, default='warn' Maximum number of samples, used to fit the model, for computational efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None` when `strategy='uniform'` or `strategy='kmeans'`. `subsample=None` means that all the training samples are used when computing the quantiles that determine the binning thresholds. Since quantile computation relies on sorting each column of `X` and that sorting has an `n log(n)` time complexity, it is recommended to use subsampling on datasets with a very large number of samples. .. versionchanged:: 1.3 The default value of `subsample` changed from `None` to `200_000` when `strategy="quantile"`. .. versionchanged:: 1.5 The default value of `subsample` changed from `None` to `200_000` when `strategy="uniform"` or `strategy="kmeans"`. random_state : int, RandomState instance or None, default=None Determines random number generation for subsampling. Pass an int for reproducible results across multiple function calls. See the `subsample` parameter for more details. See :term:`Glossary `. .. versionadded:: 1.1 Attributes ---------- bin_edges_ : ndarray of ndarray of shape (n_features,) The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` Ignored features will have empty arrays. n_bins_ : ndarray of shape (n_features,), dtype=np.int64 Number of bins per feature. Bins whose width are too small (i.e., <= 1e-8) are removed with a warning. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- Binarizer : Class used to bin values as ``0`` or ``1`` based on a parameter ``threshold``. Notes ----- For a visualization of discretization on different datasets refer to :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`. On the effect of discretization on linear models see: :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`. In bin edges for feature ``i``, the first and last values are used only for ``inverse_transform``. During transform, bin edges are extended to:: np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf]) You can combine ``KBinsDiscretizer`` with :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess part of the features. ``KBinsDiscretizer`` might produce constant features (e.g., when ``encode = 'onehot'`` and certain bins do not contain any data). These features can be removed with feature selection algorithms (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`). Examples -------- >>> from sklearn.preprocessing import KBinsDiscretizer >>> X = [[-2, 1, -4, -1], ... [-1, 2, -3, -0.5], ... [ 0, 3, -2, 0.5], ... [ 1, 4, -1, 2]] >>> est = KBinsDiscretizer( ... n_bins=3, encode='ordinal', strategy='uniform', subsample=None ... ) >>> est.fit(X) KBinsDiscretizer(...) >>> Xt = est.transform(X) >>> Xt # doctest: +SKIP array([[ 0., 0., 0., 0.], [ 1., 1., 1., 0.], [ 2., 2., 2., 1.], [ 2., 2., 2., 2.]]) Sometimes it may be useful to convert the data back into the original feature space. The ``inverse_transform`` function converts the binned data into the original feature space. Each value will be equal to the mean of the two bin edges. >>> est.bin_edges_[0] array([-2., -1., 0., 1.]) >>> est.inverse_transform(Xt) array([[-1.5, 1.5, -3.5, -0.5], [-0.5, 2.5, -2.5, -0.5], [ 0.5, 3.5, -1.5, 0.5], [ 0.5, 3.5, -1.5, 1.5]]) """ _parameter_constraints: dict = { "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"], "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})], "strategy": [StrOptions({"uniform", "quantile", "kmeans"})], "dtype": [Options(type, {np.float64, np.float32}), None], "subsample": [ Interval(Integral, 1, None, closed="left"), None, Hidden(StrOptions({"warn"})), ], "random_state": ["random_state"], } def __init__( self, n_bins=5, *, encode="onehot", strategy="quantile", dtype=None, subsample="warn", random_state=None, ): self.n_bins = n_bins self.encode = encode self.strategy = strategy self.dtype = dtype self.subsample = subsample self.random_state = random_state @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """ Fit the estimator. Parameters ---------- X : array-like of shape (n_samples, n_features) Data to be discretized. y : None Ignored. This parameter exists only for compatibility with :class:`~sklearn.pipeline.Pipeline`. sample_weight : ndarray of shape (n_samples,) Contains weight values to be associated with each sample. Only possible when `strategy` is set to `"quantile"`. .. versionadded:: 1.3 Returns ------- self : object Returns the instance itself. """ X = self._validate_data(X, dtype="numeric") if self.dtype in (np.float64, np.float32): output_dtype = self.dtype else: # self.dtype is None output_dtype = X.dtype n_samples, n_features = X.shape if sample_weight is not None and self.strategy == "uniform": raise ValueError( "`sample_weight` was provided but it cannot be " "used with strategy='uniform'. Got strategy=" f"{self.strategy!r} instead." ) if self.strategy in ("uniform", "kmeans") and self.subsample == "warn": warnings.warn( ( "In version 1.5 onwards, subsample=200_000 " "will be used by default. Set subsample explicitly to " "silence this warning in the mean time. Set " "subsample=None to disable subsampling explicitly." ), FutureWarning, ) subsample = self.subsample if subsample == "warn": subsample = 200000 if self.strategy == "quantile" else None if subsample is not None and n_samples > subsample: rng = check_random_state(self.random_state) subsample_idx = rng.choice(n_samples, size=subsample, replace=False) X = _safe_indexing(X, subsample_idx) n_features = X.shape[1] n_bins = self._validate_n_bins(n_features) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) bin_edges = np.zeros(n_features, dtype=object) for jj in range(n_features): column = X[:, jj] col_min, col_max = column.min(), column.max() if col_min == col_max: warnings.warn( "Feature %d is constant and will be replaced with 0." % jj ) n_bins[jj] = 1 bin_edges[jj] = np.array([-np.inf, np.inf]) continue if self.strategy == "uniform": bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1) elif self.strategy == "quantile": quantiles = np.linspace(0, 100, n_bins[jj] + 1) if sample_weight is None: bin_edges[jj] = np.asarray(np.percentile(column, quantiles)) else: bin_edges[jj] = np.asarray( [ _weighted_percentile(column, sample_weight, q) for q in quantiles ], dtype=np.float64, ) elif self.strategy == "kmeans": from ..cluster import KMeans # fixes import loops # Deterministic initialization with uniform spacing uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1) init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 # 1D k-means procedure km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) centers = km.fit( column[:, None], sample_weight=sample_weight ).cluster_centers_[:, 0] # Must sort, centers may be unsorted even with sorted init centers.sort() bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] # Remove bins whose width are too small (i.e., <= 1e-8) if self.strategy in ("quantile", "kmeans"): mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 bin_edges[jj] = bin_edges[jj][mask] if len(bin_edges[jj]) - 1 != n_bins[jj]: warnings.warn( "Bins whose width are too small (i.e., <= " "1e-8) in feature %d are removed. Consider " "decreasing the number of bins." % jj ) n_bins[jj] = len(bin_edges[jj]) - 1 self.bin_edges_ = bin_edges self.n_bins_ = n_bins if "onehot" in self.encode: self._encoder = OneHotEncoder( categories=[np.arange(i) for i in self.n_bins_], sparse_output=self.encode == "onehot", dtype=output_dtype, ) # Fit the OneHotEncoder with toy datasets # so that it's ready for use after the KBinsDiscretizer is fitted self._encoder.fit(np.zeros((1, len(self.n_bins_)))) return self def _validate_n_bins(self, n_features): """Returns n_bins_, the number of bins per feature.""" orig_bins = self.n_bins if isinstance(orig_bins, Integral): return np.full(n_features, orig_bins, dtype=int) n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False) if n_bins.ndim > 1 or n_bins.shape[0] != n_features: raise ValueError("n_bins must be a scalar or array of shape (n_features,).") bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins) violating_indices = np.where(bad_nbins_value)[0] if violating_indices.shape[0] > 0: indices = ", ".join(str(i) for i in violating_indices) raise ValueError( "{} received an invalid number " "of bins at indices {}. Number of bins " "must be at least 2, and must be an int.".format( KBinsDiscretizer.__name__, indices ) ) return n_bins def transform(self, X): """ Discretize the data. Parameters ---------- X : array-like of shape (n_samples, n_features) Data to be discretized. Returns ------- Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} Data in the binned space. Will be a sparse matrix if `self.encode='onehot'` and ndarray otherwise. """ check_is_fitted(self) # check input and attribute dtypes dtype = (np.float64, np.float32) if self.dtype is None else self.dtype Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False) bin_edges = self.bin_edges_ for jj in range(Xt.shape[1]): Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right") if self.encode == "ordinal": return Xt dtype_init = None if "onehot" in self.encode: dtype_init = self._encoder.dtype self._encoder.dtype = Xt.dtype try: Xt_enc = self._encoder.transform(Xt) finally: # revert the initial dtype to avoid modifying self. self._encoder.dtype = dtype_init return Xt_enc def inverse_transform(self, Xt): """ Transform discretized data back to original feature space. Note that this function does not regenerate the original data due to discretization rounding. Parameters ---------- Xt : array-like of shape (n_samples, n_features) Transformed data in the binned space. Returns ------- Xinv : ndarray, dtype={np.float32, np.float64} Data in the original feature space. """ check_is_fitted(self) if "onehot" in self.encode: Xt = self._encoder.inverse_transform(Xt) Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32)) n_features = self.n_bins_.shape[0] if Xinv.shape[1] != n_features: raise ValueError( "Incorrect number of features. Expecting {}, received {}.".format( n_features, Xinv.shape[1] ) ) for jj in range(n_features): bin_edges = self.bin_edges_[jj] bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5 Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)] return Xinv def get_feature_names_out(self, input_features=None): """Get output feature names. Parameters ---------- input_features : array-like of str or None, default=None Input features. - If `input_features` is `None`, then `feature_names_in_` is used as feature names in. If `feature_names_in_` is not defined, then the following input feature names are generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. - If `input_features` is an array-like, then `input_features` must match `feature_names_in_` if `feature_names_in_` is defined. Returns ------- feature_names_out : ndarray of str objects Transformed feature names. """ check_is_fitted(self, "n_features_in_") input_features = _check_feature_names_in(self, input_features) if hasattr(self, "_encoder"): return self._encoder.get_feature_names_out(input_features) # ordinal encoding return input_features