ai-content-maker/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py

# Author: Henry Lin <hlin117@gmail.com>
#         Tom Dupré la Tour

# License: BSD


import warnings
from numbers import Integral

import numpy as np

from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils import _safe_indexing
from ..utils._param_validation import Hidden, Interval, Options, StrOptions
from ..utils.stats import _weighted_percentile
from ..utils.validation import (
    _check_feature_names_in,
    _check_sample_weight,
    check_array,
    check_is_fitted,
    check_random_state,
)
from ._encoders import OneHotEncoder


class KBinsDiscretizer(TransformerMixin, BaseEstimator):
    """
    Bin continuous data into intervals.

    Read more in the :ref:`User Guide <preprocessing_discretization>`.

    .. versionadded:: 0.20

    Parameters
    ----------
    n_bins : int or array-like of shape (n_features,), default=5
        The number of bins to produce. Raises ValueError if ``n_bins < 2``.

    encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
        Method used to encode the transformed result.

        - 'onehot': Encode the transformed result with one-hot encoding
          and return a sparse matrix. Ignored features are always
          stacked to the right.
        - 'onehot-dense': Encode the transformed result with one-hot encoding
          and return a dense array. Ignored features are always
          stacked to the right.
        - 'ordinal': Return the bin identifier encoded as an integer value.

    strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
        Strategy used to define the widths of the bins.

        - 'uniform': All bins in each feature have identical widths.
        - 'quantile': All bins in each feature have the same number of points.
        - 'kmeans': Values in each bin have the same nearest center of a 1D
          k-means cluster.

        For an example of the different strategies see:
        :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.

    dtype : {np.float32, np.float64}, default=None
        The desired data-type for the output. If None, output dtype is
        consistent with input dtype. Only np.float32 and np.float64 are
        supported.

        .. versionadded:: 0.24

    subsample : int or None, default='warn'
        Maximum number of samples, used to fit the model, for computational
        efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
        when `strategy='uniform'` or `strategy='kmeans'`.
        `subsample=None` means that all the training samples are used when
        computing the quantiles that determine the binning thresholds.
        Since quantile computation relies on sorting each column of `X` and
        that sorting has an `n log(n)` time complexity,
        it is recommended to use subsampling on datasets with a
        very large number of samples.

        .. versionchanged:: 1.3
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="quantile"`.

        .. versionchanged:: 1.5
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="uniform"` or `strategy="kmeans"`.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for subsampling.
        Pass an int for reproducible results across multiple function calls.
        See the `subsample` parameter for more details.
        See :term:`Glossary <random_state>`.

        .. versionadded:: 1.1

    Attributes
    ----------
    bin_edges_ : ndarray of ndarray of shape (n_features,)
        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
        Ignored features will have empty arrays.

    n_bins_ : ndarray of shape (n_features,), dtype=np.int64
        Number of bins per feature. Bins whose width are too small
        (i.e., <= 1e-8) are removed with a warning.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    Binarizer : Class used to bin values as ``0`` or
        ``1`` based on a parameter ``threshold``.

    Notes
    -----

    For a visualization of discretization on different datasets refer to
    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
    On the effect of discretization on linear models see:
    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.

    In bin edges for feature ``i``, the first and last values are used only for
    ``inverse_transform``. During transform, bin edges are extended to::

      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])

    You can combine ``KBinsDiscretizer`` with
    :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
    part of the features.

    ``KBinsDiscretizer`` might produce constant features (e.g., when
    ``encode = 'onehot'`` and certain bins do not contain any data).
    These features can be removed with feature selection algorithms
    (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).

    Examples
    --------
    >>> from sklearn.preprocessing import KBinsDiscretizer
    >>> X = [[-2, 1, -4,   -1],
    ...      [-1, 2, -3, -0.5],
    ...      [ 0, 3, -2,  0.5],
    ...      [ 1, 4, -1,    2]]
    >>> est = KBinsDiscretizer(
    ...     n_bins=3, encode='ordinal', strategy='uniform', subsample=None
    ... )
    >>> est.fit(X)
    KBinsDiscretizer(...)
    >>> Xt = est.transform(X)
    >>> Xt  # doctest: +SKIP
    array([[ 0., 0., 0., 0.],
           [ 1., 1., 1., 0.],
           [ 2., 2., 2., 1.],
           [ 2., 2., 2., 2.]])

    Sometimes it may be useful to convert the data back into the original
    feature space. The ``inverse_transform`` function converts the binned
    data into the original feature space. Each value will be equal to the mean
    of the two bin edges.

    >>> est.bin_edges_[0]
    array([-2., -1.,  0.,  1.])
    >>> est.inverse_transform(Xt)
    array([[-1.5,  1.5, -3.5, -0.5],
           [-0.5,  2.5, -2.5, -0.5],
           [ 0.5,  3.5, -1.5,  0.5],
           [ 0.5,  3.5, -1.5,  1.5]])
    """

    _parameter_constraints: dict = {
        "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
        "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
        "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
        "dtype": [Options(type, {np.float64, np.float32}), None],
        "subsample": [
            Interval(Integral, 1, None, closed="left"),
            None,
            Hidden(StrOptions({"warn"})),
        ],
        "random_state": ["random_state"],
    }

    def __init__(
        self,
        n_bins=5,
        *,
        encode="onehot",
        strategy="quantile",
        dtype=None,
        subsample="warn",
        random_state=None,
    ):
        self.n_bins = n_bins
        self.encode = encode
        self.strategy = strategy
        self.dtype = dtype
        self.subsample = subsample
        self.random_state = random_state

    @_fit_context(prefer_skip_nested_validation=True)
    def fit(self, X, y=None, sample_weight=None):
        """
        Fit the estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        y : None
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

        sample_weight : ndarray of shape (n_samples,)
            Contains weight values to be associated with each sample.
            Only possible when `strategy` is set to `"quantile"`.

            .. versionadded:: 1.3

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        X = self._validate_data(X, dtype="numeric")

        if self.dtype in (np.float64, np.float32):
            output_dtype = self.dtype
        else:  # self.dtype is None
            output_dtype = X.dtype

        n_samples, n_features = X.shape

        if sample_weight is not None and self.strategy == "uniform":
            raise ValueError(
                "`sample_weight` was provided but it cannot be "
                "used with strategy='uniform'. Got strategy="
                f"{self.strategy!r} instead."
            )

        if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":
            warnings.warn(
                (
                    "In version 1.5 onwards, subsample=200_000 "
                    "will be used by default. Set subsample explicitly to "
                    "silence this warning in the mean time. Set "
                    "subsample=None to disable subsampling explicitly."
                ),
                FutureWarning,
            )

        subsample = self.subsample
        if subsample == "warn":
            subsample = 200000 if self.strategy == "quantile" else None
        if subsample is not None and n_samples > subsample:
            rng = check_random_state(self.random_state)
            subsample_idx = rng.choice(n_samples, size=subsample, replace=False)
            X = _safe_indexing(X, subsample_idx)

        n_features = X.shape[1]
        n_bins = self._validate_n_bins(n_features)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

        bin_edges = np.zeros(n_features, dtype=object)
        for jj in range(n_features):
            column = X[:, jj]
            col_min, col_max = column.min(), column.max()

            if col_min == col_max:
                warnings.warn(
                    "Feature %d is constant and will be replaced with 0." % jj
                )
                n_bins[jj] = 1
                bin_edges[jj] = np.array([-np.inf, np.inf])
                continue

            if self.strategy == "uniform":
                bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)

            elif self.strategy == "quantile":
                quantiles = np.linspace(0, 100, n_bins[jj] + 1)
                if sample_weight is None:
                    bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
                else:
                    bin_edges[jj] = np.asarray(
                        [
                            _weighted_percentile(column, sample_weight, q)
                            for q in quantiles
                        ],
                        dtype=np.float64,
                    )
            elif self.strategy == "kmeans":
                from ..cluster import KMeans  # fixes import loops

                # Deterministic initialization with uniform spacing
                uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
                init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5

                # 1D k-means procedure
                km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
                centers = km.fit(
                    column[:, None], sample_weight=sample_weight
                ).cluster_centers_[:, 0]
                # Must sort, centers may be unsorted even with sorted init
                centers.sort()
                bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
                bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]

            # Remove bins whose width are too small (i.e., <= 1e-8)
            if self.strategy in ("quantile", "kmeans"):
                mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
                bin_edges[jj] = bin_edges[jj][mask]
                if len(bin_edges[jj]) - 1 != n_bins[jj]:
                    warnings.warn(
                        "Bins whose width are too small (i.e., <= "
                        "1e-8) in feature %d are removed. Consider "
                        "decreasing the number of bins." % jj
                    )
                    n_bins[jj] = len(bin_edges[jj]) - 1

        self.bin_edges_ = bin_edges
        self.n_bins_ = n_bins

        if "onehot" in self.encode:
            self._encoder = OneHotEncoder(
                categories=[np.arange(i) for i in self.n_bins_],
                sparse_output=self.encode == "onehot",
                dtype=output_dtype,
            )
            # Fit the OneHotEncoder with toy datasets
            # so that it's ready for use after the KBinsDiscretizer is fitted
            self._encoder.fit(np.zeros((1, len(self.n_bins_))))

        return self

    def _validate_n_bins(self, n_features):
        """Returns n_bins_, the number of bins per feature."""
        orig_bins = self.n_bins
        if isinstance(orig_bins, Integral):
            return np.full(n_features, orig_bins, dtype=int)

        n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)

        if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
            raise ValueError("n_bins must be a scalar or array of shape (n_features,).")

        bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)

        violating_indices = np.where(bad_nbins_value)[0]
        if violating_indices.shape[0] > 0:
            indices = ", ".join(str(i) for i in violating_indices)
            raise ValueError(
                "{} received an invalid number "
                "of bins at indices {}. Number of bins "
                "must be at least 2, and must be an int.".format(
                    KBinsDiscretizer.__name__, indices
                )
            )
        return n_bins

    def transform(self, X):
        """
        Discretize the data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        Returns
        -------
        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
            Data in the binned space. Will be a sparse matrix if
            `self.encode='onehot'` and ndarray otherwise.
        """
        check_is_fitted(self)

        # check input and attribute dtypes
        dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
        Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)

        bin_edges = self.bin_edges_
        for jj in range(Xt.shape[1]):
            Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")

        if self.encode == "ordinal":
            return Xt

        dtype_init = None
        if "onehot" in self.encode:
            dtype_init = self._encoder.dtype
            self._encoder.dtype = Xt.dtype
        try:
            Xt_enc = self._encoder.transform(Xt)
        finally:
            # revert the initial dtype to avoid modifying self.
            self._encoder.dtype = dtype_init
        return Xt_enc

    def inverse_transform(self, Xt):
        """
        Transform discretized data back to original feature space.

        Note that this function does not regenerate the original data
        due to discretization rounding.

        Parameters
        ----------
        Xt : array-like of shape (n_samples, n_features)
            Transformed data in the binned space.

        Returns
        -------
        Xinv : ndarray, dtype={np.float32, np.float64}
            Data in the original feature space.
        """
        check_is_fitted(self)

        if "onehot" in self.encode:
            Xt = self._encoder.inverse_transform(Xt)

        Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))
        n_features = self.n_bins_.shape[0]
        if Xinv.shape[1] != n_features:
            raise ValueError(
                "Incorrect number of features. Expecting {}, received {}.".format(
                    n_features, Xinv.shape[1]
                )
            )

        for jj in range(n_features):
            bin_edges = self.bin_edges_[jj]
            bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
            Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]

        return Xinv

    def get_feature_names_out(self, input_features=None):
        """Get output feature names.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Input features.

            - If `input_features` is `None`, then `feature_names_in_` is
              used as feature names in. If `feature_names_in_` is not defined,
              then the following input feature names are generated:
              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
            - If `input_features` is an array-like, then `input_features` must
              match `feature_names_in_` if `feature_names_in_` is defined.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        """
        check_is_fitted(self, "n_features_in_")
        input_features = _check_feature_names_in(self, input_features)
        if hasattr(self, "_encoder"):
            return self._encoder.get_feature_names_out(input_features)

        # ordinal encoding
        return input_features
first commit 2024-05-03 04:18:51 +03:00			`# Author: Henry Lin <hlin117@gmail.com>`
			`# Tom Dupré la Tour`

			`# License: BSD`


			`import warnings`
			`from numbers import Integral`

			`import numpy as np`

			`from ..base import BaseEstimator, TransformerMixin, _fit_context`
			`from ..utils import _safe_indexing`
			`from ..utils._param_validation import Hidden, Interval, Options, StrOptions`
			`from ..utils.stats import _weighted_percentile`
			`from ..utils.validation import (`
			`_check_feature_names_in,`
			`_check_sample_weight,`
			`check_array,`
			`check_is_fitted,`
			`check_random_state,`
			`)`
			`from ._encoders import OneHotEncoder`


			`class KBinsDiscretizer(TransformerMixin, BaseEstimator):`
			`"""`
			`Bin continuous data into intervals.`

			Read more in the :ref:`User Guide <preprocessing_discretization>`.

			`.. versionadded:: 0.20`

			`Parameters`
			`----------`
			`n_bins : int or array-like of shape (n_features,), default=5`
			The number of bins to produce. Raises ValueError if ``n_bins < 2``.

			`encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'`
			`Method used to encode the transformed result.`

			`- 'onehot': Encode the transformed result with one-hot encoding`
			`and return a sparse matrix. Ignored features are always`
			`stacked to the right.`
			`- 'onehot-dense': Encode the transformed result with one-hot encoding`
			`and return a dense array. Ignored features are always`
			`stacked to the right.`
			`- 'ordinal': Return the bin identifier encoded as an integer value.`

			`strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'`
			`Strategy used to define the widths of the bins.`

			`- 'uniform': All bins in each feature have identical widths.`
			`- 'quantile': All bins in each feature have the same number of points.`
			`- 'kmeans': Values in each bin have the same nearest center of a 1D`
			`k-means cluster.`

			`For an example of the different strategies see:`
			:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.

			`dtype : {np.float32, np.float64}, default=None`
			`The desired data-type for the output. If None, output dtype is`
			`consistent with input dtype. Only np.float32 and np.float64 are`
			`supported.`

			`.. versionadded:: 0.24`

			`subsample : int or None, default='warn'`
			`Maximum number of samples, used to fit the model, for computational`
			efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
			when `strategy='uniform'` or `strategy='kmeans'`.
			`subsample=None` means that all the training samples are used when
			`computing the quantiles that determine the binning thresholds.`
			Since quantile computation relies on sorting each column of `X` and
			that sorting has an `n log(n)` time complexity,
			`it is recommended to use subsampling on datasets with a`
			`very large number of samples.`

			`.. versionchanged:: 1.3`
			The default value of `subsample` changed from `None` to `200_000` when
			`strategy="quantile"`.

			`.. versionchanged:: 1.5`
			The default value of `subsample` changed from `None` to `200_000` when
			`strategy="uniform"` or `strategy="kmeans"`.

			`random_state : int, RandomState instance or None, default=None`
			`Determines random number generation for subsampling.`
			`Pass an int for reproducible results across multiple function calls.`
			See the `subsample` parameter for more details.
			See :term:`Glossary <random_state>`.

			`.. versionadded:: 1.1`

			`Attributes`
			`----------`
			`bin_edges_ : ndarray of ndarray of shape (n_features,)`
			The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
			`Ignored features will have empty arrays.`

			`n_bins_ : ndarray of shape (n_features,), dtype=np.int64`
			`Number of bins per feature. Bins whose width are too small`
			`(i.e., <= 1e-8) are removed with a warning.`

			`n_features_in_ : int`
			Number of features seen during :term:`fit`.

			`.. versionadded:: 0.24`

			feature_names_in_ : ndarray of shape (`n_features_in_`,)
			Names of features seen during :term:`fit`. Defined only when `X`
			`has feature names that are all strings.`

			`.. versionadded:: 1.0`

			`See Also`
			`--------`
			Binarizer : Class used to bin values as ``0`` or
			``1`` based on a parameter ``threshold``.

			`Notes`
			`-----`

			`For a visualization of discretization on different datasets refer to`
			:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
			`On the effect of discretization on linear models see:`
			:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.

			In bin edges for feature ``i``, the first and last values are used only for
			``inverse_transform``. During transform, bin edges are extended to::

			`np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])`

			You can combine ``KBinsDiscretizer`` with
			:class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
			`part of the features.`

			``KBinsDiscretizer`` might produce constant features (e.g., when
			``encode = 'onehot'`` and certain bins do not contain any data).
			`These features can be removed with feature selection algorithms`
			(e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).

			`Examples`
			`--------`
			`>>> from sklearn.preprocessing import KBinsDiscretizer`
			`>>> X = [[-2, 1, -4, -1],`
			`... [-1, 2, -3, -0.5],`
			`... [ 0, 3, -2, 0.5],`
			`... [ 1, 4, -1, 2]]`
			`>>> est = KBinsDiscretizer(`
			`... n_bins=3, encode='ordinal', strategy='uniform', subsample=None`
			`... )`
			`>>> est.fit(X)`
			`KBinsDiscretizer(...)`
			`>>> Xt = est.transform(X)`
			`>>> Xt # doctest: +SKIP`
			`array([[ 0., 0., 0., 0.],`
			`[ 1., 1., 1., 0.],`
			`[ 2., 2., 2., 1.],`
			`[ 2., 2., 2., 2.]])`

			`Sometimes it may be useful to convert the data back into the original`
			feature space. The ``inverse_transform`` function converts the binned
			`data into the original feature space. Each value will be equal to the mean`
			`of the two bin edges.`

			`>>> est.bin_edges_[0]`
			`array([-2., -1., 0., 1.])`
			`>>> est.inverse_transform(Xt)`
			`array([[-1.5, 1.5, -3.5, -0.5],`
			`[-0.5, 2.5, -2.5, -0.5],`
			`[ 0.5, 3.5, -1.5, 0.5],`
			`[ 0.5, 3.5, -1.5, 1.5]])`
			`"""`

			`_parameter_constraints: dict = {`
			`"n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],`
			`"encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],`
			`"strategy": [StrOptions({"uniform", "quantile", "kmeans"})],`
			`"dtype": [Options(type, {np.float64, np.float32}), None],`
			`"subsample": [`
			`Interval(Integral, 1, None, closed="left"),`
			`None,`
			`Hidden(StrOptions({"warn"})),`
			`],`
			`"random_state": ["random_state"],`
			`}`

			`def __init__(`
			`self,`
			`n_bins=5,`
			`*,`
			`encode="onehot",`
			`strategy="quantile",`
			`dtype=None,`
			`subsample="warn",`
			`random_state=None,`
			`):`
			`self.n_bins = n_bins`
			`self.encode = encode`
			`self.strategy = strategy`
			`self.dtype = dtype`
			`self.subsample = subsample`
			`self.random_state = random_state`

			`@_fit_context(prefer_skip_nested_validation=True)`
			`def fit(self, X, y=None, sample_weight=None):`
			`"""`
			`Fit the estimator.`

			`Parameters`
			`----------`
			`X : array-like of shape (n_samples, n_features)`
			`Data to be discretized.`

			`y : None`
			`Ignored. This parameter exists only for compatibility with`
			:class:`~sklearn.pipeline.Pipeline`.

			`sample_weight : ndarray of shape (n_samples,)`
			`Contains weight values to be associated with each sample.`
			Only possible when `strategy` is set to `"quantile"`.

			`.. versionadded:: 1.3`

			`Returns`
			`-------`
			`self : object`
			`Returns the instance itself.`
			`"""`
			`X = self._validate_data(X, dtype="numeric")`

			`if self.dtype in (np.float64, np.float32):`
			`output_dtype = self.dtype`
			`else: # self.dtype is None`
			`output_dtype = X.dtype`

			`n_samples, n_features = X.shape`

			`if sample_weight is not None and self.strategy == "uniform":`
			`raise ValueError(`
			"`sample_weight` was provided but it cannot be "
			`"used with strategy='uniform'. Got strategy="`
			`f"{self.strategy!r} instead."`
			`)`

			`if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":`
			`warnings.warn(`
			`(`
			`"In version 1.5 onwards, subsample=200_000 "`
			`"will be used by default. Set subsample explicitly to "`
			`"silence this warning in the mean time. Set "`
			`"subsample=None to disable subsampling explicitly."`
			`),`
			`FutureWarning,`
			`)`

			`subsample = self.subsample`
			`if subsample == "warn":`
			`subsample = 200000 if self.strategy == "quantile" else None`
			`if subsample is not None and n_samples > subsample:`
			`rng = check_random_state(self.random_state)`
			`subsample_idx = rng.choice(n_samples, size=subsample, replace=False)`
			`X = _safe_indexing(X, subsample_idx)`

			`n_features = X.shape[1]`
			`n_bins = self._validate_n_bins(n_features)`

			`if sample_weight is not None:`
			`sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)`

			`bin_edges = np.zeros(n_features, dtype=object)`
			`for jj in range(n_features):`
			`column = X[:, jj]`
			`col_min, col_max = column.min(), column.max()`

			`if col_min == col_max:`
			`warnings.warn(`
			`"Feature %d is constant and will be replaced with 0." % jj`
			`)`
			`n_bins[jj] = 1`
			`bin_edges[jj] = np.array([-np.inf, np.inf])`
			`continue`

			`if self.strategy == "uniform":`
			`bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)`

			`elif self.strategy == "quantile":`
			`quantiles = np.linspace(0, 100, n_bins[jj] + 1)`
			`if sample_weight is None:`
			`bin_edges[jj] = np.asarray(np.percentile(column, quantiles))`
			`else:`
			`bin_edges[jj] = np.asarray(`
			`[`
			`_weighted_percentile(column, sample_weight, q)`
			`for q in quantiles`
			`],`
			`dtype=np.float64,`
			`)`
			`elif self.strategy == "kmeans":`
			`from ..cluster import KMeans # fixes import loops`

			`# Deterministic initialization with uniform spacing`
			`uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)`
			`init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5`

			`# 1D k-means procedure`
			`km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)`
			`centers = km.fit(`
			`column[:, None], sample_weight=sample_weight`
			`).cluster_centers_[:, 0]`
			`# Must sort, centers may be unsorted even with sorted init`
			`centers.sort()`
			`bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5`
			`bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]`

			`# Remove bins whose width are too small (i.e., <= 1e-8)`
			`if self.strategy in ("quantile", "kmeans"):`
			`mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8`
			`bin_edges[jj] = bin_edges[jj][mask]`
			`if len(bin_edges[jj]) - 1 != n_bins[jj]:`
			`warnings.warn(`
			`"Bins whose width are too small (i.e., <= "`
			`"1e-8) in feature %d are removed. Consider "`
			`"decreasing the number of bins." % jj`
			`)`
			`n_bins[jj] = len(bin_edges[jj]) - 1`

			`self.bin_edges_ = bin_edges`
			`self.n_bins_ = n_bins`

			`if "onehot" in self.encode:`
			`self._encoder = OneHotEncoder(`
			`categories=[np.arange(i) for i in self.n_bins_],`
			`sparse_output=self.encode == "onehot",`
			`dtype=output_dtype,`
			`)`
			`# Fit the OneHotEncoder with toy datasets`
			`# so that it's ready for use after the KBinsDiscretizer is fitted`
			`self._encoder.fit(np.zeros((1, len(self.n_bins_))))`

			`return self`

			`def _validate_n_bins(self, n_features):`
			`"""Returns n_bins_, the number of bins per feature."""`
			`orig_bins = self.n_bins`
			`if isinstance(orig_bins, Integral):`
			`return np.full(n_features, orig_bins, dtype=int)`

			`n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)`

			`if n_bins.ndim > 1 or n_bins.shape[0] != n_features:`
			`raise ValueError("n_bins must be a scalar or array of shape (n_features,).")`

			`bad_nbins_value = (n_bins < 2) \| (n_bins != orig_bins)`

			`violating_indices = np.where(bad_nbins_value)[0]`
			`if violating_indices.shape[0] > 0:`
			`indices = ", ".join(str(i) for i in violating_indices)`
			`raise ValueError(`
			`"{} received an invalid number "`
			`"of bins at indices {}. Number of bins "`
			`"must be at least 2, and must be an int.".format(`
			`KBinsDiscretizer.__name__, indices`
			`)`
			`)`
			`return n_bins`

			`def transform(self, X):`
			`"""`
			`Discretize the data.`

			`Parameters`
			`----------`
			`X : array-like of shape (n_samples, n_features)`
			`Data to be discretized.`

			`Returns`
			`-------`
			`Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}`
			`Data in the binned space. Will be a sparse matrix if`
			`self.encode='onehot'` and ndarray otherwise.
			`"""`
			`check_is_fitted(self)`

			`# check input and attribute dtypes`
			`dtype = (np.float64, np.float32) if self.dtype is None else self.dtype`
			`Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)`

			`bin_edges = self.bin_edges_`
			`for jj in range(Xt.shape[1]):`
			`Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")`

			`if self.encode == "ordinal":`
			`return Xt`

			`dtype_init = None`
			`if "onehot" in self.encode:`
			`dtype_init = self._encoder.dtype`
			`self._encoder.dtype = Xt.dtype`
			`try:`
			`Xt_enc = self._encoder.transform(Xt)`
			`finally:`
			`# revert the initial dtype to avoid modifying self.`
			`self._encoder.dtype = dtype_init`
			`return Xt_enc`

			`def inverse_transform(self, Xt):`
			`"""`
			`Transform discretized data back to original feature space.`

			`Note that this function does not regenerate the original data`
			`due to discretization rounding.`

			`Parameters`
			`----------`
			`Xt : array-like of shape (n_samples, n_features)`
			`Transformed data in the binned space.`

			`Returns`
			`-------`
			`Xinv : ndarray, dtype={np.float32, np.float64}`
			`Data in the original feature space.`
			`"""`
			`check_is_fitted(self)`

			`if "onehot" in self.encode:`
			`Xt = self._encoder.inverse_transform(Xt)`

			`Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))`
			`n_features = self.n_bins_.shape[0]`
			`if Xinv.shape[1] != n_features:`
			`raise ValueError(`
			`"Incorrect number of features. Expecting {}, received {}.".format(`
			`n_features, Xinv.shape[1]`
			`)`
			`)`

			`for jj in range(n_features):`
			`bin_edges = self.bin_edges_[jj]`
			`bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5`
			`Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]`

			`return Xinv`

			`def get_feature_names_out(self, input_features=None):`
			`"""Get output feature names.`

			`Parameters`
			`----------`
			`input_features : array-like of str or None, default=None`
			`Input features.`

			- If `input_features` is `None`, then `feature_names_in_` is
			used as feature names in. If `feature_names_in_` is not defined,
			`then the following input feature names are generated:`
			`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
			- If `input_features` is an array-like, then `input_features` must
			match `feature_names_in_` if `feature_names_in_` is defined.

			`Returns`
			`-------`
			`feature_names_out : ndarray of str objects`
			`Transformed feature names.`
			`"""`
			`check_is_fitted(self, "n_features_in_")`
			`input_features = _check_feature_names_in(self, input_features)`
			`if hasattr(self, "_encoder"):`
			`return self._encoder.get_feature_names_out(input_features)`

			`# ordinal encoding`
			`return input_features`