ai-content-maker/.venv/Lib/site-packages/sklearn/preprocessing/_discretization.py

473 lines
17 KiB
Python
Raw Permalink Normal View History

2024-05-03 04:18:51 +03:00
# Author: Henry Lin <hlin117@gmail.com>
# Tom Dupré la Tour
# License: BSD
import warnings
from numbers import Integral
import numpy as np
from ..base import BaseEstimator, TransformerMixin, _fit_context
from ..utils import _safe_indexing
from ..utils._param_validation import Hidden, Interval, Options, StrOptions
from ..utils.stats import _weighted_percentile
from ..utils.validation import (
_check_feature_names_in,
_check_sample_weight,
check_array,
check_is_fitted,
check_random_state,
)
from ._encoders import OneHotEncoder
class KBinsDiscretizer(TransformerMixin, BaseEstimator):
"""
Bin continuous data into intervals.
Read more in the :ref:`User Guide <preprocessing_discretization>`.
.. versionadded:: 0.20
Parameters
----------
n_bins : int or array-like of shape (n_features,), default=5
The number of bins to produce. Raises ValueError if ``n_bins < 2``.
encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
Method used to encode the transformed result.
- 'onehot': Encode the transformed result with one-hot encoding
and return a sparse matrix. Ignored features are always
stacked to the right.
- 'onehot-dense': Encode the transformed result with one-hot encoding
and return a dense array. Ignored features are always
stacked to the right.
- 'ordinal': Return the bin identifier encoded as an integer value.
strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
Strategy used to define the widths of the bins.
- 'uniform': All bins in each feature have identical widths.
- 'quantile': All bins in each feature have the same number of points.
- 'kmeans': Values in each bin have the same nearest center of a 1D
k-means cluster.
For an example of the different strategies see:
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
dtype : {np.float32, np.float64}, default=None
The desired data-type for the output. If None, output dtype is
consistent with input dtype. Only np.float32 and np.float64 are
supported.
.. versionadded:: 0.24
subsample : int or None, default='warn'
Maximum number of samples, used to fit the model, for computational
efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
when `strategy='uniform'` or `strategy='kmeans'`.
`subsample=None` means that all the training samples are used when
computing the quantiles that determine the binning thresholds.
Since quantile computation relies on sorting each column of `X` and
that sorting has an `n log(n)` time complexity,
it is recommended to use subsampling on datasets with a
very large number of samples.
.. versionchanged:: 1.3
The default value of `subsample` changed from `None` to `200_000` when
`strategy="quantile"`.
.. versionchanged:: 1.5
The default value of `subsample` changed from `None` to `200_000` when
`strategy="uniform"` or `strategy="kmeans"`.
random_state : int, RandomState instance or None, default=None
Determines random number generation for subsampling.
Pass an int for reproducible results across multiple function calls.
See the `subsample` parameter for more details.
See :term:`Glossary <random_state>`.
.. versionadded:: 1.1
Attributes
----------
bin_edges_ : ndarray of ndarray of shape (n_features,)
The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
Ignored features will have empty arrays.
n_bins_ : ndarray of shape (n_features,), dtype=np.int64
Number of bins per feature. Bins whose width are too small
(i.e., <= 1e-8) are removed with a warning.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
See Also
--------
Binarizer : Class used to bin values as ``0`` or
``1`` based on a parameter ``threshold``.
Notes
-----
For a visualization of discretization on different datasets refer to
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
On the effect of discretization on linear models see:
:ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
In bin edges for feature ``i``, the first and last values are used only for
``inverse_transform``. During transform, bin edges are extended to::
np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
You can combine ``KBinsDiscretizer`` with
:class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
part of the features.
``KBinsDiscretizer`` might produce constant features (e.g., when
``encode = 'onehot'`` and certain bins do not contain any data).
These features can be removed with feature selection algorithms
(e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
Examples
--------
>>> from sklearn.preprocessing import KBinsDiscretizer
>>> X = [[-2, 1, -4, -1],
... [-1, 2, -3, -0.5],
... [ 0, 3, -2, 0.5],
... [ 1, 4, -1, 2]]
>>> est = KBinsDiscretizer(
... n_bins=3, encode='ordinal', strategy='uniform', subsample=None
... )
>>> est.fit(X)
KBinsDiscretizer(...)
>>> Xt = est.transform(X)
>>> Xt # doctest: +SKIP
array([[ 0., 0., 0., 0.],
[ 1., 1., 1., 0.],
[ 2., 2., 2., 1.],
[ 2., 2., 2., 2.]])
Sometimes it may be useful to convert the data back into the original
feature space. The ``inverse_transform`` function converts the binned
data into the original feature space. Each value will be equal to the mean
of the two bin edges.
>>> est.bin_edges_[0]
array([-2., -1., 0., 1.])
>>> est.inverse_transform(Xt)
array([[-1.5, 1.5, -3.5, -0.5],
[-0.5, 2.5, -2.5, -0.5],
[ 0.5, 3.5, -1.5, 0.5],
[ 0.5, 3.5, -1.5, 1.5]])
"""
_parameter_constraints: dict = {
"n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
"encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
"strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
"dtype": [Options(type, {np.float64, np.float32}), None],
"subsample": [
Interval(Integral, 1, None, closed="left"),
None,
Hidden(StrOptions({"warn"})),
],
"random_state": ["random_state"],
}
def __init__(
self,
n_bins=5,
*,
encode="onehot",
strategy="quantile",
dtype=None,
subsample="warn",
random_state=None,
):
self.n_bins = n_bins
self.encode = encode
self.strategy = strategy
self.dtype = dtype
self.subsample = subsample
self.random_state = random_state
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None, sample_weight=None):
"""
Fit the estimator.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data to be discretized.
y : None
Ignored. This parameter exists only for compatibility with
:class:`~sklearn.pipeline.Pipeline`.
sample_weight : ndarray of shape (n_samples,)
Contains weight values to be associated with each sample.
Only possible when `strategy` is set to `"quantile"`.
.. versionadded:: 1.3
Returns
-------
self : object
Returns the instance itself.
"""
X = self._validate_data(X, dtype="numeric")
if self.dtype in (np.float64, np.float32):
output_dtype = self.dtype
else: # self.dtype is None
output_dtype = X.dtype
n_samples, n_features = X.shape
if sample_weight is not None and self.strategy == "uniform":
raise ValueError(
"`sample_weight` was provided but it cannot be "
"used with strategy='uniform'. Got strategy="
f"{self.strategy!r} instead."
)
if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":
warnings.warn(
(
"In version 1.5 onwards, subsample=200_000 "
"will be used by default. Set subsample explicitly to "
"silence this warning in the mean time. Set "
"subsample=None to disable subsampling explicitly."
),
FutureWarning,
)
subsample = self.subsample
if subsample == "warn":
subsample = 200000 if self.strategy == "quantile" else None
if subsample is not None and n_samples > subsample:
rng = check_random_state(self.random_state)
subsample_idx = rng.choice(n_samples, size=subsample, replace=False)
X = _safe_indexing(X, subsample_idx)
n_features = X.shape[1]
n_bins = self._validate_n_bins(n_features)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
bin_edges = np.zeros(n_features, dtype=object)
for jj in range(n_features):
column = X[:, jj]
col_min, col_max = column.min(), column.max()
if col_min == col_max:
warnings.warn(
"Feature %d is constant and will be replaced with 0." % jj
)
n_bins[jj] = 1
bin_edges[jj] = np.array([-np.inf, np.inf])
continue
if self.strategy == "uniform":
bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
elif self.strategy == "quantile":
quantiles = np.linspace(0, 100, n_bins[jj] + 1)
if sample_weight is None:
bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
else:
bin_edges[jj] = np.asarray(
[
_weighted_percentile(column, sample_weight, q)
for q in quantiles
],
dtype=np.float64,
)
elif self.strategy == "kmeans":
from ..cluster import KMeans # fixes import loops
# Deterministic initialization with uniform spacing
uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
# 1D k-means procedure
km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
centers = km.fit(
column[:, None], sample_weight=sample_weight
).cluster_centers_[:, 0]
# Must sort, centers may be unsorted even with sorted init
centers.sort()
bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
# Remove bins whose width are too small (i.e., <= 1e-8)
if self.strategy in ("quantile", "kmeans"):
mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
bin_edges[jj] = bin_edges[jj][mask]
if len(bin_edges[jj]) - 1 != n_bins[jj]:
warnings.warn(
"Bins whose width are too small (i.e., <= "
"1e-8) in feature %d are removed. Consider "
"decreasing the number of bins." % jj
)
n_bins[jj] = len(bin_edges[jj]) - 1
self.bin_edges_ = bin_edges
self.n_bins_ = n_bins
if "onehot" in self.encode:
self._encoder = OneHotEncoder(
categories=[np.arange(i) for i in self.n_bins_],
sparse_output=self.encode == "onehot",
dtype=output_dtype,
)
# Fit the OneHotEncoder with toy datasets
# so that it's ready for use after the KBinsDiscretizer is fitted
self._encoder.fit(np.zeros((1, len(self.n_bins_))))
return self
def _validate_n_bins(self, n_features):
"""Returns n_bins_, the number of bins per feature."""
orig_bins = self.n_bins
if isinstance(orig_bins, Integral):
return np.full(n_features, orig_bins, dtype=int)
n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
violating_indices = np.where(bad_nbins_value)[0]
if violating_indices.shape[0] > 0:
indices = ", ".join(str(i) for i in violating_indices)
raise ValueError(
"{} received an invalid number "
"of bins at indices {}. Number of bins "
"must be at least 2, and must be an int.".format(
KBinsDiscretizer.__name__, indices
)
)
return n_bins
def transform(self, X):
"""
Discretize the data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data to be discretized.
Returns
-------
Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
Data in the binned space. Will be a sparse matrix if
`self.encode='onehot'` and ndarray otherwise.
"""
check_is_fitted(self)
# check input and attribute dtypes
dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)
bin_edges = self.bin_edges_
for jj in range(Xt.shape[1]):
Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
if self.encode == "ordinal":
return Xt
dtype_init = None
if "onehot" in self.encode:
dtype_init = self._encoder.dtype
self._encoder.dtype = Xt.dtype
try:
Xt_enc = self._encoder.transform(Xt)
finally:
# revert the initial dtype to avoid modifying self.
self._encoder.dtype = dtype_init
return Xt_enc
def inverse_transform(self, Xt):
"""
Transform discretized data back to original feature space.
Note that this function does not regenerate the original data
due to discretization rounding.
Parameters
----------
Xt : array-like of shape (n_samples, n_features)
Transformed data in the binned space.
Returns
-------
Xinv : ndarray, dtype={np.float32, np.float64}
Data in the original feature space.
"""
check_is_fitted(self)
if "onehot" in self.encode:
Xt = self._encoder.inverse_transform(Xt)
Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))
n_features = self.n_bins_.shape[0]
if Xinv.shape[1] != n_features:
raise ValueError(
"Incorrect number of features. Expecting {}, received {}.".format(
n_features, Xinv.shape[1]
)
)
for jj in range(n_features):
bin_edges = self.bin_edges_[jj]
bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]
return Xinv
def get_feature_names_out(self, input_features=None):
"""Get output feature names.
Parameters
----------
input_features : array-like of str or None, default=None
Input features.
- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then the following input feature names are generated:
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.
Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
check_is_fitted(self, "n_features_in_")
input_features = _check_feature_names_in(self, input_features)
if hasattr(self, "_encoder"):
return self._encoder.get_feature_names_out(input_features)
# ordinal encoding
return input_features