ai-content-maker/.venv/Lib/site-packages/sklearn/utils/__init__.py

1300 lines
40 KiB
Python

"""
The :mod:`sklearn.utils` module includes various utilities.
"""
import math
import numbers
import platform
import struct
import timeit
import warnings
from collections.abc import Sequence
from contextlib import contextmanager, suppress
from itertools import compress, islice
import numpy as np
from scipy.sparse import issparse
from .. import get_config
from ..exceptions import DataConversionWarning
from . import _joblib, metadata_routing
from ._bunch import Bunch
from ._estimator_html_repr import estimator_html_repr
from ._param_validation import Integral, Interval, validate_params
from .class_weight import compute_class_weight, compute_sample_weight
from .deprecation import deprecated
from .discovery import all_estimators
from .fixes import parse_version, threadpool_info
from .murmurhash import murmurhash3_32
from .validation import (
_is_arraylike_not_scalar,
_is_pandas_df,
_is_polars_df,
_use_interchange_protocol,
as_float_array,
assert_all_finite,
check_array,
check_consistent_length,
check_random_state,
check_scalar,
check_symmetric,
check_X_y,
column_or_1d,
indexable,
)
# Do not deprecate parallel_backend and register_parallel_backend as they are
# needed to tune `scikit-learn` behavior and have different effect if called
# from the vendored version or or the site-package version. The other are
# utilities that are independent of scikit-learn so they are not part of
# scikit-learn public API.
parallel_backend = _joblib.parallel_backend
register_parallel_backend = _joblib.register_parallel_backend
__all__ = [
"murmurhash3_32",
"as_float_array",
"assert_all_finite",
"check_array",
"check_random_state",
"compute_class_weight",
"compute_sample_weight",
"column_or_1d",
"check_consistent_length",
"check_X_y",
"check_scalar",
"indexable",
"check_symmetric",
"indices_to_mask",
"deprecated",
"parallel_backend",
"register_parallel_backend",
"resample",
"shuffle",
"check_matplotlib_support",
"all_estimators",
"DataConversionWarning",
"estimator_html_repr",
"Bunch",
"metadata_routing",
]
IS_PYPY = platform.python_implementation() == "PyPy"
_IS_32BIT = 8 * struct.calcsize("P") == 32
_IS_WASM = platform.machine() in ["wasm32", "wasm64"]
def _in_unstable_openblas_configuration():
"""Return True if in an unstable configuration for OpenBLAS"""
# Import libraries which might load OpenBLAS.
import numpy # noqa
import scipy # noqa
modules_info = threadpool_info()
open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
if not open_blas_used:
return False
# OpenBLAS 0.3.16 fixed instability for arm64, see:
# https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
openblas_arm64_stable_version = parse_version("0.3.16")
for info in modules_info:
if info["internal_api"] != "openblas":
continue
openblas_version = info.get("version")
openblas_architecture = info.get("architecture")
if openblas_version is None or openblas_architecture is None:
# Cannot be sure that OpenBLAS is good enough. Assume unstable:
return True
if (
openblas_architecture == "neoversen1"
and parse_version(openblas_version) < openblas_arm64_stable_version
):
# See discussions in https://github.com/numpy/numpy/issues/19411
return True
return False
@validate_params(
{
"X": ["array-like", "sparse matrix"],
"mask": ["array-like"],
},
prefer_skip_nested_validation=True,
)
def safe_mask(X, mask):
"""Return a mask which is safe to use on X.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : array-like
Mask to be used on X.
Returns
-------
mask : ndarray
Array that is safe to use on X.
Examples
--------
>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
[3],
[5]])
"""
mask = np.asarray(mask)
if np.issubdtype(mask.dtype, np.signedinteger):
return mask
if hasattr(X, "toarray"):
ind = np.arange(mask.shape[0])
mask = ind[mask]
return mask
def axis0_safe_slice(X, mask, len_mask):
"""Return a mask which is safer to use on X than safe_mask.
This mask is safer than safe_mask since it returns an
empty array, when a sparse matrix is sliced with a boolean mask
with all False, instead of raising an unhelpful error in older
versions of SciPy.
See: https://github.com/scipy/scipy/issues/5361
Also note that we can avoid doing the dot product by checking if
the len_mask is not zero in _huber_loss_and_gradient but this
is not going to be the bottleneck, since the number of outliers
and non_outliers are typically non-zero and it makes the code
tougher to follow.
Parameters
----------
X : {array-like, sparse matrix}
Data on which to apply mask.
mask : ndarray
Mask to be used on X.
len_mask : int
The length of the mask.
Returns
-------
mask : ndarray
Array that is safe to use on X.
"""
if len_mask != 0:
return X[safe_mask(X, mask), :]
return np.zeros(shape=(0, X.shape[1]))
def _array_indexing(array, key, key_dtype, axis):
"""Index an array or scipy.sparse consistently across NumPy version."""
if issparse(array) and key_dtype == "bool":
key = np.asarray(key)
if isinstance(key, tuple):
key = list(key)
return array[key, ...] if axis == 0 else array[:, key]
def _pandas_indexing(X, key, key_dtype, axis):
"""Index a pandas dataframe or a series."""
if _is_arraylike_not_scalar(key):
key = np.asarray(key)
if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
# using take() instead of iloc[] ensures the return value is a "proper"
# copy that will not raise SettingWithCopyWarning
return X.take(key, axis=axis)
else:
# check whether we should index with loc or iloc
indexer = X.iloc if key_dtype == "int" else X.loc
return indexer[:, key] if axis else indexer[key]
def _list_indexing(X, key, key_dtype):
"""Index a Python list."""
if np.isscalar(key) or isinstance(key, slice):
# key is a slice or a scalar
return X[key]
if key_dtype == "bool":
# key is a boolean array-like
return list(compress(X, key))
# key is a integer array-like of key
return [X[idx] for idx in key]
def _polars_indexing(X, key, key_dtype, axis):
"""Indexing X with polars interchange protocol."""
# Polars behavior is more consistent with lists
if isinstance(key, np.ndarray):
key = key.tolist()
if axis == 1:
return X[:, key]
else:
return X[key]
def _determine_key_type(key, accept_slice=True):
"""Determine the data type of key.
Parameters
----------
key : scalar, slice or array-like
The key from which we want to infer the data type.
accept_slice : bool, default=True
Whether or not to raise an error if the key is a slice.
Returns
-------
dtype : {'int', 'str', 'bool', None}
Returns the data type of key.
"""
err_msg = (
"No valid specification of the columns. Only a scalar, list or "
"slice of all integers or all strings, or boolean mask is "
"allowed"
)
dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"}
array_dtype_to_str = {
"i": "int",
"u": "int",
"b": "bool",
"O": "str",
"U": "str",
"S": "str",
}
if key is None:
return None
if isinstance(key, tuple(dtype_to_str.keys())):
try:
return dtype_to_str[type(key)]
except KeyError:
raise ValueError(err_msg)
if isinstance(key, slice):
if not accept_slice:
raise TypeError(
"Only array-like or scalar are supported. A Python slice was given."
)
if key.start is None and key.stop is None:
return None
key_start_type = _determine_key_type(key.start)
key_stop_type = _determine_key_type(key.stop)
if key_start_type is not None and key_stop_type is not None:
if key_start_type != key_stop_type:
raise ValueError(err_msg)
if key_start_type is not None:
return key_start_type
return key_stop_type
if isinstance(key, (list, tuple)):
unique_key = set(key)
key_type = {_determine_key_type(elt) for elt in unique_key}
if not key_type:
return None
if len(key_type) != 1:
raise ValueError(err_msg)
return key_type.pop()
if hasattr(key, "dtype"):
try:
return array_dtype_to_str[key.dtype.kind]
except KeyError:
raise ValueError(err_msg)
raise ValueError(err_msg)
def _safe_indexing(X, indices, *, axis=0):
"""Return rows, items or columns of X using indices.
.. warning::
This utility is documented, but **private**. This means that
backward compatibility might be broken without any deprecation
cycle.
Parameters
----------
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
Data from which to sample rows, items or columns. `list` are only
supported when `axis=0`.
indices : bool, int, str, slice, array-like
- If `axis=0`, boolean and integer array-like, integer slice,
and scalar integer are supported.
- If `axis=1`:
- to select a single column, `indices` can be of `int` type for
all `X` types and `str` only for dataframe. The selected subset
will be 1D, unless `X` is a sparse matrix in which case it will
be 2D.
- to select multiples columns, `indices` can be one of the
following: `list`, `array`, `slice`. The type used in
these containers can be one of the following: `int`, 'bool' and
`str`. However, `str` is only supported when `X` is a dataframe.
The selected subset will be 2D.
axis : int, default=0
The axis along which `X` will be subsampled. `axis=0` will select
rows while `axis=1` will select columns.
Returns
-------
subset
Subset of X on axis 0 or 1.
Notes
-----
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
not supported.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils import _safe_indexing
>>> data = np.array([[1, 2], [3, 4], [5, 6]])
>>> _safe_indexing(data, 0, axis=0) # select the first row
array([1, 2])
>>> _safe_indexing(data, 0, axis=1) # select the first column
array([1, 3, 5])
"""
if indices is None:
return X
if axis not in (0, 1):
raise ValueError(
"'axis' should be either 0 (to index rows) or 1 (to index "
" column). Got {} instead.".format(axis)
)
indices_dtype = _determine_key_type(indices)
if axis == 0 and indices_dtype == "str":
raise ValueError("String indexing is not supported with 'axis=0'")
if axis == 1 and isinstance(X, list):
raise ValueError("axis=1 is not supported for lists")
if axis == 1 and hasattr(X, "ndim") and X.ndim != 2:
raise ValueError(
"'X' should be a 2D NumPy array, 2D sparse matrix or pandas "
"dataframe when indexing the columns (i.e. 'axis=1'). "
"Got {} instead with {} dimension(s).".format(type(X), X.ndim)
)
if (
axis == 1
and indices_dtype == "str"
and not (_is_pandas_df(X) or _use_interchange_protocol(X))
):
raise ValueError(
"Specifying the columns using strings is only supported for dataframes."
)
if hasattr(X, "iloc"):
# TODO: we should probably use _is_pandas_df(X) instead but this would
# require updating some tests such as test_train_test_split_mock_pandas.
return _pandas_indexing(X, indices, indices_dtype, axis=axis)
elif _is_polars_df(X):
return _polars_indexing(X, indices, indices_dtype, axis=axis)
elif hasattr(X, "shape"):
return _array_indexing(X, indices, indices_dtype, axis=axis)
else:
return _list_indexing(X, indices, indices_dtype)
def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):
"""Safe assignment to a numpy array, sparse matrix, or pandas dataframe.
Parameters
----------
X : {ndarray, sparse-matrix, dataframe}
Array to be modified. It is expected to be 2-dimensional.
values : ndarray
The values to be assigned to `X`.
row_indexer : array-like, dtype={int, bool}, default=None
A 1-dimensional array to select the rows of interest. If `None`, all
rows are selected.
column_indexer : array-like, dtype={int, bool}, default=None
A 1-dimensional array to select the columns of interest. If `None`, all
columns are selected.
"""
row_indexer = slice(None, None, None) if row_indexer is None else row_indexer
column_indexer = (
slice(None, None, None) if column_indexer is None else column_indexer
)
if hasattr(X, "iloc"): # pandas dataframe
with warnings.catch_warnings():
# pandas >= 1.5 raises a warning when using iloc to set values in a column
# that does not have the same type as the column being set. It happens
# for instance when setting a categorical column with a string.
# In the future the behavior won't change and the warning should disappear.
# TODO(1.3): check if the warning is still raised or remove the filter.
warnings.simplefilter("ignore", FutureWarning)
X.iloc[row_indexer, column_indexer] = values
else: # numpy array or sparse matrix
X[row_indexer, column_indexer] = values
def _get_column_indices_for_bool_or_int(key, n_columns):
# Convert key into list of positive integer indexes
try:
idx = _safe_indexing(np.arange(n_columns), key)
except IndexError as e:
raise ValueError(
f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]"
) from e
return np.atleast_1d(idx).tolist()
def _get_column_indices(X, key):
"""Get feature column indices for input data X and key.
For accepted values of `key`, see the docstring of
:func:`_safe_indexing`.
"""
key_dtype = _determine_key_type(key)
if _use_interchange_protocol(X):
return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype)
n_columns = X.shape[1]
if isinstance(key, (list, tuple)) and not key:
# we get an empty list
return []
elif key_dtype in ("bool", "int"):
return _get_column_indices_for_bool_or_int(key, n_columns)
else:
try:
all_columns = X.columns
except AttributeError:
raise ValueError(
"Specifying the columns using strings is only supported for dataframes."
)
if isinstance(key, str):
columns = [key]
elif isinstance(key, slice):
start, stop = key.start, key.stop
if start is not None:
start = all_columns.get_loc(start)
if stop is not None:
# pandas indexing with strings is endpoint included
stop = all_columns.get_loc(stop) + 1
else:
stop = n_columns + 1
return list(islice(range(n_columns), start, stop))
else:
columns = list(key)
try:
column_indices = []
for col in columns:
col_idx = all_columns.get_loc(col)
if not isinstance(col_idx, numbers.Integral):
raise ValueError(
f"Selected columns, {columns}, are not unique in dataframe"
)
column_indices.append(col_idx)
except KeyError as e:
raise ValueError("A given column is not a column of the dataframe") from e
return column_indices
def _get_column_indices_interchange(X_interchange, key, key_dtype):
"""Same as _get_column_indices but for X with __dataframe__ protocol."""
n_columns = X_interchange.num_columns()
if isinstance(key, (list, tuple)) and not key:
# we get an empty list
return []
elif key_dtype in ("bool", "int"):
return _get_column_indices_for_bool_or_int(key, n_columns)
else:
column_names = list(X_interchange.column_names())
if isinstance(key, slice):
if key.step not in [1, None]:
raise NotImplementedError("key.step must be 1 or None")
start, stop = key.start, key.stop
if start is not None:
start = column_names.index(start)
if stop is not None:
stop = column_names.index(stop) + 1
else:
stop = n_columns + 1
return list(islice(range(n_columns), start, stop))
selected_columns = [key] if np.isscalar(key) else key
try:
return [column_names.index(col) for col in selected_columns]
except ValueError as e:
raise ValueError("A given column is not a column of the dataframe") from e
@validate_params(
{
"replace": ["boolean"],
"n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None],
"random_state": ["random_state"],
"stratify": ["array-like", None],
},
prefer_skip_nested_validation=True,
)
def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):
"""Resample arrays or sparse matrices in a consistent way.
The default strategy implements one step of the bootstrapping
procedure.
Parameters
----------
*arrays : sequence of array-like of shape (n_samples,) or \
(n_samples, n_outputs)
Indexable data-structures can be arrays, lists, dataframes or scipy
sparse matrices with consistent first dimension.
replace : bool, default=True
Implements resampling with replacement. If False, this will implement
(sliced) random permutations.
n_samples : int, default=None
Number of samples to generate. If left to None this is
automatically set to the first dimension of the arrays.
If replace is False it should not be larger than the length of
arrays.
random_state : int, RandomState instance or None, default=None
Determines random number generation for shuffling
the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \
default=None
If not None, data is split in a stratified fashion, using this as
the class labels.
Returns
-------
resampled_arrays : sequence of array-like of shape (n_samples,) or \
(n_samples, n_outputs)
Sequence of resampled copies of the collections. The original arrays
are not impacted.
See Also
--------
shuffle : Shuffle arrays or sparse matrices in a consistent way.
Examples
--------
It is possible to mix sparse and dense arrays in the same run::
>>> import numpy as np
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
>>> y = np.array([0, 1, 2])
>>> from scipy.sparse import coo_matrix
>>> X_sparse = coo_matrix(X)
>>> from sklearn.utils import resample
>>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
>>> X
array([[1., 0.],
[2., 1.],
[1., 0.]])
>>> X_sparse
<3x2 sparse matrix of type '<... 'numpy.float64'>'
with 4 stored elements in Compressed Sparse Row format>
>>> X_sparse.toarray()
array([[1., 0.],
[2., 1.],
[1., 0.]])
>>> y
array([0, 1, 0])
>>> resample(y, n_samples=2, random_state=0)
array([0, 1])
Example using stratification::
>>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]
>>> resample(y, n_samples=5, replace=False, stratify=y,
... random_state=0)
[1, 1, 1, 0, 1]
"""
max_n_samples = n_samples
random_state = check_random_state(random_state)
if len(arrays) == 0:
return None
first = arrays[0]
n_samples = first.shape[0] if hasattr(first, "shape") else len(first)
if max_n_samples is None:
max_n_samples = n_samples
elif (max_n_samples > n_samples) and (not replace):
raise ValueError(
"Cannot sample %d out of arrays with dim %d when replace is False"
% (max_n_samples, n_samples)
)
check_consistent_length(*arrays)
if stratify is None:
if replace:
indices = random_state.randint(0, n_samples, size=(max_n_samples,))
else:
indices = np.arange(n_samples)
random_state.shuffle(indices)
indices = indices[:max_n_samples]
else:
# Code adapted from StratifiedShuffleSplit()
y = check_array(stratify, ensure_2d=False, dtype=None)
if y.ndim == 2:
# for multi-label y, map each distinct row to a string repr
# using join because str(row) uses an ellipsis if len(row) > 1000
y = np.array([" ".join(row.astype("str")) for row in y])
classes, y_indices = np.unique(y, return_inverse=True)
n_classes = classes.shape[0]
class_counts = np.bincount(y_indices)
# Find the sorted list of instances for each class:
# (np.unique above performs a sort, so code is O(n logn) already)
class_indices = np.split(
np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
)
n_i = _approximate_mode(class_counts, max_n_samples, random_state)
indices = []
for i in range(n_classes):
indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace)
indices.extend(indices_i)
indices = random_state.permutation(indices)
# convert sparse matrices to CSR for row-based indexing
arrays = [a.tocsr() if issparse(a) else a for a in arrays]
resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
if len(resampled_arrays) == 1:
# syntactic sugar for the unit argument case
return resampled_arrays[0]
else:
return resampled_arrays
def shuffle(*arrays, random_state=None, n_samples=None):
"""Shuffle arrays or sparse matrices in a consistent way.
This is a convenience alias to ``resample(*arrays, replace=False)`` to do
random permutations of the collections.
Parameters
----------
*arrays : sequence of indexable data-structures
Indexable data-structures can be arrays, lists, dataframes or scipy
sparse matrices with consistent first dimension.
random_state : int, RandomState instance or None, default=None
Determines random number generation for shuffling
the data.
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary <random_state>`.
n_samples : int, default=None
Number of samples to generate. If left to None this is
automatically set to the first dimension of the arrays. It should
not be larger than the length of arrays.
Returns
-------
shuffled_arrays : sequence of indexable data-structures
Sequence of shuffled copies of the collections. The original arrays
are not impacted.
See Also
--------
resample : Resample arrays or sparse matrices in a consistent way.
Examples
--------
It is possible to mix sparse and dense arrays in the same run::
>>> import numpy as np
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
>>> y = np.array([0, 1, 2])
>>> from scipy.sparse import coo_matrix
>>> X_sparse = coo_matrix(X)
>>> from sklearn.utils import shuffle
>>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
>>> X
array([[0., 0.],
[2., 1.],
[1., 0.]])
>>> X_sparse
<3x2 sparse matrix of type '<... 'numpy.float64'>'
with 3 stored elements in Compressed Sparse Row format>
>>> X_sparse.toarray()
array([[0., 0.],
[2., 1.],
[1., 0.]])
>>> y
array([2, 1, 0])
>>> shuffle(y, n_samples=2, random_state=0)
array([0, 1])
"""
return resample(
*arrays, replace=False, n_samples=n_samples, random_state=random_state
)
def safe_sqr(X, *, copy=True):
"""Element wise squaring of array-likes and sparse matrices.
Parameters
----------
X : {array-like, ndarray, sparse matrix}
copy : bool, default=True
Whether to create a copy of X and operate on it or to perform
inplace computation (default behaviour).
Returns
-------
X ** 2 : element wise square
Return the element-wise square of the input.
Examples
--------
>>> from sklearn.utils import safe_sqr
>>> safe_sqr([1, 2, 3])
array([1, 4, 9])
"""
X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
if issparse(X):
if copy:
X = X.copy()
X.data **= 2
else:
if copy:
X = X**2
else:
X **= 2
return X
def _chunk_generator(gen, chunksize):
"""Chunk generator, ``gen`` into lists of length ``chunksize``. The last
chunk may have a length less than ``chunksize``."""
while True:
chunk = list(islice(gen, chunksize))
if chunk:
yield chunk
else:
return
@validate_params(
{
"n": [Interval(numbers.Integral, 1, None, closed="left")],
"batch_size": [Interval(numbers.Integral, 1, None, closed="left")],
"min_batch_size": [Interval(numbers.Integral, 0, None, closed="left")],
},
prefer_skip_nested_validation=True,
)
def gen_batches(n, batch_size, *, min_batch_size=0):
"""Generator to create slices containing `batch_size` elements from 0 to `n`.
The last slice may contain less than `batch_size` elements, when
`batch_size` does not divide `n`.
Parameters
----------
n : int
Size of the sequence.
batch_size : int
Number of elements in each batch.
min_batch_size : int, default=0
Minimum number of elements in each batch.
Yields
------
slice of `batch_size` elements
See Also
--------
gen_even_slices: Generator to create n_packs slices going up to n.
Examples
--------
>>> from sklearn.utils import gen_batches
>>> list(gen_batches(7, 3))
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
>>> list(gen_batches(6, 3))
[slice(0, 3, None), slice(3, 6, None)]
>>> list(gen_batches(2, 3))
[slice(0, 2, None)]
>>> list(gen_batches(7, 3, min_batch_size=0))
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
>>> list(gen_batches(7, 3, min_batch_size=2))
[slice(0, 3, None), slice(3, 7, None)]
"""
start = 0
for _ in range(int(n // batch_size)):
end = start + batch_size
if end + min_batch_size > n:
continue
yield slice(start, end)
start = end
if start < n:
yield slice(start, n)
@validate_params(
{
"n": [Interval(Integral, 1, None, closed="left")],
"n_packs": [Interval(Integral, 1, None, closed="left")],
"n_samples": [Interval(Integral, 1, None, closed="left"), None],
},
prefer_skip_nested_validation=True,
)
def gen_even_slices(n, n_packs, *, n_samples=None):
"""Generator to create `n_packs` evenly spaced slices going up to `n`.
If `n_packs` does not divide `n`, except for the first `n % n_packs`
slices, remaining slices may contain fewer elements.
Parameters
----------
n : int
Size of the sequence.
n_packs : int
Number of slices to generate.
n_samples : int, default=None
Number of samples. Pass `n_samples` when the slices are to be used for
sparse matrix indexing; slicing off-the-end raises an exception, while
it works for NumPy arrays.
Yields
------
`slice` representing a set of indices from 0 to n.
See Also
--------
gen_batches: Generator to create slices containing batch_size elements
from 0 to n.
Examples
--------
>>> from sklearn.utils import gen_even_slices
>>> list(gen_even_slices(10, 1))
[slice(0, 10, None)]
>>> list(gen_even_slices(10, 10))
[slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
>>> list(gen_even_slices(10, 5))
[slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
>>> list(gen_even_slices(10, 3))
[slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
"""
start = 0
for pack_num in range(n_packs):
this_n = n // n_packs
if pack_num < n % n_packs:
this_n += 1
if this_n > 0:
end = start + this_n
if n_samples is not None:
end = min(n_samples, end)
yield slice(start, end, None)
start = end
def tosequence(x):
"""Cast iterable x to a Sequence, avoiding a copy if possible.
Parameters
----------
x : iterable
The iterable to be converted.
Returns
-------
x : Sequence
If `x` is a NumPy array, it returns it as a `ndarray`. If `x`
is a `Sequence`, `x` is returned as-is. If `x` is from any other
type, `x` is returned casted as a list.
"""
if isinstance(x, np.ndarray):
return np.asarray(x)
elif isinstance(x, Sequence):
return x
else:
return list(x)
def _to_object_array(sequence):
"""Convert sequence to a 1-D NumPy array of object dtype.
numpy.array constructor has a similar use but it's output
is ambiguous. It can be 1-D NumPy array of object dtype if
the input is a ragged array, but if the input is a list of
equal length arrays, then the output is a 2D numpy.array.
_to_object_array solves this ambiguity by guarantying that
the output is a 1-D NumPy array of objects for any input.
Parameters
----------
sequence : array-like of shape (n_elements,)
The sequence to be converted.
Returns
-------
out : ndarray of shape (n_elements,), dtype=object
The converted sequence into a 1-D NumPy array of object dtype.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils import _to_object_array
>>> _to_object_array([np.array([0]), np.array([1])])
array([array([0]), array([1])], dtype=object)
>>> _to_object_array([np.array([0]), np.array([1, 2])])
array([array([0]), array([1, 2])], dtype=object)
>>> _to_object_array([np.array([0]), np.array([1, 2])])
array([array([0]), array([1, 2])], dtype=object)
"""
out = np.empty(len(sequence), dtype=object)
out[:] = sequence
return out
def indices_to_mask(indices, mask_length):
"""Convert list of indices to boolean mask.
Parameters
----------
indices : list-like
List of integers treated as indices.
mask_length : int
Length of boolean mask to be generated.
This parameter must be greater than max(indices).
Returns
-------
mask : 1d boolean nd-array
Boolean array that is True where indices are present, else False.
Examples
--------
>>> from sklearn.utils import indices_to_mask
>>> indices = [1, 2 , 3, 4]
>>> indices_to_mask(indices, 5)
array([False, True, True, True, True])
"""
if mask_length <= np.max(indices):
raise ValueError("mask_length must be greater than max(indices)")
mask = np.zeros(mask_length, dtype=bool)
mask[indices] = True
return mask
def _message_with_time(source, message, time):
"""Create one line message for logging purposes.
Parameters
----------
source : str
String indicating the source or the reference of the message.
message : str
Short message.
time : int
Time in seconds.
"""
start_message = "[%s] " % source
# adapted from joblib.logger.short_format_time without the Windows -.1s
# adjustment
if time > 60:
time_str = "%4.1fmin" % (time / 60)
else:
time_str = " %5.1fs" % time
end_message = " %s, total=%s" % (message, time_str)
dots_len = 70 - len(start_message) - len(end_message)
return "%s%s%s" % (start_message, dots_len * ".", end_message)
@contextmanager
def _print_elapsed_time(source, message=None):
"""Log elapsed time to stdout when the context is exited.
Parameters
----------
source : str
String indicating the source or the reference of the message.
message : str, default=None
Short message. If None, nothing will be printed.
Returns
-------
context_manager
Prints elapsed time upon exit if verbose.
"""
if message is None:
yield
else:
start = timeit.default_timer()
yield
print(_message_with_time(source, message, timeit.default_timer() - start))
def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
"""Calculate how many rows can be processed within `working_memory`.
Parameters
----------
row_bytes : int
The expected number of bytes of memory that will be consumed
during the processing of each row.
max_n_rows : int, default=None
The maximum return value.
working_memory : int or float, default=None
The number of rows to fit inside this number of MiB will be
returned. When None (default), the value of
``sklearn.get_config()['working_memory']`` is used.
Returns
-------
int
The number of rows which can be processed within `working_memory`.
Warns
-----
Issues a UserWarning if `row_bytes exceeds `working_memory` MiB.
"""
if working_memory is None:
working_memory = get_config()["working_memory"]
chunk_n_rows = int(working_memory * (2**20) // row_bytes)
if max_n_rows is not None:
chunk_n_rows = min(chunk_n_rows, max_n_rows)
if chunk_n_rows < 1:
warnings.warn(
"Could not adhere to working_memory config. "
"Currently %.0fMiB, %.0fMiB required."
% (working_memory, np.ceil(row_bytes * 2**-20))
)
chunk_n_rows = 1
return chunk_n_rows
def _is_pandas_na(x):
"""Test if x is pandas.NA.
We intentionally do not use this function to return `True` for `pd.NA` in
`is_scalar_nan`, because estimators that support `pd.NA` are the exception
rather than the rule at the moment. When `pd.NA` is more universally
supported, we may reconsider this decision.
Parameters
----------
x : any type
Returns
-------
boolean
"""
with suppress(ImportError):
from pandas import NA
return x is NA
return False
def is_scalar_nan(x):
"""Test if x is NaN.
This function is meant to overcome the issue that np.isnan does not allow
non-numerical types as input, and that np.nan is not float('nan').
Parameters
----------
x : any type
Any scalar value.
Returns
-------
bool
Returns true if x is NaN, and false otherwise.
Examples
--------
>>> import numpy as np
>>> from sklearn.utils import is_scalar_nan
>>> is_scalar_nan(np.nan)
True
>>> is_scalar_nan(float("nan"))
True
>>> is_scalar_nan(None)
False
>>> is_scalar_nan("")
False
>>> is_scalar_nan([np.nan])
False
"""
return (
not isinstance(x, numbers.Integral)
and isinstance(x, numbers.Real)
and math.isnan(x)
)
def _approximate_mode(class_counts, n_draws, rng):
"""Computes approximate mode of multivariate hypergeometric.
This is an approximation to the mode of the multivariate
hypergeometric given by class_counts and n_draws.
It shouldn't be off by more than one.
It is the mostly likely outcome of drawing n_draws many
samples from the population given by class_counts.
Parameters
----------
class_counts : ndarray of int
Population per class.
n_draws : int
Number of draws (samples to draw) from the overall population.
rng : random state
Used to break ties.
Returns
-------
sampled_classes : ndarray of int
Number of samples drawn from each class.
np.sum(sampled_classes) == n_draws
Examples
--------
>>> import numpy as np
>>> from sklearn.utils import _approximate_mode
>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
array([2, 1])
>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
array([3, 1])
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
... n_draws=2, rng=0)
array([0, 1, 1, 0])
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
... n_draws=2, rng=42)
array([1, 1, 0, 0])
"""
rng = check_random_state(rng)
# this computes a bad approximation to the mode of the
# multivariate hypergeometric given by class_counts and n_draws
continuous = class_counts / class_counts.sum() * n_draws
# floored means we don't overshoot n_samples, but probably undershoot
floored = np.floor(continuous)
# we add samples according to how much "left over" probability
# they had, until we arrive at n_samples
need_to_add = int(n_draws - floored.sum())
if need_to_add > 0:
remainder = continuous - floored
values = np.sort(np.unique(remainder))[::-1]
# add according to remainder, but break ties
# randomly to avoid biases
for value in values:
(inds,) = np.where(remainder == value)
# if we need_to_add less than what's in inds
# we draw randomly from them.
# if we need to add more, we add them all and
# go to the next value
add_now = min(len(inds), need_to_add)
inds = rng.choice(inds, size=add_now, replace=False)
floored[inds] += 1
need_to_add -= add_now
if need_to_add == 0:
break
return floored.astype(int)
def check_matplotlib_support(caller_name):
"""Raise ImportError with detailed error message if mpl is not installed.
Plot utilities like any of the Display's plotting functions should lazily import
matplotlib and call this helper before any computation.
Parameters
----------
caller_name : str
The name of the caller that requires matplotlib.
"""
try:
import matplotlib # noqa
except ImportError as e:
raise ImportError(
"{} requires matplotlib. You can install matplotlib with "
"`pip install matplotlib`".format(caller_name)
) from e
def check_pandas_support(caller_name):
"""Raise ImportError with detailed error message if pandas is not installed.
Plot utilities like :func:`fetch_openml` should lazily import
pandas and call this helper before any computation.
Parameters
----------
caller_name : str
The name of the caller that requires pandas.
Returns
-------
pandas
The pandas package.
"""
try:
import pandas # noqa
return pandas
except ImportError as e:
raise ImportError("{} requires pandas.".format(caller_name)) from e