""" The :mod:`sklearn.utils.sparsefuncs` module includes a collection of utilities to work with sparse matrices and arrays. """ # Authors: Manoj Kumar # Thomas Unterthiner # Giorgio Patrini # # License: BSD 3 clause import numpy as np import scipy.sparse as sp from scipy.sparse.linalg import LinearOperator from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max from ..utils.validation import _check_sample_weight from .sparsefuncs_fast import ( csc_mean_variance_axis0 as _csc_mean_var_axis0, ) from .sparsefuncs_fast import ( csr_mean_variance_axis0 as _csr_mean_var_axis0, ) from .sparsefuncs_fast import ( incr_mean_variance_axis0 as _incr_mean_var_axis0, ) def _raise_typeerror(X): """Raises a TypeError if X is not a CSR or CSC matrix""" input_type = X.format if sp.issparse(X) else type(X) err = "Expected a CSR or CSC sparse matrix, got %s." % input_type raise TypeError(err) def _raise_error_wrong_axis(axis): if axis not in (0, 1): raise ValueError( "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis ) def inplace_csr_column_scale(X, scale): """Inplace column scaling of a CSR matrix. Scale each feature of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix to normalize using the variance of the features. It should be of CSR format. scale : ndarray of shape (n_features,), dtype={np.float32, np.float64} Array of precomputed feature-wise values to use for scaling. Examples -------- >>> from sklearn.utils import sparsefuncs >>> from scipy import sparse >>> import numpy as np >>> indptr = np.array([0, 3, 4, 4, 4]) >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) >>> csr = sparse.csr_matrix((data, indices, indptr)) >>> csr.todense() matrix([[8, 1, 2], [0, 0, 5], [0, 0, 0], [0, 0, 0]]) >>> sparsefuncs.inplace_csr_column_scale(csr, scale) >>> csr.todense() matrix([[16, 3, 4], [ 0, 0, 10], [ 0, 0, 0], [ 0, 0, 0]]) """ assert scale.shape[0] == X.shape[1] X.data *= scale.take(X.indices, mode="clip") def inplace_csr_row_scale(X, scale): """Inplace row scaling of a CSR matrix. Scale each sample of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix to be scaled. It should be of CSR format. scale : ndarray of float of shape (n_samples,) Array of precomputed sample-wise values to use for scaling. """ assert scale.shape[0] == X.shape[0] X.data *= np.repeat(scale, np.diff(X.indptr)) def mean_variance_axis(X, axis, weights=None, return_sum_weights=False): """Compute mean and variance along an axis on a CSR or CSC matrix. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Input data. It can be of CSR or CSC format. axis : {0, 1} Axis along which the axis should be computed. weights : ndarray of shape (n_samples,) or (n_features,), default=None If axis is set to 0 shape is (n_samples,) or if axis is set to 1 shape is (n_features,). If it is set to None, then samples are equally weighted. .. versionadded:: 0.24 return_sum_weights : bool, default=False If True, returns the sum of weights seen for each feature if `axis=0` or each sample if `axis=1`. .. versionadded:: 0.24 Returns ------- means : ndarray of shape (n_features,), dtype=floating Feature-wise means. variances : ndarray of shape (n_features,), dtype=floating Feature-wise variances. sum_weights : ndarray of shape (n_features,), dtype=floating Returned if `return_sum_weights` is `True`. Examples -------- >>> from sklearn.utils import sparsefuncs >>> from scipy import sparse >>> import numpy as np >>> indptr = np.array([0, 3, 4, 4, 4]) >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) >>> csr = sparse.csr_matrix((data, indices, indptr)) >>> csr.todense() matrix([[8, 1, 2], [0, 0, 5], [0, 0, 0], [0, 0, 0]]) >>> sparsefuncs.mean_variance_axis(csr, axis=0) (array([2. , 0.25, 1.75]), array([12. , 0.1875, 4.1875])) """ _raise_error_wrong_axis(axis) if sp.issparse(X) and X.format == "csr": if axis == 0: return _csr_mean_var_axis0( X, weights=weights, return_sum_weights=return_sum_weights ) else: return _csc_mean_var_axis0( X.T, weights=weights, return_sum_weights=return_sum_weights ) elif sp.issparse(X) and X.format == "csc": if axis == 0: return _csc_mean_var_axis0( X, weights=weights, return_sum_weights=return_sum_weights ) else: return _csr_mean_var_axis0( X.T, weights=weights, return_sum_weights=return_sum_weights ) else: _raise_typeerror(X) def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None): """Compute incremental mean and variance along an axis on a CSR or CSC matrix. last_mean, last_var are the statistics computed at the last step by this function. Both must be initialized to 0-arrays of the proper size, i.e. the number of features in X. last_n is the number of samples encountered until now. Parameters ---------- X : CSR or CSC sparse matrix of shape (n_samples, n_features) Input data. axis : {0, 1} Axis along which the axis should be computed. last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating Array of means to update with the new data X. Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1. last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating Array of variances to update with the new data X. Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1. last_n : float or ndarray of shape (n_features,) or (n_samples,), \ dtype=floating Sum of the weights seen so far, excluding the current weights If not float, it should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1. If float it corresponds to having same weights for all samples (or features). weights : ndarray of shape (n_samples,) or (n_features,), default=None If axis is set to 0 shape is (n_samples,) or if axis is set to 1 shape is (n_features,). If it is set to None, then samples are equally weighted. .. versionadded:: 0.24 Returns ------- means : ndarray of shape (n_features,) or (n_samples,), dtype=floating Updated feature-wise means if axis = 0 or sample-wise means if axis = 1. variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating Updated feature-wise variances if axis = 0 or sample-wise variances if axis = 1. n : ndarray of shape (n_features,) or (n_samples,), dtype=integral Updated number of seen samples per feature if axis=0 or number of seen features per sample if axis=1. If weights is not None, n is a sum of the weights of the seen samples or features instead of the actual number of seen samples or features. Notes ----- NaNs are ignored in the algorithm. Examples -------- >>> from sklearn.utils import sparsefuncs >>> from scipy import sparse >>> import numpy as np >>> indptr = np.array([0, 3, 4, 4, 4]) >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) >>> csr = sparse.csr_matrix((data, indices, indptr)) >>> csr.todense() matrix([[8, 1, 2], [0, 0, 5], [0, 0, 0], [0, 0, 0]]) >>> sparsefuncs.incr_mean_variance_axis( ... csr, axis=0, last_mean=np.zeros(3), last_var=np.zeros(3), last_n=2 ... ) (array([1.3..., 0.1..., 1.1...]), array([8.8..., 0.1..., 3.4...]), array([6., 6., 6.])) """ _raise_error_wrong_axis(axis) if not (sp.issparse(X) and X.format in ("csc", "csr")): _raise_typeerror(X) if np.size(last_n) == 1: last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype) if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)): raise ValueError("last_mean, last_var, last_n do not have the same shapes.") if axis == 1: if np.size(last_mean) != X.shape[0]: raise ValueError( "If axis=1, then last_mean, last_n, last_var should be of " f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})." ) else: # axis == 0 if np.size(last_mean) != X.shape[1]: raise ValueError( "If axis=0, then last_mean, last_n, last_var should be of " f"size n_features {X.shape[1]} (Got {np.size(last_mean)})." ) X = X.T if axis == 1 else X if weights is not None: weights = _check_sample_weight(weights, X, dtype=X.dtype) return _incr_mean_var_axis0( X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights ) def inplace_column_scale(X, scale): """Inplace column scaling of a CSC/CSR matrix. Scale each feature of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix to normalize using the variance of the features. It should be of CSC or CSR format. scale : ndarray of shape (n_features,), dtype={np.float32, np.float64} Array of precomputed feature-wise values to use for scaling. Examples -------- >>> from sklearn.utils import sparsefuncs >>> from scipy import sparse >>> import numpy as np >>> indptr = np.array([0, 3, 4, 4, 4]) >>> indices = np.array([0, 1, 2, 2]) >>> data = np.array([8, 1, 2, 5]) >>> scale = np.array([2, 3, 2]) >>> csr = sparse.csr_matrix((data, indices, indptr)) >>> csr.todense() matrix([[8, 1, 2], [0, 0, 5], [0, 0, 0], [0, 0, 0]]) >>> sparsefuncs.inplace_column_scale(csr, scale) >>> csr.todense() matrix([[16, 3, 4], [ 0, 0, 10], [ 0, 0, 0], [ 0, 0, 0]]) """ if sp.issparse(X) and X.format == "csc": inplace_csr_row_scale(X.T, scale) elif sp.issparse(X) and X.format == "csr": inplace_csr_column_scale(X, scale) else: _raise_typeerror(X) def inplace_row_scale(X, scale): """Inplace row scaling of a CSR or CSC matrix. Scale each row of the data matrix by multiplying with specific scale provided by the caller assuming a (n_samples, n_features) shape. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix to be scaled. It should be of CSR or CSC format. scale : ndarray of shape (n_features,), dtype={np.float32, np.float64} Array of precomputed sample-wise values to use for scaling. Examples -------- >>> from sklearn.utils import sparsefuncs >>> from scipy import sparse >>> import numpy as np >>> indptr = np.array([0, 2, 3, 4, 5]) >>> indices = np.array([0, 1, 2, 3, 3]) >>> data = np.array([8, 1, 2, 5, 6]) >>> scale = np.array([2, 3, 4, 5]) >>> csr = sparse.csr_matrix((data, indices, indptr)) >>> csr.todense() matrix([[8, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 5], [0, 0, 0, 6]]) >>> sparsefuncs.inplace_row_scale(csr, scale) >>> csr.todense() matrix([[16, 2, 0, 0], [ 0, 0, 6, 0], [ 0, 0, 0, 20], [ 0, 0, 0, 30]]) """ if sp.issparse(X) and X.format == "csc": inplace_csr_column_scale(X.T, scale) elif sp.issparse(X) and X.format == "csr": inplace_csr_row_scale(X, scale) else: _raise_typeerror(X) def inplace_swap_row_csc(X, m, n): """Swap two rows of a CSC matrix in-place. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix whose two rows are to be swapped. It should be of CSC format. m : int Index of the row of X to be swapped. n : int Index of the row of X to be swapped. """ for t in [m, n]: if isinstance(t, np.ndarray): raise TypeError("m and n should be valid integers") if m < 0: m += X.shape[0] if n < 0: n += X.shape[0] m_mask = X.indices == m X.indices[X.indices == n] = m X.indices[m_mask] = n def inplace_swap_row_csr(X, m, n): """Swap two rows of a CSR matrix in-place. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix whose two rows are to be swapped. It should be of CSR format. m : int Index of the row of X to be swapped. n : int Index of the row of X to be swapped. """ for t in [m, n]: if isinstance(t, np.ndarray): raise TypeError("m and n should be valid integers") if m < 0: m += X.shape[0] if n < 0: n += X.shape[0] # The following swapping makes life easier since m is assumed to be the # smaller integer below. if m > n: m, n = n, m indptr = X.indptr m_start = indptr[m] m_stop = indptr[m + 1] n_start = indptr[n] n_stop = indptr[n + 1] nz_m = m_stop - m_start nz_n = n_stop - n_start if nz_m != nz_n: # Modify indptr first X.indptr[m + 2 : n] += nz_n - nz_m X.indptr[m + 1] = m_start + nz_n X.indptr[n] = n_stop - nz_m X.indices = np.concatenate( [ X.indices[:m_start], X.indices[n_start:n_stop], X.indices[m_stop:n_start], X.indices[m_start:m_stop], X.indices[n_stop:], ] ) X.data = np.concatenate( [ X.data[:m_start], X.data[n_start:n_stop], X.data[m_stop:n_start], X.data[m_start:m_stop], X.data[n_stop:], ] ) def inplace_swap_row(X, m, n): """ Swap two rows of a CSC/CSR matrix in-place. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix whose two rows are to be swapped. It should be of CSR or CSC format. m : int Index of the row of X to be swapped. n : int Index of the row of X to be swapped. Examples -------- >>> from sklearn.utils import sparsefuncs >>> from scipy import sparse >>> import numpy as np >>> indptr = np.array([0, 2, 3, 3, 3]) >>> indices = np.array([0, 2, 2]) >>> data = np.array([8, 2, 5]) >>> csr = sparse.csr_matrix((data, indices, indptr)) >>> csr.todense() matrix([[8, 0, 2], [0, 0, 5], [0, 0, 0], [0, 0, 0]]) >>> sparsefuncs.inplace_swap_row(csr, 0, 1) >>> csr.todense() matrix([[0, 0, 5], [8, 0, 2], [0, 0, 0], [0, 0, 0]]) """ if sp.issparse(X) and X.format == "csc": inplace_swap_row_csc(X, m, n) elif sp.issparse(X) and X.format == "csr": inplace_swap_row_csr(X, m, n) else: _raise_typeerror(X) def inplace_swap_column(X, m, n): """ Swap two columns of a CSC/CSR matrix in-place. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Matrix whose two columns are to be swapped. It should be of CSR or CSC format. m : int Index of the column of X to be swapped. n : int Index of the column of X to be swapped. Examples -------- >>> from sklearn.utils import sparsefuncs >>> from scipy import sparse >>> import numpy as np >>> indptr = np.array([0, 2, 3, 3, 3]) >>> indices = np.array([0, 2, 2]) >>> data = np.array([8, 2, 5]) >>> csr = sparse.csr_matrix((data, indices, indptr)) >>> csr.todense() matrix([[8, 0, 2], [0, 0, 5], [0, 0, 0], [0, 0, 0]]) >>> sparsefuncs.inplace_swap_column(csr, 0, 1) >>> csr.todense() matrix([[0, 8, 2], [0, 0, 5], [0, 0, 0], [0, 0, 0]]) """ if m < 0: m += X.shape[1] if n < 0: n += X.shape[1] if sp.issparse(X) and X.format == "csc": inplace_swap_row_csr(X, m, n) elif sp.issparse(X) and X.format == "csr": inplace_swap_row_csc(X, m, n) else: _raise_typeerror(X) def min_max_axis(X, axis, ignore_nan=False): """Compute minimum and maximum along an axis on a CSR or CSC matrix. Optionally ignore NaN values. Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Input data. It should be of CSR or CSC format. axis : {0, 1} Axis along which the axis should be computed. ignore_nan : bool, default=False Ignore or passing through NaN values. .. versionadded:: 0.20 Returns ------- mins : ndarray of shape (n_features,), dtype={np.float32, np.float64} Feature-wise minima. maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64} Feature-wise maxima. """ if sp.issparse(X) and X.format in ("csr", "csc"): if ignore_nan: return _sparse_nan_min_max(X, axis=axis) else: return _sparse_min_max(X, axis=axis) else: _raise_typeerror(X) def count_nonzero(X, axis=None, sample_weight=None): """A variant of X.getnnz() with extension to weighting on axis 0. Useful in efficiently calculating multilabel metrics. Parameters ---------- X : sparse matrix of shape (n_samples, n_labels) Input data. It should be of CSR format. axis : {0, 1}, default=None The axis on which the data is aggregated. sample_weight : array-like of shape (n_samples,), default=None Weight for each row of X. Returns ------- nnz : int, float, ndarray of shape (n_samples,) or ndarray of shape (n_features,) Number of non-zero values in the array along a given axis. Otherwise, the total number of non-zero values in the array is returned. """ if axis == -1: axis = 1 elif axis == -2: axis = 0 elif X.format != "csr": raise TypeError("Expected CSR sparse format, got {0}".format(X.format)) # We rely here on the fact that np.diff(Y.indptr) for a CSR # will return the number of nonzero entries in each row. # A bincount over Y.indices will return the number of nonzeros # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14. if axis is None: if sample_weight is None: return X.nnz else: return np.dot(np.diff(X.indptr), sample_weight) elif axis == 1: out = np.diff(X.indptr) if sample_weight is None: # astype here is for consistency with axis=0 dtype return out.astype("intp") return out * sample_weight elif axis == 0: if sample_weight is None: return np.bincount(X.indices, minlength=X.shape[1]) else: weights = np.repeat(sample_weight, np.diff(X.indptr)) return np.bincount(X.indices, minlength=X.shape[1], weights=weights) else: raise ValueError("Unsupported axis: {0}".format(axis)) def _get_median(data, n_zeros): """Compute the median of data with n_zeros additional zeros. This function is used to support sparse matrices; it modifies data in-place. """ n_elems = len(data) + n_zeros if not n_elems: return np.nan n_negative = np.count_nonzero(data < 0) middle, is_odd = divmod(n_elems, 2) data.sort() if is_odd: return _get_elem_at_rank(middle, data, n_negative, n_zeros) return ( _get_elem_at_rank(middle - 1, data, n_negative, n_zeros) + _get_elem_at_rank(middle, data, n_negative, n_zeros) ) / 2.0 def _get_elem_at_rank(rank, data, n_negative, n_zeros): """Find the value in data augmented with n_zeros for the given rank""" if rank < n_negative: return data[rank] if rank - n_negative < n_zeros: return 0 return data[rank - n_zeros] def csc_median_axis_0(X): """Find the median across axis 0 of a CSC matrix. It is equivalent to doing np.median(X, axis=0). Parameters ---------- X : sparse matrix of shape (n_samples, n_features) Input data. It should be of CSC format. Returns ------- median : ndarray of shape (n_features,) Median. """ if not (sp.issparse(X) and X.format == "csc"): raise TypeError("Expected matrix of CSC format, got %s" % X.format) indptr = X.indptr n_samples, n_features = X.shape median = np.zeros(n_features) for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])): # Prevent modifying X in place data = np.copy(X.data[start:end]) nz = n_samples - data.size median[f_ind] = _get_median(data, nz) return median def _implicit_column_offset(X, offset): """Create an implicitly offset linear operator. This is used by PCA on sparse data to avoid densifying the whole data matrix. Params ------ X : sparse matrix of shape (n_samples, n_features) offset : ndarray of shape (n_features,) Returns ------- centered : LinearOperator """ offset = offset[None, :] XT = X.T return LinearOperator( matvec=lambda x: X @ x - offset @ x, matmat=lambda x: X @ x - offset @ x, rmatvec=lambda x: XT @ x - (offset * x.sum()), rmatmat=lambda x: XT @ x - offset.T @ x.sum(axis=0)[None, :], dtype=X.dtype, shape=X.shape, )