750 lines
28 KiB
Python
750 lines
28 KiB
Python
"""Spectral Embedding."""
|
|
|
|
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
|
# Wei LI <kuantkid@gmail.com>
|
|
# License: BSD 3 clause
|
|
|
|
|
|
import warnings
|
|
from numbers import Integral, Real
|
|
|
|
import numpy as np
|
|
from scipy import sparse
|
|
from scipy.linalg import eigh
|
|
from scipy.sparse.csgraph import connected_components
|
|
from scipy.sparse.linalg import eigsh, lobpcg
|
|
|
|
from ..base import BaseEstimator, _fit_context
|
|
from ..metrics.pairwise import rbf_kernel
|
|
from ..neighbors import NearestNeighbors, kneighbors_graph
|
|
from ..utils import (
|
|
check_array,
|
|
check_random_state,
|
|
check_symmetric,
|
|
)
|
|
from ..utils._arpack import _init_arpack_v0
|
|
from ..utils._param_validation import Interval, StrOptions
|
|
from ..utils.extmath import _deterministic_vector_sign_flip
|
|
from ..utils.fixes import laplacian as csgraph_laplacian
|
|
from ..utils.fixes import parse_version, sp_version
|
|
|
|
|
|
def _graph_connected_component(graph, node_id):
|
|
"""Find the largest graph connected components that contains one
|
|
given node.
|
|
|
|
Parameters
|
|
----------
|
|
graph : array-like of shape (n_samples, n_samples)
|
|
Adjacency matrix of the graph, non-zero weight means an edge
|
|
between the nodes.
|
|
|
|
node_id : int
|
|
The index of the query node of the graph.
|
|
|
|
Returns
|
|
-------
|
|
connected_components_matrix : array-like of shape (n_samples,)
|
|
An array of bool value indicating the indexes of the nodes
|
|
belonging to the largest connected components of the given query
|
|
node.
|
|
"""
|
|
n_node = graph.shape[0]
|
|
if sparse.issparse(graph):
|
|
# speed up row-wise access to boolean connection mask
|
|
graph = graph.tocsr()
|
|
connected_nodes = np.zeros(n_node, dtype=bool)
|
|
nodes_to_explore = np.zeros(n_node, dtype=bool)
|
|
nodes_to_explore[node_id] = True
|
|
for _ in range(n_node):
|
|
last_num_component = connected_nodes.sum()
|
|
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
|
|
if last_num_component >= connected_nodes.sum():
|
|
break
|
|
indices = np.where(nodes_to_explore)[0]
|
|
nodes_to_explore.fill(False)
|
|
for i in indices:
|
|
if sparse.issparse(graph):
|
|
# scipy not yet implemented 1D sparse slices; can be changed back to
|
|
# `neighbors = graph[i].toarray().ravel()` once implemented
|
|
neighbors = graph[[i], :].toarray().ravel()
|
|
else:
|
|
neighbors = graph[i]
|
|
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
|
|
return connected_nodes
|
|
|
|
|
|
def _graph_is_connected(graph):
|
|
"""Return whether the graph is connected (True) or Not (False).
|
|
|
|
Parameters
|
|
----------
|
|
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
|
|
Adjacency matrix of the graph, non-zero weight means an edge
|
|
between the nodes.
|
|
|
|
Returns
|
|
-------
|
|
is_connected : bool
|
|
True means the graph is fully connected and False means not.
|
|
"""
|
|
if sparse.issparse(graph):
|
|
# Before Scipy 1.11.3, `connected_components` only supports 32-bit indices.
|
|
# PR: https://github.com/scipy/scipy/pull/18913
|
|
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
|
|
# TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use
|
|
# `accept_large_sparse=True`.
|
|
accept_large_sparse = sp_version >= parse_version("1.11.3")
|
|
graph = check_array(
|
|
graph, accept_sparse=True, accept_large_sparse=accept_large_sparse
|
|
)
|
|
# sparse graph, find all the connected components
|
|
n_connected_components, _ = connected_components(graph)
|
|
return n_connected_components == 1
|
|
else:
|
|
# dense graph, find all connected components start from node 0
|
|
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
|
|
|
|
|
|
def _set_diag(laplacian, value, norm_laplacian):
|
|
"""Set the diagonal of the laplacian matrix and convert it to a
|
|
sparse format well suited for eigenvalue decomposition.
|
|
|
|
Parameters
|
|
----------
|
|
laplacian : {ndarray, sparse matrix}
|
|
The graph laplacian.
|
|
|
|
value : float
|
|
The value of the diagonal.
|
|
|
|
norm_laplacian : bool
|
|
Whether the value of the diagonal should be changed or not.
|
|
|
|
Returns
|
|
-------
|
|
laplacian : {array, sparse matrix}
|
|
An array of matrix in a form that is well suited to fast
|
|
eigenvalue decomposition, depending on the band width of the
|
|
matrix.
|
|
"""
|
|
n_nodes = laplacian.shape[0]
|
|
# We need all entries in the diagonal to values
|
|
if not sparse.issparse(laplacian):
|
|
if norm_laplacian:
|
|
laplacian.flat[:: n_nodes + 1] = value
|
|
else:
|
|
laplacian = laplacian.tocoo()
|
|
if norm_laplacian:
|
|
diag_idx = laplacian.row == laplacian.col
|
|
laplacian.data[diag_idx] = value
|
|
# If the matrix has a small number of diagonals (as in the
|
|
# case of structured matrices coming from images), the
|
|
# dia format might be best suited for matvec products:
|
|
n_diags = np.unique(laplacian.row - laplacian.col).size
|
|
if n_diags <= 7:
|
|
# 3 or less outer diagonals on each side
|
|
laplacian = laplacian.todia()
|
|
else:
|
|
# csr has the fastest matvec and is thus best suited to
|
|
# arpack
|
|
laplacian = laplacian.tocsr()
|
|
return laplacian
|
|
|
|
|
|
def spectral_embedding(
|
|
adjacency,
|
|
*,
|
|
n_components=8,
|
|
eigen_solver=None,
|
|
random_state=None,
|
|
eigen_tol="auto",
|
|
norm_laplacian=True,
|
|
drop_first=True,
|
|
):
|
|
"""Project the sample on the first eigenvectors of the graph Laplacian.
|
|
|
|
The adjacency matrix is used to compute a normalized graph Laplacian
|
|
whose spectrum (especially the eigenvectors associated to the
|
|
smallest eigenvalues) has an interpretation in terms of minimal
|
|
number of cuts necessary to split the graph into comparably sized
|
|
components.
|
|
|
|
This embedding can also 'work' even if the ``adjacency`` variable is
|
|
not strictly the adjacency matrix of a graph but more generally
|
|
an affinity or similarity matrix between samples (for instance the
|
|
heat kernel of a euclidean distance matrix or a k-NN matrix).
|
|
|
|
However care must taken to always make the affinity matrix symmetric
|
|
so that the eigenvector decomposition works as expected.
|
|
|
|
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
|
|
|
Read more in the :ref:`User Guide <spectral_embedding>`.
|
|
|
|
Parameters
|
|
----------
|
|
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
|
|
The adjacency matrix of the graph to embed.
|
|
|
|
n_components : int, default=8
|
|
The dimension of the projection subspace.
|
|
|
|
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
|
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
|
to be installed. It can be faster on very large, sparse problems,
|
|
but may also lead to instabilities. If None, then ``'arpack'`` is
|
|
used.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
A pseudo random number generator used for the initialization
|
|
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
|
'amg'`, and for the K-Means initialization. Use an int to make
|
|
the results deterministic across calls (See
|
|
:term:`Glossary <random_state>`).
|
|
|
|
.. note::
|
|
When using `eigen_solver == 'amg'`,
|
|
it is necessary to also fix the global numpy seed with
|
|
`np.random.seed(int)` to get deterministic results. See
|
|
https://github.com/pyamg/pyamg/issues/139 for further
|
|
information.
|
|
|
|
eigen_tol : float, default="auto"
|
|
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
|
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
|
`eigen_solver`:
|
|
|
|
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
|
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
|
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
|
automatically resolve the value according to their heuristics. See,
|
|
:func:`scipy.sparse.linalg.lobpcg` for details.
|
|
|
|
Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
|
|
to convergence issues and should be avoided.
|
|
|
|
.. versionadded:: 1.2
|
|
Added 'auto' option.
|
|
|
|
norm_laplacian : bool, default=True
|
|
If True, then compute symmetric normalized Laplacian.
|
|
|
|
drop_first : bool, default=True
|
|
Whether to drop the first eigenvector. For spectral embedding, this
|
|
should be True as the first eigenvector should be constant vector for
|
|
connected graph, but for spectral clustering, this should be kept as
|
|
False to retain the first eigenvector.
|
|
|
|
Returns
|
|
-------
|
|
embedding : ndarray of shape (n_samples, n_components)
|
|
The reduced samples.
|
|
|
|
Notes
|
|
-----
|
|
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
|
|
has one connected component. If there graph has many components, the first
|
|
few eigenvectors will simply uncover the connected components of the graph.
|
|
|
|
References
|
|
----------
|
|
* https://en.wikipedia.org/wiki/LOBPCG
|
|
|
|
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
|
|
Block Preconditioned Conjugate Gradient Method",
|
|
Andrew V. Knyazev
|
|
<10.1137/S1064827500366124>`
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_digits
|
|
>>> from sklearn.neighbors import kneighbors_graph
|
|
>>> from sklearn.manifold import spectral_embedding
|
|
>>> X, _ = load_digits(return_X_y=True)
|
|
>>> X = X[:100]
|
|
>>> affinity_matrix = kneighbors_graph(
|
|
... X, n_neighbors=int(X.shape[0] / 10), include_self=True
|
|
... )
|
|
>>> # make the matrix symmetric
|
|
>>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)
|
|
>>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42)
|
|
>>> embedding.shape
|
|
(100, 2)
|
|
"""
|
|
adjacency = check_symmetric(adjacency)
|
|
|
|
if eigen_solver == "amg":
|
|
try:
|
|
from pyamg import smoothed_aggregation_solver
|
|
except ImportError as e:
|
|
raise ValueError(
|
|
"The eigen_solver was set to 'amg', but pyamg is not available."
|
|
) from e
|
|
|
|
if eigen_solver is None:
|
|
eigen_solver = "arpack"
|
|
elif eigen_solver not in ("arpack", "lobpcg", "amg"):
|
|
raise ValueError(
|
|
"Unknown value for eigen_solver: '%s'."
|
|
"Should be 'amg', 'arpack', or 'lobpcg'" % eigen_solver
|
|
)
|
|
|
|
random_state = check_random_state(random_state)
|
|
|
|
n_nodes = adjacency.shape[0]
|
|
# Whether to drop the first eigenvector
|
|
if drop_first:
|
|
n_components = n_components + 1
|
|
|
|
if not _graph_is_connected(adjacency):
|
|
warnings.warn(
|
|
"Graph is not fully connected, spectral embedding may not work as expected."
|
|
)
|
|
|
|
laplacian, dd = csgraph_laplacian(
|
|
adjacency, normed=norm_laplacian, return_diag=True
|
|
)
|
|
if (
|
|
eigen_solver == "arpack"
|
|
or eigen_solver != "lobpcg"
|
|
and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components)
|
|
):
|
|
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
|
|
# for details see the source code in scipy:
|
|
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
|
|
# /lobpcg/lobpcg.py#L237
|
|
# or matlab:
|
|
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
|
|
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
|
|
|
# Here we'll use shift-invert mode for fast eigenvalues
|
|
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
|
|
# for a short explanation of what this means)
|
|
# Because the normalized Laplacian has eigenvalues between 0 and 2,
|
|
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
|
|
# when finding eigenvalues of largest magnitude (keyword which='LM')
|
|
# and when these eigenvalues are very large compared to the rest.
|
|
# For very large, very sparse graphs, I - L can have many, many
|
|
# eigenvalues very near 1.0. This leads to slow convergence. So
|
|
# instead, we'll use ARPACK's shift-invert mode, asking for the
|
|
# eigenvalues near 1.0. This effectively spreads-out the spectrum
|
|
# near 1.0 and leads to much faster convergence: potentially an
|
|
# orders-of-magnitude speedup over simply using keyword which='LA'
|
|
# in standard mode.
|
|
try:
|
|
# We are computing the opposite of the laplacian inplace so as
|
|
# to spare a memory allocation of a possibly very large array
|
|
tol = 0 if eigen_tol == "auto" else eigen_tol
|
|
laplacian *= -1
|
|
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
|
|
laplacian = check_array(
|
|
laplacian, accept_sparse="csr", accept_large_sparse=False
|
|
)
|
|
_, diffusion_map = eigsh(
|
|
laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
|
|
)
|
|
embedding = diffusion_map.T[n_components::-1]
|
|
if norm_laplacian:
|
|
# recover u = D^-1/2 x from the eigenvector output x
|
|
embedding = embedding / dd
|
|
except RuntimeError:
|
|
# When submatrices are exactly singular, an LU decomposition
|
|
# in arpack fails. We fallback to lobpcg
|
|
eigen_solver = "lobpcg"
|
|
# Revert the laplacian to its opposite to have lobpcg work
|
|
laplacian *= -1
|
|
|
|
elif eigen_solver == "amg":
|
|
# Use AMG to get a preconditioner and speed up the eigenvalue
|
|
# problem.
|
|
if not sparse.issparse(laplacian):
|
|
warnings.warn("AMG works better for sparse matrices")
|
|
laplacian = check_array(
|
|
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
|
)
|
|
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
|
|
|
# The Laplacian matrix is always singular, having at least one zero
|
|
# eigenvalue, corresponding to the trivial eigenvector, which is a
|
|
# constant. Using a singular matrix for preconditioning may result in
|
|
# random failures in LOBPCG and is not supported by the existing
|
|
# theory:
|
|
# see https://doi.org/10.1007/s10208-015-9297-1
|
|
# Shift the Laplacian so its diagononal is not all ones. The shift
|
|
# does change the eigenpairs however, so we'll feed the shifted
|
|
# matrix to the solver and afterward set it back to the original.
|
|
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
|
|
laplacian += diag_shift
|
|
if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array):
|
|
# `pyamg` does not work with `csr_array` and we need to convert it to a
|
|
# `csr_matrix` object.
|
|
laplacian = sparse.csr_matrix(laplacian)
|
|
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
|
|
laplacian -= diag_shift
|
|
|
|
M = ml.aspreconditioner()
|
|
# Create initial approximation X to eigenvectors
|
|
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
|
|
X[:, 0] = dd.ravel()
|
|
X = X.astype(laplacian.dtype)
|
|
|
|
tol = None if eigen_tol == "auto" else eigen_tol
|
|
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
|
|
embedding = diffusion_map.T
|
|
if norm_laplacian:
|
|
# recover u = D^-1/2 x from the eigenvector output x
|
|
embedding = embedding / dd
|
|
if embedding.shape[0] == 1:
|
|
raise ValueError
|
|
|
|
if eigen_solver == "lobpcg":
|
|
laplacian = check_array(
|
|
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
|
)
|
|
if n_nodes < 5 * n_components + 1:
|
|
# see note above under arpack why lobpcg has problems with small
|
|
# number of nodes
|
|
# lobpcg will fallback to eigh, so we short circuit it
|
|
if sparse.issparse(laplacian):
|
|
laplacian = laplacian.toarray()
|
|
_, diffusion_map = eigh(laplacian, check_finite=False)
|
|
embedding = diffusion_map.T[:n_components]
|
|
if norm_laplacian:
|
|
# recover u = D^-1/2 x from the eigenvector output x
|
|
embedding = embedding / dd
|
|
else:
|
|
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
|
# We increase the number of eigenvectors requested, as lobpcg
|
|
# doesn't behave well in low dimension and create initial
|
|
# approximation X to eigenvectors
|
|
X = random_state.standard_normal(
|
|
size=(laplacian.shape[0], n_components + 1)
|
|
)
|
|
X[:, 0] = dd.ravel()
|
|
X = X.astype(laplacian.dtype)
|
|
tol = None if eigen_tol == "auto" else eigen_tol
|
|
_, diffusion_map = lobpcg(
|
|
laplacian, X, tol=tol, largest=False, maxiter=2000
|
|
)
|
|
embedding = diffusion_map.T[:n_components]
|
|
if norm_laplacian:
|
|
# recover u = D^-1/2 x from the eigenvector output x
|
|
embedding = embedding / dd
|
|
if embedding.shape[0] == 1:
|
|
raise ValueError
|
|
|
|
embedding = _deterministic_vector_sign_flip(embedding)
|
|
if drop_first:
|
|
return embedding[1:n_components].T
|
|
else:
|
|
return embedding[:n_components].T
|
|
|
|
|
|
class SpectralEmbedding(BaseEstimator):
|
|
"""Spectral embedding for non-linear dimensionality reduction.
|
|
|
|
Forms an affinity matrix given by the specified function and
|
|
applies spectral decomposition to the corresponding graph laplacian.
|
|
The resulting transformation is given by the value of the
|
|
eigenvectors for each data point.
|
|
|
|
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
|
|
|
Read more in the :ref:`User Guide <spectral_embedding>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_components : int, default=2
|
|
The dimension of the projected subspace.
|
|
|
|
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
|
|
'precomputed_nearest_neighbors'} or callable, \
|
|
default='nearest_neighbors'
|
|
How to construct the affinity matrix.
|
|
- 'nearest_neighbors' : construct the affinity matrix by computing a
|
|
graph of nearest neighbors.
|
|
- 'rbf' : construct the affinity matrix by computing a radial basis
|
|
function (RBF) kernel.
|
|
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
|
|
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
|
|
of precomputed nearest neighbors, and constructs the affinity matrix
|
|
by selecting the ``n_neighbors`` nearest neighbors.
|
|
- callable : use passed in function as affinity
|
|
the function takes in data matrix (n_samples, n_features)
|
|
and return affinity matrix (n_samples, n_samples).
|
|
|
|
gamma : float, default=None
|
|
Kernel coefficient for rbf kernel. If None, gamma will be set to
|
|
1/n_features.
|
|
|
|
random_state : int, RandomState instance or None, default=None
|
|
A pseudo random number generator used for the initialization
|
|
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
|
'amg'`, and for the K-Means initialization. Use an int to make
|
|
the results deterministic across calls (See
|
|
:term:`Glossary <random_state>`).
|
|
|
|
.. note::
|
|
When using `eigen_solver == 'amg'`,
|
|
it is necessary to also fix the global numpy seed with
|
|
`np.random.seed(int)` to get deterministic results. See
|
|
https://github.com/pyamg/pyamg/issues/139 for further
|
|
information.
|
|
|
|
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
|
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
|
to be installed. It can be faster on very large, sparse problems.
|
|
If None, then ``'arpack'`` is used.
|
|
|
|
eigen_tol : float, default="auto"
|
|
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
|
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
|
`eigen_solver`:
|
|
|
|
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
|
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
|
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
|
automatically resolve the value according to their heuristics. See,
|
|
:func:`scipy.sparse.linalg.lobpcg` for details.
|
|
|
|
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
|
values of `tol<1e-5` may lead to convergence issues and should be
|
|
avoided.
|
|
|
|
.. versionadded:: 1.2
|
|
|
|
n_neighbors : int, default=None
|
|
Number of nearest neighbors for nearest_neighbors graph building.
|
|
If None, n_neighbors will be set to max(n_samples/10, 1).
|
|
|
|
n_jobs : int, default=None
|
|
The number of parallel jobs to run.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
Attributes
|
|
----------
|
|
embedding_ : ndarray of shape (n_samples, n_components)
|
|
Spectral embedding of the training matrix.
|
|
|
|
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
|
Affinity_matrix constructed from samples or precomputed.
|
|
|
|
n_features_in_ : int
|
|
Number of features seen during :term:`fit`.
|
|
|
|
.. versionadded:: 0.24
|
|
|
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
|
Names of features seen during :term:`fit`. Defined only when `X`
|
|
has feature names that are all strings.
|
|
|
|
.. versionadded:: 1.0
|
|
|
|
n_neighbors_ : int
|
|
Number of nearest neighbors effectively used.
|
|
|
|
See Also
|
|
--------
|
|
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
|
|
|
|
References
|
|
----------
|
|
|
|
- :doi:`A Tutorial on Spectral Clustering, 2007
|
|
Ulrike von Luxburg
|
|
<10.1007/s11222-007-9033-z>`
|
|
|
|
- `On Spectral Clustering: Analysis and an algorithm, 2001
|
|
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
|
|
<https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58>`_
|
|
|
|
- :doi:`Normalized cuts and image segmentation, 2000
|
|
Jianbo Shi, Jitendra Malik
|
|
<10.1109/34.868688>`
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_digits
|
|
>>> from sklearn.manifold import SpectralEmbedding
|
|
>>> X, _ = load_digits(return_X_y=True)
|
|
>>> X.shape
|
|
(1797, 64)
|
|
>>> embedding = SpectralEmbedding(n_components=2)
|
|
>>> X_transformed = embedding.fit_transform(X[:100])
|
|
>>> X_transformed.shape
|
|
(100, 2)
|
|
"""
|
|
|
|
_parameter_constraints: dict = {
|
|
"n_components": [Interval(Integral, 1, None, closed="left")],
|
|
"affinity": [
|
|
StrOptions(
|
|
{
|
|
"nearest_neighbors",
|
|
"rbf",
|
|
"precomputed",
|
|
"precomputed_nearest_neighbors",
|
|
},
|
|
),
|
|
callable,
|
|
],
|
|
"gamma": [Interval(Real, 0, None, closed="left"), None],
|
|
"random_state": ["random_state"],
|
|
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
|
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
|
|
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
|
|
"n_jobs": [None, Integral],
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
n_components=2,
|
|
*,
|
|
affinity="nearest_neighbors",
|
|
gamma=None,
|
|
random_state=None,
|
|
eigen_solver=None,
|
|
eigen_tol="auto",
|
|
n_neighbors=None,
|
|
n_jobs=None,
|
|
):
|
|
self.n_components = n_components
|
|
self.affinity = affinity
|
|
self.gamma = gamma
|
|
self.random_state = random_state
|
|
self.eigen_solver = eigen_solver
|
|
self.eigen_tol = eigen_tol
|
|
self.n_neighbors = n_neighbors
|
|
self.n_jobs = n_jobs
|
|
|
|
def _more_tags(self):
|
|
return {
|
|
"pairwise": self.affinity in [
|
|
"precomputed",
|
|
"precomputed_nearest_neighbors",
|
|
]
|
|
}
|
|
|
|
def _get_affinity_matrix(self, X, Y=None):
|
|
"""Calculate the affinity matrix from data
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
Training vector, where `n_samples` is the number of samples
|
|
and `n_features` is the number of features.
|
|
|
|
If affinity is "precomputed"
|
|
X : array-like of shape (n_samples, n_samples),
|
|
Interpret X as precomputed adjacency graph computed from
|
|
samples.
|
|
|
|
Y: Ignored
|
|
|
|
Returns
|
|
-------
|
|
affinity_matrix of shape (n_samples, n_samples)
|
|
"""
|
|
if self.affinity == "precomputed":
|
|
self.affinity_matrix_ = X
|
|
return self.affinity_matrix_
|
|
if self.affinity == "precomputed_nearest_neighbors":
|
|
estimator = NearestNeighbors(
|
|
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
|
|
).fit(X)
|
|
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
|
|
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
|
return self.affinity_matrix_
|
|
if self.affinity == "nearest_neighbors":
|
|
if sparse.issparse(X):
|
|
warnings.warn(
|
|
"Nearest neighbors affinity currently does "
|
|
"not support sparse input, falling back to "
|
|
"rbf affinity"
|
|
)
|
|
self.affinity = "rbf"
|
|
else:
|
|
self.n_neighbors_ = (
|
|
self.n_neighbors
|
|
if self.n_neighbors is not None
|
|
else max(int(X.shape[0] / 10), 1)
|
|
)
|
|
self.affinity_matrix_ = kneighbors_graph(
|
|
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
|
|
)
|
|
# currently only symmetric affinity_matrix supported
|
|
self.affinity_matrix_ = 0.5 * (
|
|
self.affinity_matrix_ + self.affinity_matrix_.T
|
|
)
|
|
return self.affinity_matrix_
|
|
if self.affinity == "rbf":
|
|
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
|
|
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
|
|
return self.affinity_matrix_
|
|
self.affinity_matrix_ = self.affinity(X)
|
|
return self.affinity_matrix_
|
|
|
|
@_fit_context(prefer_skip_nested_validation=True)
|
|
def fit(self, X, y=None):
|
|
"""Fit the model from data in X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
Training vector, where `n_samples` is the number of samples
|
|
and `n_features` is the number of features.
|
|
|
|
If affinity is "precomputed"
|
|
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
|
|
Interpret X as precomputed adjacency graph computed from
|
|
samples.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns the instance itself.
|
|
"""
|
|
X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
|
|
|
|
random_state = check_random_state(self.random_state)
|
|
|
|
affinity_matrix = self._get_affinity_matrix(X)
|
|
self.embedding_ = spectral_embedding(
|
|
affinity_matrix,
|
|
n_components=self.n_components,
|
|
eigen_solver=self.eigen_solver,
|
|
eigen_tol=self.eigen_tol,
|
|
random_state=random_state,
|
|
)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None):
|
|
"""Fit the model from data in X and transform X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
Training vector, where `n_samples` is the number of samples
|
|
and `n_features` is the number of features.
|
|
|
|
If affinity is "precomputed"
|
|
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
|
|
Interpret X as precomputed adjacency graph computed from
|
|
samples.
|
|
|
|
y : Ignored
|
|
Not used, present for API consistency by convention.
|
|
|
|
Returns
|
|
-------
|
|
X_new : array-like of shape (n_samples, n_components)
|
|
Spectral embedding of the training matrix.
|
|
"""
|
|
self.fit(X)
|
|
return self.embedding_
|