ai-content-maker/.venv/Lib/site-packages/sklearn/cluster/_agglomerative.py

1337 lines
48 KiB
Python

"""Hierarchical Agglomerative Clustering
These routines perform some hierarchical agglomerative clustering of some
input data.
Authors : Vincent Michel, Bertrand Thirion, Alexandre Gramfort,
Gael Varoquaux
License: BSD 3 clause
"""
import warnings
from heapq import heapify, heappop, heappush, heappushpop
from numbers import Integral, Real
import numpy as np
from scipy import sparse
from scipy.sparse.csgraph import connected_components
from ..base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
ClusterMixin,
_fit_context,
)
from ..metrics import DistanceMetric
from ..metrics._dist_metrics import METRIC_MAPPING64
from ..metrics.pairwise import _VALID_METRICS, paired_distances
from ..utils import check_array
from ..utils._fast_dict import IntFloatDict
from ..utils._param_validation import (
HasMethods,
Hidden,
Interval,
StrOptions,
validate_params,
)
from ..utils.graph import _fix_connected_components
from ..utils.validation import check_memory
# mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
from . import _hierarchical_fast as _hierarchical # type: ignore
from ._feature_agglomeration import AgglomerationTransform
###############################################################################
# For non fully-connected graphs
def _fix_connectivity(X, connectivity, affinity):
"""
Fixes the connectivity matrix.
The different steps are:
- copies it
- makes it symmetric
- converts it to LIL if necessary
- completes it if necessary.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix representing `n_samples` samples to be clustered.
connectivity : sparse matrix, default=None
Connectivity matrix. Defines for each sample the neighboring samples
following a given structure of the data. The matrix is assumed to
be symmetric and only the upper triangular half is used.
Default is `None`, i.e, the Ward algorithm is unstructured.
affinity : {"euclidean", "precomputed"}, default="euclidean"
Which affinity to use. At the moment `precomputed` and
``euclidean`` are supported. `euclidean` uses the
negative squared Euclidean distance between points.
Returns
-------
connectivity : sparse matrix
The fixed connectivity matrix.
n_connected_components : int
The number of connected components in the graph.
"""
n_samples = X.shape[0]
if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:
raise ValueError(
"Wrong shape for connectivity matrix: %s when X is %s"
% (connectivity.shape, X.shape)
)
# Make the connectivity matrix symmetric:
connectivity = connectivity + connectivity.T
# Convert connectivity matrix to LIL
if not sparse.issparse(connectivity):
connectivity = sparse.lil_matrix(connectivity)
# `connectivity` is a sparse matrix at this point
if connectivity.format != "lil":
connectivity = connectivity.tolil()
# Compute the number of nodes
n_connected_components, labels = connected_components(connectivity)
if n_connected_components > 1:
warnings.warn(
"the number of connected components of the "
"connectivity matrix is %d > 1. Completing it to avoid "
"stopping the tree early." % n_connected_components,
stacklevel=2,
)
# XXX: Can we do without completing the matrix?
connectivity = _fix_connected_components(
X=X,
graph=connectivity,
n_connected_components=n_connected_components,
component_labels=labels,
metric=affinity,
mode="connectivity",
)
return connectivity, n_connected_components
def _single_linkage_tree(
connectivity,
n_samples,
n_nodes,
n_clusters,
n_connected_components,
return_distance,
):
"""
Perform single linkage clustering on sparse data via the minimum
spanning tree from scipy.sparse.csgraph, then using union-find to label.
The parent array is then generated by walking through the tree.
"""
from scipy.sparse.csgraph import minimum_spanning_tree
# explicitly cast connectivity to ensure safety
connectivity = connectivity.astype(np.float64, copy=False)
# Ensure zero distances aren't ignored by setting them to "epsilon"
epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps
connectivity.data[connectivity.data == 0] = epsilon_value
# Use scipy.sparse.csgraph to generate a minimum spanning tree
mst = minimum_spanning_tree(connectivity.tocsr())
# Convert the graph to scipy.cluster.hierarchy array format
mst = mst.tocoo()
# Undo the epsilon values
mst.data[mst.data == epsilon_value] = 0
mst_array = np.vstack([mst.row, mst.col, mst.data]).T
# Sort edges of the min_spanning_tree by weight
mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :]
# Convert edge list into standard hierarchical clustering format
single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
children_ = single_linkage_tree[:, :2].astype(int)
# Compute parents
parent = np.arange(n_nodes, dtype=np.intp)
for i, (left, right) in enumerate(children_, n_samples):
if n_clusters is not None and i >= n_nodes:
break
if left < n_nodes:
parent[left] = i
if right < n_nodes:
parent[right] = i
if return_distance:
distances = single_linkage_tree[:, 2]
return children_, n_connected_components, n_samples, parent, distances
return children_, n_connected_components, n_samples, parent
###############################################################################
# Hierarchical tree building functions
@validate_params(
{
"X": ["array-like"],
"connectivity": ["array-like", "sparse matrix", None],
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
"return_distance": ["boolean"],
},
prefer_skip_nested_validation=True,
)
def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
"""Ward clustering based on a Feature matrix.
Recursively merges the pair of clusters that minimally increases
within-cluster variance.
The inertia matrix uses a Heapq-based representation.
This is the structured version, that takes into account some topological
structure between samples.
Read more in the :ref:`User Guide <hierarchical_clustering>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix representing `n_samples` samples to be clustered.
connectivity : {array-like, sparse matrix}, default=None
Connectivity matrix. Defines for each sample the neighboring samples
following a given structure of the data. The matrix is assumed to
be symmetric and only the upper triangular half is used.
Default is None, i.e, the Ward algorithm is unstructured.
n_clusters : int, default=None
`n_clusters` should be less than `n_samples`. Stop early the
construction of the tree at `n_clusters.` This is useful to decrease
computation time if the number of clusters is not small compared to the
number of samples. In this case, the complete tree is not computed, thus
the 'children' output is of limited use, and the 'parents' output should
rather be used. This option is valid only when specifying a connectivity
matrix.
return_distance : bool, default=False
If `True`, return the distance between the clusters.
Returns
-------
children : ndarray of shape (n_nodes-1, 2)
The children of each non-leaf node. Values less than `n_samples`
correspond to leaves of the tree which are the original samples.
A node `i` greater than or equal to `n_samples` is a non-leaf
node and has children `children_[i - n_samples]`. Alternatively
at the i-th iteration, children[i][0] and children[i][1]
are merged to form node `n_samples + i`.
n_connected_components : int
The number of connected components in the graph.
n_leaves : int
The number of leaves in the tree.
parents : ndarray of shape (n_nodes,) or None
The parent of each node. Only returned when a connectivity matrix
is specified, elsewhere 'None' is returned.
distances : ndarray of shape (n_nodes-1,)
Only returned if `return_distance` is set to `True` (for compatibility).
The distances between the centers of the nodes. `distances[i]`
corresponds to a weighted Euclidean distance between
the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
leaves of the tree, then `distances[i]` is their unweighted Euclidean
distance. Distances are updated in the following way
(from scipy.hierarchy.linkage):
The new entry :math:`d(u,v)` is computed as follows,
.. math::
d(u,v) = \\sqrt{\\frac{|v|+|s|}
{T}d(v,s)^2
+ \\frac{|v|+|t|}
{T}d(v,t)^2
- \\frac{|v|}
{T}d(s,t)^2}
where :math:`u` is the newly joined cluster consisting of
clusters :math:`s` and :math:`t`, :math:`v` is an unused
cluster in the forest, :math:`T=|v|+|s|+|t|`, and
:math:`|*|` is the cardinality of its argument. This is also
known as the incremental algorithm.
Examples
--------
>>> import numpy as np
>>> from sklearn.cluster import ward_tree
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> children, n_connected_components, n_leaves, parents = ward_tree(X)
>>> children
array([[0, 1],
[3, 5],
[2, 6],
[4, 7],
[8, 9]])
>>> n_connected_components
1
>>> n_leaves
6
"""
X = np.asarray(X)
if X.ndim == 1:
X = np.reshape(X, (-1, 1))
n_samples, n_features = X.shape
if connectivity is None:
from scipy.cluster import hierarchy # imports PIL
if n_clusters is not None:
warnings.warn(
(
"Partial build of the tree is implemented "
"only for structured clustering (i.e. with "
"explicit connectivity). The algorithm "
"will build the full tree and only "
"retain the lower branches required "
"for the specified number of clusters"
),
stacklevel=2,
)
X = np.require(X, requirements="W")
out = hierarchy.ward(X)
children_ = out[:, :2].astype(np.intp)
if return_distance:
distances = out[:, 2]
return children_, 1, n_samples, None, distances
else:
return children_, 1, n_samples, None
connectivity, n_connected_components = _fix_connectivity(
X, connectivity, affinity="euclidean"
)
if n_clusters is None:
n_nodes = 2 * n_samples - 1
else:
if n_clusters > n_samples:
raise ValueError(
"Cannot provide more clusters than samples. "
"%i n_clusters was asked, and there are %i "
"samples." % (n_clusters, n_samples)
)
n_nodes = 2 * n_samples - n_clusters
# create inertia matrix
coord_row = []
coord_col = []
A = []
for ind, row in enumerate(connectivity.rows):
A.append(row)
# We keep only the upper triangular for the moments
# Generator expressions are faster than arrays on the following
row = [i for i in row if i < ind]
coord_row.extend(
len(row)
* [
ind,
]
)
coord_col.extend(row)
coord_row = np.array(coord_row, dtype=np.intp, order="C")
coord_col = np.array(coord_col, dtype=np.intp, order="C")
# build moments as a list
moments_1 = np.zeros(n_nodes, order="C")
moments_1[:n_samples] = 1
moments_2 = np.zeros((n_nodes, n_features), order="C")
moments_2[:n_samples] = X
inertia = np.empty(len(coord_row), dtype=np.float64, order="C")
_hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia)
inertia = list(zip(inertia, coord_row, coord_col))
heapify(inertia)
# prepare the main fields
parent = np.arange(n_nodes, dtype=np.intp)
used_node = np.ones(n_nodes, dtype=bool)
children = []
if return_distance:
distances = np.empty(n_nodes - n_samples)
not_visited = np.empty(n_nodes, dtype=bool, order="C")
# recursive merge loop
for k in range(n_samples, n_nodes):
# identify the merge
while True:
inert, i, j = heappop(inertia)
if used_node[i] and used_node[j]:
break
parent[i], parent[j] = k, k
children.append((i, j))
used_node[i] = used_node[j] = False
if return_distance: # store inertia value
distances[k - n_samples] = inert
# update the moments
moments_1[k] = moments_1[i] + moments_1[j]
moments_2[k] = moments_2[i] + moments_2[j]
# update the structure matrix A and the inertia matrix
coord_col = []
not_visited.fill(1)
not_visited[k] = 0
_hierarchical._get_parents(A[i], coord_col, parent, not_visited)
_hierarchical._get_parents(A[j], coord_col, parent, not_visited)
# List comprehension is faster than a for loop
[A[col].append(k) for col in coord_col]
A.append(coord_col)
coord_col = np.array(coord_col, dtype=np.intp, order="C")
coord_row = np.empty(coord_col.shape, dtype=np.intp, order="C")
coord_row.fill(k)
n_additions = len(coord_row)
ini = np.empty(n_additions, dtype=np.float64, order="C")
_hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini)
# List comprehension is faster than a for loop
[heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)]
# Separate leaves in children (empty lists up to now)
n_leaves = n_samples
# sort children to get consistent output with unstructured version
children = [c[::-1] for c in children]
children = np.array(children) # return numpy array for efficient caching
if return_distance:
# 2 is scaling factor to compare w/ unstructured version
distances = np.sqrt(2.0 * distances)
return children, n_connected_components, n_leaves, parent, distances
else:
return children, n_connected_components, n_leaves, parent
# single average and complete linkage
def linkage_tree(
X,
connectivity=None,
n_clusters=None,
linkage="complete",
affinity="euclidean",
return_distance=False,
):
"""Linkage agglomerative clustering based on a Feature matrix.
The inertia matrix uses a Heapq-based representation.
This is the structured version, that takes into account some topological
structure between samples.
Read more in the :ref:`User Guide <hierarchical_clustering>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Feature matrix representing `n_samples` samples to be clustered.
connectivity : sparse matrix, default=None
Connectivity matrix. Defines for each sample the neighboring samples
following a given structure of the data. The matrix is assumed to
be symmetric and only the upper triangular half is used.
Default is `None`, i.e, the Ward algorithm is unstructured.
n_clusters : int, default=None
Stop early the construction of the tree at `n_clusters`. This is
useful to decrease computation time if the number of clusters is
not small compared to the number of samples. In this case, the
complete tree is not computed, thus the 'children' output is of
limited use, and the 'parents' output should rather be used.
This option is valid only when specifying a connectivity matrix.
linkage : {"average", "complete", "single"}, default="complete"
Which linkage criteria to use. The linkage criterion determines which
distance to use between sets of observation.
- "average" uses the average of the distances of each observation of
the two sets.
- "complete" or maximum linkage uses the maximum distances between
all observations of the two sets.
- "single" uses the minimum of the distances between all
observations of the two sets.
affinity : str or callable, default='euclidean'
Which metric to use. Can be 'euclidean', 'manhattan', or any
distance known to paired distance (see metric.pairwise).
return_distance : bool, default=False
Whether or not to return the distances between the clusters.
Returns
-------
children : ndarray of shape (n_nodes-1, 2)
The children of each non-leaf node. Values less than `n_samples`
correspond to leaves of the tree which are the original samples.
A node `i` greater than or equal to `n_samples` is a non-leaf
node and has children `children_[i - n_samples]`. Alternatively
at the i-th iteration, children[i][0] and children[i][1]
are merged to form node `n_samples + i`.
n_connected_components : int
The number of connected components in the graph.
n_leaves : int
The number of leaves in the tree.
parents : ndarray of shape (n_nodes, ) or None
The parent of each node. Only returned when a connectivity matrix
is specified, elsewhere 'None' is returned.
distances : ndarray of shape (n_nodes-1,)
Returned when `return_distance` is set to `True`.
distances[i] refers to the distance between children[i][0] and
children[i][1] when they are merged.
See Also
--------
ward_tree : Hierarchical clustering with ward linkage.
"""
X = np.asarray(X)
if X.ndim == 1:
X = np.reshape(X, (-1, 1))
n_samples, n_features = X.shape
linkage_choices = {
"complete": _hierarchical.max_merge,
"average": _hierarchical.average_merge,
"single": None,
} # Single linkage is handled differently
try:
join_func = linkage_choices[linkage]
except KeyError as e:
raise ValueError(
"Unknown linkage option, linkage should be one of %s, but %s was given"
% (linkage_choices.keys(), linkage)
) from e
if affinity == "cosine" and np.any(~np.any(X, axis=1)):
raise ValueError("Cosine affinity cannot be used when X contains zero vectors")
if connectivity is None:
from scipy.cluster import hierarchy # imports PIL
if n_clusters is not None:
warnings.warn(
(
"Partial build of the tree is implemented "
"only for structured clustering (i.e. with "
"explicit connectivity). The algorithm "
"will build the full tree and only "
"retain the lower branches required "
"for the specified number of clusters"
),
stacklevel=2,
)
if affinity == "precomputed":
# for the linkage function of hierarchy to work on precomputed
# data, provide as first argument an ndarray of the shape returned
# by sklearn.metrics.pairwise_distances.
if X.shape[0] != X.shape[1]:
raise ValueError(
f"Distance matrix should be square, got matrix of shape {X.shape}"
)
i, j = np.triu_indices(X.shape[0], k=1)
X = X[i, j]
elif affinity == "l2":
# Translate to something understood by scipy
affinity = "euclidean"
elif affinity in ("l1", "manhattan"):
affinity = "cityblock"
elif callable(affinity):
X = affinity(X)
i, j = np.triu_indices(X.shape[0], k=1)
X = X[i, j]
if (
linkage == "single"
and affinity != "precomputed"
and not callable(affinity)
and affinity in METRIC_MAPPING64
):
# We need the fast cythonized metric from neighbors
dist_metric = DistanceMetric.get_metric(affinity)
# The Cython routines used require contiguous arrays
X = np.ascontiguousarray(X, dtype=np.double)
mst = _hierarchical.mst_linkage_core(X, dist_metric)
# Sort edges of the min_spanning_tree by weight
mst = mst[np.argsort(mst.T[2], kind="mergesort"), :]
# Convert edge list into standard hierarchical clustering format
out = _hierarchical.single_linkage_label(mst)
else:
out = hierarchy.linkage(X, method=linkage, metric=affinity)
children_ = out[:, :2].astype(int, copy=False)
if return_distance:
distances = out[:, 2]
return children_, 1, n_samples, None, distances
return children_, 1, n_samples, None
connectivity, n_connected_components = _fix_connectivity(
X, connectivity, affinity=affinity
)
connectivity = connectivity.tocoo()
# Put the diagonal to zero
diag_mask = connectivity.row != connectivity.col
connectivity.row = connectivity.row[diag_mask]
connectivity.col = connectivity.col[diag_mask]
connectivity.data = connectivity.data[diag_mask]
del diag_mask
if affinity == "precomputed":
distances = X[connectivity.row, connectivity.col].astype(np.float64, copy=False)
else:
# FIXME We compute all the distances, while we could have only computed
# the "interesting" distances
distances = paired_distances(
X[connectivity.row], X[connectivity.col], metric=affinity
)
connectivity.data = distances
if n_clusters is None:
n_nodes = 2 * n_samples - 1
else:
assert n_clusters <= n_samples
n_nodes = 2 * n_samples - n_clusters
if linkage == "single":
return _single_linkage_tree(
connectivity,
n_samples,
n_nodes,
n_clusters,
n_connected_components,
return_distance,
)
if return_distance:
distances = np.empty(n_nodes - n_samples)
# create inertia heap and connection matrix
A = np.empty(n_nodes, dtype=object)
inertia = list()
# LIL seems to the best format to access the rows quickly,
# without the numpy overhead of slicing CSR indices and data.
connectivity = connectivity.tolil()
# We are storing the graph in a list of IntFloatDict
for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)):
A[ind] = IntFloatDict(
np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)
)
# We keep only the upper triangular for the heap
# Generator expressions are faster than arrays on the following
inertia.extend(
_hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind
)
del connectivity
heapify(inertia)
# prepare the main fields
parent = np.arange(n_nodes, dtype=np.intp)
used_node = np.ones(n_nodes, dtype=np.intp)
children = []
# recursive merge loop
for k in range(n_samples, n_nodes):
# identify the merge
while True:
edge = heappop(inertia)
if used_node[edge.a] and used_node[edge.b]:
break
i = edge.a
j = edge.b
if return_distance:
# store distances
distances[k - n_samples] = edge.weight
parent[i] = parent[j] = k
children.append((i, j))
# Keep track of the number of elements per cluster
n_i = used_node[i]
n_j = used_node[j]
used_node[k] = n_i + n_j
used_node[i] = used_node[j] = False
# update the structure matrix A and the inertia matrix
# a clever 'min', or 'max' operation between A[i] and A[j]
coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
for col, d in coord_col:
A[col].append(k, d)
# Here we use the information from coord_col (containing the
# distances) to update the heap
heappush(inertia, _hierarchical.WeightedEdge(d, k, col))
A[k] = coord_col
# Clear A[i] and A[j] to save memory
A[i] = A[j] = 0
# Separate leaves in children (empty lists up to now)
n_leaves = n_samples
# # return numpy array for efficient caching
children = np.array(children)[:, ::-1]
if return_distance:
return children, n_connected_components, n_leaves, parent, distances
return children, n_connected_components, n_leaves, parent
# Matching names to tree-building strategies
def _complete_linkage(*args, **kwargs):
kwargs["linkage"] = "complete"
return linkage_tree(*args, **kwargs)
def _average_linkage(*args, **kwargs):
kwargs["linkage"] = "average"
return linkage_tree(*args, **kwargs)
def _single_linkage(*args, **kwargs):
kwargs["linkage"] = "single"
return linkage_tree(*args, **kwargs)
_TREE_BUILDERS = dict(
ward=ward_tree,
complete=_complete_linkage,
average=_average_linkage,
single=_single_linkage,
)
###############################################################################
# Functions for cutting hierarchical clustering tree
def _hc_cut(n_clusters, children, n_leaves):
"""Function cutting the ward tree for a given number of clusters.
Parameters
----------
n_clusters : int or ndarray
The number of clusters to form.
children : ndarray of shape (n_nodes-1, 2)
The children of each non-leaf node. Values less than `n_samples`
correspond to leaves of the tree which are the original samples.
A node `i` greater than or equal to `n_samples` is a non-leaf
node and has children `children_[i - n_samples]`. Alternatively
at the i-th iteration, children[i][0] and children[i][1]
are merged to form node `n_samples + i`.
n_leaves : int
Number of leaves of the tree.
Returns
-------
labels : array [n_samples]
Cluster labels for each point.
"""
if n_clusters > n_leaves:
raise ValueError(
"Cannot extract more clusters than samples: "
"%s clusters where given for a tree with %s leaves."
% (n_clusters, n_leaves)
)
# In this function, we store nodes as a heap to avoid recomputing
# the max of the nodes: the first element is always the smallest
# We use negated indices as heaps work on smallest elements, and we
# are interested in largest elements
# children[-1] is the root of the tree
nodes = [-(max(children[-1]) + 1)]
for _ in range(n_clusters - 1):
# As we have a heap, nodes[0] is the smallest element
these_children = children[-nodes[0] - n_leaves]
# Insert the 2 children and remove the largest node
heappush(nodes, -these_children[0])
heappushpop(nodes, -these_children[1])
label = np.zeros(n_leaves, dtype=np.intp)
for i, node in enumerate(nodes):
label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i
return label
###############################################################################
class AgglomerativeClustering(ClusterMixin, BaseEstimator):
"""
Agglomerative Clustering.
Recursively merges pair of clusters of sample data; uses linkage distance.
Read more in the :ref:`User Guide <hierarchical_clustering>`.
Parameters
----------
n_clusters : int or None, default=2
The number of clusters to find. It must be ``None`` if
``distance_threshold`` is not ``None``.
metric : str or callable, default="euclidean"
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
"manhattan", "cosine", or "precomputed". If linkage is "ward", only
"euclidean" is accepted. If "precomputed", a distance matrix is needed
as input for the fit method.
.. versionadded:: 1.2
.. deprecated:: 1.4
`metric=None` is deprecated in 1.4 and will be removed in 1.6.
Let `metric` be the default value (i.e. `"euclidean"`) instead.
memory : str or object with the joblib.Memory interface, default=None
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
connectivity : array-like or callable, default=None
Connectivity matrix. Defines for each sample the neighboring
samples following a given structure of the data.
This can be a connectivity matrix itself or a callable that transforms
the data into a connectivity matrix, such as derived from
`kneighbors_graph`. Default is ``None``, i.e, the
hierarchical clustering algorithm is unstructured.
compute_full_tree : 'auto' or bool, default='auto'
Stop early the construction of the tree at ``n_clusters``. This is
useful to decrease computation time if the number of clusters is not
small compared to the number of samples. This option is useful only
when specifying a connectivity matrix. Note also that when varying the
number of clusters and using caching, it may be advantageous to compute
the full tree. It must be ``True`` if ``distance_threshold`` is not
``None``. By default `compute_full_tree` is "auto", which is equivalent
to `True` when `distance_threshold` is not `None` or that `n_clusters`
is inferior to the maximum between 100 or `0.02 * n_samples`.
Otherwise, "auto" is equivalent to `False`.
linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
Which linkage criterion to use. The linkage criterion determines which
distance to use between sets of observation. The algorithm will merge
the pairs of cluster that minimize this criterion.
- 'ward' minimizes the variance of the clusters being merged.
- 'average' uses the average of the distances of each observation of
the two sets.
- 'complete' or 'maximum' linkage uses the maximum distances between
all observations of the two sets.
- 'single' uses the minimum of the distances between all observations
of the two sets.
.. versionadded:: 0.20
Added the 'single' option
distance_threshold : float, default=None
The linkage distance threshold at or above which clusters will not be
merged. If not ``None``, ``n_clusters`` must be ``None`` and
``compute_full_tree`` must be ``True``.
.. versionadded:: 0.21
compute_distances : bool, default=False
Computes distances between clusters even if `distance_threshold` is not
used. This can be used to make dendrogram visualization, but introduces
a computational and memory overhead.
.. versionadded:: 0.24
Attributes
----------
n_clusters_ : int
The number of clusters found by the algorithm. If
``distance_threshold=None``, it will be equal to the given
``n_clusters``.
labels_ : ndarray of shape (n_samples)
Cluster labels for each point.
n_leaves_ : int
Number of leaves in the hierarchical tree.
n_connected_components_ : int
The estimated number of connected components in the graph.
.. versionadded:: 0.21
``n_connected_components_`` was added to replace ``n_components_``.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
children_ : array-like of shape (n_samples-1, 2)
The children of each non-leaf node. Values less than `n_samples`
correspond to leaves of the tree which are the original samples.
A node `i` greater than or equal to `n_samples` is a non-leaf
node and has children `children_[i - n_samples]`. Alternatively
at the i-th iteration, children[i][0] and children[i][1]
are merged to form node `n_samples + i`.
distances_ : array-like of shape (n_nodes-1,)
Distances between nodes in the corresponding place in `children_`.
Only computed if `distance_threshold` is used or `compute_distances`
is set to `True`.
See Also
--------
FeatureAgglomeration : Agglomerative clustering but for features instead of
samples.
ward_tree : Hierarchical clustering with ward linkage.
Examples
--------
>>> from sklearn.cluster import AgglomerativeClustering
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
... [4, 2], [4, 4], [4, 0]])
>>> clustering = AgglomerativeClustering().fit(X)
>>> clustering
AgglomerativeClustering()
>>> clustering.labels_
array([1, 1, 1, 0, 0, 0])
"""
_parameter_constraints: dict = {
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
"metric": [
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
Hidden(None),
],
"memory": [str, HasMethods("cache"), None],
"connectivity": ["array-like", callable, None],
"compute_full_tree": [StrOptions({"auto"}), "boolean"],
"linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
"distance_threshold": [Interval(Real, 0, None, closed="left"), None],
"compute_distances": ["boolean"],
}
def __init__(
self,
n_clusters=2,
*,
metric="euclidean",
memory=None,
connectivity=None,
compute_full_tree="auto",
linkage="ward",
distance_threshold=None,
compute_distances=False,
):
self.n_clusters = n_clusters
self.distance_threshold = distance_threshold
self.memory = memory
self.connectivity = connectivity
self.compute_full_tree = compute_full_tree
self.linkage = linkage
self.metric = metric
self.compute_distances = compute_distances
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the hierarchical clustering from features, or distance matrix.
Parameters
----------
X : array-like, shape (n_samples, n_features) or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``metric='precomputed'``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
Returns the fitted instance.
"""
X = self._validate_data(X, ensure_min_samples=2)
return self._fit(X)
def _fit(self, X):
"""Fit without validation
Parameters
----------
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
Training instances to cluster, or distances between instances if
``affinity='precomputed'``.
Returns
-------
self : object
Returns the fitted instance.
"""
memory = check_memory(self.memory)
# TODO(1.6): remove in 1.6
if self.metric is None:
warnings.warn(
(
"`metric=None` is deprecated in version 1.4 and will be removed in "
"version 1.6. Let `metric` be the default value "
"(i.e. `'euclidean'`) instead."
),
FutureWarning,
)
self._metric = "euclidean"
else:
self._metric = self.metric
if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
raise ValueError(
"Exactly one of n_clusters and "
"distance_threshold has to be set, and the other "
"needs to be None."
)
if self.distance_threshold is not None and not self.compute_full_tree:
raise ValueError(
"compute_full_tree must be True if distance_threshold is set."
)
if self.linkage == "ward" and self._metric != "euclidean":
raise ValueError(
f"{self._metric} was provided as metric. Ward can only "
"work with euclidean distances."
)
tree_builder = _TREE_BUILDERS[self.linkage]
connectivity = self.connectivity
if self.connectivity is not None:
if callable(self.connectivity):
connectivity = self.connectivity(X)
connectivity = check_array(
connectivity, accept_sparse=["csr", "coo", "lil"]
)
n_samples = len(X)
compute_full_tree = self.compute_full_tree
if self.connectivity is None:
compute_full_tree = True
if compute_full_tree == "auto":
if self.distance_threshold is not None:
compute_full_tree = True
else:
# Early stopping is likely to give a speed up only for
# a large number of clusters. The actual threshold
# implemented here is heuristic
compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples)
n_clusters = self.n_clusters
if compute_full_tree:
n_clusters = None
# Construct the tree
kwargs = {}
if self.linkage != "ward":
kwargs["linkage"] = self.linkage
kwargs["affinity"] = self._metric
distance_threshold = self.distance_threshold
return_distance = (distance_threshold is not None) or self.compute_distances
out = memory.cache(tree_builder)(
X,
connectivity=connectivity,
n_clusters=n_clusters,
return_distance=return_distance,
**kwargs,
)
(self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[
:4
]
if return_distance:
self.distances_ = out[-1]
if self.distance_threshold is not None: # distance_threshold is used
self.n_clusters_ = (
np.count_nonzero(self.distances_ >= distance_threshold) + 1
)
else: # n_clusters is used
self.n_clusters_ = self.n_clusters
# Cut the tree
if compute_full_tree:
self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_)
else:
labels = _hierarchical.hc_get_heads(parents, copy=False)
# copy to avoid holding a reference on the original array
labels = np.copy(labels[:n_samples])
# Reassign cluster numbers
self.labels_ = np.searchsorted(np.unique(labels), labels)
return self
def fit_predict(self, X, y=None):
"""Fit and return the result of each sample's clustering assignment.
In addition to fitting, this method also return the result of the
clustering assignment for each sample in the training set.
Parameters
----------
X : array-like of shape (n_samples, n_features) or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``affinity='precomputed'``.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
labels : ndarray of shape (n_samples,)
Cluster labels.
"""
return super().fit_predict(X, y)
class FeatureAgglomeration(
ClassNamePrefixFeaturesOutMixin, AgglomerativeClustering, AgglomerationTransform
):
"""Agglomerate features.
Recursively merges pair of clusters of features.
Read more in the :ref:`User Guide <hierarchical_clustering>`.
Parameters
----------
n_clusters : int or None, default=2
The number of clusters to find. It must be ``None`` if
``distance_threshold`` is not ``None``.
metric : str or callable, default="euclidean"
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
"manhattan", "cosine", or "precomputed". If linkage is "ward", only
"euclidean" is accepted. If "precomputed", a distance matrix is needed
as input for the fit method.
.. versionadded:: 1.2
.. deprecated:: 1.4
`metric=None` is deprecated in 1.4 and will be removed in 1.6.
Let `metric` be the default value (i.e. `"euclidean"`) instead.
memory : str or object with the joblib.Memory interface, default=None
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
path to the caching directory.
connectivity : array-like or callable, default=None
Connectivity matrix. Defines for each feature the neighboring
features following a given structure of the data.
This can be a connectivity matrix itself or a callable that transforms
the data into a connectivity matrix, such as derived from
`kneighbors_graph`. Default is `None`, i.e, the
hierarchical clustering algorithm is unstructured.
compute_full_tree : 'auto' or bool, default='auto'
Stop early the construction of the tree at `n_clusters`. This is useful
to decrease computation time if the number of clusters is not small
compared to the number of features. This option is useful only when
specifying a connectivity matrix. Note also that when varying the
number of clusters and using caching, it may be advantageous to compute
the full tree. It must be ``True`` if ``distance_threshold`` is not
``None``. By default `compute_full_tree` is "auto", which is equivalent
to `True` when `distance_threshold` is not `None` or that `n_clusters`
is inferior to the maximum between 100 or `0.02 * n_samples`.
Otherwise, "auto" is equivalent to `False`.
linkage : {"ward", "complete", "average", "single"}, default="ward"
Which linkage criterion to use. The linkage criterion determines which
distance to use between sets of features. The algorithm will merge
the pairs of cluster that minimize this criterion.
- "ward" minimizes the variance of the clusters being merged.
- "complete" or maximum linkage uses the maximum distances between
all features of the two sets.
- "average" uses the average of the distances of each feature of
the two sets.
- "single" uses the minimum of the distances between all features
of the two sets.
pooling_func : callable, default=np.mean
This combines the values of agglomerated features into a single
value, and should accept an array of shape [M, N] and the keyword
argument `axis=1`, and reduce it to an array of size [M].
distance_threshold : float, default=None
The linkage distance threshold at or above which clusters will not be
merged. If not ``None``, ``n_clusters`` must be ``None`` and
``compute_full_tree`` must be ``True``.
.. versionadded:: 0.21
compute_distances : bool, default=False
Computes distances between clusters even if `distance_threshold` is not
used. This can be used to make dendrogram visualization, but introduces
a computational and memory overhead.
.. versionadded:: 0.24
Attributes
----------
n_clusters_ : int
The number of clusters found by the algorithm. If
``distance_threshold=None``, it will be equal to the given
``n_clusters``.
labels_ : array-like of (n_features,)
Cluster labels for each feature.
n_leaves_ : int
Number of leaves in the hierarchical tree.
n_connected_components_ : int
The estimated number of connected components in the graph.
.. versionadded:: 0.21
``n_connected_components_`` was added to replace ``n_components_``.
n_features_in_ : int
Number of features seen during :term:`fit`.
.. versionadded:: 0.24
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during :term:`fit`. Defined only when `X`
has feature names that are all strings.
.. versionadded:: 1.0
children_ : array-like of shape (n_nodes-1, 2)
The children of each non-leaf node. Values less than `n_features`
correspond to leaves of the tree which are the original samples.
A node `i` greater than or equal to `n_features` is a non-leaf
node and has children `children_[i - n_features]`. Alternatively
at the i-th iteration, children[i][0] and children[i][1]
are merged to form node `n_features + i`.
distances_ : array-like of shape (n_nodes-1,)
Distances between nodes in the corresponding place in `children_`.
Only computed if `distance_threshold` is used or `compute_distances`
is set to `True`.
See Also
--------
AgglomerativeClustering : Agglomerative clustering samples instead of
features.
ward_tree : Hierarchical clustering with ward linkage.
Examples
--------
>>> import numpy as np
>>> from sklearn import datasets, cluster
>>> digits = datasets.load_digits()
>>> images = digits.images
>>> X = np.reshape(images, (len(images), -1))
>>> agglo = cluster.FeatureAgglomeration(n_clusters=32)
>>> agglo.fit(X)
FeatureAgglomeration(n_clusters=32)
>>> X_reduced = agglo.transform(X)
>>> X_reduced.shape
(1797, 32)
"""
_parameter_constraints: dict = {
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
"metric": [
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
Hidden(None),
],
"memory": [str, HasMethods("cache"), None],
"connectivity": ["array-like", callable, None],
"compute_full_tree": [StrOptions({"auto"}), "boolean"],
"linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
"pooling_func": [callable],
"distance_threshold": [Interval(Real, 0, None, closed="left"), None],
"compute_distances": ["boolean"],
}
def __init__(
self,
n_clusters=2,
*,
metric="euclidean",
memory=None,
connectivity=None,
compute_full_tree="auto",
linkage="ward",
pooling_func=np.mean,
distance_threshold=None,
compute_distances=False,
):
super().__init__(
n_clusters=n_clusters,
memory=memory,
connectivity=connectivity,
compute_full_tree=compute_full_tree,
linkage=linkage,
metric=metric,
distance_threshold=distance_threshold,
compute_distances=compute_distances,
)
self.pooling_func = pooling_func
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y=None):
"""Fit the hierarchical clustering on the data.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data.
y : Ignored
Not used, present here for API consistency by convention.
Returns
-------
self : object
Returns the transformer.
"""
X = self._validate_data(X, ensure_min_features=2)
super()._fit(X.T)
self._n_features_out = self.n_clusters_
return self
@property
def fit_predict(self):
"""Fit and return the result of each sample's clustering assignment."""
raise AttributeError