1337 lines
48 KiB
Python
1337 lines
48 KiB
Python
|
"""Hierarchical Agglomerative Clustering
|
||
|
|
||
|
These routines perform some hierarchical agglomerative clustering of some
|
||
|
input data.
|
||
|
|
||
|
Authors : Vincent Michel, Bertrand Thirion, Alexandre Gramfort,
|
||
|
Gael Varoquaux
|
||
|
License: BSD 3 clause
|
||
|
"""
|
||
|
import warnings
|
||
|
from heapq import heapify, heappop, heappush, heappushpop
|
||
|
from numbers import Integral, Real
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy import sparse
|
||
|
from scipy.sparse.csgraph import connected_components
|
||
|
|
||
|
from ..base import (
|
||
|
BaseEstimator,
|
||
|
ClassNamePrefixFeaturesOutMixin,
|
||
|
ClusterMixin,
|
||
|
_fit_context,
|
||
|
)
|
||
|
from ..metrics import DistanceMetric
|
||
|
from ..metrics._dist_metrics import METRIC_MAPPING64
|
||
|
from ..metrics.pairwise import _VALID_METRICS, paired_distances
|
||
|
from ..utils import check_array
|
||
|
from ..utils._fast_dict import IntFloatDict
|
||
|
from ..utils._param_validation import (
|
||
|
HasMethods,
|
||
|
Hidden,
|
||
|
Interval,
|
||
|
StrOptions,
|
||
|
validate_params,
|
||
|
)
|
||
|
from ..utils.graph import _fix_connected_components
|
||
|
from ..utils.validation import check_memory
|
||
|
|
||
|
# mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
|
||
|
from . import _hierarchical_fast as _hierarchical # type: ignore
|
||
|
from ._feature_agglomeration import AgglomerationTransform
|
||
|
|
||
|
###############################################################################
|
||
|
# For non fully-connected graphs
|
||
|
|
||
|
|
||
|
def _fix_connectivity(X, connectivity, affinity):
|
||
|
"""
|
||
|
Fixes the connectivity matrix.
|
||
|
|
||
|
The different steps are:
|
||
|
|
||
|
- copies it
|
||
|
- makes it symmetric
|
||
|
- converts it to LIL if necessary
|
||
|
- completes it if necessary.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Feature matrix representing `n_samples` samples to be clustered.
|
||
|
|
||
|
connectivity : sparse matrix, default=None
|
||
|
Connectivity matrix. Defines for each sample the neighboring samples
|
||
|
following a given structure of the data. The matrix is assumed to
|
||
|
be symmetric and only the upper triangular half is used.
|
||
|
Default is `None`, i.e, the Ward algorithm is unstructured.
|
||
|
|
||
|
affinity : {"euclidean", "precomputed"}, default="euclidean"
|
||
|
Which affinity to use. At the moment `precomputed` and
|
||
|
``euclidean`` are supported. `euclidean` uses the
|
||
|
negative squared Euclidean distance between points.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
connectivity : sparse matrix
|
||
|
The fixed connectivity matrix.
|
||
|
|
||
|
n_connected_components : int
|
||
|
The number of connected components in the graph.
|
||
|
"""
|
||
|
n_samples = X.shape[0]
|
||
|
if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:
|
||
|
raise ValueError(
|
||
|
"Wrong shape for connectivity matrix: %s when X is %s"
|
||
|
% (connectivity.shape, X.shape)
|
||
|
)
|
||
|
|
||
|
# Make the connectivity matrix symmetric:
|
||
|
connectivity = connectivity + connectivity.T
|
||
|
|
||
|
# Convert connectivity matrix to LIL
|
||
|
if not sparse.issparse(connectivity):
|
||
|
connectivity = sparse.lil_matrix(connectivity)
|
||
|
|
||
|
# `connectivity` is a sparse matrix at this point
|
||
|
if connectivity.format != "lil":
|
||
|
connectivity = connectivity.tolil()
|
||
|
|
||
|
# Compute the number of nodes
|
||
|
n_connected_components, labels = connected_components(connectivity)
|
||
|
|
||
|
if n_connected_components > 1:
|
||
|
warnings.warn(
|
||
|
"the number of connected components of the "
|
||
|
"connectivity matrix is %d > 1. Completing it to avoid "
|
||
|
"stopping the tree early." % n_connected_components,
|
||
|
stacklevel=2,
|
||
|
)
|
||
|
# XXX: Can we do without completing the matrix?
|
||
|
connectivity = _fix_connected_components(
|
||
|
X=X,
|
||
|
graph=connectivity,
|
||
|
n_connected_components=n_connected_components,
|
||
|
component_labels=labels,
|
||
|
metric=affinity,
|
||
|
mode="connectivity",
|
||
|
)
|
||
|
|
||
|
return connectivity, n_connected_components
|
||
|
|
||
|
|
||
|
def _single_linkage_tree(
|
||
|
connectivity,
|
||
|
n_samples,
|
||
|
n_nodes,
|
||
|
n_clusters,
|
||
|
n_connected_components,
|
||
|
return_distance,
|
||
|
):
|
||
|
"""
|
||
|
Perform single linkage clustering on sparse data via the minimum
|
||
|
spanning tree from scipy.sparse.csgraph, then using union-find to label.
|
||
|
The parent array is then generated by walking through the tree.
|
||
|
"""
|
||
|
from scipy.sparse.csgraph import minimum_spanning_tree
|
||
|
|
||
|
# explicitly cast connectivity to ensure safety
|
||
|
connectivity = connectivity.astype(np.float64, copy=False)
|
||
|
|
||
|
# Ensure zero distances aren't ignored by setting them to "epsilon"
|
||
|
epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps
|
||
|
connectivity.data[connectivity.data == 0] = epsilon_value
|
||
|
|
||
|
# Use scipy.sparse.csgraph to generate a minimum spanning tree
|
||
|
mst = minimum_spanning_tree(connectivity.tocsr())
|
||
|
|
||
|
# Convert the graph to scipy.cluster.hierarchy array format
|
||
|
mst = mst.tocoo()
|
||
|
|
||
|
# Undo the epsilon values
|
||
|
mst.data[mst.data == epsilon_value] = 0
|
||
|
|
||
|
mst_array = np.vstack([mst.row, mst.col, mst.data]).T
|
||
|
|
||
|
# Sort edges of the min_spanning_tree by weight
|
||
|
mst_array = mst_array[np.argsort(mst_array.T[2], kind="mergesort"), :]
|
||
|
|
||
|
# Convert edge list into standard hierarchical clustering format
|
||
|
single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
|
||
|
children_ = single_linkage_tree[:, :2].astype(int)
|
||
|
|
||
|
# Compute parents
|
||
|
parent = np.arange(n_nodes, dtype=np.intp)
|
||
|
for i, (left, right) in enumerate(children_, n_samples):
|
||
|
if n_clusters is not None and i >= n_nodes:
|
||
|
break
|
||
|
if left < n_nodes:
|
||
|
parent[left] = i
|
||
|
if right < n_nodes:
|
||
|
parent[right] = i
|
||
|
|
||
|
if return_distance:
|
||
|
distances = single_linkage_tree[:, 2]
|
||
|
return children_, n_connected_components, n_samples, parent, distances
|
||
|
return children_, n_connected_components, n_samples, parent
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
# Hierarchical tree building functions
|
||
|
|
||
|
|
||
|
@validate_params(
|
||
|
{
|
||
|
"X": ["array-like"],
|
||
|
"connectivity": ["array-like", "sparse matrix", None],
|
||
|
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
|
||
|
"return_distance": ["boolean"],
|
||
|
},
|
||
|
prefer_skip_nested_validation=True,
|
||
|
)
|
||
|
def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
|
||
|
"""Ward clustering based on a Feature matrix.
|
||
|
|
||
|
Recursively merges the pair of clusters that minimally increases
|
||
|
within-cluster variance.
|
||
|
|
||
|
The inertia matrix uses a Heapq-based representation.
|
||
|
|
||
|
This is the structured version, that takes into account some topological
|
||
|
structure between samples.
|
||
|
|
||
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Feature matrix representing `n_samples` samples to be clustered.
|
||
|
|
||
|
connectivity : {array-like, sparse matrix}, default=None
|
||
|
Connectivity matrix. Defines for each sample the neighboring samples
|
||
|
following a given structure of the data. The matrix is assumed to
|
||
|
be symmetric and only the upper triangular half is used.
|
||
|
Default is None, i.e, the Ward algorithm is unstructured.
|
||
|
|
||
|
n_clusters : int, default=None
|
||
|
`n_clusters` should be less than `n_samples`. Stop early the
|
||
|
construction of the tree at `n_clusters.` This is useful to decrease
|
||
|
computation time if the number of clusters is not small compared to the
|
||
|
number of samples. In this case, the complete tree is not computed, thus
|
||
|
the 'children' output is of limited use, and the 'parents' output should
|
||
|
rather be used. This option is valid only when specifying a connectivity
|
||
|
matrix.
|
||
|
|
||
|
return_distance : bool, default=False
|
||
|
If `True`, return the distance between the clusters.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
children : ndarray of shape (n_nodes-1, 2)
|
||
|
The children of each non-leaf node. Values less than `n_samples`
|
||
|
correspond to leaves of the tree which are the original samples.
|
||
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
||
|
node and has children `children_[i - n_samples]`. Alternatively
|
||
|
at the i-th iteration, children[i][0] and children[i][1]
|
||
|
are merged to form node `n_samples + i`.
|
||
|
|
||
|
n_connected_components : int
|
||
|
The number of connected components in the graph.
|
||
|
|
||
|
n_leaves : int
|
||
|
The number of leaves in the tree.
|
||
|
|
||
|
parents : ndarray of shape (n_nodes,) or None
|
||
|
The parent of each node. Only returned when a connectivity matrix
|
||
|
is specified, elsewhere 'None' is returned.
|
||
|
|
||
|
distances : ndarray of shape (n_nodes-1,)
|
||
|
Only returned if `return_distance` is set to `True` (for compatibility).
|
||
|
The distances between the centers of the nodes. `distances[i]`
|
||
|
corresponds to a weighted Euclidean distance between
|
||
|
the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
|
||
|
leaves of the tree, then `distances[i]` is their unweighted Euclidean
|
||
|
distance. Distances are updated in the following way
|
||
|
(from scipy.hierarchy.linkage):
|
||
|
|
||
|
The new entry :math:`d(u,v)` is computed as follows,
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
d(u,v) = \\sqrt{\\frac{|v|+|s|}
|
||
|
{T}d(v,s)^2
|
||
|
+ \\frac{|v|+|t|}
|
||
|
{T}d(v,t)^2
|
||
|
- \\frac{|v|}
|
||
|
{T}d(s,t)^2}
|
||
|
|
||
|
where :math:`u` is the newly joined cluster consisting of
|
||
|
clusters :math:`s` and :math:`t`, :math:`v` is an unused
|
||
|
cluster in the forest, :math:`T=|v|+|s|+|t|`, and
|
||
|
:math:`|*|` is the cardinality of its argument. This is also
|
||
|
known as the incremental algorithm.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn.cluster import ward_tree
|
||
|
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||
|
... [4, 2], [4, 4], [4, 0]])
|
||
|
>>> children, n_connected_components, n_leaves, parents = ward_tree(X)
|
||
|
>>> children
|
||
|
array([[0, 1],
|
||
|
[3, 5],
|
||
|
[2, 6],
|
||
|
[4, 7],
|
||
|
[8, 9]])
|
||
|
>>> n_connected_components
|
||
|
1
|
||
|
>>> n_leaves
|
||
|
6
|
||
|
"""
|
||
|
X = np.asarray(X)
|
||
|
if X.ndim == 1:
|
||
|
X = np.reshape(X, (-1, 1))
|
||
|
n_samples, n_features = X.shape
|
||
|
|
||
|
if connectivity is None:
|
||
|
from scipy.cluster import hierarchy # imports PIL
|
||
|
|
||
|
if n_clusters is not None:
|
||
|
warnings.warn(
|
||
|
(
|
||
|
"Partial build of the tree is implemented "
|
||
|
"only for structured clustering (i.e. with "
|
||
|
"explicit connectivity). The algorithm "
|
||
|
"will build the full tree and only "
|
||
|
"retain the lower branches required "
|
||
|
"for the specified number of clusters"
|
||
|
),
|
||
|
stacklevel=2,
|
||
|
)
|
||
|
X = np.require(X, requirements="W")
|
||
|
out = hierarchy.ward(X)
|
||
|
children_ = out[:, :2].astype(np.intp)
|
||
|
|
||
|
if return_distance:
|
||
|
distances = out[:, 2]
|
||
|
return children_, 1, n_samples, None, distances
|
||
|
else:
|
||
|
return children_, 1, n_samples, None
|
||
|
|
||
|
connectivity, n_connected_components = _fix_connectivity(
|
||
|
X, connectivity, affinity="euclidean"
|
||
|
)
|
||
|
if n_clusters is None:
|
||
|
n_nodes = 2 * n_samples - 1
|
||
|
else:
|
||
|
if n_clusters > n_samples:
|
||
|
raise ValueError(
|
||
|
"Cannot provide more clusters than samples. "
|
||
|
"%i n_clusters was asked, and there are %i "
|
||
|
"samples." % (n_clusters, n_samples)
|
||
|
)
|
||
|
n_nodes = 2 * n_samples - n_clusters
|
||
|
|
||
|
# create inertia matrix
|
||
|
coord_row = []
|
||
|
coord_col = []
|
||
|
A = []
|
||
|
for ind, row in enumerate(connectivity.rows):
|
||
|
A.append(row)
|
||
|
# We keep only the upper triangular for the moments
|
||
|
# Generator expressions are faster than arrays on the following
|
||
|
row = [i for i in row if i < ind]
|
||
|
coord_row.extend(
|
||
|
len(row)
|
||
|
* [
|
||
|
ind,
|
||
|
]
|
||
|
)
|
||
|
coord_col.extend(row)
|
||
|
|
||
|
coord_row = np.array(coord_row, dtype=np.intp, order="C")
|
||
|
coord_col = np.array(coord_col, dtype=np.intp, order="C")
|
||
|
|
||
|
# build moments as a list
|
||
|
moments_1 = np.zeros(n_nodes, order="C")
|
||
|
moments_1[:n_samples] = 1
|
||
|
moments_2 = np.zeros((n_nodes, n_features), order="C")
|
||
|
moments_2[:n_samples] = X
|
||
|
inertia = np.empty(len(coord_row), dtype=np.float64, order="C")
|
||
|
_hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, inertia)
|
||
|
inertia = list(zip(inertia, coord_row, coord_col))
|
||
|
heapify(inertia)
|
||
|
|
||
|
# prepare the main fields
|
||
|
parent = np.arange(n_nodes, dtype=np.intp)
|
||
|
used_node = np.ones(n_nodes, dtype=bool)
|
||
|
children = []
|
||
|
if return_distance:
|
||
|
distances = np.empty(n_nodes - n_samples)
|
||
|
|
||
|
not_visited = np.empty(n_nodes, dtype=bool, order="C")
|
||
|
|
||
|
# recursive merge loop
|
||
|
for k in range(n_samples, n_nodes):
|
||
|
# identify the merge
|
||
|
while True:
|
||
|
inert, i, j = heappop(inertia)
|
||
|
if used_node[i] and used_node[j]:
|
||
|
break
|
||
|
parent[i], parent[j] = k, k
|
||
|
children.append((i, j))
|
||
|
used_node[i] = used_node[j] = False
|
||
|
if return_distance: # store inertia value
|
||
|
distances[k - n_samples] = inert
|
||
|
|
||
|
# update the moments
|
||
|
moments_1[k] = moments_1[i] + moments_1[j]
|
||
|
moments_2[k] = moments_2[i] + moments_2[j]
|
||
|
|
||
|
# update the structure matrix A and the inertia matrix
|
||
|
coord_col = []
|
||
|
not_visited.fill(1)
|
||
|
not_visited[k] = 0
|
||
|
_hierarchical._get_parents(A[i], coord_col, parent, not_visited)
|
||
|
_hierarchical._get_parents(A[j], coord_col, parent, not_visited)
|
||
|
# List comprehension is faster than a for loop
|
||
|
[A[col].append(k) for col in coord_col]
|
||
|
A.append(coord_col)
|
||
|
coord_col = np.array(coord_col, dtype=np.intp, order="C")
|
||
|
coord_row = np.empty(coord_col.shape, dtype=np.intp, order="C")
|
||
|
coord_row.fill(k)
|
||
|
n_additions = len(coord_row)
|
||
|
ini = np.empty(n_additions, dtype=np.float64, order="C")
|
||
|
|
||
|
_hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col, ini)
|
||
|
|
||
|
# List comprehension is faster than a for loop
|
||
|
[heappush(inertia, (ini[idx], k, coord_col[idx])) for idx in range(n_additions)]
|
||
|
|
||
|
# Separate leaves in children (empty lists up to now)
|
||
|
n_leaves = n_samples
|
||
|
# sort children to get consistent output with unstructured version
|
||
|
children = [c[::-1] for c in children]
|
||
|
children = np.array(children) # return numpy array for efficient caching
|
||
|
|
||
|
if return_distance:
|
||
|
# 2 is scaling factor to compare w/ unstructured version
|
||
|
distances = np.sqrt(2.0 * distances)
|
||
|
return children, n_connected_components, n_leaves, parent, distances
|
||
|
else:
|
||
|
return children, n_connected_components, n_leaves, parent
|
||
|
|
||
|
|
||
|
# single average and complete linkage
|
||
|
def linkage_tree(
|
||
|
X,
|
||
|
connectivity=None,
|
||
|
n_clusters=None,
|
||
|
linkage="complete",
|
||
|
affinity="euclidean",
|
||
|
return_distance=False,
|
||
|
):
|
||
|
"""Linkage agglomerative clustering based on a Feature matrix.
|
||
|
|
||
|
The inertia matrix uses a Heapq-based representation.
|
||
|
|
||
|
This is the structured version, that takes into account some topological
|
||
|
structure between samples.
|
||
|
|
||
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
Feature matrix representing `n_samples` samples to be clustered.
|
||
|
|
||
|
connectivity : sparse matrix, default=None
|
||
|
Connectivity matrix. Defines for each sample the neighboring samples
|
||
|
following a given structure of the data. The matrix is assumed to
|
||
|
be symmetric and only the upper triangular half is used.
|
||
|
Default is `None`, i.e, the Ward algorithm is unstructured.
|
||
|
|
||
|
n_clusters : int, default=None
|
||
|
Stop early the construction of the tree at `n_clusters`. This is
|
||
|
useful to decrease computation time if the number of clusters is
|
||
|
not small compared to the number of samples. In this case, the
|
||
|
complete tree is not computed, thus the 'children' output is of
|
||
|
limited use, and the 'parents' output should rather be used.
|
||
|
This option is valid only when specifying a connectivity matrix.
|
||
|
|
||
|
linkage : {"average", "complete", "single"}, default="complete"
|
||
|
Which linkage criteria to use. The linkage criterion determines which
|
||
|
distance to use between sets of observation.
|
||
|
- "average" uses the average of the distances of each observation of
|
||
|
the two sets.
|
||
|
- "complete" or maximum linkage uses the maximum distances between
|
||
|
all observations of the two sets.
|
||
|
- "single" uses the minimum of the distances between all
|
||
|
observations of the two sets.
|
||
|
|
||
|
affinity : str or callable, default='euclidean'
|
||
|
Which metric to use. Can be 'euclidean', 'manhattan', or any
|
||
|
distance known to paired distance (see metric.pairwise).
|
||
|
|
||
|
return_distance : bool, default=False
|
||
|
Whether or not to return the distances between the clusters.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
children : ndarray of shape (n_nodes-1, 2)
|
||
|
The children of each non-leaf node. Values less than `n_samples`
|
||
|
correspond to leaves of the tree which are the original samples.
|
||
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
||
|
node and has children `children_[i - n_samples]`. Alternatively
|
||
|
at the i-th iteration, children[i][0] and children[i][1]
|
||
|
are merged to form node `n_samples + i`.
|
||
|
|
||
|
n_connected_components : int
|
||
|
The number of connected components in the graph.
|
||
|
|
||
|
n_leaves : int
|
||
|
The number of leaves in the tree.
|
||
|
|
||
|
parents : ndarray of shape (n_nodes, ) or None
|
||
|
The parent of each node. Only returned when a connectivity matrix
|
||
|
is specified, elsewhere 'None' is returned.
|
||
|
|
||
|
distances : ndarray of shape (n_nodes-1,)
|
||
|
Returned when `return_distance` is set to `True`.
|
||
|
|
||
|
distances[i] refers to the distance between children[i][0] and
|
||
|
children[i][1] when they are merged.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
ward_tree : Hierarchical clustering with ward linkage.
|
||
|
"""
|
||
|
X = np.asarray(X)
|
||
|
if X.ndim == 1:
|
||
|
X = np.reshape(X, (-1, 1))
|
||
|
n_samples, n_features = X.shape
|
||
|
|
||
|
linkage_choices = {
|
||
|
"complete": _hierarchical.max_merge,
|
||
|
"average": _hierarchical.average_merge,
|
||
|
"single": None,
|
||
|
} # Single linkage is handled differently
|
||
|
try:
|
||
|
join_func = linkage_choices[linkage]
|
||
|
except KeyError as e:
|
||
|
raise ValueError(
|
||
|
"Unknown linkage option, linkage should be one of %s, but %s was given"
|
||
|
% (linkage_choices.keys(), linkage)
|
||
|
) from e
|
||
|
|
||
|
if affinity == "cosine" and np.any(~np.any(X, axis=1)):
|
||
|
raise ValueError("Cosine affinity cannot be used when X contains zero vectors")
|
||
|
|
||
|
if connectivity is None:
|
||
|
from scipy.cluster import hierarchy # imports PIL
|
||
|
|
||
|
if n_clusters is not None:
|
||
|
warnings.warn(
|
||
|
(
|
||
|
"Partial build of the tree is implemented "
|
||
|
"only for structured clustering (i.e. with "
|
||
|
"explicit connectivity). The algorithm "
|
||
|
"will build the full tree and only "
|
||
|
"retain the lower branches required "
|
||
|
"for the specified number of clusters"
|
||
|
),
|
||
|
stacklevel=2,
|
||
|
)
|
||
|
|
||
|
if affinity == "precomputed":
|
||
|
# for the linkage function of hierarchy to work on precomputed
|
||
|
# data, provide as first argument an ndarray of the shape returned
|
||
|
# by sklearn.metrics.pairwise_distances.
|
||
|
if X.shape[0] != X.shape[1]:
|
||
|
raise ValueError(
|
||
|
f"Distance matrix should be square, got matrix of shape {X.shape}"
|
||
|
)
|
||
|
i, j = np.triu_indices(X.shape[0], k=1)
|
||
|
X = X[i, j]
|
||
|
elif affinity == "l2":
|
||
|
# Translate to something understood by scipy
|
||
|
affinity = "euclidean"
|
||
|
elif affinity in ("l1", "manhattan"):
|
||
|
affinity = "cityblock"
|
||
|
elif callable(affinity):
|
||
|
X = affinity(X)
|
||
|
i, j = np.triu_indices(X.shape[0], k=1)
|
||
|
X = X[i, j]
|
||
|
if (
|
||
|
linkage == "single"
|
||
|
and affinity != "precomputed"
|
||
|
and not callable(affinity)
|
||
|
and affinity in METRIC_MAPPING64
|
||
|
):
|
||
|
# We need the fast cythonized metric from neighbors
|
||
|
dist_metric = DistanceMetric.get_metric(affinity)
|
||
|
|
||
|
# The Cython routines used require contiguous arrays
|
||
|
X = np.ascontiguousarray(X, dtype=np.double)
|
||
|
|
||
|
mst = _hierarchical.mst_linkage_core(X, dist_metric)
|
||
|
# Sort edges of the min_spanning_tree by weight
|
||
|
mst = mst[np.argsort(mst.T[2], kind="mergesort"), :]
|
||
|
|
||
|
# Convert edge list into standard hierarchical clustering format
|
||
|
out = _hierarchical.single_linkage_label(mst)
|
||
|
else:
|
||
|
out = hierarchy.linkage(X, method=linkage, metric=affinity)
|
||
|
children_ = out[:, :2].astype(int, copy=False)
|
||
|
|
||
|
if return_distance:
|
||
|
distances = out[:, 2]
|
||
|
return children_, 1, n_samples, None, distances
|
||
|
return children_, 1, n_samples, None
|
||
|
|
||
|
connectivity, n_connected_components = _fix_connectivity(
|
||
|
X, connectivity, affinity=affinity
|
||
|
)
|
||
|
connectivity = connectivity.tocoo()
|
||
|
# Put the diagonal to zero
|
||
|
diag_mask = connectivity.row != connectivity.col
|
||
|
connectivity.row = connectivity.row[diag_mask]
|
||
|
connectivity.col = connectivity.col[diag_mask]
|
||
|
connectivity.data = connectivity.data[diag_mask]
|
||
|
del diag_mask
|
||
|
|
||
|
if affinity == "precomputed":
|
||
|
distances = X[connectivity.row, connectivity.col].astype(np.float64, copy=False)
|
||
|
else:
|
||
|
# FIXME We compute all the distances, while we could have only computed
|
||
|
# the "interesting" distances
|
||
|
distances = paired_distances(
|
||
|
X[connectivity.row], X[connectivity.col], metric=affinity
|
||
|
)
|
||
|
connectivity.data = distances
|
||
|
|
||
|
if n_clusters is None:
|
||
|
n_nodes = 2 * n_samples - 1
|
||
|
else:
|
||
|
assert n_clusters <= n_samples
|
||
|
n_nodes = 2 * n_samples - n_clusters
|
||
|
|
||
|
if linkage == "single":
|
||
|
return _single_linkage_tree(
|
||
|
connectivity,
|
||
|
n_samples,
|
||
|
n_nodes,
|
||
|
n_clusters,
|
||
|
n_connected_components,
|
||
|
return_distance,
|
||
|
)
|
||
|
|
||
|
if return_distance:
|
||
|
distances = np.empty(n_nodes - n_samples)
|
||
|
# create inertia heap and connection matrix
|
||
|
A = np.empty(n_nodes, dtype=object)
|
||
|
inertia = list()
|
||
|
|
||
|
# LIL seems to the best format to access the rows quickly,
|
||
|
# without the numpy overhead of slicing CSR indices and data.
|
||
|
connectivity = connectivity.tolil()
|
||
|
# We are storing the graph in a list of IntFloatDict
|
||
|
for ind, (data, row) in enumerate(zip(connectivity.data, connectivity.rows)):
|
||
|
A[ind] = IntFloatDict(
|
||
|
np.asarray(row, dtype=np.intp), np.asarray(data, dtype=np.float64)
|
||
|
)
|
||
|
# We keep only the upper triangular for the heap
|
||
|
# Generator expressions are faster than arrays on the following
|
||
|
inertia.extend(
|
||
|
_hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data) if r < ind
|
||
|
)
|
||
|
del connectivity
|
||
|
|
||
|
heapify(inertia)
|
||
|
|
||
|
# prepare the main fields
|
||
|
parent = np.arange(n_nodes, dtype=np.intp)
|
||
|
used_node = np.ones(n_nodes, dtype=np.intp)
|
||
|
children = []
|
||
|
|
||
|
# recursive merge loop
|
||
|
for k in range(n_samples, n_nodes):
|
||
|
# identify the merge
|
||
|
while True:
|
||
|
edge = heappop(inertia)
|
||
|
if used_node[edge.a] and used_node[edge.b]:
|
||
|
break
|
||
|
i = edge.a
|
||
|
j = edge.b
|
||
|
|
||
|
if return_distance:
|
||
|
# store distances
|
||
|
distances[k - n_samples] = edge.weight
|
||
|
|
||
|
parent[i] = parent[j] = k
|
||
|
children.append((i, j))
|
||
|
# Keep track of the number of elements per cluster
|
||
|
n_i = used_node[i]
|
||
|
n_j = used_node[j]
|
||
|
used_node[k] = n_i + n_j
|
||
|
used_node[i] = used_node[j] = False
|
||
|
|
||
|
# update the structure matrix A and the inertia matrix
|
||
|
# a clever 'min', or 'max' operation between A[i] and A[j]
|
||
|
coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
|
||
|
for col, d in coord_col:
|
||
|
A[col].append(k, d)
|
||
|
# Here we use the information from coord_col (containing the
|
||
|
# distances) to update the heap
|
||
|
heappush(inertia, _hierarchical.WeightedEdge(d, k, col))
|
||
|
A[k] = coord_col
|
||
|
# Clear A[i] and A[j] to save memory
|
||
|
A[i] = A[j] = 0
|
||
|
|
||
|
# Separate leaves in children (empty lists up to now)
|
||
|
n_leaves = n_samples
|
||
|
|
||
|
# # return numpy array for efficient caching
|
||
|
children = np.array(children)[:, ::-1]
|
||
|
|
||
|
if return_distance:
|
||
|
return children, n_connected_components, n_leaves, parent, distances
|
||
|
return children, n_connected_components, n_leaves, parent
|
||
|
|
||
|
|
||
|
# Matching names to tree-building strategies
|
||
|
def _complete_linkage(*args, **kwargs):
|
||
|
kwargs["linkage"] = "complete"
|
||
|
return linkage_tree(*args, **kwargs)
|
||
|
|
||
|
|
||
|
def _average_linkage(*args, **kwargs):
|
||
|
kwargs["linkage"] = "average"
|
||
|
return linkage_tree(*args, **kwargs)
|
||
|
|
||
|
|
||
|
def _single_linkage(*args, **kwargs):
|
||
|
kwargs["linkage"] = "single"
|
||
|
return linkage_tree(*args, **kwargs)
|
||
|
|
||
|
|
||
|
_TREE_BUILDERS = dict(
|
||
|
ward=ward_tree,
|
||
|
complete=_complete_linkage,
|
||
|
average=_average_linkage,
|
||
|
single=_single_linkage,
|
||
|
)
|
||
|
|
||
|
###############################################################################
|
||
|
# Functions for cutting hierarchical clustering tree
|
||
|
|
||
|
|
||
|
def _hc_cut(n_clusters, children, n_leaves):
|
||
|
"""Function cutting the ward tree for a given number of clusters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n_clusters : int or ndarray
|
||
|
The number of clusters to form.
|
||
|
|
||
|
children : ndarray of shape (n_nodes-1, 2)
|
||
|
The children of each non-leaf node. Values less than `n_samples`
|
||
|
correspond to leaves of the tree which are the original samples.
|
||
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
||
|
node and has children `children_[i - n_samples]`. Alternatively
|
||
|
at the i-th iteration, children[i][0] and children[i][1]
|
||
|
are merged to form node `n_samples + i`.
|
||
|
|
||
|
n_leaves : int
|
||
|
Number of leaves of the tree.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
labels : array [n_samples]
|
||
|
Cluster labels for each point.
|
||
|
"""
|
||
|
if n_clusters > n_leaves:
|
||
|
raise ValueError(
|
||
|
"Cannot extract more clusters than samples: "
|
||
|
"%s clusters where given for a tree with %s leaves."
|
||
|
% (n_clusters, n_leaves)
|
||
|
)
|
||
|
# In this function, we store nodes as a heap to avoid recomputing
|
||
|
# the max of the nodes: the first element is always the smallest
|
||
|
# We use negated indices as heaps work on smallest elements, and we
|
||
|
# are interested in largest elements
|
||
|
# children[-1] is the root of the tree
|
||
|
nodes = [-(max(children[-1]) + 1)]
|
||
|
for _ in range(n_clusters - 1):
|
||
|
# As we have a heap, nodes[0] is the smallest element
|
||
|
these_children = children[-nodes[0] - n_leaves]
|
||
|
# Insert the 2 children and remove the largest node
|
||
|
heappush(nodes, -these_children[0])
|
||
|
heappushpop(nodes, -these_children[1])
|
||
|
label = np.zeros(n_leaves, dtype=np.intp)
|
||
|
for i, node in enumerate(nodes):
|
||
|
label[_hierarchical._hc_get_descendent(-node, children, n_leaves)] = i
|
||
|
return label
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
|
||
|
|
||
|
class AgglomerativeClustering(ClusterMixin, BaseEstimator):
|
||
|
"""
|
||
|
Agglomerative Clustering.
|
||
|
|
||
|
Recursively merges pair of clusters of sample data; uses linkage distance.
|
||
|
|
||
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n_clusters : int or None, default=2
|
||
|
The number of clusters to find. It must be ``None`` if
|
||
|
``distance_threshold`` is not ``None``.
|
||
|
|
||
|
metric : str or callable, default="euclidean"
|
||
|
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
|
||
|
"manhattan", "cosine", or "precomputed". If linkage is "ward", only
|
||
|
"euclidean" is accepted. If "precomputed", a distance matrix is needed
|
||
|
as input for the fit method.
|
||
|
|
||
|
.. versionadded:: 1.2
|
||
|
|
||
|
.. deprecated:: 1.4
|
||
|
`metric=None` is deprecated in 1.4 and will be removed in 1.6.
|
||
|
Let `metric` be the default value (i.e. `"euclidean"`) instead.
|
||
|
|
||
|
memory : str or object with the joblib.Memory interface, default=None
|
||
|
Used to cache the output of the computation of the tree.
|
||
|
By default, no caching is done. If a string is given, it is the
|
||
|
path to the caching directory.
|
||
|
|
||
|
connectivity : array-like or callable, default=None
|
||
|
Connectivity matrix. Defines for each sample the neighboring
|
||
|
samples following a given structure of the data.
|
||
|
This can be a connectivity matrix itself or a callable that transforms
|
||
|
the data into a connectivity matrix, such as derived from
|
||
|
`kneighbors_graph`. Default is ``None``, i.e, the
|
||
|
hierarchical clustering algorithm is unstructured.
|
||
|
|
||
|
compute_full_tree : 'auto' or bool, default='auto'
|
||
|
Stop early the construction of the tree at ``n_clusters``. This is
|
||
|
useful to decrease computation time if the number of clusters is not
|
||
|
small compared to the number of samples. This option is useful only
|
||
|
when specifying a connectivity matrix. Note also that when varying the
|
||
|
number of clusters and using caching, it may be advantageous to compute
|
||
|
the full tree. It must be ``True`` if ``distance_threshold`` is not
|
||
|
``None``. By default `compute_full_tree` is "auto", which is equivalent
|
||
|
to `True` when `distance_threshold` is not `None` or that `n_clusters`
|
||
|
is inferior to the maximum between 100 or `0.02 * n_samples`.
|
||
|
Otherwise, "auto" is equivalent to `False`.
|
||
|
|
||
|
linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
|
||
|
Which linkage criterion to use. The linkage criterion determines which
|
||
|
distance to use between sets of observation. The algorithm will merge
|
||
|
the pairs of cluster that minimize this criterion.
|
||
|
|
||
|
- 'ward' minimizes the variance of the clusters being merged.
|
||
|
- 'average' uses the average of the distances of each observation of
|
||
|
the two sets.
|
||
|
- 'complete' or 'maximum' linkage uses the maximum distances between
|
||
|
all observations of the two sets.
|
||
|
- 'single' uses the minimum of the distances between all observations
|
||
|
of the two sets.
|
||
|
|
||
|
.. versionadded:: 0.20
|
||
|
Added the 'single' option
|
||
|
|
||
|
distance_threshold : float, default=None
|
||
|
The linkage distance threshold at or above which clusters will not be
|
||
|
merged. If not ``None``, ``n_clusters`` must be ``None`` and
|
||
|
``compute_full_tree`` must be ``True``.
|
||
|
|
||
|
.. versionadded:: 0.21
|
||
|
|
||
|
compute_distances : bool, default=False
|
||
|
Computes distances between clusters even if `distance_threshold` is not
|
||
|
used. This can be used to make dendrogram visualization, but introduces
|
||
|
a computational and memory overhead.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
n_clusters_ : int
|
||
|
The number of clusters found by the algorithm. If
|
||
|
``distance_threshold=None``, it will be equal to the given
|
||
|
``n_clusters``.
|
||
|
|
||
|
labels_ : ndarray of shape (n_samples)
|
||
|
Cluster labels for each point.
|
||
|
|
||
|
n_leaves_ : int
|
||
|
Number of leaves in the hierarchical tree.
|
||
|
|
||
|
n_connected_components_ : int
|
||
|
The estimated number of connected components in the graph.
|
||
|
|
||
|
.. versionadded:: 0.21
|
||
|
``n_connected_components_`` was added to replace ``n_components_``.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
children_ : array-like of shape (n_samples-1, 2)
|
||
|
The children of each non-leaf node. Values less than `n_samples`
|
||
|
correspond to leaves of the tree which are the original samples.
|
||
|
A node `i` greater than or equal to `n_samples` is a non-leaf
|
||
|
node and has children `children_[i - n_samples]`. Alternatively
|
||
|
at the i-th iteration, children[i][0] and children[i][1]
|
||
|
are merged to form node `n_samples + i`.
|
||
|
|
||
|
distances_ : array-like of shape (n_nodes-1,)
|
||
|
Distances between nodes in the corresponding place in `children_`.
|
||
|
Only computed if `distance_threshold` is used or `compute_distances`
|
||
|
is set to `True`.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
FeatureAgglomeration : Agglomerative clustering but for features instead of
|
||
|
samples.
|
||
|
ward_tree : Hierarchical clustering with ward linkage.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.cluster import AgglomerativeClustering
|
||
|
>>> import numpy as np
|
||
|
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||
|
... [4, 2], [4, 4], [4, 0]])
|
||
|
>>> clustering = AgglomerativeClustering().fit(X)
|
||
|
>>> clustering
|
||
|
AgglomerativeClustering()
|
||
|
>>> clustering.labels_
|
||
|
array([1, 1, 1, 0, 0, 0])
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
|
||
|
"metric": [
|
||
|
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
|
||
|
callable,
|
||
|
Hidden(None),
|
||
|
],
|
||
|
"memory": [str, HasMethods("cache"), None],
|
||
|
"connectivity": ["array-like", callable, None],
|
||
|
"compute_full_tree": [StrOptions({"auto"}), "boolean"],
|
||
|
"linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
|
||
|
"distance_threshold": [Interval(Real, 0, None, closed="left"), None],
|
||
|
"compute_distances": ["boolean"],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
n_clusters=2,
|
||
|
*,
|
||
|
metric="euclidean",
|
||
|
memory=None,
|
||
|
connectivity=None,
|
||
|
compute_full_tree="auto",
|
||
|
linkage="ward",
|
||
|
distance_threshold=None,
|
||
|
compute_distances=False,
|
||
|
):
|
||
|
self.n_clusters = n_clusters
|
||
|
self.distance_threshold = distance_threshold
|
||
|
self.memory = memory
|
||
|
self.connectivity = connectivity
|
||
|
self.compute_full_tree = compute_full_tree
|
||
|
self.linkage = linkage
|
||
|
self.metric = metric
|
||
|
self.compute_distances = compute_distances
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y=None):
|
||
|
"""Fit the hierarchical clustering from features, or distance matrix.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like, shape (n_samples, n_features) or \
|
||
|
(n_samples, n_samples)
|
||
|
Training instances to cluster, or distances between instances if
|
||
|
``metric='precomputed'``.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present here for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
Returns the fitted instance.
|
||
|
"""
|
||
|
X = self._validate_data(X, ensure_min_samples=2)
|
||
|
return self._fit(X)
|
||
|
|
||
|
def _fit(self, X):
|
||
|
"""Fit without validation
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
|
||
|
Training instances to cluster, or distances between instances if
|
||
|
``affinity='precomputed'``.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
Returns the fitted instance.
|
||
|
"""
|
||
|
memory = check_memory(self.memory)
|
||
|
|
||
|
# TODO(1.6): remove in 1.6
|
||
|
if self.metric is None:
|
||
|
warnings.warn(
|
||
|
(
|
||
|
"`metric=None` is deprecated in version 1.4 and will be removed in "
|
||
|
"version 1.6. Let `metric` be the default value "
|
||
|
"(i.e. `'euclidean'`) instead."
|
||
|
),
|
||
|
FutureWarning,
|
||
|
)
|
||
|
self._metric = "euclidean"
|
||
|
else:
|
||
|
self._metric = self.metric
|
||
|
|
||
|
if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
|
||
|
raise ValueError(
|
||
|
"Exactly one of n_clusters and "
|
||
|
"distance_threshold has to be set, and the other "
|
||
|
"needs to be None."
|
||
|
)
|
||
|
|
||
|
if self.distance_threshold is not None and not self.compute_full_tree:
|
||
|
raise ValueError(
|
||
|
"compute_full_tree must be True if distance_threshold is set."
|
||
|
)
|
||
|
|
||
|
if self.linkage == "ward" and self._metric != "euclidean":
|
||
|
raise ValueError(
|
||
|
f"{self._metric} was provided as metric. Ward can only "
|
||
|
"work with euclidean distances."
|
||
|
)
|
||
|
|
||
|
tree_builder = _TREE_BUILDERS[self.linkage]
|
||
|
|
||
|
connectivity = self.connectivity
|
||
|
if self.connectivity is not None:
|
||
|
if callable(self.connectivity):
|
||
|
connectivity = self.connectivity(X)
|
||
|
connectivity = check_array(
|
||
|
connectivity, accept_sparse=["csr", "coo", "lil"]
|
||
|
)
|
||
|
|
||
|
n_samples = len(X)
|
||
|
compute_full_tree = self.compute_full_tree
|
||
|
if self.connectivity is None:
|
||
|
compute_full_tree = True
|
||
|
if compute_full_tree == "auto":
|
||
|
if self.distance_threshold is not None:
|
||
|
compute_full_tree = True
|
||
|
else:
|
||
|
# Early stopping is likely to give a speed up only for
|
||
|
# a large number of clusters. The actual threshold
|
||
|
# implemented here is heuristic
|
||
|
compute_full_tree = self.n_clusters < max(100, 0.02 * n_samples)
|
||
|
n_clusters = self.n_clusters
|
||
|
if compute_full_tree:
|
||
|
n_clusters = None
|
||
|
|
||
|
# Construct the tree
|
||
|
kwargs = {}
|
||
|
if self.linkage != "ward":
|
||
|
kwargs["linkage"] = self.linkage
|
||
|
kwargs["affinity"] = self._metric
|
||
|
|
||
|
distance_threshold = self.distance_threshold
|
||
|
|
||
|
return_distance = (distance_threshold is not None) or self.compute_distances
|
||
|
|
||
|
out = memory.cache(tree_builder)(
|
||
|
X,
|
||
|
connectivity=connectivity,
|
||
|
n_clusters=n_clusters,
|
||
|
return_distance=return_distance,
|
||
|
**kwargs,
|
||
|
)
|
||
|
(self.children_, self.n_connected_components_, self.n_leaves_, parents) = out[
|
||
|
:4
|
||
|
]
|
||
|
|
||
|
if return_distance:
|
||
|
self.distances_ = out[-1]
|
||
|
|
||
|
if self.distance_threshold is not None: # distance_threshold is used
|
||
|
self.n_clusters_ = (
|
||
|
np.count_nonzero(self.distances_ >= distance_threshold) + 1
|
||
|
)
|
||
|
else: # n_clusters is used
|
||
|
self.n_clusters_ = self.n_clusters
|
||
|
|
||
|
# Cut the tree
|
||
|
if compute_full_tree:
|
||
|
self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_)
|
||
|
else:
|
||
|
labels = _hierarchical.hc_get_heads(parents, copy=False)
|
||
|
# copy to avoid holding a reference on the original array
|
||
|
labels = np.copy(labels[:n_samples])
|
||
|
# Reassign cluster numbers
|
||
|
self.labels_ = np.searchsorted(np.unique(labels), labels)
|
||
|
return self
|
||
|
|
||
|
def fit_predict(self, X, y=None):
|
||
|
"""Fit and return the result of each sample's clustering assignment.
|
||
|
|
||
|
In addition to fitting, this method also return the result of the
|
||
|
clustering assignment for each sample in the training set.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features) or \
|
||
|
(n_samples, n_samples)
|
||
|
Training instances to cluster, or distances between instances if
|
||
|
``affinity='precomputed'``.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present here for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
labels : ndarray of shape (n_samples,)
|
||
|
Cluster labels.
|
||
|
"""
|
||
|
return super().fit_predict(X, y)
|
||
|
|
||
|
|
||
|
class FeatureAgglomeration(
|
||
|
ClassNamePrefixFeaturesOutMixin, AgglomerativeClustering, AgglomerationTransform
|
||
|
):
|
||
|
"""Agglomerate features.
|
||
|
|
||
|
Recursively merges pair of clusters of features.
|
||
|
|
||
|
Read more in the :ref:`User Guide <hierarchical_clustering>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n_clusters : int or None, default=2
|
||
|
The number of clusters to find. It must be ``None`` if
|
||
|
``distance_threshold`` is not ``None``.
|
||
|
|
||
|
metric : str or callable, default="euclidean"
|
||
|
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
|
||
|
"manhattan", "cosine", or "precomputed". If linkage is "ward", only
|
||
|
"euclidean" is accepted. If "precomputed", a distance matrix is needed
|
||
|
as input for the fit method.
|
||
|
|
||
|
.. versionadded:: 1.2
|
||
|
|
||
|
.. deprecated:: 1.4
|
||
|
`metric=None` is deprecated in 1.4 and will be removed in 1.6.
|
||
|
Let `metric` be the default value (i.e. `"euclidean"`) instead.
|
||
|
|
||
|
memory : str or object with the joblib.Memory interface, default=None
|
||
|
Used to cache the output of the computation of the tree.
|
||
|
By default, no caching is done. If a string is given, it is the
|
||
|
path to the caching directory.
|
||
|
|
||
|
connectivity : array-like or callable, default=None
|
||
|
Connectivity matrix. Defines for each feature the neighboring
|
||
|
features following a given structure of the data.
|
||
|
This can be a connectivity matrix itself or a callable that transforms
|
||
|
the data into a connectivity matrix, such as derived from
|
||
|
`kneighbors_graph`. Default is `None`, i.e, the
|
||
|
hierarchical clustering algorithm is unstructured.
|
||
|
|
||
|
compute_full_tree : 'auto' or bool, default='auto'
|
||
|
Stop early the construction of the tree at `n_clusters`. This is useful
|
||
|
to decrease computation time if the number of clusters is not small
|
||
|
compared to the number of features. This option is useful only when
|
||
|
specifying a connectivity matrix. Note also that when varying the
|
||
|
number of clusters and using caching, it may be advantageous to compute
|
||
|
the full tree. It must be ``True`` if ``distance_threshold`` is not
|
||
|
``None``. By default `compute_full_tree` is "auto", which is equivalent
|
||
|
to `True` when `distance_threshold` is not `None` or that `n_clusters`
|
||
|
is inferior to the maximum between 100 or `0.02 * n_samples`.
|
||
|
Otherwise, "auto" is equivalent to `False`.
|
||
|
|
||
|
linkage : {"ward", "complete", "average", "single"}, default="ward"
|
||
|
Which linkage criterion to use. The linkage criterion determines which
|
||
|
distance to use between sets of features. The algorithm will merge
|
||
|
the pairs of cluster that minimize this criterion.
|
||
|
|
||
|
- "ward" minimizes the variance of the clusters being merged.
|
||
|
- "complete" or maximum linkage uses the maximum distances between
|
||
|
all features of the two sets.
|
||
|
- "average" uses the average of the distances of each feature of
|
||
|
the two sets.
|
||
|
- "single" uses the minimum of the distances between all features
|
||
|
of the two sets.
|
||
|
|
||
|
pooling_func : callable, default=np.mean
|
||
|
This combines the values of agglomerated features into a single
|
||
|
value, and should accept an array of shape [M, N] and the keyword
|
||
|
argument `axis=1`, and reduce it to an array of size [M].
|
||
|
|
||
|
distance_threshold : float, default=None
|
||
|
The linkage distance threshold at or above which clusters will not be
|
||
|
merged. If not ``None``, ``n_clusters`` must be ``None`` and
|
||
|
``compute_full_tree`` must be ``True``.
|
||
|
|
||
|
.. versionadded:: 0.21
|
||
|
|
||
|
compute_distances : bool, default=False
|
||
|
Computes distances between clusters even if `distance_threshold` is not
|
||
|
used. This can be used to make dendrogram visualization, but introduces
|
||
|
a computational and memory overhead.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
n_clusters_ : int
|
||
|
The number of clusters found by the algorithm. If
|
||
|
``distance_threshold=None``, it will be equal to the given
|
||
|
``n_clusters``.
|
||
|
|
||
|
labels_ : array-like of (n_features,)
|
||
|
Cluster labels for each feature.
|
||
|
|
||
|
n_leaves_ : int
|
||
|
Number of leaves in the hierarchical tree.
|
||
|
|
||
|
n_connected_components_ : int
|
||
|
The estimated number of connected components in the graph.
|
||
|
|
||
|
.. versionadded:: 0.21
|
||
|
``n_connected_components_`` was added to replace ``n_components_``.
|
||
|
|
||
|
n_features_in_ : int
|
||
|
Number of features seen during :term:`fit`.
|
||
|
|
||
|
.. versionadded:: 0.24
|
||
|
|
||
|
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||
|
Names of features seen during :term:`fit`. Defined only when `X`
|
||
|
has feature names that are all strings.
|
||
|
|
||
|
.. versionadded:: 1.0
|
||
|
|
||
|
children_ : array-like of shape (n_nodes-1, 2)
|
||
|
The children of each non-leaf node. Values less than `n_features`
|
||
|
correspond to leaves of the tree which are the original samples.
|
||
|
A node `i` greater than or equal to `n_features` is a non-leaf
|
||
|
node and has children `children_[i - n_features]`. Alternatively
|
||
|
at the i-th iteration, children[i][0] and children[i][1]
|
||
|
are merged to form node `n_features + i`.
|
||
|
|
||
|
distances_ : array-like of shape (n_nodes-1,)
|
||
|
Distances between nodes in the corresponding place in `children_`.
|
||
|
Only computed if `distance_threshold` is used or `compute_distances`
|
||
|
is set to `True`.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
AgglomerativeClustering : Agglomerative clustering samples instead of
|
||
|
features.
|
||
|
ward_tree : Hierarchical clustering with ward linkage.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn import datasets, cluster
|
||
|
>>> digits = datasets.load_digits()
|
||
|
>>> images = digits.images
|
||
|
>>> X = np.reshape(images, (len(images), -1))
|
||
|
>>> agglo = cluster.FeatureAgglomeration(n_clusters=32)
|
||
|
>>> agglo.fit(X)
|
||
|
FeatureAgglomeration(n_clusters=32)
|
||
|
>>> X_reduced = agglo.transform(X)
|
||
|
>>> X_reduced.shape
|
||
|
(1797, 32)
|
||
|
"""
|
||
|
|
||
|
_parameter_constraints: dict = {
|
||
|
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
|
||
|
"metric": [
|
||
|
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
|
||
|
callable,
|
||
|
Hidden(None),
|
||
|
],
|
||
|
"memory": [str, HasMethods("cache"), None],
|
||
|
"connectivity": ["array-like", callable, None],
|
||
|
"compute_full_tree": [StrOptions({"auto"}), "boolean"],
|
||
|
"linkage": [StrOptions(set(_TREE_BUILDERS.keys()))],
|
||
|
"pooling_func": [callable],
|
||
|
"distance_threshold": [Interval(Real, 0, None, closed="left"), None],
|
||
|
"compute_distances": ["boolean"],
|
||
|
}
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
n_clusters=2,
|
||
|
*,
|
||
|
metric="euclidean",
|
||
|
memory=None,
|
||
|
connectivity=None,
|
||
|
compute_full_tree="auto",
|
||
|
linkage="ward",
|
||
|
pooling_func=np.mean,
|
||
|
distance_threshold=None,
|
||
|
compute_distances=False,
|
||
|
):
|
||
|
super().__init__(
|
||
|
n_clusters=n_clusters,
|
||
|
memory=memory,
|
||
|
connectivity=connectivity,
|
||
|
compute_full_tree=compute_full_tree,
|
||
|
linkage=linkage,
|
||
|
metric=metric,
|
||
|
distance_threshold=distance_threshold,
|
||
|
compute_distances=compute_distances,
|
||
|
)
|
||
|
self.pooling_func = pooling_func
|
||
|
|
||
|
@_fit_context(prefer_skip_nested_validation=True)
|
||
|
def fit(self, X, y=None):
|
||
|
"""Fit the hierarchical clustering on the data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_samples, n_features)
|
||
|
The data.
|
||
|
|
||
|
y : Ignored
|
||
|
Not used, present here for API consistency by convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : object
|
||
|
Returns the transformer.
|
||
|
"""
|
||
|
X = self._validate_data(X, ensure_min_features=2)
|
||
|
super()._fit(X.T)
|
||
|
self._n_features_out = self.n_clusters_
|
||
|
return self
|
||
|
|
||
|
@property
|
||
|
def fit_predict(self):
|
||
|
"""Fit and return the result of each sample's clustering assignment."""
|
||
|
raise AttributeError
|