414 lines
12 KiB
Python
414 lines
12 KiB
Python
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
from numpy.testing import assert_allclose
|
||
|
from scipy.sparse import issparse
|
||
|
|
||
|
from sklearn import datasets
|
||
|
from sklearn.metrics import pairwise_distances
|
||
|
from sklearn.metrics.cluster import (
|
||
|
calinski_harabasz_score,
|
||
|
davies_bouldin_score,
|
||
|
silhouette_samples,
|
||
|
silhouette_score,
|
||
|
)
|
||
|
from sklearn.metrics.cluster._unsupervised import _silhouette_reduce
|
||
|
from sklearn.utils._testing import assert_array_equal
|
||
|
from sklearn.utils.fixes import (
|
||
|
CSC_CONTAINERS,
|
||
|
CSR_CONTAINERS,
|
||
|
DOK_CONTAINERS,
|
||
|
LIL_CONTAINERS,
|
||
|
)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container",
|
||
|
[None] + CSR_CONTAINERS + CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||
|
)
|
||
|
@pytest.mark.parametrize("sample_size", [None, "half"])
|
||
|
def test_silhouette(sparse_container, sample_size):
|
||
|
# Tests the Silhouette Coefficient.
|
||
|
dataset = datasets.load_iris()
|
||
|
X, y = dataset.data, dataset.target
|
||
|
if sparse_container is not None:
|
||
|
X = sparse_container(X)
|
||
|
sample_size = int(X.shape[0] / 2) if sample_size == "half" else sample_size
|
||
|
|
||
|
D = pairwise_distances(X, metric="euclidean")
|
||
|
# Given that the actual labels are used, we can assume that S would be positive.
|
||
|
score_precomputed = silhouette_score(
|
||
|
D, y, metric="precomputed", sample_size=sample_size, random_state=0
|
||
|
)
|
||
|
score_euclidean = silhouette_score(
|
||
|
X, y, metric="euclidean", sample_size=sample_size, random_state=0
|
||
|
)
|
||
|
assert score_precomputed > 0
|
||
|
assert score_euclidean > 0
|
||
|
assert score_precomputed == pytest.approx(score_euclidean)
|
||
|
|
||
|
|
||
|
def test_cluster_size_1():
|
||
|
# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
|
||
|
# (cluster 0). We also test the case where there are identical samples
|
||
|
# as the only members of a cluster (cluster 2). To our knowledge, this case
|
||
|
# is not discussed in reference material, and we choose for it a sample
|
||
|
# score of 1.
|
||
|
X = [[0.0], [1.0], [1.0], [2.0], [3.0], [3.0]]
|
||
|
labels = np.array([0, 1, 1, 1, 2, 2])
|
||
|
|
||
|
# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
|
||
|
# Cluster 1: intra-cluster = [.5, .5, 1]
|
||
|
# inter-cluster = [1, 1, 1]
|
||
|
# silhouette = [.5, .5, 0]
|
||
|
# Cluster 2: intra-cluster = [0, 0]
|
||
|
# inter-cluster = [arbitrary, arbitrary]
|
||
|
# silhouette = [1., 1.]
|
||
|
|
||
|
silhouette = silhouette_score(X, labels)
|
||
|
assert not np.isnan(silhouette)
|
||
|
ss = silhouette_samples(X, labels)
|
||
|
assert_array_equal(ss, [0, 0.5, 0.5, 0, 1, 1])
|
||
|
|
||
|
|
||
|
def test_silhouette_paper_example():
|
||
|
# Explicitly check per-sample results against Rousseeuw (1987)
|
||
|
# Data from Table 1
|
||
|
lower = [
|
||
|
5.58,
|
||
|
7.00,
|
||
|
6.50,
|
||
|
7.08,
|
||
|
7.00,
|
||
|
3.83,
|
||
|
4.83,
|
||
|
5.08,
|
||
|
8.17,
|
||
|
5.83,
|
||
|
2.17,
|
||
|
5.75,
|
||
|
6.67,
|
||
|
6.92,
|
||
|
4.92,
|
||
|
6.42,
|
||
|
5.00,
|
||
|
5.58,
|
||
|
6.00,
|
||
|
4.67,
|
||
|
6.42,
|
||
|
3.42,
|
||
|
5.50,
|
||
|
6.42,
|
||
|
6.42,
|
||
|
5.00,
|
||
|
3.92,
|
||
|
6.17,
|
||
|
2.50,
|
||
|
4.92,
|
||
|
6.25,
|
||
|
7.33,
|
||
|
4.50,
|
||
|
2.25,
|
||
|
6.33,
|
||
|
2.75,
|
||
|
6.08,
|
||
|
6.67,
|
||
|
4.25,
|
||
|
2.67,
|
||
|
6.00,
|
||
|
6.17,
|
||
|
6.17,
|
||
|
6.92,
|
||
|
6.17,
|
||
|
5.25,
|
||
|
6.83,
|
||
|
4.50,
|
||
|
3.75,
|
||
|
5.75,
|
||
|
5.42,
|
||
|
6.08,
|
||
|
5.83,
|
||
|
6.67,
|
||
|
3.67,
|
||
|
4.75,
|
||
|
3.00,
|
||
|
6.08,
|
||
|
6.67,
|
||
|
5.00,
|
||
|
5.58,
|
||
|
4.83,
|
||
|
6.17,
|
||
|
5.67,
|
||
|
6.50,
|
||
|
6.92,
|
||
|
]
|
||
|
D = np.zeros((12, 12))
|
||
|
D[np.tril_indices(12, -1)] = lower
|
||
|
D += D.T
|
||
|
|
||
|
names = [
|
||
|
"BEL",
|
||
|
"BRA",
|
||
|
"CHI",
|
||
|
"CUB",
|
||
|
"EGY",
|
||
|
"FRA",
|
||
|
"IND",
|
||
|
"ISR",
|
||
|
"USA",
|
||
|
"USS",
|
||
|
"YUG",
|
||
|
"ZAI",
|
||
|
]
|
||
|
|
||
|
# Data from Figure 2
|
||
|
labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
|
||
|
expected1 = {
|
||
|
"USA": 0.43,
|
||
|
"BEL": 0.39,
|
||
|
"FRA": 0.35,
|
||
|
"ISR": 0.30,
|
||
|
"BRA": 0.22,
|
||
|
"EGY": 0.20,
|
||
|
"ZAI": 0.19,
|
||
|
"CUB": 0.40,
|
||
|
"USS": 0.34,
|
||
|
"CHI": 0.33,
|
||
|
"YUG": 0.26,
|
||
|
"IND": -0.04,
|
||
|
}
|
||
|
score1 = 0.28
|
||
|
|
||
|
# Data from Figure 3
|
||
|
labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
|
||
|
expected2 = {
|
||
|
"USA": 0.47,
|
||
|
"FRA": 0.44,
|
||
|
"BEL": 0.42,
|
||
|
"ISR": 0.37,
|
||
|
"EGY": 0.02,
|
||
|
"ZAI": 0.28,
|
||
|
"BRA": 0.25,
|
||
|
"IND": 0.17,
|
||
|
"CUB": 0.48,
|
||
|
"USS": 0.44,
|
||
|
"YUG": 0.31,
|
||
|
"CHI": 0.31,
|
||
|
}
|
||
|
score2 = 0.33
|
||
|
|
||
|
for labels, expected, score in [
|
||
|
(labels1, expected1, score1),
|
||
|
(labels2, expected2, score2),
|
||
|
]:
|
||
|
expected = [expected[name] for name in names]
|
||
|
# we check to 2dp because that's what's in the paper
|
||
|
pytest.approx(
|
||
|
expected,
|
||
|
silhouette_samples(D, np.array(labels), metric="precomputed"),
|
||
|
abs=1e-2,
|
||
|
)
|
||
|
pytest.approx(
|
||
|
score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_correct_labelsize():
|
||
|
# Assert 1 < n_labels < n_samples
|
||
|
dataset = datasets.load_iris()
|
||
|
X = dataset.data
|
||
|
|
||
|
# n_labels = n_samples
|
||
|
y = np.arange(X.shape[0])
|
||
|
err_msg = (
|
||
|
r"Number of labels is %d\. Valid values are 2 "
|
||
|
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
silhouette_score(X, y)
|
||
|
|
||
|
# n_labels = 1
|
||
|
y = np.zeros(X.shape[0])
|
||
|
err_msg = (
|
||
|
r"Number of labels is %d\. Valid values are 2 "
|
||
|
r"to n_samples - 1 \(inclusive\)" % len(np.unique(y))
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=err_msg):
|
||
|
silhouette_score(X, y)
|
||
|
|
||
|
|
||
|
def test_non_encoded_labels():
|
||
|
dataset = datasets.load_iris()
|
||
|
X = dataset.data
|
||
|
labels = dataset.target
|
||
|
assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels)
|
||
|
assert_array_equal(
|
||
|
silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels)
|
||
|
)
|
||
|
|
||
|
|
||
|
def test_non_numpy_labels():
|
||
|
dataset = datasets.load_iris()
|
||
|
X = dataset.data
|
||
|
y = dataset.target
|
||
|
assert silhouette_score(list(X), list(y)) == silhouette_score(X, y)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||
|
def test_silhouette_nonzero_diag(dtype):
|
||
|
# Make sure silhouette_samples requires diagonal to be zero.
|
||
|
# Non-regression test for #12178
|
||
|
|
||
|
# Construct a zero-diagonal matrix
|
||
|
dists = pairwise_distances(
|
||
|
np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T
|
||
|
)
|
||
|
labels = [0, 0, 0, 1, 1, 1]
|
||
|
|
||
|
# small values on the diagonal are OK
|
||
|
dists[2][2] = np.finfo(dists.dtype).eps * 10
|
||
|
silhouette_samples(dists, labels, metric="precomputed")
|
||
|
|
||
|
# values bigger than eps * 100 are not
|
||
|
dists[2][2] = np.finfo(dists.dtype).eps * 1000
|
||
|
with pytest.raises(ValueError, match="contains non-zero"):
|
||
|
silhouette_samples(dists, labels, metric="precomputed")
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container",
|
||
|
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||
|
)
|
||
|
def test_silhouette_samples_precomputed_sparse(sparse_container):
|
||
|
"""Check that silhouette_samples works for sparse matrices correctly."""
|
||
|
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||
|
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||
|
pdist_dense = pairwise_distances(X)
|
||
|
pdist_sparse = sparse_container(pdist_dense)
|
||
|
assert issparse(pdist_sparse)
|
||
|
output_with_sparse_input = silhouette_samples(pdist_sparse, y, metric="precomputed")
|
||
|
output_with_dense_input = silhouette_samples(pdist_dense, y, metric="precomputed")
|
||
|
assert_allclose(output_with_sparse_input, output_with_dense_input)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container",
|
||
|
CSC_CONTAINERS + CSR_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS,
|
||
|
)
|
||
|
def test_silhouette_samples_euclidean_sparse(sparse_container):
|
||
|
"""Check that silhouette_samples works for sparse matrices correctly."""
|
||
|
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||
|
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||
|
pdist_dense = pairwise_distances(X)
|
||
|
pdist_sparse = sparse_container(pdist_dense)
|
||
|
assert issparse(pdist_sparse)
|
||
|
output_with_sparse_input = silhouette_samples(pdist_sparse, y)
|
||
|
output_with_dense_input = silhouette_samples(pdist_dense, y)
|
||
|
assert_allclose(output_with_sparse_input, output_with_dense_input)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"sparse_container", CSC_CONTAINERS + DOK_CONTAINERS + LIL_CONTAINERS
|
||
|
)
|
||
|
def test_silhouette_reduce(sparse_container):
|
||
|
"""Check for non-CSR input to private method `_silhouette_reduce`."""
|
||
|
X = np.array([[0.2, 0.1, 0.1, 0.2, 0.1, 1.6, 0.2, 0.1]], dtype=np.float32).T
|
||
|
pdist_dense = pairwise_distances(X)
|
||
|
pdist_sparse = sparse_container(pdist_dense)
|
||
|
y = [0, 0, 0, 0, 1, 1, 1, 1]
|
||
|
label_freqs = np.bincount(y)
|
||
|
with pytest.raises(
|
||
|
TypeError,
|
||
|
match="Expected CSR matrix. Please pass sparse matrix in CSR format.",
|
||
|
):
|
||
|
_silhouette_reduce(pdist_sparse, start=0, labels=y, label_freqs=label_freqs)
|
||
|
|
||
|
|
||
|
def assert_raises_on_only_one_label(func):
|
||
|
"""Assert message when there is only one label"""
|
||
|
rng = np.random.RandomState(seed=0)
|
||
|
with pytest.raises(ValueError, match="Number of labels is"):
|
||
|
func(rng.rand(10, 2), np.zeros(10))
|
||
|
|
||
|
|
||
|
def assert_raises_on_all_points_same_cluster(func):
|
||
|
"""Assert message when all point are in different clusters"""
|
||
|
rng = np.random.RandomState(seed=0)
|
||
|
with pytest.raises(ValueError, match="Number of labels is"):
|
||
|
func(rng.rand(10, 2), np.arange(10))
|
||
|
|
||
|
|
||
|
def test_calinski_harabasz_score():
|
||
|
assert_raises_on_only_one_label(calinski_harabasz_score)
|
||
|
|
||
|
assert_raises_on_all_points_same_cluster(calinski_harabasz_score)
|
||
|
|
||
|
# Assert the value is 1. when all samples are equals
|
||
|
assert 1.0 == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)
|
||
|
|
||
|
# Assert the value is 0. when all the mean cluster are equal
|
||
|
assert 0.0 == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)
|
||
|
|
||
|
# General case (with non numpy arrays)
|
||
|
X = (
|
||
|
[[0, 0], [1, 1]] * 5
|
||
|
+ [[3, 3], [4, 4]] * 5
|
||
|
+ [[0, 4], [1, 3]] * 5
|
||
|
+ [[3, 1], [4, 0]] * 5
|
||
|
)
|
||
|
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
|
||
|
pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
|
||
|
|
||
|
|
||
|
def test_davies_bouldin_score():
|
||
|
assert_raises_on_only_one_label(davies_bouldin_score)
|
||
|
assert_raises_on_all_points_same_cluster(davies_bouldin_score)
|
||
|
|
||
|
# Assert the value is 0. when all samples are equals
|
||
|
assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(
|
||
|
0.0
|
||
|
)
|
||
|
|
||
|
# Assert the value is 0. when all the mean cluster are equal
|
||
|
assert davies_bouldin_score(
|
||
|
[[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10
|
||
|
) == pytest.approx(0.0)
|
||
|
|
||
|
# General case (with non numpy arrays)
|
||
|
X = (
|
||
|
[[0, 0], [1, 1]] * 5
|
||
|
+ [[3, 3], [4, 4]] * 5
|
||
|
+ [[0, 4], [1, 3]] * 5
|
||
|
+ [[3, 1], [4, 0]] * 5
|
||
|
)
|
||
|
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
|
||
|
pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)
|
||
|
|
||
|
# Ensure divide by zero warning is not raised in general case
|
||
|
with warnings.catch_warnings():
|
||
|
warnings.simplefilter("error", RuntimeWarning)
|
||
|
davies_bouldin_score(X, labels)
|
||
|
|
||
|
# General case - cluster have one sample
|
||
|
X = [[0, 0], [2, 2], [3, 3], [5, 5]]
|
||
|
labels = [0, 0, 1, 2]
|
||
|
pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
|
||
|
|
||
|
|
||
|
def test_silhouette_score_integer_precomputed():
|
||
|
"""Check that silhouette_score works for precomputed metrics that are integers.
|
||
|
|
||
|
Non-regression test for #22107.
|
||
|
"""
|
||
|
result = silhouette_score(
|
||
|
[[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
|
||
|
)
|
||
|
assert result == pytest.approx(1 / 6)
|
||
|
|
||
|
# non-zero on diagonal for ints raises an error
|
||
|
with pytest.raises(ValueError, match="contains non-zero"):
|
||
|
silhouette_score(
|
||
|
[[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1], metric="precomputed"
|
||
|
)
|