import sys from io import StringIO import numpy as np import pytest from numpy.testing import assert_array_equal from scipy.linalg import block_diag from scipy.special import psi from sklearn.decomposition import LatentDirichletAllocation from sklearn.decomposition._online_lda_fast import ( _dirichlet_expectation_1d, _dirichlet_expectation_2d, ) from sklearn.exceptions import NotFittedError from sklearn.utils._testing import ( assert_allclose, assert_almost_equal, assert_array_almost_equal, if_safe_multiprocessing_with_blas, ) from sklearn.utils.fixes import CSR_CONTAINERS def _build_sparse_array(csr_container): # Create 3 topics and each topic has 3 distinct words. # (Each word only belongs to a single topic.) n_components = 3 block = np.full((3, 3), n_components, dtype=int) blocks = [block] * n_components X = block_diag(*blocks) X = csr_container(X) return (n_components, X) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_default_prior_params(csr_container): # default prior parameter should be `1 / topics` # and verbose params should not affect result n_components, X = _build_sparse_array(csr_container) prior = 1.0 / n_components lda_1 = LatentDirichletAllocation( n_components=n_components, doc_topic_prior=prior, topic_word_prior=prior, random_state=0, ) lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) assert_almost_equal(topic_distr_1, topic_distr_2) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_fit_batch(csr_container): # Test LDA batch learning_offset (`fit` method with 'batch' learning) rng = np.random.RandomState(0) n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, evaluate_every=1, learning_method="batch", random_state=rng, ) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_fit_online(csr_container): # Test LDA online learning (`fit` method with 'online' learning) rng = np.random.RandomState(0) n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, learning_offset=10.0, evaluate_every=1, learning_method="online", random_state=rng, ) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_partial_fit(csr_container): # Test LDA online learning (`partial_fit` method) # (same as test_lda_batch) rng = np.random.RandomState(0) n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, learning_offset=10.0, total_samples=100, random_state=rng, ) for i in range(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_dense_input(csr_container): # Test LDA with dense input. rng = np.random.RandomState(0) n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, learning_method="batch", random_state=rng ) lda.fit(X.toarray()) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps def test_lda_transform(): # Test LDA transform. # Transform result cannot be negative and should be normalized rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_components = 3 lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) X_trans = lda.fit_transform(X) assert (X_trans > 0.0).any() assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0])) @pytest.mark.parametrize("method", ("online", "batch")) def test_lda_fit_transform(method): # Test LDA fit_transform & transform # fit_transform and transform result should be the same rng = np.random.RandomState(0) X = rng.randint(10, size=(50, 20)) lda = LatentDirichletAllocation( n_components=5, learning_method=method, random_state=rng ) X_fit = lda.fit_transform(X) X_trans = lda.transform(X) assert_array_almost_equal(X_fit, X_trans, 4) def test_lda_negative_input(): # test pass dense matrix with sparse negative input. X = np.full((5, 10), -1.0) lda = LatentDirichletAllocation() regex = r"^Negative values in data passed" with pytest.raises(ValueError, match=regex): lda.fit(X) def test_lda_no_component_error(): # test `perplexity` before `fit` rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) lda = LatentDirichletAllocation() regex = ( "This LatentDirichletAllocation instance is not fitted yet. " "Call 'fit' with appropriate arguments before using this " "estimator." ) with pytest.raises(NotFittedError, match=regex): lda.perplexity(X) @if_safe_multiprocessing_with_blas @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) @pytest.mark.parametrize("method", ("online", "batch")) def test_lda_multi_jobs(method, csr_container): n_components, X = _build_sparse_array(csr_container) # Test LDA batch training with multi CPU rng = np.random.RandomState(0) lda = LatentDirichletAllocation( n_components=n_components, n_jobs=2, learning_method=method, evaluate_every=1, random_state=rng, ) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps @if_safe_multiprocessing_with_blas @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_partial_fit_multi_jobs(csr_container): # Test LDA online training with multi CPU rng = np.random.RandomState(0) n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, n_jobs=2, learning_offset=5.0, total_samples=30, random_state=rng, ) for i in range(2): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) n_components = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) lda = LatentDirichletAllocation( n_components=n_components, learning_offset=5.0, total_samples=20, random_state=rng, ) lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components)) with pytest.raises(ValueError, match=r"Number of samples"): lda._perplexity_precomp_distr(X, invalid_n_samples) # invalid topic number invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1)) with pytest.raises(ValueError, match=r"Number of topics"): lda._perplexity_precomp_distr(X, invalid_n_components) @pytest.mark.parametrize("method", ("online", "batch")) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_perplexity(method, csr_container): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_components, X = _build_sparse_array(csr_container) lda_1 = LatentDirichletAllocation( n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0, ) lda_2 = LatentDirichletAllocation( n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0, ) lda_1.fit(X) perp_1 = lda_1.perplexity(X, sub_sampling=False) lda_2.fit(X) perp_2 = lda_2.perplexity(X, sub_sampling=False) assert perp_1 >= perp_2 perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) assert perp_1_subsampling >= perp_2_subsampling @pytest.mark.parametrize("method", ("online", "batch")) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_score(method, csr_container): # Test LDA score for batch training # score should be higher after each iteration n_components, X = _build_sparse_array(csr_container) lda_1 = LatentDirichletAllocation( n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0, ) lda_2 = LatentDirichletAllocation( n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0, ) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert score_2 >= score_1 @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_perplexity_input_format(csr_container): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, max_iter=1, learning_method="batch", total_samples=100, random_state=0, ) lda.fit(X) perp_1 = lda.perplexity(X) perp_2 = lda.perplexity(X.toarray()) assert_almost_equal(perp_1, perp_2) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_score_perplexity(csr_container): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, max_iter=10, random_state=0 ) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1.0 * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_fit_perplexity(csr_container): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, max_iter=1, learning_method="batch", random_state=0, evaluate_every=1, ) lda.fit(X) # Perplexity computed at end of fit method perplexity1 = lda.bound_ # Result of perplexity method on the train set perplexity2 = lda.perplexity(X) assert_almost_equal(perplexity1, perplexity2) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_empty_docs(csr_container): """Test LDA on empty document (all-zero rows).""" Z = np.zeros((5, 4)) for X in [Z, csr_container(Z)]: lda = LatentDirichletAllocation(max_iter=750).fit(X) assert_almost_equal( lda.components_.sum(axis=0), np.ones(lda.components_.shape[1]) ) def test_dirichlet_expectation(): """Test Cython version of Dirichlet expectation calculation.""" x = np.logspace(-100, 10, 10000) expectation = np.empty_like(x) _dirichlet_expectation_1d(x, 0, expectation) assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))), atol=1e-19) x = x.reshape(100, 100) assert_allclose( _dirichlet_expectation_2d(x), psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]), rtol=1e-11, atol=3e-9, ) def check_verbosity( verbose, evaluate_every, expected_lines, expected_perplexities, csr_container ): n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation( n_components=n_components, max_iter=3, learning_method="batch", verbose=verbose, evaluate_every=evaluate_every, random_state=0, ) out = StringIO() old_out, sys.stdout = sys.stdout, out try: lda.fit(X) finally: sys.stdout = old_out n_lines = out.getvalue().count("\n") n_perplexity = out.getvalue().count("perplexity") assert expected_lines == n_lines assert expected_perplexities == n_perplexity @pytest.mark.parametrize( "verbose,evaluate_every,expected_lines,expected_perplexities", [ (False, 1, 0, 0), (False, 0, 0, 0), (True, 0, 3, 0), (True, 1, 3, 3), (True, 2, 3, 1), ], ) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_verbosity( verbose, evaluate_every, expected_lines, expected_perplexities, csr_container ): check_verbosity( verbose, evaluate_every, expected_lines, expected_perplexities, csr_container ) @pytest.mark.parametrize("csr_container", CSR_CONTAINERS) def test_lda_feature_names_out(csr_container): """Check feature names out for LatentDirichletAllocation.""" n_components, X = _build_sparse_array(csr_container) lda = LatentDirichletAllocation(n_components=n_components).fit(X) names = lda.get_feature_names_out() assert_array_equal( [f"latentdirichletallocation{i}" for i in range(n_components)], names ) @pytest.mark.parametrize("learning_method", ("batch", "online")) def test_lda_dtype_match(learning_method, global_dtype): """Check data type preservation of fitted attributes.""" rng = np.random.RandomState(0) X = rng.uniform(size=(20, 10)).astype(global_dtype, copy=False) lda = LatentDirichletAllocation( n_components=5, random_state=0, learning_method=learning_method ) lda.fit(X) assert lda.components_.dtype == global_dtype assert lda.exp_dirichlet_component_.dtype == global_dtype @pytest.mark.parametrize("learning_method", ("batch", "online")) def test_lda_numerical_consistency(learning_method, global_random_seed): """Check numerical consistency between np.float32 and np.float64.""" rng = np.random.RandomState(global_random_seed) X64 = rng.uniform(size=(20, 10)) X32 = X64.astype(np.float32) lda_64 = LatentDirichletAllocation( n_components=5, random_state=global_random_seed, learning_method=learning_method ).fit(X64) lda_32 = LatentDirichletAllocation( n_components=5, random_state=global_random_seed, learning_method=learning_method ).fit(X32) assert_allclose(lda_32.components_, lda_64.components_) assert_allclose(lda_32.transform(X32), lda_64.transform(X64))