ai-content-maker/.venv/Lib/site-packages/sklearn/decomposition/tests/test_online_lda.py

import sys
from io import StringIO

import numpy as np
import pytest
from numpy.testing import assert_array_equal
from scipy.linalg import block_diag
from scipy.special import psi

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition._online_lda_fast import (
    _dirichlet_expectation_1d,
    _dirichlet_expectation_2d,
)
from sklearn.exceptions import NotFittedError
from sklearn.utils._testing import (
    assert_allclose,
    assert_almost_equal,
    assert_array_almost_equal,
    if_safe_multiprocessing_with_blas,
)
from sklearn.utils.fixes import CSR_CONTAINERS


def _build_sparse_array(csr_container):
    # Create 3 topics and each topic has 3 distinct words.
    # (Each word only belongs to a single topic.)
    n_components = 3
    block = np.full((3, 3), n_components, dtype=int)
    blocks = [block] * n_components
    X = block_diag(*blocks)
    X = csr_container(X)
    return (n_components, X)


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_default_prior_params(csr_container):
    # default prior parameter should be `1 / topics`
    # and verbose params should not affect result
    n_components, X = _build_sparse_array(csr_container)
    prior = 1.0 / n_components
    lda_1 = LatentDirichletAllocation(
        n_components=n_components,
        doc_topic_prior=prior,
        topic_word_prior=prior,
        random_state=0,
    )
    lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0)
    topic_distr_1 = lda_1.fit_transform(X)
    topic_distr_2 = lda_2.fit_transform(X)
    assert_almost_equal(topic_distr_1, topic_distr_2)


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_fit_batch(csr_container):
    # Test LDA batch learning_offset (`fit` method with 'batch' learning)
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        evaluate_every=1,
        learning_method="batch",
        random_state=rng,
    )
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert tuple(sorted(top_idx)) in correct_idx_grps


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_fit_online(csr_container):
    # Test LDA online learning (`fit` method with 'online' learning)
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        learning_offset=10.0,
        evaluate_every=1,
        learning_method="online",
        random_state=rng,
    )
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert tuple(sorted(top_idx)) in correct_idx_grps


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_partial_fit(csr_container):
    # Test LDA online learning (`partial_fit` method)
    # (same as test_lda_batch)
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        learning_offset=10.0,
        total_samples=100,
        random_state=rng,
    )
    for i in range(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert tuple(sorted(top_idx)) in correct_idx_grps


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_dense_input(csr_container):
    # Test LDA with dense input.
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components, learning_method="batch", random_state=rng
    )
    lda.fit(X.toarray())

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert tuple(sorted(top_idx)) in correct_idx_grps


def test_lda_transform():
    # Test LDA transform.
    # Transform result cannot be negative and should be normalized
    rng = np.random.RandomState(0)
    X = rng.randint(5, size=(20, 10))
    n_components = 3
    lda = LatentDirichletAllocation(n_components=n_components, random_state=rng)
    X_trans = lda.fit_transform(X)
    assert (X_trans > 0.0).any()
    assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))


@pytest.mark.parametrize("method", ("online", "batch"))
def test_lda_fit_transform(method):
    # Test LDA fit_transform & transform
    # fit_transform and transform result should be the same
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(50, 20))
    lda = LatentDirichletAllocation(
        n_components=5, learning_method=method, random_state=rng
    )
    X_fit = lda.fit_transform(X)
    X_trans = lda.transform(X)
    assert_array_almost_equal(X_fit, X_trans, 4)


def test_lda_negative_input():
    # test pass dense matrix with sparse negative input.
    X = np.full((5, 10), -1.0)
    lda = LatentDirichletAllocation()
    regex = r"^Negative values in data passed"
    with pytest.raises(ValueError, match=regex):
        lda.fit(X)


def test_lda_no_component_error():
    # test `perplexity` before `fit`
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    lda = LatentDirichletAllocation()
    regex = (
        "This LatentDirichletAllocation instance is not fitted yet. "
        "Call 'fit' with appropriate arguments before using this "
        "estimator."
    )
    with pytest.raises(NotFittedError, match=regex):
        lda.perplexity(X)


@if_safe_multiprocessing_with_blas
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("method", ("online", "batch"))
def test_lda_multi_jobs(method, csr_container):
    n_components, X = _build_sparse_array(csr_container)
    # Test LDA batch training with multi CPU
    rng = np.random.RandomState(0)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        n_jobs=2,
        learning_method=method,
        evaluate_every=1,
        random_state=rng,
    )
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert tuple(sorted(top_idx)) in correct_idx_grps


@if_safe_multiprocessing_with_blas
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_partial_fit_multi_jobs(csr_container):
    # Test LDA online training with multi CPU
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        n_jobs=2,
        learning_offset=5.0,
        total_samples=30,
        random_state=rng,
    )
    for i in range(2):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert tuple(sorted(top_idx)) in correct_idx_grps


def test_lda_preplexity_mismatch():
    # test dimension mismatch in `perplexity` method
    rng = np.random.RandomState(0)
    n_components = rng.randint(3, 6)
    n_samples = rng.randint(6, 10)
    X = np.random.randint(4, size=(n_samples, 10))
    lda = LatentDirichletAllocation(
        n_components=n_components,
        learning_offset=5.0,
        total_samples=20,
        random_state=rng,
    )
    lda.fit(X)
    # invalid samples
    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
    with pytest.raises(ValueError, match=r"Number of samples"):
        lda._perplexity_precomp_distr(X, invalid_n_samples)
    # invalid topic number
    invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
    with pytest.raises(ValueError, match=r"Number of topics"):
        lda._perplexity_precomp_distr(X, invalid_n_components)


@pytest.mark.parametrize("method", ("online", "batch"))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_perplexity(method, csr_container):
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_components, X = _build_sparse_array(csr_container)
    lda_1 = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=1,
        learning_method=method,
        total_samples=100,
        random_state=0,
    )
    lda_2 = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=10,
        learning_method=method,
        total_samples=100,
        random_state=0,
    )
    lda_1.fit(X)
    perp_1 = lda_1.perplexity(X, sub_sampling=False)

    lda_2.fit(X)
    perp_2 = lda_2.perplexity(X, sub_sampling=False)
    assert perp_1 >= perp_2

    perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
    perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
    assert perp_1_subsampling >= perp_2_subsampling


@pytest.mark.parametrize("method", ("online", "batch"))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_score(method, csr_container):
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_components, X = _build_sparse_array(csr_container)
    lda_1 = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=1,
        learning_method=method,
        total_samples=100,
        random_state=0,
    )
    lda_2 = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=10,
        learning_method=method,
        total_samples=100,
        random_state=0,
    )
    lda_1.fit_transform(X)
    score_1 = lda_1.score(X)

    lda_2.fit_transform(X)
    score_2 = lda_2.score(X)
    assert score_2 >= score_1


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_perplexity_input_format(csr_container):
    # Test LDA perplexity for sparse and dense input
    # score should be the same for both dense and sparse input
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=1,
        learning_method="batch",
        total_samples=100,
        random_state=0,
    )
    lda.fit(X)
    perp_1 = lda.perplexity(X)
    perp_2 = lda.perplexity(X.toarray())
    assert_almost_equal(perp_1, perp_2)


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_score_perplexity(csr_container):
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components, max_iter=10, random_state=0
    )
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1.0 * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_fit_perplexity(csr_container):
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=1,
        learning_method="batch",
        random_state=0,
        evaluate_every=1,
    )
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2)


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_empty_docs(csr_container):
    """Test LDA on empty document (all-zero rows)."""
    Z = np.zeros((5, 4))
    for X in [Z, csr_container(Z)]:
        lda = LatentDirichletAllocation(max_iter=750).fit(X)
        assert_almost_equal(
            lda.components_.sum(axis=0), np.ones(lda.components_.shape[1])
        )


def test_dirichlet_expectation():
    """Test Cython version of Dirichlet expectation calculation."""
    x = np.logspace(-100, 10, 10000)
    expectation = np.empty_like(x)
    _dirichlet_expectation_1d(x, 0, expectation)
    assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))), atol=1e-19)

    x = x.reshape(100, 100)
    assert_allclose(
        _dirichlet_expectation_2d(x),
        psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]),
        rtol=1e-11,
        atol=3e-9,
    )


def check_verbosity(
    verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
):
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=3,
        learning_method="batch",
        verbose=verbose,
        evaluate_every=evaluate_every,
        random_state=0,
    )
    out = StringIO()
    old_out, sys.stdout = sys.stdout, out
    try:
        lda.fit(X)
    finally:
        sys.stdout = old_out

    n_lines = out.getvalue().count("\n")
    n_perplexity = out.getvalue().count("perplexity")
    assert expected_lines == n_lines
    assert expected_perplexities == n_perplexity


@pytest.mark.parametrize(
    "verbose,evaluate_every,expected_lines,expected_perplexities",
    [
        (False, 1, 0, 0),
        (False, 0, 0, 0),
        (True, 0, 3, 0),
        (True, 1, 3, 3),
        (True, 2, 3, 1),
    ],
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_verbosity(
    verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
):
    check_verbosity(
        verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
    )


@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_lda_feature_names_out(csr_container):
    """Check feature names out for LatentDirichletAllocation."""
    n_components, X = _build_sparse_array(csr_container)
    lda = LatentDirichletAllocation(n_components=n_components).fit(X)

    names = lda.get_feature_names_out()
    assert_array_equal(
        [f"latentdirichletallocation{i}" for i in range(n_components)], names
    )


@pytest.mark.parametrize("learning_method", ("batch", "online"))
def test_lda_dtype_match(learning_method, global_dtype):
    """Check data type preservation of fitted attributes."""
    rng = np.random.RandomState(0)
    X = rng.uniform(size=(20, 10)).astype(global_dtype, copy=False)

    lda = LatentDirichletAllocation(
        n_components=5, random_state=0, learning_method=learning_method
    )
    lda.fit(X)
    assert lda.components_.dtype == global_dtype
    assert lda.exp_dirichlet_component_.dtype == global_dtype


@pytest.mark.parametrize("learning_method", ("batch", "online"))
def test_lda_numerical_consistency(learning_method, global_random_seed):
    """Check numerical consistency between np.float32 and np.float64."""
    rng = np.random.RandomState(global_random_seed)
    X64 = rng.uniform(size=(20, 10))
    X32 = X64.astype(np.float32)

    lda_64 = LatentDirichletAllocation(
        n_components=5, random_state=global_random_seed, learning_method=learning_method
    ).fit(X64)
    lda_32 = LatentDirichletAllocation(
        n_components=5, random_state=global_random_seed, learning_method=learning_method
    ).fit(X32)

    assert_allclose(lda_32.components_, lda_64.components_)
    assert_allclose(lda_32.transform(X32), lda_64.transform(X64))
first commit 2024-05-03 04:18:51 +03:00			`import sys`
			`from io import StringIO`

			`import numpy as np`
			`import pytest`
			`from numpy.testing import assert_array_equal`
			`from scipy.linalg import block_diag`
			`from scipy.special import psi`

			`from sklearn.decomposition import LatentDirichletAllocation`
			`from sklearn.decomposition._online_lda_fast import (`
			`_dirichlet_expectation_1d,`
			`_dirichlet_expectation_2d,`
			`)`
			`from sklearn.exceptions import NotFittedError`
			`from sklearn.utils._testing import (`
			`assert_allclose,`
			`assert_almost_equal,`
			`assert_array_almost_equal,`
			`if_safe_multiprocessing_with_blas,`
			`)`
			`from sklearn.utils.fixes import CSR_CONTAINERS`


			`def _build_sparse_array(csr_container):`
			`# Create 3 topics and each topic has 3 distinct words.`
			`# (Each word only belongs to a single topic.)`
			`n_components = 3`
			`block = np.full((3, 3), n_components, dtype=int)`
			`blocks = [block] * n_components`
			`X = block_diag(*blocks)`
			`X = csr_container(X)`
			`return (n_components, X)`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_default_prior_params(csr_container):`
			# default prior parameter should be `1 / topics`
			`# and verbose params should not affect result`
			`n_components, X = _build_sparse_array(csr_container)`
			`prior = 1.0 / n_components`
			`lda_1 = LatentDirichletAllocation(`
			`n_components=n_components,`
			`doc_topic_prior=prior,`
			`topic_word_prior=prior,`
			`random_state=0,`
			`)`
			`lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0)`
			`topic_distr_1 = lda_1.fit_transform(X)`
			`topic_distr_2 = lda_2.fit_transform(X)`
			`assert_almost_equal(topic_distr_1, topic_distr_2)`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_fit_batch(csr_container):`
			# Test LDA batch learning_offset (`fit` method with 'batch' learning)
			`rng = np.random.RandomState(0)`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`evaluate_every=1,`
			`learning_method="batch",`
			`random_state=rng,`
			`)`
			`lda.fit(X)`

			`correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]`
			`for component in lda.components_:`
			`# Find top 3 words in each LDA component`
			`top_idx = set(component.argsort()[-3:][::-1])`
			`assert tuple(sorted(top_idx)) in correct_idx_grps`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_fit_online(csr_container):`
			# Test LDA online learning (`fit` method with 'online' learning)
			`rng = np.random.RandomState(0)`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`learning_offset=10.0,`
			`evaluate_every=1,`
			`learning_method="online",`
			`random_state=rng,`
			`)`
			`lda.fit(X)`

			`correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]`
			`for component in lda.components_:`
			`# Find top 3 words in each LDA component`
			`top_idx = set(component.argsort()[-3:][::-1])`
			`assert tuple(sorted(top_idx)) in correct_idx_grps`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_partial_fit(csr_container):`
			# Test LDA online learning (`partial_fit` method)
			`# (same as test_lda_batch)`
			`rng = np.random.RandomState(0)`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`learning_offset=10.0,`
			`total_samples=100,`
			`random_state=rng,`
			`)`
			`for i in range(3):`
			`lda.partial_fit(X)`

			`correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]`
			`for c in lda.components_:`
			`top_idx = set(c.argsort()[-3:][::-1])`
			`assert tuple(sorted(top_idx)) in correct_idx_grps`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_dense_input(csr_container):`
			`# Test LDA with dense input.`
			`rng = np.random.RandomState(0)`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components, learning_method="batch", random_state=rng`
			`)`
			`lda.fit(X.toarray())`

			`correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]`
			`for component in lda.components_:`
			`# Find top 3 words in each LDA component`
			`top_idx = set(component.argsort()[-3:][::-1])`
			`assert tuple(sorted(top_idx)) in correct_idx_grps`


			`def test_lda_transform():`
			`# Test LDA transform.`
			`# Transform result cannot be negative and should be normalized`
			`rng = np.random.RandomState(0)`
			`X = rng.randint(5, size=(20, 10))`
			`n_components = 3`
			`lda = LatentDirichletAllocation(n_components=n_components, random_state=rng)`
			`X_trans = lda.fit_transform(X)`
			`assert (X_trans > 0.0).any()`
			`assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))`


			`@pytest.mark.parametrize("method", ("online", "batch"))`
			`def test_lda_fit_transform(method):`
			`# Test LDA fit_transform & transform`
			`# fit_transform and transform result should be the same`
			`rng = np.random.RandomState(0)`
			`X = rng.randint(10, size=(50, 20))`
			`lda = LatentDirichletAllocation(`
			`n_components=5, learning_method=method, random_state=rng`
			`)`
			`X_fit = lda.fit_transform(X)`
			`X_trans = lda.transform(X)`
			`assert_array_almost_equal(X_fit, X_trans, 4)`


			`def test_lda_negative_input():`
			`# test pass dense matrix with sparse negative input.`
			`X = np.full((5, 10), -1.0)`
			`lda = LatentDirichletAllocation()`
			`regex = r"^Negative values in data passed"`
			`with pytest.raises(ValueError, match=regex):`
			`lda.fit(X)`


			`def test_lda_no_component_error():`
			# test `perplexity` before `fit`
			`rng = np.random.RandomState(0)`
			`X = rng.randint(4, size=(20, 10))`
			`lda = LatentDirichletAllocation()`
			`regex = (`
			`"This LatentDirichletAllocation instance is not fitted yet. "`
			`"Call 'fit' with appropriate arguments before using this "`
			`"estimator."`
			`)`
			`with pytest.raises(NotFittedError, match=regex):`
			`lda.perplexity(X)`


			`@if_safe_multiprocessing_with_blas`
			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`@pytest.mark.parametrize("method", ("online", "batch"))`
			`def test_lda_multi_jobs(method, csr_container):`
			`n_components, X = _build_sparse_array(csr_container)`
			`# Test LDA batch training with multi CPU`
			`rng = np.random.RandomState(0)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`n_jobs=2,`
			`learning_method=method,`
			`evaluate_every=1,`
			`random_state=rng,`
			`)`
			`lda.fit(X)`

			`correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]`
			`for c in lda.components_:`
			`top_idx = set(c.argsort()[-3:][::-1])`
			`assert tuple(sorted(top_idx)) in correct_idx_grps`


			`@if_safe_multiprocessing_with_blas`
			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_partial_fit_multi_jobs(csr_container):`
			`# Test LDA online training with multi CPU`
			`rng = np.random.RandomState(0)`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`n_jobs=2,`
			`learning_offset=5.0,`
			`total_samples=30,`
			`random_state=rng,`
			`)`
			`for i in range(2):`
			`lda.partial_fit(X)`

			`correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]`
			`for c in lda.components_:`
			`top_idx = set(c.argsort()[-3:][::-1])`
			`assert tuple(sorted(top_idx)) in correct_idx_grps`


			`def test_lda_preplexity_mismatch():`
			# test dimension mismatch in `perplexity` method
			`rng = np.random.RandomState(0)`
			`n_components = rng.randint(3, 6)`
			`n_samples = rng.randint(6, 10)`
			`X = np.random.randint(4, size=(n_samples, 10))`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`learning_offset=5.0,`
			`total_samples=20,`
			`random_state=rng,`
			`)`
			`lda.fit(X)`
			`# invalid samples`
			`invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))`
			`with pytest.raises(ValueError, match=r"Number of samples"):`
			`lda._perplexity_precomp_distr(X, invalid_n_samples)`
			`# invalid topic number`
			`invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))`
			`with pytest.raises(ValueError, match=r"Number of topics"):`
			`lda._perplexity_precomp_distr(X, invalid_n_components)`


			`@pytest.mark.parametrize("method", ("online", "batch"))`
			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_perplexity(method, csr_container):`
			`# Test LDA perplexity for batch training`
			`# perplexity should be lower after each iteration`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda_1 = LatentDirichletAllocation(`
			`n_components=n_components,`
			`max_iter=1,`
			`learning_method=method,`
			`total_samples=100,`
			`random_state=0,`
			`)`
			`lda_2 = LatentDirichletAllocation(`
			`n_components=n_components,`
			`max_iter=10,`
			`learning_method=method,`
			`total_samples=100,`
			`random_state=0,`
			`)`
			`lda_1.fit(X)`
			`perp_1 = lda_1.perplexity(X, sub_sampling=False)`

			`lda_2.fit(X)`
			`perp_2 = lda_2.perplexity(X, sub_sampling=False)`
			`assert perp_1 >= perp_2`

			`perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)`
			`perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)`
			`assert perp_1_subsampling >= perp_2_subsampling`


			`@pytest.mark.parametrize("method", ("online", "batch"))`
			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_score(method, csr_container):`
			`# Test LDA score for batch training`
			`# score should be higher after each iteration`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda_1 = LatentDirichletAllocation(`
			`n_components=n_components,`
			`max_iter=1,`
			`learning_method=method,`
			`total_samples=100,`
			`random_state=0,`
			`)`
			`lda_2 = LatentDirichletAllocation(`
			`n_components=n_components,`
			`max_iter=10,`
			`learning_method=method,`
			`total_samples=100,`
			`random_state=0,`
			`)`
			`lda_1.fit_transform(X)`
			`score_1 = lda_1.score(X)`

			`lda_2.fit_transform(X)`
			`score_2 = lda_2.score(X)`
			`assert score_2 >= score_1`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_perplexity_input_format(csr_container):`
			`# Test LDA perplexity for sparse and dense input`
			`# score should be the same for both dense and sparse input`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`max_iter=1,`
			`learning_method="batch",`
			`total_samples=100,`
			`random_state=0,`
			`)`
			`lda.fit(X)`
			`perp_1 = lda.perplexity(X)`
			`perp_2 = lda.perplexity(X.toarray())`
			`assert_almost_equal(perp_1, perp_2)`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_score_perplexity(csr_container):`
			`# Test the relationship between LDA score and perplexity`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components, max_iter=10, random_state=0`
			`)`
			`lda.fit(X)`
			`perplexity_1 = lda.perplexity(X, sub_sampling=False)`

			`score = lda.score(X)`
			`perplexity_2 = np.exp(-1.0 * (score / np.sum(X.data)))`
			`assert_almost_equal(perplexity_1, perplexity_2)`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_fit_perplexity(csr_container):`
			`# Test that the perplexity computed during fit is consistent with what is`
			`# returned by the perplexity method`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`max_iter=1,`
			`learning_method="batch",`
			`random_state=0,`
			`evaluate_every=1,`
			`)`
			`lda.fit(X)`

			`# Perplexity computed at end of fit method`
			`perplexity1 = lda.bound_`

			`# Result of perplexity method on the train set`
			`perplexity2 = lda.perplexity(X)`

			`assert_almost_equal(perplexity1, perplexity2)`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_empty_docs(csr_container):`
			`"""Test LDA on empty document (all-zero rows)."""`
			`Z = np.zeros((5, 4))`
			`for X in [Z, csr_container(Z)]:`
			`lda = LatentDirichletAllocation(max_iter=750).fit(X)`
			`assert_almost_equal(`
			`lda.components_.sum(axis=0), np.ones(lda.components_.shape[1])`
			`)`


			`def test_dirichlet_expectation():`
			`"""Test Cython version of Dirichlet expectation calculation."""`
			`x = np.logspace(-100, 10, 10000)`
			`expectation = np.empty_like(x)`
			`_dirichlet_expectation_1d(x, 0, expectation)`
			`assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))), atol=1e-19)`

			`x = x.reshape(100, 100)`
			`assert_allclose(`
			`_dirichlet_expectation_2d(x),`
			`psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]),`
			`rtol=1e-11,`
			`atol=3e-9,`
			`)`


			`def check_verbosity(`
			`verbose, evaluate_every, expected_lines, expected_perplexities, csr_container`
			`):`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(`
			`n_components=n_components,`
			`max_iter=3,`
			`learning_method="batch",`
			`verbose=verbose,`
			`evaluate_every=evaluate_every,`
			`random_state=0,`
			`)`
			`out = StringIO()`
			`old_out, sys.stdout = sys.stdout, out`
			`try:`
			`lda.fit(X)`
			`finally:`
			`sys.stdout = old_out`

			`n_lines = out.getvalue().count("\n")`
			`n_perplexity = out.getvalue().count("perplexity")`
			`assert expected_lines == n_lines`
			`assert expected_perplexities == n_perplexity`


			`@pytest.mark.parametrize(`
			`"verbose,evaluate_every,expected_lines,expected_perplexities",`
			`[`
			`(False, 1, 0, 0),`
			`(False, 0, 0, 0),`
			`(True, 0, 3, 0),`
			`(True, 1, 3, 3),`
			`(True, 2, 3, 1),`
			`],`
			`)`
			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_verbosity(`
			`verbose, evaluate_every, expected_lines, expected_perplexities, csr_container`
			`):`
			`check_verbosity(`
			`verbose, evaluate_every, expected_lines, expected_perplexities, csr_container`
			`)`


			`@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)`
			`def test_lda_feature_names_out(csr_container):`
			`"""Check feature names out for LatentDirichletAllocation."""`
			`n_components, X = _build_sparse_array(csr_container)`
			`lda = LatentDirichletAllocation(n_components=n_components).fit(X)`

			`names = lda.get_feature_names_out()`
			`assert_array_equal(`
			`[f"latentdirichletallocation{i}" for i in range(n_components)], names`
			`)`


			`@pytest.mark.parametrize("learning_method", ("batch", "online"))`
			`def test_lda_dtype_match(learning_method, global_dtype):`
			`"""Check data type preservation of fitted attributes."""`
			`rng = np.random.RandomState(0)`
			`X = rng.uniform(size=(20, 10)).astype(global_dtype, copy=False)`

			`lda = LatentDirichletAllocation(`
			`n_components=5, random_state=0, learning_method=learning_method`
			`)`
			`lda.fit(X)`
			`assert lda.components_.dtype == global_dtype`
			`assert lda.exp_dirichlet_component_.dtype == global_dtype`


			`@pytest.mark.parametrize("learning_method", ("batch", "online"))`
			`def test_lda_numerical_consistency(learning_method, global_random_seed):`
			`"""Check numerical consistency between np.float32 and np.float64."""`
			`rng = np.random.RandomState(global_random_seed)`
			`X64 = rng.uniform(size=(20, 10))`
			`X32 = X64.astype(np.float32)`

			`lda_64 = LatentDirichletAllocation(`
			`n_components=5, random_state=global_random_seed, learning_method=learning_method`
			`).fit(X64)`
			`lda_32 = LatentDirichletAllocation(`
			`n_components=5, random_state=global_random_seed, learning_method=learning_method`
			`).fit(X32)`

			`assert_allclose(lda_32.components_, lda_64.components_)`
			`assert_allclose(lda_32.transform(X32), lda_64.transform(X64))`