""" Test the graphical_lasso module. """ import sys from io import StringIO import numpy as np import pytest from numpy.testing import assert_allclose from scipy import linalg from sklearn import datasets from sklearn.covariance import ( GraphicalLasso, GraphicalLassoCV, empirical_covariance, graphical_lasso, ) from sklearn.datasets import make_sparse_spd_matrix from sklearn.utils import check_random_state from sklearn.utils._testing import ( _convert_container, assert_array_almost_equal, assert_array_less, ) def test_graphical_lassos(random_state=1): """Test the graphical lasso solvers. This checks is unstable for some random seeds where the covariance found with "cd" and "lars" solvers are different (4 cases / 100 tries). """ # Sample data from a sparse multivariate normal dim = 20 n_samples = 100 random_state = check_random_state(random_state) prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=random_state) cov = linalg.inv(prec) X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) emp_cov = empirical_covariance(X) for alpha in (0.0, 0.1, 0.25): covs = dict() icovs = dict() for method in ("cd", "lars"): cov_, icov_, costs = graphical_lasso( emp_cov, return_costs=True, alpha=alpha, mode=method ) covs[method] = cov_ icovs[method] = icov_ costs, dual_gap = np.array(costs).T # Check that the costs always decrease (doesn't hold if alpha == 0) if not alpha == 0: # use 1e-12 since the cost can be exactly 0 assert_array_less(np.diff(costs), 1e-12) # Check that the 2 approaches give similar results assert_allclose(covs["cd"], covs["lars"], atol=5e-4) assert_allclose(icovs["cd"], icovs["lars"], atol=5e-4) # Smoke test the estimator model = GraphicalLasso(alpha=0.25).fit(X) model.score(X) assert_array_almost_equal(model.covariance_, covs["cd"], decimal=4) assert_array_almost_equal(model.covariance_, covs["lars"], decimal=4) # For a centered matrix, assume_centered could be chosen True or False # Check that this returns indeed the same result for centered data Z = X - X.mean(0) precs = list() for assume_centered in (False, True): prec_ = GraphicalLasso(assume_centered=assume_centered).fit(Z).precision_ precs.append(prec_) assert_array_almost_equal(precs[0], precs[1]) def test_graphical_lasso_when_alpha_equals_0(): """Test graphical_lasso's early return condition when alpha=0.""" X = np.random.randn(100, 10) emp_cov = empirical_covariance(X, assume_centered=True) model = GraphicalLasso(alpha=0, covariance="precomputed").fit(emp_cov) assert_allclose(model.precision_, np.linalg.inv(emp_cov)) _, precision = graphical_lasso(emp_cov, alpha=0) assert_allclose(precision, np.linalg.inv(emp_cov)) @pytest.mark.parametrize("mode", ["cd", "lars"]) def test_graphical_lasso_n_iter(mode): X, _ = datasets.make_classification(n_samples=5_000, n_features=20, random_state=0) emp_cov = empirical_covariance(X) _, _, n_iter = graphical_lasso( emp_cov, 0.2, mode=mode, max_iter=2, return_n_iter=True ) assert n_iter == 2 def test_graphical_lasso_iris(): # Hard-coded solution from R glasso package for alpha=1.0 # (need to set penalize.diagonal to FALSE) cov_R = np.array( [ [0.68112222, 0.0000000, 0.265820, 0.02464314], [0.00000000, 0.1887129, 0.000000, 0.00000000], [0.26582000, 0.0000000, 3.095503, 0.28697200], [0.02464314, 0.0000000, 0.286972, 0.57713289], ] ) icov_R = np.array( [ [1.5190747, 0.000000, -0.1304475, 0.0000000], [0.0000000, 5.299055, 0.0000000, 0.0000000], [-0.1304475, 0.000000, 0.3498624, -0.1683946], [0.0000000, 0.000000, -0.1683946, 1.8164353], ] ) X = datasets.load_iris().data emp_cov = empirical_covariance(X) for method in ("cd", "lars"): cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_R) assert_array_almost_equal(icov, icov_R) def test_graph_lasso_2D(): # Hard-coded solution from Python skggm package # obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)` cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]]) icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]]) X = datasets.load_iris().data[:, 2:] emp_cov = empirical_covariance(X) for method in ("cd", "lars"): cov, icov = graphical_lasso(emp_cov, alpha=0.1, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_skggm) assert_array_almost_equal(icov, icov_skggm) def test_graphical_lasso_iris_singular(): # Small subset of rows to test the rank-deficient case # Need to choose samples such that none of the variances are zero indices = np.arange(10, 13) # Hard-coded solution from R glasso package for alpha=0.01 cov_R = np.array( [ [0.08, 0.056666662595, 0.00229729713223, 0.00153153142149], [0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222], [0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009], [0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222], ] ) icov_R = np.array( [ [24.42244057, -16.831679593, 0.0, 0.0], [-16.83168201, 24.351841681, -6.206896552, -12.5], [0.0, -6.206896171, 153.103448276, 0.0], [0.0, -12.499999143, 0.0, 462.5], ] ) X = datasets.load_iris().data[indices, :] emp_cov = empirical_covariance(X) for method in ("cd", "lars"): cov, icov = graphical_lasso( emp_cov, alpha=0.01, return_costs=False, mode=method ) assert_array_almost_equal(cov, cov_R, decimal=5) assert_array_almost_equal(icov, icov_R, decimal=5) def test_graphical_lasso_cv(random_state=1): # Sample data from a sparse multivariate normal dim = 5 n_samples = 6 random_state = check_random_state(random_state) prec = make_sparse_spd_matrix(dim, alpha=0.96, random_state=random_state) cov = linalg.inv(prec) X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) # Capture stdout, to smoke test the verbose mode orig_stdout = sys.stdout try: sys.stdout = StringIO() # We need verbose very high so that Parallel prints on stdout GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X) finally: sys.stdout = orig_stdout @pytest.mark.parametrize("alphas_container_type", ["list", "tuple", "array"]) def test_graphical_lasso_cv_alphas_iterable(alphas_container_type): """Check that we can pass an array-like to `alphas`. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/22489 """ true_cov = np.array( [ [0.8, 0.0, 0.2, 0.0], [0.0, 0.4, 0.0, 0.0], [0.2, 0.0, 0.3, 0.1], [0.0, 0.0, 0.1, 0.7], ] ) rng = np.random.RandomState(0) X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200) alphas = _convert_container([0.02, 0.03], alphas_container_type) GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X) @pytest.mark.parametrize( "alphas,err_type,err_msg", [ ([-0.02, 0.03], ValueError, "must be > 0"), ([0, 0.03], ValueError, "must be > 0"), (["not_number", 0.03], TypeError, "must be an instance of float"), ], ) def test_graphical_lasso_cv_alphas_invalid_array(alphas, err_type, err_msg): """Check that if an array-like containing a value outside of (0, inf] is passed to `alphas`, a ValueError is raised. Check if a string is passed, a TypeError is raised. """ true_cov = np.array( [ [0.8, 0.0, 0.2, 0.0], [0.0, 0.4, 0.0, 0.0], [0.2, 0.0, 0.3, 0.1], [0.0, 0.0, 0.1, 0.7], ] ) rng = np.random.RandomState(0) X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200) with pytest.raises(err_type, match=err_msg): GraphicalLassoCV(alphas=alphas, tol=1e-1, n_jobs=1).fit(X) def test_graphical_lasso_cv_scores(): splits = 4 n_alphas = 5 n_refinements = 3 true_cov = np.array( [ [0.8, 0.0, 0.2, 0.0], [0.0, 0.4, 0.0, 0.0], [0.2, 0.0, 0.3, 0.1], [0.0, 0.0, 0.1, 0.7], ] ) rng = np.random.RandomState(0) X = rng.multivariate_normal(mean=[0, 0, 0, 0], cov=true_cov, size=200) cov = GraphicalLassoCV(cv=splits, alphas=n_alphas, n_refinements=n_refinements).fit( X ) cv_results = cov.cv_results_ # alpha and one for each split total_alphas = n_refinements * n_alphas + 1 keys = ["alphas"] split_keys = [f"split{i}_test_score" for i in range(splits)] for key in keys + split_keys: assert key in cv_results assert len(cv_results[key]) == total_alphas cv_scores = np.asarray([cov.cv_results_[key] for key in split_keys]) expected_mean = cv_scores.mean(axis=0) expected_std = cv_scores.std(axis=0) assert_allclose(cov.cv_results_["mean_test_score"], expected_mean) assert_allclose(cov.cv_results_["std_test_score"], expected_std) # TODO(1.5): remove in 1.5 def test_graphical_lasso_cov_init_deprecation(): """Check that we raise a deprecation warning if providing `cov_init` in `graphical_lasso`.""" rng, dim, n_samples = np.random.RandomState(0), 20, 100 prec = make_sparse_spd_matrix(dim, alpha=0.95, random_state=0) cov = linalg.inv(prec) X = rng.multivariate_normal(np.zeros(dim), cov, size=n_samples) emp_cov = empirical_covariance(X) with pytest.warns(FutureWarning, match="cov_init parameter is deprecated"): graphical_lasso(emp_cov, alpha=0.1, cov_init=emp_cov)