# Natural Language Toolkit: Language Model Unit Tests # # Copyright (C) 2001-2023 NLTK Project # Author: Ilia Kurenkov # URL: # For license information, see LICENSE.TXT import math from operator import itemgetter import pytest from nltk.lm import ( MLE, AbsoluteDiscountingInterpolated, KneserNeyInterpolated, Laplace, Lidstone, StupidBackoff, Vocabulary, WittenBellInterpolated, ) from nltk.lm.preprocessing import padded_everygrams @pytest.fixture(scope="session") def vocabulary(): return Vocabulary(["a", "b", "c", "d", "z", "", ""], unk_cutoff=1) @pytest.fixture(scope="session") def training_data(): return [["a", "b", "c", "d"], ["e", "g", "a", "d", "b", "e"]] @pytest.fixture(scope="session") def bigram_training_data(training_data): return [list(padded_everygrams(2, sent)) for sent in training_data] @pytest.fixture(scope="session") def trigram_training_data(training_data): return [list(padded_everygrams(3, sent)) for sent in training_data] @pytest.fixture def mle_bigram_model(vocabulary, bigram_training_data): model = MLE(2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ ("d", ["c"], 1), # Unseen ngrams should yield 0 ("d", ["e"], 0), # Unigrams should also be 0 ("z", None, 0), # N unigrams = 14 # count('a') = 2 ("a", None, 2.0 / 14), # count('y') = 3 ("y", None, 3.0 / 14), ], ) def test_mle_bigram_scores(mle_bigram_model, word, context, expected_score): assert pytest.approx(mle_bigram_model.score(word, context), 1e-4) == expected_score def test_mle_bigram_logscore_for_zero_score(mle_bigram_model): assert math.isinf(mle_bigram_model.logscore("d", ["e"])) def test_mle_bigram_entropy_perplexity_seen(mle_bigram_model): # ngrams seen during training trained = [ ("", "a"), ("a", "b"), ("b", ""), ("", "a"), ("a", "d"), ("d", ""), ] # Ngram = Log score # , a = -1 # a, b = -1 # b, UNK = -1 # UNK, a = -1.585 # a, d = -1 # d, = -1 # TOTAL logscores = -6.585 # - AVG logscores = 1.0975 H = 1.0975 perplexity = 2.1398 assert pytest.approx(mle_bigram_model.entropy(trained), 1e-4) == H assert pytest.approx(mle_bigram_model.perplexity(trained), 1e-4) == perplexity def test_mle_bigram_entropy_perplexity_unseen(mle_bigram_model): # In MLE, even one unseen ngram should make entropy and perplexity infinite untrained = [("", "a"), ("a", "c"), ("c", "d"), ("d", "")] assert math.isinf(mle_bigram_model.entropy(untrained)) assert math.isinf(mle_bigram_model.perplexity(untrained)) def test_mle_bigram_entropy_perplexity_unigrams(mle_bigram_model): # word = score, log score # = 0.1429, -2.8074 # a = 0.1429, -2.8074 # c = 0.0714, -3.8073 # UNK = 0.2143, -2.2224 # d = 0.1429, -2.8074 # c = 0.0714, -3.8073 # = 0.1429, -2.8074 # TOTAL logscores = -21.6243 # - AVG logscores = 3.0095 H = 3.0095 perplexity = 8.0529 text = [("",), ("a",), ("c",), ("-",), ("d",), ("c",), ("",)] assert pytest.approx(mle_bigram_model.entropy(text), 1e-4) == H assert pytest.approx(mle_bigram_model.perplexity(text), 1e-4) == perplexity @pytest.fixture def mle_trigram_model(trigram_training_data, vocabulary): model = MLE(order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # count(d | b, c) = 1 # count(b, c) = 1 ("d", ("b", "c"), 1), # count(d | c) = 1 # count(c) = 1 ("d", ["c"], 1), # total number of tokens is 18, of which "a" occurred 2 times ("a", None, 2.0 / 18), # in vocabulary but unseen ("z", None, 0), # out of vocabulary should use "UNK" score ("y", None, 3.0 / 18), ], ) def test_mle_trigram_scores(mle_trigram_model, word, context, expected_score): assert pytest.approx(mle_trigram_model.score(word, context), 1e-4) == expected_score @pytest.fixture def lidstone_bigram_model(bigram_training_data, vocabulary): model = Lidstone(0.1, order=2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # count(d | c) = 1 # *count(d | c) = 1.1 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 1.8 ("d", ["c"], 1.1 / 1.8), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 0.8 = 14.8 # count("a") = 2 # *count("a") = 2.1 ("a", None, 2.1 / 14.8), # in vocabulary but unseen # count("z") = 0 # *count("z") = 0.1 ("z", None, 0.1 / 14.8), # out of vocabulary should use "UNK" score # count("") = 3 # *count("") = 3.1 ("y", None, 3.1 / 14.8), ], ) def test_lidstone_bigram_score(lidstone_bigram_model, word, context, expected_score): assert ( pytest.approx(lidstone_bigram_model.score(word, context), 1e-4) == expected_score ) def test_lidstone_entropy_perplexity(lidstone_bigram_model): text = [ ("", "a"), ("a", "c"), ("c", ""), ("", "d"), ("d", "c"), ("c", ""), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # , a = 0.3929, -1.3479 # a, c = 0.0357, -4.8074 # c, UNK = 0.0(5), -4.1699 # UNK, d = 0.0263, -5.2479 # d, c = 0.0357, -4.8074 # c, = 0.0(5), -4.1699 # TOTAL logscore: −24.5504 # - AVG logscore: 4.0917 H = 4.0917 perplexity = 17.0504 assert pytest.approx(lidstone_bigram_model.entropy(text), 1e-4) == H assert pytest.approx(lidstone_bigram_model.perplexity(text), 1e-4) == perplexity @pytest.fixture def lidstone_trigram_model(trigram_training_data, vocabulary): model = Lidstone(0.1, order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # Logic behind this is the same as for bigram model ("d", ["c"], 1.1 / 1.8), # if we choose a word that hasn't appeared after (b, c) ("e", ["c"], 0.1 / 1.8), # Trigram score now ("d", ["b", "c"], 1.1 / 1.8), ("e", ["b", "c"], 0.1 / 1.8), ], ) def test_lidstone_trigram_score(lidstone_trigram_model, word, context, expected_score): assert ( pytest.approx(lidstone_trigram_model.score(word, context), 1e-4) == expected_score ) @pytest.fixture def laplace_bigram_model(bigram_training_data, vocabulary): model = Laplace(2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # basic sanity-check: # count(d | c) = 1 # *count(d | c) = 2 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 9 ("d", ["c"], 2.0 / 9), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 8 = 22 # count("a") = 2 # *count("a") = 3 ("a", None, 3.0 / 22), # in vocabulary but unseen # count("z") = 0 # *count("z") = 1 ("z", None, 1.0 / 22), # out of vocabulary should use "UNK" score # count("") = 3 # *count("") = 4 ("y", None, 4.0 / 22), ], ) def test_laplace_bigram_score(laplace_bigram_model, word, context, expected_score): assert ( pytest.approx(laplace_bigram_model.score(word, context), 1e-4) == expected_score ) def test_laplace_bigram_entropy_perplexity(laplace_bigram_model): text = [ ("", "a"), ("a", "c"), ("c", ""), ("", "d"), ("d", "c"), ("c", ""), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # , a = 0.2, -2.3219 # a, c = 0.1, -3.3219 # c, UNK = 0.(1), -3.1699 # UNK, d = 0.(09), 3.4594 # d, c = 0.1 -3.3219 # c, = 0.(1), -3.1699 # Total logscores: −18.7651 # - AVG logscores: 3.1275 H = 3.1275 perplexity = 8.7393 assert pytest.approx(laplace_bigram_model.entropy(text), 1e-4) == H assert pytest.approx(laplace_bigram_model.perplexity(text), 1e-4) == perplexity def test_laplace_gamma(laplace_bigram_model): assert laplace_bigram_model.gamma == 1 @pytest.fixture def wittenbell_trigram_model(trigram_training_data, vocabulary): model = WittenBellInterpolated(3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # For unigram scores by default revert to regular MLE # Total unigrams: 18 # Vocab Size = 7 # count('c'): 1 ("c", None, 1.0 / 18), # in vocabulary but unseen # count("z") = 0 ("z", None, 0 / 18), # out of vocabulary should use "UNK" score # count("") = 3 ("y", None, 3.0 / 18), # 2 words follow b and b occurred a total of 2 times # gamma(['b']) = 2 / (2 + 2) = 0.5 # mle.score('c', ['b']) = 0.5 # mle('c') = 1 / 18 = 0.055 # (1 - gamma) * mle + gamma * mle('c') ~= 0.27 + 0.055 ("c", ["b"], (1 - 0.5) * 0.5 + 0.5 * 1 / 18), # building on that, let's try 'a b c' as the trigram # 1 word follows 'a b' and 'a b' occurred 1 time # gamma(['a', 'b']) = 1 / (1 + 1) = 0.5 # mle("c", ["a", "b"]) = 1 ("c", ["a", "b"], (1 - 0.5) + 0.5 * ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), # P(c|zb) # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. ("c", ["z", "b"], ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), ], ) def test_wittenbell_trigram_score( wittenbell_trigram_model, word, context, expected_score ): assert ( pytest.approx(wittenbell_trigram_model.score(word, context), 1e-4) == expected_score ) ############################################################################### # Notation Explained # ############################################################################### # For all subsequent calculations we use the following notation: # 1. '*': Placeholder for any word/character. E.g. '*b' stands for # all bigrams that end in 'b'. '*b*' stands for all trigrams that # contain 'b' in the middle. # 1. count(ngram): Count all instances (tokens) of an ngram. # 1. unique(ngram): Count unique instances (types) of an ngram. @pytest.fixture def kneserney_trigram_model(trigram_training_data, vocabulary): model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # P(c) = count('*c') / unique('**') # = 1 / 14 ("c", None, 1.0 / 14), # P(z) = count('*z') / unique('**') # = 0 / 14 # 'z' is in the vocabulary, but it was not seen during training. ("z", None, 0.0 / 14), # P(y) # Out of vocabulary should use "UNK" score. # P(y) = P(UNK) = count('*UNK') / unique('**') ("y", None, 3 / 14), # We start with P(c|b) # P(c|b) = alpha('bc') + gamma('b') * P(c) # alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*') # = max(1 - 0.75, 0) / 2 # = 0.125 # gamma('b') = discount * unique('b*') / unique('*b*') # = (0.75 * 2) / 2 # = 0.75 ("c", ["b"], (0.125 + 0.75 * (1 / 14))), # Building on that, let's try P(c|ab). # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') # = max(1 - 0.75, 0) / 1 # = 0.25 # gamma('ab') = (discount * unique('ab*')) / count('ab*') # = 0.75 * 1 / 1 ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))), # P(c|zb) # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))), ], ) def test_kneserney_trigram_score( kneserney_trigram_model, word, context, expected_score ): assert ( pytest.approx(kneserney_trigram_model.score(word, context), 1e-4) == expected_score ) @pytest.fixture def absolute_discounting_trigram_model(trigram_training_data, vocabulary): model = AbsoluteDiscountingInterpolated(order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # For unigram scores revert to uniform # P(c) = count('c') / count('**') ("c", None, 1.0 / 18), # in vocabulary but unseen # count('z') = 0 ("z", None, 0.0 / 18), # out of vocabulary should use "UNK" score # count('') = 3 ("y", None, 3 / 18), # P(c|b) = alpha('bc') + gamma('b') * P(c) # alpha('bc') = max(count('bc') - discount, 0) / count('b*') # = max(1 - 0.75, 0) / 2 # = 0.125 # gamma('b') = discount * unique('b*') / count('b*') # = (0.75 * 2) / 2 # = 0.75 ("c", ["b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), # Building on that, let's try P(c|ab). # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') # = max(1 - 0.75, 0) / 1 # = 0.25 # gamma('ab') = (discount * unique('ab*')) / count('ab*') # = 0.75 * 1 / 1 ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (2 / 2) * (1 / 18))), # P(c|zb) # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. ("c", ["z", "b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), ], ) def test_absolute_discounting_trigram_score( absolute_discounting_trigram_model, word, context, expected_score ): assert ( pytest.approx(absolute_discounting_trigram_model.score(word, context), 1e-4) == expected_score ) @pytest.fixture def stupid_backoff_trigram_model(trigram_training_data, vocabulary): model = StupidBackoff(order=3, vocabulary=vocabulary) model.fit(trigram_training_data) return model @pytest.mark.parametrize( "word, context, expected_score", [ # For unigram scores revert to uniform # total bigrams = 18 ("c", None, 1.0 / 18), # in vocabulary but unseen # bigrams ending with z = 0 ("z", None, 0.0 / 18), # out of vocabulary should use "UNK" score # count(''): 3 ("y", None, 3 / 18), # c follows 1 time out of 2 after b ("c", ["b"], 1 / 2), # c always follows ab ("c", ["a", "b"], 1 / 1), # The ngram 'z b c' was not seen, so we backoff to # the score of the ngram 'b c' * smoothing factor ("c", ["z", "b"], (0.4 * (1 / 2))), ], ) def test_stupid_backoff_trigram_score( stupid_backoff_trigram_model, word, context, expected_score ): assert ( pytest.approx(stupid_backoff_trigram_model.score(word, context), 1e-4) == expected_score ) ############################################################################### # Probability Distributions Should Sum up to Unity # ############################################################################### @pytest.fixture(scope="session") def kneserney_bigram_model(bigram_training_data, vocabulary): model = KneserNeyInterpolated(order=2, vocabulary=vocabulary) model.fit(bigram_training_data) return model @pytest.mark.parametrize( "model_fixture", [ "mle_bigram_model", "mle_trigram_model", "lidstone_bigram_model", "laplace_bigram_model", "wittenbell_trigram_model", "absolute_discounting_trigram_model", "kneserney_bigram_model", pytest.param( "stupid_backoff_trigram_model", marks=pytest.mark.xfail( reason="Stupid Backoff is not a valid distribution" ), ), ], ) @pytest.mark.parametrize( "context", [("a",), ("c",), ("",), ("b",), ("",), ("d",), ("e",), ("r",), ("w",)], ids=itemgetter(0), ) def test_sums_to_1(model_fixture, context, request): model = request.getfixturevalue(model_fixture) scores_for_context = sum(model.score(w, context) for w in model.vocab) assert pytest.approx(scores_for_context, 1e-7) == 1.0 ############################################################################### # Generating Text # ############################################################################### def test_generate_one_no_context(mle_trigram_model): assert mle_trigram_model.generate(random_seed=3) == "" def test_generate_one_from_limiting_context(mle_trigram_model): # We don't need random_seed for contexts with only one continuation assert mle_trigram_model.generate(text_seed=["c"]) == "d" assert mle_trigram_model.generate(text_seed=["b", "c"]) == "d" assert mle_trigram_model.generate(text_seed=["a", "c"]) == "d" def test_generate_one_from_varied_context(mle_trigram_model): # When context doesn't limit our options enough, seed the random choice assert mle_trigram_model.generate(text_seed=("a", ""), random_seed=2) == "a" def test_generate_cycle(mle_trigram_model): # Add a cycle to the model: bd -> b, db -> d more_training_text = [padded_everygrams(mle_trigram_model.order, list("bdbdbd"))] mle_trigram_model.fit(more_training_text) # Test that we can escape the cycle assert mle_trigram_model.generate(7, text_seed=("b", "d"), random_seed=5) == [ "b", "d", "b", "d", "b", "d", "", ] def test_generate_with_text_seed(mle_trigram_model): assert mle_trigram_model.generate(5, text_seed=("", "e"), random_seed=3) == [ "", "a", "d", "b", "", ] def test_generate_oov_text_seed(mle_trigram_model): assert mle_trigram_model.generate( text_seed=("aliens",), random_seed=3 ) == mle_trigram_model.generate(text_seed=("",), random_seed=3) def test_generate_None_text_seed(mle_trigram_model): # should crash with type error when we try to look it up in vocabulary with pytest.raises(TypeError): mle_trigram_model.generate(text_seed=(None,)) # This will work assert mle_trigram_model.generate( text_seed=None, random_seed=3 ) == mle_trigram_model.generate(random_seed=3)