ai-content-maker/.venv/Lib/site-packages/nltk/test/unit/test_collocations.py

121 lines
3.5 KiB
Python

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
## Test bigram counters with discontinuous bigrams and repeated words
_EPSILON = 1e-8
SENT = "this this is is a a test test".split()
def close_enough(x, y):
"""Verify that two sequences of n-gram association values are within
_EPSILON of each other.
"""
return all(abs(x1[1] - y1[1]) <= _EPSILON for x1, y1 in zip(x, y))
def test_bigram2():
b = BigramCollocationFinder.from_words(SENT)
assert sorted(b.ngram_fd.items()) == [
(("a", "a"), 1),
(("a", "test"), 1),
(("is", "a"), 1),
(("is", "is"), 1),
(("test", "test"), 1),
(("this", "is"), 1),
(("this", "this"), 1),
]
assert sorted(b.word_fd.items()) == [("a", 2), ("is", 2), ("test", 2), ("this", 2)]
assert len(SENT) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
assert close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
[
(("a", "a"), 1.0),
(("a", "test"), 1.0),
(("is", "a"), 1.0),
(("is", "is"), 1.0),
(("test", "test"), 1.0),
(("this", "is"), 1.0),
(("this", "this"), 1.0),
],
)
def test_bigram3():
b = BigramCollocationFinder.from_words(SENT, window_size=3)
assert sorted(b.ngram_fd.items()) == sorted(
[
(("a", "test"), 3),
(("is", "a"), 3),
(("this", "is"), 3),
(("a", "a"), 1),
(("is", "is"), 1),
(("test", "test"), 1),
(("this", "this"), 1),
]
)
assert sorted(b.word_fd.items()) == sorted(
[("a", 2), ("is", 2), ("test", 2), ("this", 2)]
)
assert (
len(SENT) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
)
assert close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(("a", "test"), 1.584962500721156),
(("is", "a"), 1.584962500721156),
(("this", "is"), 1.584962500721156),
(("a", "a"), 0.0),
(("is", "is"), 0.0),
(("test", "test"), 0.0),
(("this", "this"), 0.0),
]
),
)
def test_bigram5():
b = BigramCollocationFinder.from_words(SENT, window_size=5)
assert sorted(b.ngram_fd.items()) == sorted(
[
(("a", "test"), 4),
(("is", "a"), 4),
(("this", "is"), 4),
(("is", "test"), 3),
(("this", "a"), 3),
(("a", "a"), 1),
(("is", "is"), 1),
(("test", "test"), 1),
(("this", "this"), 1),
]
)
assert sorted(b.word_fd.items()) == sorted(
[("a", 2), ("is", 2), ("test", 2), ("this", 2)]
)
n_word_fd = sum(b.word_fd.values())
n_ngram_fd = (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
assert len(SENT) == n_word_fd == n_ngram_fd
assert close_enough(
sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
sorted(
[
(("a", "test"), 1.0),
(("is", "a"), 1.0),
(("this", "is"), 1.0),
(("is", "test"), 0.5849625007211562),
(("this", "a"), 0.5849625007211562),
(("a", "a"), -1.0),
(("is", "is"), -1.0),
(("test", "test"), -1.0),
(("this", "this"), -1.0),
]
),
)