307 lines
8.7 KiB
Plaintext
307 lines
8.7 KiB
Plaintext
|
.. Copyright (C) 2001-2023 NLTK Project
|
||
|
.. For license information, see LICENSE.TXT
|
||
|
|
||
|
===========
|
||
|
Probability
|
||
|
===========
|
||
|
|
||
|
>>> from nltk.test.probability_fixt import setup_module
|
||
|
>>> setup_module()
|
||
|
|
||
|
>>> import nltk
|
||
|
>>> from nltk.probability import *
|
||
|
|
||
|
FreqDist
|
||
|
--------
|
||
|
|
||
|
>>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!']
|
||
|
>>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.']
|
||
|
|
||
|
>>> fd1 = nltk.FreqDist(text1)
|
||
|
>>> fd1 == nltk.FreqDist(text1)
|
||
|
True
|
||
|
|
||
|
Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order.
|
||
|
|
||
|
>>> import itertools
|
||
|
>>> both = nltk.FreqDist(text1 + text2)
|
||
|
>>> both_most_common = both.most_common()
|
||
|
>>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1]))))
|
||
|
[('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)]
|
||
|
|
||
|
>>> both == fd1 + nltk.FreqDist(text2)
|
||
|
True
|
||
|
>>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged
|
||
|
True
|
||
|
|
||
|
>>> fd2 = nltk.FreqDist(text2)
|
||
|
>>> fd1.update(fd2)
|
||
|
>>> fd1 == both
|
||
|
True
|
||
|
|
||
|
>>> fd1 = nltk.FreqDist(text1)
|
||
|
>>> fd1.update(text2)
|
||
|
>>> fd1 == both
|
||
|
True
|
||
|
|
||
|
>>> fd1 = nltk.FreqDist(text1)
|
||
|
>>> fd2 = nltk.FreqDist(fd1)
|
||
|
>>> fd2 == fd1
|
||
|
True
|
||
|
|
||
|
``nltk.FreqDist`` can be pickled:
|
||
|
|
||
|
>>> import pickle
|
||
|
>>> fd1 = nltk.FreqDist(text1)
|
||
|
>>> pickled = pickle.dumps(fd1)
|
||
|
>>> fd1 == pickle.loads(pickled)
|
||
|
True
|
||
|
|
||
|
Mathematical operations:
|
||
|
|
||
|
>>> FreqDist('abbb') + FreqDist('bcc')
|
||
|
FreqDist({'b': 4, 'c': 2, 'a': 1})
|
||
|
>>> FreqDist('abbbc') - FreqDist('bccd')
|
||
|
FreqDist({'b': 2, 'a': 1})
|
||
|
>>> FreqDist('abbb') | FreqDist('bcc')
|
||
|
FreqDist({'b': 3, 'c': 2, 'a': 1})
|
||
|
>>> FreqDist('abbb') & FreqDist('bcc')
|
||
|
FreqDist({'b': 1})
|
||
|
|
||
|
ConditionalFreqDist
|
||
|
-------------------
|
||
|
|
||
|
>>> cfd1 = ConditionalFreqDist()
|
||
|
>>> cfd1[1] = FreqDist('abbbb')
|
||
|
>>> cfd1[2] = FreqDist('xxxxyy')
|
||
|
>>> cfd1
|
||
|
<ConditionalFreqDist with 2 conditions>
|
||
|
|
||
|
>>> cfd2 = ConditionalFreqDist()
|
||
|
>>> cfd2[1] = FreqDist('bbccc')
|
||
|
>>> cfd2[2] = FreqDist('xxxyyyzz')
|
||
|
>>> cfd2[3] = FreqDist('m')
|
||
|
>>> cfd2
|
||
|
<ConditionalFreqDist with 3 conditions>
|
||
|
|
||
|
>>> r = cfd1 + cfd2
|
||
|
>>> [(i,r[i]) for i in r.conditions()]
|
||
|
[(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))]
|
||
|
|
||
|
>>> r = cfd1 - cfd2
|
||
|
>>> [(i,r[i]) for i in r.conditions()]
|
||
|
[(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))]
|
||
|
|
||
|
>>> r = cfd1 | cfd2
|
||
|
>>> [(i,r[i]) for i in r.conditions()]
|
||
|
[(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))]
|
||
|
|
||
|
>>> r = cfd1 & cfd2
|
||
|
>>> [(i,r[i]) for i in r.conditions()]
|
||
|
[(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))]
|
||
|
|
||
|
Testing some HMM estimators
|
||
|
---------------------------
|
||
|
|
||
|
We extract a small part (500 sentences) of the Brown corpus
|
||
|
|
||
|
>>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
|
||
|
>>> print(len(corpus))
|
||
|
500
|
||
|
|
||
|
We create a HMM trainer - note that we need the tags and symbols
|
||
|
from the whole corpus, not just the training corpus
|
||
|
|
||
|
>>> from nltk.util import unique_list
|
||
|
>>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
|
||
|
>>> print(len(tag_set))
|
||
|
92
|
||
|
>>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
|
||
|
>>> print(len(symbols))
|
||
|
1464
|
||
|
>>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
|
||
|
|
||
|
We divide the corpus into 90% training and 10% testing
|
||
|
|
||
|
>>> train_corpus = []
|
||
|
>>> test_corpus = []
|
||
|
>>> for i in range(len(corpus)):
|
||
|
... if i % 10:
|
||
|
... train_corpus += [corpus[i]]
|
||
|
... else:
|
||
|
... test_corpus += [corpus[i]]
|
||
|
>>> print(len(train_corpus))
|
||
|
450
|
||
|
>>> print(len(test_corpus))
|
||
|
50
|
||
|
|
||
|
And now we can test the estimators
|
||
|
|
||
|
>>> def train_and_test(est):
|
||
|
... hmm = trainer.train_supervised(train_corpus, estimator=est)
|
||
|
... print('%.2f%%' % (100 * hmm.accuracy(test_corpus)))
|
||
|
|
||
|
Maximum Likelihood Estimation
|
||
|
-----------------------------
|
||
|
- this resulted in an initialization error before r7209
|
||
|
|
||
|
>>> mle = lambda fd, bins: MLEProbDist(fd)
|
||
|
>>> train_and_test(mle)
|
||
|
22.75%
|
||
|
|
||
|
Laplace (= Lidstone with gamma==1)
|
||
|
|
||
|
>>> train_and_test(LaplaceProbDist)
|
||
|
66.04%
|
||
|
|
||
|
Expected Likelihood Estimation (= Lidstone with gamma==0.5)
|
||
|
|
||
|
>>> train_and_test(ELEProbDist)
|
||
|
73.01%
|
||
|
|
||
|
Lidstone Estimation, for gamma==0.1, 0.5 and 1
|
||
|
(the later two should be exactly equal to MLE and ELE above)
|
||
|
|
||
|
>>> def lidstone(gamma):
|
||
|
... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins)
|
||
|
>>> train_and_test(lidstone(0.1))
|
||
|
82.51%
|
||
|
>>> train_and_test(lidstone(0.5))
|
||
|
73.01%
|
||
|
>>> train_and_test(lidstone(1.0))
|
||
|
66.04%
|
||
|
|
||
|
Witten Bell Estimation
|
||
|
----------------------
|
||
|
- This resulted in ZeroDivisionError before r7209
|
||
|
|
||
|
>>> train_and_test(WittenBellProbDist)
|
||
|
88.12%
|
||
|
|
||
|
Good Turing Estimation
|
||
|
|
||
|
>>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5)
|
||
|
>>> train_and_test(gt)
|
||
|
86.93%
|
||
|
|
||
|
Kneser Ney Estimation
|
||
|
---------------------
|
||
|
Since the Kneser-Ney distribution is best suited for trigrams, we must adjust
|
||
|
our testing accordingly.
|
||
|
|
||
|
>>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1]))
|
||
|
... for x, y, z in nltk.trigrams(sent)]
|
||
|
... for sent in corpus[:100]]
|
||
|
|
||
|
We will then need to redefine the rest of the training/testing variables
|
||
|
|
||
|
>>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
|
||
|
>>> len(tag_set)
|
||
|
906
|
||
|
|
||
|
>>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
|
||
|
>>> len(symbols)
|
||
|
1341
|
||
|
|
||
|
>>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
|
||
|
>>> train_corpus = []
|
||
|
>>> test_corpus = []
|
||
|
|
||
|
>>> for i in range(len(corpus)):
|
||
|
... if i % 10:
|
||
|
... train_corpus += [corpus[i]]
|
||
|
... else:
|
||
|
... test_corpus += [corpus[i]]
|
||
|
|
||
|
>>> len(train_corpus)
|
||
|
90
|
||
|
>>> len(test_corpus)
|
||
|
10
|
||
|
|
||
|
>>> kn = lambda fd, bins: KneserNeyProbDist(fd)
|
||
|
>>> train_and_test(kn)
|
||
|
0.86%
|
||
|
|
||
|
Remains to be added:
|
||
|
- Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist
|
||
|
|
||
|
Squashed bugs
|
||
|
-------------
|
||
|
|
||
|
Issue 511: override pop and popitem to invalidate the cache
|
||
|
|
||
|
>>> fd = nltk.FreqDist('a')
|
||
|
>>> list(fd.keys())
|
||
|
['a']
|
||
|
>>> fd.pop('a')
|
||
|
1
|
||
|
>>> list(fd.keys())
|
||
|
[]
|
||
|
|
||
|
Issue 533: access cumulative frequencies with no arguments
|
||
|
|
||
|
>>> fd = nltk.FreqDist('aab')
|
||
|
>>> list(fd._cumulative_frequencies(['a']))
|
||
|
[2.0]
|
||
|
>>> list(fd._cumulative_frequencies(['a', 'b']))
|
||
|
[2.0, 3.0]
|
||
|
|
||
|
Issue 579: override clear to reset some variables
|
||
|
|
||
|
>>> fd = FreqDist('aab')
|
||
|
>>> fd.clear()
|
||
|
>>> fd.N()
|
||
|
0
|
||
|
|
||
|
Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently
|
||
|
add errant categories
|
||
|
|
||
|
>>> from nltk.corpus import brown
|
||
|
>>> brown.fileids('blah')
|
||
|
Traceback (most recent call last):
|
||
|
...
|
||
|
ValueError: Category blah not found
|
||
|
>>> brown.categories()
|
||
|
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
|
||
|
|
||
|
Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default
|
||
|
otherwise any unseen events get a probability of zero, i.e.,
|
||
|
they don't get smoothed
|
||
|
|
||
|
>>> from nltk import SimpleGoodTuringProbDist, FreqDist
|
||
|
>>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10})
|
||
|
>>> p = SimpleGoodTuringProbDist(fd)
|
||
|
>>> p.prob('a')
|
||
|
0.017649766667026317...
|
||
|
>>> p.prob('o')
|
||
|
0.08433050215340411...
|
||
|
>>> p.prob('z')
|
||
|
0.022727272727272728...
|
||
|
>>> p.prob('foobar')
|
||
|
0.022727272727272728...
|
||
|
|
||
|
``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and
|
||
|
``ConditionalFreqDist`` can be pickled:
|
||
|
|
||
|
>>> import pickle
|
||
|
>>> pd = MLEProbDist(fd)
|
||
|
>>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples())
|
||
|
True
|
||
|
>>> dpd = DictionaryConditionalProbDist({'x': pd})
|
||
|
>>> unpickled = pickle.loads(pickle.dumps(dpd))
|
||
|
>>> dpd['x'].prob('a')
|
||
|
0.011363636...
|
||
|
>>> dpd['x'].prob('a') == unpickled['x'].prob('a')
|
||
|
True
|
||
|
>>> cfd = nltk.probability.ConditionalFreqDist()
|
||
|
>>> cfd['foo']['hello'] += 1
|
||
|
>>> cfd['foo']['hello'] += 1
|
||
|
>>> cfd['bar']['hello'] += 1
|
||
|
>>> cfd2 = pickle.loads(pickle.dumps(cfd))
|
||
|
>>> cfd2 == cfd
|
||
|
True
|
||
|
>>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
|
||
|
>>> cpd2 = pickle.loads(pickle.dumps(cpd))
|
||
|
>>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello')
|
||
|
True
|