.. Copyright (C) 2001-2023 NLTK Project .. For license information, see LICENSE.TXT =========== Probability =========== >>> from nltk.test.probability_fixt import setup_module >>> setup_module() >>> import nltk >>> from nltk.probability import * FreqDist -------- >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.'] >>> fd1 = nltk.FreqDist(text1) >>> fd1 == nltk.FreqDist(text1) True Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order. >>> import itertools >>> both = nltk.FreqDist(text1 + text2) >>> both_most_common = both.most_common() >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1])))) [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)] >>> both == fd1 + nltk.FreqDist(text2) True >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged True >>> fd2 = nltk.FreqDist(text2) >>> fd1.update(fd2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd1.update(text2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd2 = nltk.FreqDist(fd1) >>> fd2 == fd1 True ``nltk.FreqDist`` can be pickled: >>> import pickle >>> fd1 = nltk.FreqDist(text1) >>> pickled = pickle.dumps(fd1) >>> fd1 == pickle.loads(pickled) True Mathematical operations: >>> FreqDist('abbb') + FreqDist('bcc') FreqDist({'b': 4, 'c': 2, 'a': 1}) >>> FreqDist('abbbc') - FreqDist('bccd') FreqDist({'b': 2, 'a': 1}) >>> FreqDist('abbb') | FreqDist('bcc') FreqDist({'b': 3, 'c': 2, 'a': 1}) >>> FreqDist('abbb') & FreqDist('bcc') FreqDist({'b': 1}) ConditionalFreqDist ------------------- >>> cfd1 = ConditionalFreqDist() >>> cfd1[1] = FreqDist('abbbb') >>> cfd1[2] = FreqDist('xxxxyy') >>> cfd1 >>> cfd2 = ConditionalFreqDist() >>> cfd2[1] = FreqDist('bbccc') >>> cfd2[2] = FreqDist('xxxyyyzz') >>> cfd2[3] = FreqDist('m') >>> cfd2 >>> r = cfd1 + cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))] >>> r = cfd1 - cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))] >>> r = cfd1 | cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))] >>> r = cfd1 & cfd2 >>> [(i,r[i]) for i in r.conditions()] [(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))] Testing some HMM estimators --------------------------- We extract a small part (500 sentences) of the Brown corpus >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] >>> print(len(corpus)) 500 We create a HMM trainer - note that we need the tags and symbols from the whole corpus, not just the training corpus >>> from nltk.util import unique_list >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> print(len(tag_set)) 92 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> print(len(symbols)) 1464 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) We divide the corpus into 90% training and 10% testing >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> print(len(train_corpus)) 450 >>> print(len(test_corpus)) 50 And now we can test the estimators >>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) ... print('%.2f%%' % (100 * hmm.accuracy(test_corpus))) Maximum Likelihood Estimation ----------------------------- - this resulted in an initialization error before r7209 >>> mle = lambda fd, bins: MLEProbDist(fd) >>> train_and_test(mle) 22.75% Laplace (= Lidstone with gamma==1) >>> train_and_test(LaplaceProbDist) 66.04% Expected Likelihood Estimation (= Lidstone with gamma==0.5) >>> train_and_test(ELEProbDist) 73.01% Lidstone Estimation, for gamma==0.1, 0.5 and 1 (the later two should be exactly equal to MLE and ELE above) >>> def lidstone(gamma): ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) >>> train_and_test(lidstone(0.1)) 82.51% >>> train_and_test(lidstone(0.5)) 73.01% >>> train_and_test(lidstone(1.0)) 66.04% Witten Bell Estimation ---------------------- - This resulted in ZeroDivisionError before r7209 >>> train_and_test(WittenBellProbDist) 88.12% Good Turing Estimation >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) >>> train_and_test(gt) 86.93% Kneser Ney Estimation --------------------- Since the Kneser-Ney distribution is best suited for trigrams, we must adjust our testing accordingly. >>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1])) ... for x, y, z in nltk.trigrams(sent)] ... for sent in corpus[:100]] We will then need to redefine the rest of the training/testing variables >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> len(tag_set) 906 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> len(symbols) 1341 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> len(train_corpus) 90 >>> len(test_corpus) 10 >>> kn = lambda fd, bins: KneserNeyProbDist(fd) >>> train_and_test(kn) 0.86% Remains to be added: - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist Squashed bugs ------------- Issue 511: override pop and popitem to invalidate the cache >>> fd = nltk.FreqDist('a') >>> list(fd.keys()) ['a'] >>> fd.pop('a') 1 >>> list(fd.keys()) [] Issue 533: access cumulative frequencies with no arguments >>> fd = nltk.FreqDist('aab') >>> list(fd._cumulative_frequencies(['a'])) [2.0] >>> list(fd._cumulative_frequencies(['a', 'b'])) [2.0, 3.0] Issue 579: override clear to reset some variables >>> fd = FreqDist('aab') >>> fd.clear() >>> fd.N() 0 Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently add errant categories >>> from nltk.corpus import brown >>> brown.fileids('blah') Traceback (most recent call last): ... ValueError: Category blah not found >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default otherwise any unseen events get a probability of zero, i.e., they don't get smoothed >>> from nltk import SimpleGoodTuringProbDist, FreqDist >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) >>> p = SimpleGoodTuringProbDist(fd) >>> p.prob('a') 0.017649766667026317... >>> p.prob('o') 0.08433050215340411... >>> p.prob('z') 0.022727272727272728... >>> p.prob('foobar') 0.022727272727272728... ``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and ``ConditionalFreqDist`` can be pickled: >>> import pickle >>> pd = MLEProbDist(fd) >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples()) True >>> dpd = DictionaryConditionalProbDist({'x': pd}) >>> unpickled = pickle.loads(pickle.dumps(dpd)) >>> dpd['x'].prob('a') 0.011363636... >>> dpd['x'].prob('a') == unpickled['x'].prob('a') True >>> cfd = nltk.probability.ConditionalFreqDist() >>> cfd['foo']['hello'] += 1 >>> cfd['foo']['hello'] += 1 >>> cfd['bar']['hello'] += 1 >>> cfd2 = pickle.loads(pickle.dumps(cfd)) >>> cfd2 == cfd True >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) >>> cpd2 = pickle.loads(pickle.dumps(cpd)) >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello') True