# Natural Language Toolkit: Classifier Utility Functions # # Copyright (C) 2001-2023 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ Utility functions and classes for classifiers. """ import math # from nltk.util import Deprecated import nltk.classify.util # for accuracy & log_likelihood from nltk.util import LazyMap ###################################################################### # { Helper Functions ###################################################################### # alternative name possibility: 'map_featurefunc()'? # alternative name possibility: 'detect_features()'? # alternative name possibility: 'map_featuredetect()'? # or.. just have users use LazyMap directly? def apply_features(feature_func, toks, labeled=None): """ Use the ``LazyMap`` class to construct a lazy list-like object that is analogous to ``map(feature_func, toks)``. In particular, if ``labeled=False``, then the returned list-like object's values are equal to:: [feature_func(tok) for tok in toks] If ``labeled=True``, then the returned list-like object's values are equal to:: [(feature_func(tok), label) for (tok, label) in toks] The primary purpose of this function is to avoid the memory overhead involved in storing all the featuresets for every token in a corpus. Instead, these featuresets are constructed lazily, as-needed. The reduction in memory overhead can be especially significant when the underlying list of tokens is itself lazy (as is the case with many corpus readers). :param feature_func: The function that will be applied to each token. It should return a featureset -- i.e., a dict mapping feature names to feature values. :param toks: The list of tokens to which ``feature_func`` should be applied. If ``labeled=True``, then the list elements will be passed directly to ``feature_func()``. If ``labeled=False``, then the list elements should be tuples ``(tok,label)``, and ``tok`` will be passed to ``feature_func()``. :param labeled: If true, then ``toks`` contains labeled tokens -- i.e., tuples of the form ``(tok, label)``. (Default: auto-detect based on types.) """ if labeled is None: labeled = toks and isinstance(toks[0], (tuple, list)) if labeled: def lazy_func(labeled_token): return (feature_func(labeled_token[0]), labeled_token[1]) return LazyMap(lazy_func, toks) else: return LazyMap(feature_func, toks) def attested_labels(tokens): """ :return: A list of all labels that are attested in the given list of tokens. :rtype: list of (immutable) :param tokens: The list of classified tokens from which to extract labels. A classified token has the form ``(token, label)``. :type tokens: list """ return tuple({label for (tok, label) in tokens}) def log_likelihood(classifier, gold): results = classifier.prob_classify_many([fs for (fs, l) in gold]) ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)] return math.log(sum(ll) / len(ll)) def accuracy(classifier, gold): results = classifier.classify_many([fs for (fs, l) in gold]) correct = [l == r for ((fs, l), r) in zip(gold, results)] if correct: return sum(correct) / len(correct) else: return 0 class CutoffChecker: """ A helper class that implements cutoff checks based on number of iterations and log likelihood. Accuracy cutoffs are also implemented, but they're almost never a good idea to use. """ def __init__(self, cutoffs): self.cutoffs = cutoffs.copy() if "min_ll" in cutoffs: cutoffs["min_ll"] = -abs(cutoffs["min_ll"]) if "min_lldelta" in cutoffs: cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"]) self.ll = None self.acc = None self.iter = 1 def check(self, classifier, train_toks): cutoffs = self.cutoffs self.iter += 1 if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]: return True # iteration cutoff. new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) if math.isnan(new_ll): return True if "min_ll" in cutoffs or "min_lldelta" in cutoffs: if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]: return True # log likelihood cutoff if ( "min_lldelta" in cutoffs and self.ll and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"])) ): return True # log likelihood delta cutoff self.ll = new_ll if "max_acc" in cutoffs or "min_accdelta" in cutoffs: new_acc = nltk.classify.util.log_likelihood(classifier, train_toks) if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]: return True # log likelihood cutoff if ( "min_accdelta" in cutoffs and self.acc and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"])) ): return True # log likelihood delta cutoff self.acc = new_acc return False # no cutoff reached. ###################################################################### # { Demos ###################################################################### def names_demo_features(name): features = {} features["alwayson"] = True features["startswith"] = name[0].lower() features["endswith"] = name[-1].lower() for letter in "abcdefghijklmnopqrstuvwxyz": features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = letter in name.lower() return features def binary_names_demo_features(name): features = {} features["alwayson"] = True features["startswith(vowel)"] = name[0].lower() in "aeiouy" features["endswith(vowel)"] = name[-1].lower() in "aeiouy" for letter in "abcdefghijklmnopqrstuvwxyz": features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = letter in name.lower() features["startswith(%s)" % letter] = letter == name[0].lower() features["endswith(%s)" % letter] = letter == name[-1].lower() return features def names_demo(trainer, features=names_demo_features): import random from nltk.corpus import names # Construct a list of classified names, using the names corpus. namelist = [(name, "male") for name in names.words("male.txt")] + [ (name, "female") for name in names.words("female.txt") ] # Randomly split the names into a test & train set. random.seed(123456) random.shuffle(namelist) train = namelist[:5000] test = namelist[5000:5500] # Train up a classifier. print("Training classifier...") classifier = trainer([(features(n), g) for (n, g) in train]) # Run the classifier on the test data. print("Testing classifier...") acc = accuracy(classifier, [(features(n), g) for (n, g) in test]) print("Accuracy: %6.4f" % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, g) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) print() print("Unseen Names P(Male) P(Female)\n" + "-" * 40) for ((name, gender), pdist) in list(zip(test, pdists))[:5]: if gender == "male": fmt = " %-15s *%6.4f %6.4f" else: fmt = " %-15s %6.4f *%6.4f" print(fmt % (name, pdist.prob("male"), pdist.prob("female"))) except NotImplementedError: pass # Return the classifier return classifier def partial_names_demo(trainer, features=names_demo_features): import random from nltk.corpus import names male_names = names.words("male.txt") female_names = names.words("female.txt") random.seed(654321) random.shuffle(male_names) random.shuffle(female_names) # Create a list of male names to be used as positive-labeled examples for training positive = map(features, male_names[:2000]) # Create a list of male and female names to be used as unlabeled examples unlabeled = map(features, male_names[2000:2500] + female_names[:500]) # Create a test set with correctly-labeled male and female names test = [(name, True) for name in male_names[2500:2750]] + [ (name, False) for name in female_names[500:750] ] random.shuffle(test) # Train up a classifier. print("Training classifier...") classifier = trainer(positive, unlabeled) # Run the classifier on the test data. print("Testing classifier...") acc = accuracy(classifier, [(features(n), m) for (n, m) in test]) print("Accuracy: %6.4f" % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, m) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) print() print("Unseen Names P(Male) P(Female)\n" + "-" * 40) for ((name, is_male), pdist) in zip(test, pdists)[:5]: if is_male == True: fmt = " %-15s *%6.4f %6.4f" else: fmt = " %-15s %6.4f *%6.4f" print(fmt % (name, pdist.prob(True), pdist.prob(False))) except NotImplementedError: pass # Return the classifier return classifier _inst_cache = {} def wsd_demo(trainer, word, features, n=1000): import random from nltk.corpus import senseval # Get the instances. print("Reading data...") global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] instances = _inst_cache[word][:] if n > len(instances): n = len(instances) senses = list({l for (i, l) in instances}) print(" Senses: " + " ".join(senses)) # Randomly split the names into a test & train set. print("Splitting into test & train...") random.seed(123456) random.shuffle(instances) train = instances[: int(0.8 * n)] test = instances[int(0.8 * n) : n] # Train up a classifier. print("Training classifier...") classifier = trainer([(features(i), l) for (i, l) in train]) # Run the classifier on the test data. print("Testing classifier...") acc = accuracy(classifier, [(features(i), l) for (i, l) in test]) print("Accuracy: %6.4f" % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(i) for (i, n) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) except NotImplementedError: pass # Return the classifier return classifier def check_megam_config(): """ Checks whether the MEGAM binary is configured. """ try: _megam_bin except NameError as e: err_msg = str( "Please configure your megam binary first, e.g.\n" ">>> nltk.config_megam('/usr/bin/local/megam')" ) raise NameError(err_msg) from e