import unittest from nltk import RegexpParser class TestChunkRule(unittest.TestCase): def test_tag_pattern2re_pattern_quantifier(self): """Test for bug https://github.com/nltk/nltk/issues/1597 Ensures that curly bracket quantifiers can be used inside a chunk rule. This type of quantifier has been used for the supplementary example in https://www.nltk.org/book/ch07.html#exploring-text-corpora. """ sent = [ ("The", "AT"), ("September-October", "NP"), ("term", "NN"), ("jury", "NN"), ("had", "HVD"), ("been", "BEN"), ("charged", "VBN"), ("by", "IN"), ("Fulton", "NP-TL"), ("Superior", "JJ-TL"), ("Court", "NN-TL"), ("Judge", "NN-TL"), ("Durwood", "NP"), ("Pye", "NP"), ("to", "TO"), ("investigate", "VB"), ("reports", "NNS"), ("of", "IN"), ("possible", "JJ"), ("``", "``"), ("irregularities", "NNS"), ("''", "''"), ("in", "IN"), ("the", "AT"), ("hard-fought", "JJ"), ("primary", "NN"), ("which", "WDT"), ("was", "BEDZ"), ("won", "VBN"), ("by", "IN"), ("Mayor-nominate", "NN-TL"), ("Ivan", "NP"), ("Allen", "NP"), ("Jr.", "NP"), (".", "."), ] # source: brown corpus cp = RegexpParser("CHUNK: {{4,}}") tree = cp.parse(sent) assert ( tree.pformat() == """(S The/AT September-October/NP term/NN jury/NN had/HVD been/BEN charged/VBN by/IN Fulton/NP-TL Superior/JJ-TL (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP) to/TO investigate/VB reports/NNS of/IN possible/JJ ``/`` irregularities/NNS ''/'' in/IN the/AT hard-fought/JJ primary/NN which/WDT was/BEDZ won/VBN by/IN (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP) ./.)""" )