ai-content-maker/.venv/Lib/site-packages/nltk/test/unit/test_tgrep.py

781 lines
30 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python
#
# Natural Language Toolkit: TGrep search
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Will Roberts <wildwilhelm@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Unit tests for nltk.tgrep.
"""
import unittest
from nltk import tgrep
from nltk.tree import ParentedTree
class TestSequenceFunctions(unittest.TestCase):
"""
Class containing unit tests for nltk.tgrep.
"""
def test_tokenize_simple(self):
"""
Simple test of tokenization.
"""
tokens = tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]")
self.assertEqual(
tokens,
[
"A",
"..",
"(",
"B",
"!",
"<",
"C",
".",
"D",
")",
"|",
"!",
"[",
"<<",
"(",
"E",
",",
"F",
")",
"$",
"G",
"]",
],
)
def test_tokenize_encoding(self):
"""
Test that tokenization handles bytes and strs the same way.
"""
self.assertEqual(
tgrep.tgrep_tokenize(b"A .. (B !< C . D) | ![<< (E , F) $ G]"),
tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]"),
)
def test_tokenize_link_types(self):
"""
Test tokenization of basic link types.
"""
self.assertEqual(tgrep.tgrep_tokenize("A<B"), ["A", "<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>B"), ["A", ">", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<3B"), ["A", "<3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>3B"), ["A", ">3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<,B"), ["A", "<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>,B"), ["A", ">,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<-3B"), ["A", "<-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>-3B"), ["A", ">-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<-B"), ["A", "<-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>-B"), ["A", ">-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<'B"), ["A", "<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>'B"), ["A", ">'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<:B"), ["A", "<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>:B"), ["A", ">:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<B"), ["A", "<<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>B"), ["A", ">>", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<,B"), ["A", "<<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>,B"), ["A", ">>,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<'B"), ["A", "<<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>'B"), ["A", ">>'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A<<:B"), ["A", "<<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A>>:B"), ["A", ">>:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A.B"), ["A", ".", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A,B"), ["A", ",", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A..B"), ["A", "..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A,,B"), ["A", ",,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$B"), ["A", "$", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$.B"), ["A", "$.", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$,B"), ["A", "$,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$..B"), ["A", "$..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A$,,B"), ["A", "$,,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<B"), ["A", "!", "<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>B"), ["A", "!", ">", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<3B"), ["A", "!", "<3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>3B"), ["A", "!", ">3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<,B"), ["A", "!", "<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>,B"), ["A", "!", ">,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<-3B"), ["A", "!", "<-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>-3B"), ["A", "!", ">-3", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<-B"), ["A", "!", "<-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>-B"), ["A", "!", ">-", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<'B"), ["A", "!", "<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>'B"), ["A", "!", ">'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<:B"), ["A", "!", "<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>:B"), ["A", "!", ">:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<B"), ["A", "!", "<<", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>B"), ["A", "!", ">>", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<,B"), ["A", "!", "<<,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>,B"), ["A", "!", ">>,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<'B"), ["A", "!", "<<'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>'B"), ["A", "!", ">>'", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!<<:B"), ["A", "!", "<<:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!>>:B"), ["A", "!", ">>:", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!.B"), ["A", "!", ".", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!,B"), ["A", "!", ",", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!..B"), ["A", "!", "..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!,,B"), ["A", "!", ",,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$B"), ["A", "!", "$", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$.B"), ["A", "!", "$.", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$,B"), ["A", "!", "$,", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$..B"), ["A", "!", "$..", "B"])
self.assertEqual(tgrep.tgrep_tokenize("A!$,,B"), ["A", "!", "$,,", "B"])
def test_tokenize_examples(self):
"""
Test tokenization of the TGrep2 manual example patterns.
"""
self.assertEqual(tgrep.tgrep_tokenize("NP < PP"), ["NP", "<", "PP"])
self.assertEqual(tgrep.tgrep_tokenize("/^NP/"), ["/^NP/"])
self.assertEqual(
tgrep.tgrep_tokenize("NP << PP . VP"), ["NP", "<<", "PP", ".", "VP"]
)
self.assertEqual(
tgrep.tgrep_tokenize("NP << PP | . VP"), ["NP", "<<", "PP", "|", ".", "VP"]
)
self.assertEqual(
tgrep.tgrep_tokenize("NP !<< PP [> NP | >> VP]"),
["NP", "!", "<<", "PP", "[", ">", "NP", "|", ">>", "VP", "]"],
)
self.assertEqual(
tgrep.tgrep_tokenize("NP << (PP . VP)"),
["NP", "<<", "(", "PP", ".", "VP", ")"],
)
self.assertEqual(
tgrep.tgrep_tokenize("NP <' (PP <, (IN < on))"),
["NP", "<'", "(", "PP", "<,", "(", "IN", "<", "on", ")", ")"],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < (A < B) < C"),
["S", "<", "(", "A", "<", "B", ")", "<", "C"],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < ((A < B) < C)"),
["S", "<", "(", "(", "A", "<", "B", ")", "<", "C", ")"],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < (A < B < C)"),
["S", "<", "(", "A", "<", "B", "<", "C", ")"],
)
self.assertEqual(tgrep.tgrep_tokenize("A<B&.C"), ["A", "<", "B", "&", ".", "C"])
def test_tokenize_quoting(self):
"""
Test tokenization of quoting.
"""
self.assertEqual(
tgrep.tgrep_tokenize('"A<<:B"<<:"A $.. B"<"A>3B"<C'),
['"A<<:B"', "<<:", '"A $.. B"', "<", '"A>3B"', "<", "C"],
)
def test_tokenize_nodenames(self):
"""
Test tokenization of node names.
"""
self.assertEqual(tgrep.tgrep_tokenize("Robert"), ["Robert"])
self.assertEqual(tgrep.tgrep_tokenize("/^[Bb]ob/"), ["/^[Bb]ob/"])
self.assertEqual(tgrep.tgrep_tokenize("*"), ["*"])
self.assertEqual(tgrep.tgrep_tokenize("__"), ["__"])
# test tokenization of NLTK tree position syntax
self.assertEqual(tgrep.tgrep_tokenize("N()"), ["N(", ")"])
self.assertEqual(tgrep.tgrep_tokenize("N(0,)"), ["N(", "0", ",", ")"])
self.assertEqual(tgrep.tgrep_tokenize("N(0,0)"), ["N(", "0", ",", "0", ")"])
self.assertEqual(
tgrep.tgrep_tokenize("N(0,0,)"), ["N(", "0", ",", "0", ",", ")"]
)
def test_tokenize_macros(self):
"""
Test tokenization of macro definitions.
"""
self.assertEqual(
tgrep.tgrep_tokenize(
"@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN"
),
[
"@",
"NP",
"/^NP/",
";",
"@",
"NN",
"/^NN/",
";",
"@NP",
"[",
"!",
"<",
"NP",
"|",
"<",
"@NN",
"]",
"!",
"$..",
"@NN",
],
)
def test_node_simple(self):
"""
Test a simple use of tgrep for finding nodes matching a given
pattern.
"""
tree = ParentedTree.fromstring(
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
)
self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(
list(tgrep.tgrep_nodes("NN", [tree])), [[tree[0, 2], tree[2, 1]]]
)
self.assertEqual(
list(tgrep.tgrep_positions("NN|JJ", [tree])), [[(0, 1), (0, 2), (2, 1)]]
)
def test_node_printing(self):
"""Test that the tgrep print operator ' is properly ignored."""
tree = ParentedTree.fromstring("(S (n x) (N x))")
self.assertEqual(
list(tgrep.tgrep_positions("N", [tree])),
list(tgrep.tgrep_positions("'N", [tree])),
)
self.assertEqual(
list(tgrep.tgrep_positions("/[Nn]/", [tree])),
list(tgrep.tgrep_positions("'/[Nn]/", [tree])),
)
def test_node_encoding(self):
"""
Test that tgrep search strings handles bytes and strs the same
way.
"""
tree = ParentedTree.fromstring(
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
)
self.assertEqual(
list(tgrep.tgrep_positions(b"NN", [tree])),
list(tgrep.tgrep_positions(b"NN", [tree])),
)
self.assertEqual(
list(tgrep.tgrep_nodes(b"NN", [tree])),
list(tgrep.tgrep_nodes("NN", [tree])),
)
self.assertEqual(
list(tgrep.tgrep_positions(b"NN|JJ", [tree])),
list(tgrep.tgrep_positions("NN|JJ", [tree])),
)
def test_node_nocase(self):
"""
Test selecting nodes using case insensitive node names.
"""
tree = ParentedTree.fromstring("(S (n x) (N x))")
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
def test_node_quoted(self):
"""
Test selecting nodes using quoted node names.
"""
tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
def test_node_regex(self):
"""
Test regex matching on nodes.
"""
tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
# This is a regular expression that matches any node whose
# name starts with NP, including NP-SBJ:
self.assertEqual(list(tgrep.tgrep_positions("/^NP/", [tree])), [[(0,), (1,)]])
def test_node_regex_2(self):
"""
Test regex matching on nodes.
"""
tree = ParentedTree.fromstring("(S (SBJ x) (SBJ1 x) (NP-SBJ x))")
self.assertEqual(list(tgrep.tgrep_positions("/^SBJ/", [tree])), [[(0,), (1,)]])
# This is a regular expression that matches any node whose
# name includes SBJ, including NP-SBJ:
self.assertEqual(
list(tgrep.tgrep_positions("/SBJ/", [tree])), [[(0,), (1,), (2,)]]
)
def test_node_tree_position(self):
"""
Test matching on nodes based on NLTK tree position.
"""
tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))")
# test all tree positions that are not leaves
leaf_positions = {tree.leaf_treeposition(x) for x in range(len(tree.leaves()))}
tree_positions = [x for x in tree.treepositions() if x not in leaf_positions]
for position in tree_positions:
node_id = f"N{position}"
tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree]))
self.assertEqual(len(tgrep_positions[0]), 1)
self.assertEqual(tgrep_positions[0][0], position)
def test_node_noleaves(self):
"""
Test node name matching with the search_leaves flag set to False.
"""
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
self.assertEqual(
list(tgrep.tgrep_positions("x", [tree])), [[(0, 0, 0), (1, 0, 0)]]
)
self.assertEqual(list(tgrep.tgrep_positions("x", [tree], False)), [[]])
def tests_rel_dominance(self):
"""
Test matching nodes based on dominance relations.
"""
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* < T > S", [tree])), [[(0,)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !< T", [tree])),
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
)
self.assertEqual(list(tgrep.tgrep_positions("* !< T > S", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* > A", [tree])), [[(0, 0)]])
self.assertEqual(list(tgrep.tgrep_positions("* > B", [tree])), [[(1, 0)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !> B", [tree])),
[[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* !> B >> S", [tree])), [[(0,), (0, 0), (1,)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* >> S", [tree])),
[[(0,), (0, 0), (1,), (1, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* >>, S", [tree])), [[(0,), (0, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* >>' S", [tree])), [[(1,), (1, 0)]]
)
# Known issue:
# self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])),
# [[()]])
self.assertEqual(list(tgrep.tgrep_positions("* << T", [tree])), [[(), (0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <<' T", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <<1 N", [tree])), [[(1,)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !<< T", [tree])),
[[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]],
)
tree = ParentedTree.fromstring("(S (A (T x)) (B (T x) (N x )))")
self.assertEqual(list(tgrep.tgrep_positions("* <: T", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,), (1,)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !<: T", [tree])),
[[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]],
)
self.assertEqual(list(tgrep.tgrep_positions("* !<: T > S", [tree])), [[(1,)]])
tree = ParentedTree.fromstring("(S (T (A x) (B x)) (T (C x)))")
self.assertEqual(list(tgrep.tgrep_positions("* >: T", [tree])), [[(1, 0)]])
self.assertEqual(
list(tgrep.tgrep_positions("* !>: T", [tree])),
[[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]],
)
tree = ParentedTree.fromstring(
"(S (A (B (C (D (E (T x))))))" " (A (B (C (D (E (T x))) (N x)))))"
)
self.assertEqual(
list(tgrep.tgrep_positions("* <<: T", [tree])),
[
[
(0,),
(0, 0),
(0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0, 0),
(1, 0, 0, 0),
(1, 0, 0, 0, 0),
]
],
)
self.assertEqual(
list(tgrep.tgrep_positions("* >>: A", [tree])),
[
[
(0, 0),
(0, 0, 0),
(0, 0, 0, 0),
(0, 0, 0, 0, 0),
(0, 0, 0, 0, 0, 0),
(1, 0),
(1, 0, 0),
]
],
)
def test_bad_operator(self):
"""
Test error handling of undefined tgrep operators.
"""
tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))")
self.assertRaises(
tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree])
)
def test_comments(self):
"""
Test that comments are correctly filtered out of tgrep search
strings.
"""
tree = ParentedTree.fromstring("(S (NN x) (NP x) (NN x))")
search1 = """
@ NP /^NP/;
@ NN /^NN/;
@NN
"""
self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]])
search2 = """
# macros
@ NP /^NP/;
@ NN /^NN/;
# search string
@NN
"""
self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]])
def test_rel_sister_nodes(self):
"""
Test matching sister nodes in a tree.
"""
tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]])
def tests_rel_indexed_children(self):
"""
Test matching nodes based on their index in their parent node.
"""
tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0,)]])
tree = ParentedTree.fromstring(
"(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))"
)
self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2,)]])
self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0,)]])
def test_rel_precedence(self):
"""
Test matching nodes based on precedence relations.
"""
tree = ParentedTree.fromstring(
"(S (NP (NP (PP x)) (NP (AP x)))"
" (VP (AP (X (PP x)) (Y (AP x))))"
" (NP (RC (NP (AP x)))))"
)
self.assertEqual(
list(tgrep.tgrep_positions("* . X", [tree])), [[(0,), (0, 1), (0, 1, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* .. X", [tree])),
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* .. Y", [tree])),
[[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("* , Y", [tree])),
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* ,, X", [tree])),
[[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
self.assertEqual(
list(tgrep.tgrep_positions("* ,, Y", [tree])),
[[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]],
)
def test_examples(self):
"""
Test the Basic Examples from the TGrep2 manual.
"""
tree = ParentedTree.fromstring("(S (NP (AP x)) (NP (PP x)))")
# This matches any NP node that immediately dominates a PP:
self.assertEqual(list(tgrep.tgrep_positions("NP < PP", [tree])), [[(1,)]])
tree = ParentedTree.fromstring("(S (NP x) (VP x) (NP (PP x)) (VP x))")
# This matches an NP that dominates a PP and is immediately
# followed by a VP:
self.assertEqual(list(tgrep.tgrep_positions("NP << PP . VP", [tree])), [[(2,)]])
tree = ParentedTree.fromstring(
"(S (NP (AP x)) (NP (PP x)) " "(NP (DET x) (NN x)) (VP x))"
)
# This matches an NP that dominates a PP or is immediately
# followed by a VP:
self.assertEqual(
list(tgrep.tgrep_positions("NP << PP | . VP", [tree])), [[(1,), (2,)]]
)
tree = ParentedTree.fromstring(
"(S (NP (NP (PP x)) (NP (AP x)))"
" (VP (AP (NP (PP x)) (NP (AP x))))"
" (NP (RC (NP (AP x)))))"
)
# This matches an NP that does not dominate a PP. Also, the NP
# must either have a parent that is an NP or be dominated by a
# VP:
self.assertEqual(
list(tgrep.tgrep_positions("NP !<< PP [> NP | >> VP]", [tree])),
[[(0, 1), (1, 0, 1)]],
)
tree = ParentedTree.fromstring(
"(S (NP (AP (PP x) (VP x))) " "(NP (AP (PP x) (NP x))) (NP x))"
)
# This matches an NP that dominates a PP which itself is
# immediately followed by a VP. Note the use of parentheses to
# group ". VP" with the PP rather than with the NP:
self.assertEqual(
list(tgrep.tgrep_positions("NP << (PP . VP)", [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
"(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))"
" (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))"
" (NP x))"
)
# This matches an NP whose last child is a PP that begins with
# the preposition "on":
self.assertEqual(
list(tgrep.tgrep_positions("NP <' (PP <, (IN < on))", [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
"(S (S (C x) (A (B x))) (S (C x) (A x)) " "(S (D x) (A (B x))))"
)
# The following pattern matches an S which has a child A and
# another child that is a C and that the A has a child B:
self.assertEqual(
list(tgrep.tgrep_positions("S < (A < B) < C", [tree])), [[(0,)]]
)
tree = ParentedTree.fromstring(
"(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))"
)
# However, this pattern means that S has child A and that A
# has children B and C:
self.assertEqual(
list(tgrep.tgrep_positions("S < ((A < B) < C)", [tree])), [[(0,)]]
)
# It is equivalent to this:
self.assertEqual(
list(tgrep.tgrep_positions("S < (A < B < C)", [tree])), [[(0,)]]
)
def test_use_macros(self):
"""
Test defining and using tgrep2 macros.
"""
tree = ParentedTree.fromstring(
"(VP (VB sold) (NP (DET the) "
"(NN heiress)) (NP (NN deed) (PREP to) "
"(NP (DET the) (NN school) (NN house))))"
)
self.assertEqual(
list(
tgrep.tgrep_positions(
"@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN", [tree]
)
),
[[(1,), (2, 2)]],
)
# use undefined macro @CNP
self.assertRaises(
tgrep.TgrepException,
list,
tgrep.tgrep_positions(
"@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN", [tree]
),
)
def test_tokenize_node_labels(self):
"""Test tokenization of labeled nodes."""
self.assertEqual(
tgrep.tgrep_tokenize("S < @SBJ < (@VP < (@VB $.. @OBJ))"),
[
"S",
"<",
"@SBJ",
"<",
"(",
"@VP",
"<",
"(",
"@VB",
"$..",
"@OBJ",
")",
")",
],
)
self.assertEqual(
tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))"),
[
"S",
"<",
"@SBJ",
"=",
"s",
"<",
"(",
"@VP",
"=",
"v",
"<",
"(",
"@VB",
"$..",
"@OBJ",
")",
")",
],
)
def test_tokenize_segmented_patterns(self):
"""Test tokenization of segmented patterns."""
self.assertEqual(
tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"),
[
"S",
"<",
"@SBJ",
"=",
"s",
"<",
"(",
"@VP",
"=",
"v",
"<",
"(",
"@VB",
"$..",
"@OBJ",
")",
")",
":",
"=s",
"..",
"=v",
],
)
def test_labeled_nodes(self):
"""
Test labeled nodes.
Test case from Emily M. Bender.
"""
search = """
# macros
@ SBJ /SBJ/;
@ VP /VP/;
@ VB /VB/;
@ VPoB /V[PB]/;
@ OBJ /OBJ/;
# 1 svo
S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"""
sent1 = ParentedTree.fromstring(
"(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))"
)
sent2 = ParentedTree.fromstring(
"(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))"
)
search_firsthalf = search.split("\n\n")[0] + "S < @SBJ < (@VP < (@VB $.. @OBJ))"
search_rewrite = "S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))"
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
self.assertEqual(
list(tgrep.tgrep_positions(search, [sent1])),
list(tgrep.tgrep_positions(search_rewrite, [sent1])),
)
self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
self.assertEqual(
list(tgrep.tgrep_positions(search, [sent2])),
list(tgrep.tgrep_positions(search_rewrite, [sent2])),
)
def test_multiple_conjs(self):
"""
Test that multiple (3 or more) conjunctions of node relations are
handled properly.
"""
sent = ParentedTree.fromstring("((A (B b) (C c)) (A (B b) (C c) (D d)))")
# search = '(A < B < C < D)'
# search_tworels = '(A < B < C)'
self.assertEqual(
list(tgrep.tgrep_positions("(A < B < C < D)", [sent])), [[(1,)]]
)
self.assertEqual(
list(tgrep.tgrep_positions("(A < B < C)", [sent])), [[(0,), (1,)]]
)
def test_trailing_semicolon(self):
"""
Test that semicolons at the end of a tgrep2 search string won't
cause a parse failure.
"""
tree = ParentedTree.fromstring(
"(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))"
)
self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(list(tgrep.tgrep_positions("NN;", [tree])), [[(0, 2), (2, 1)]])
self.assertEqual(
list(tgrep.tgrep_positions("NN;;", [tree])), [[(0, 2), (2, 1)]]
)