ai-content-maker/.venv/Lib/site-packages/spacy/tests/pipeline/test_attributeruler.py

295 lines
9.3 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
import numpy
import pytest
from spacy import registry, util
from spacy.lang.en import English
from spacy.pipeline import AttributeRuler
from spacy.tokens import Doc
from spacy.training import Example
from ..util import make_tempdir
@pytest.fixture
def nlp():
return English()
@pytest.fixture
def pattern_dicts():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
@pytest.fixture
def tag_map():
return {
".": {"POS": "PUNCT", "PunctType": "peri"},
",": {"POS": "PUNCT", "PunctType": "comm"},
}
@pytest.fixture
def morph_rules():
return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}
def check_tag_map(ruler):
doc = Doc(
ruler.vocab,
words=["This", "is", "a", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."],
)
doc = ruler(doc)
for i in range(len(doc)):
if i == 4:
assert doc[i].pos_ == "PUNCT"
assert str(doc[i].morph) == "PunctType=peri"
else:
assert doc[i].pos_ == ""
assert str(doc[i].morph) == ""
def check_morph_rules(ruler):
doc = Doc(
ruler.vocab,
words=["This", "is", "the", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."],
)
doc = ruler(doc)
for i in range(len(doc)):
if i != 2:
assert doc[i].pos_ == ""
assert str(doc[i].morph) == ""
else:
assert doc[2].pos_ == "DET"
assert doc[2].lemma_ == "a"
assert str(doc[2].morph) == "Case=Nom"
def test_attributeruler_init(nlp, pattern_dicts):
a = nlp.add_pipe("attribute_ruler")
for p in pattern_dicts:
a.add(**p)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
def test_attributeruler_init_patterns(nlp, pattern_dicts):
# initialize with patterns
ruler = nlp.add_pipe("attribute_ruler")
ruler.initialize(lambda: [], patterns=pattern_dicts)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
nlp.remove_pipe("attribute_ruler")
# initialize with patterns from misc registry
@registry.misc("attribute_ruler_patterns")
def attribute_ruler_patterns():
return [
{
"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
"attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
},
# one pattern sets the lemma
{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
# another pattern sets the morphology
{
"patterns": [[{"ORTH": "test"}]],
"attrs": {"MORPH": "Case=Nom|Number=Sing"},
"index": 0,
},
]
nlp.config["initialize"]["components"]["attribute_ruler"] = {
"patterns": {"@misc": "attribute_ruler_patterns"}
}
nlp.add_pipe("attribute_ruler")
nlp.initialize()
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
assert doc.has_annotation("LEMMA")
assert doc.has_annotation("MORPH")
def test_attributeruler_init_clear(nlp, pattern_dicts):
"""Test that initialization clears patterns."""
ruler = nlp.add_pipe("attribute_ruler")
assert not len(ruler.matcher)
ruler.add_patterns(pattern_dicts)
assert len(ruler.matcher)
ruler.initialize(lambda: [])
assert not len(ruler.matcher)
def test_attributeruler_score(nlp, pattern_dicts):
# initialize with patterns
ruler = nlp.add_pipe("attribute_ruler")
ruler.initialize(lambda: [], patterns=pattern_dicts)
doc = nlp("This is a test.")
assert doc[2].lemma_ == "the"
assert str(doc[2].morph) == "Case=Nom|Number=Plur"
assert doc[3].lemma_ == "cat"
assert str(doc[3].morph) == "Case=Nom|Number=Sing"
doc = nlp.make_doc("This is a test.")
dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]
scores = nlp.evaluate(dev_examples)
# "cat" is the only correct lemma
assert scores["lemma_acc"] == pytest.approx(0.2)
# no morphs are set
assert scores["morph_acc"] is None
nlp.remove_pipe("attribute_ruler")
# test with custom scorer
@registry.misc("weird_scorer.v1")
def make_weird_scorer():
def weird_scorer(examples, weird_score, **kwargs):
return {"weird_score": weird_score}
return weird_scorer
ruler = nlp.add_pipe(
"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
)
ruler.initialize(lambda: [], patterns=pattern_dicts)
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
assert scores["weird_score"] == 0.12345
assert "token_acc" in scores
assert "lemma_acc" not in scores
scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
assert scores["weird_score"] == 0.23456
def test_attributeruler_rule_order(nlp):
a = AttributeRuler(nlp.vocab)
patterns = [
{"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "VERB"}},
{"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
]
a.add_patterns(patterns)
doc = Doc(
nlp.vocab,
words=["This", "is", "a", "test", "."],
tags=["DT", "VBZ", "DT", "NN", "."],
)
doc = a(doc)
assert doc[1].pos_ == "NOUN"
def test_attributeruler_tag_map(nlp, tag_map):
ruler = AttributeRuler(nlp.vocab)
ruler.load_from_tag_map(tag_map)
check_tag_map(ruler)
def test_attributeruler_tag_map_initialize(nlp, tag_map):
ruler = nlp.add_pipe("attribute_ruler")
ruler.initialize(lambda: [], tag_map=tag_map)
check_tag_map(ruler)
def test_attributeruler_morph_rules(nlp, morph_rules):
ruler = AttributeRuler(nlp.vocab)
ruler.load_from_morph_rules(morph_rules)
check_morph_rules(ruler)
def test_attributeruler_morph_rules_initialize(nlp, morph_rules):
ruler = nlp.add_pipe("attribute_ruler")
ruler.initialize(lambda: [], morph_rules=morph_rules)
check_morph_rules(ruler)
def test_attributeruler_indices(nlp):
a = nlp.add_pipe("attribute_ruler")
a.add(
[[{"ORTH": "a"}, {"ORTH": "test"}]],
{"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
index=0,
)
a.add(
[[{"ORTH": "This"}, {"ORTH": "is"}]],
{"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
index=1,
)
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)
text = "This is a test."
doc = nlp(text)
for i in range(len(doc)):
if i == 1:
assert doc[i].lemma_ == "was"
assert str(doc[i].morph) == "Case=Nom|Number=Sing"
elif i == 2:
assert doc[i].lemma_ == "the"
assert str(doc[i].morph) == "Case=Nom|Number=Plur"
elif i == 3:
assert doc[i].lemma_ == "cat"
else:
assert str(doc[i].morph) == ""
# raises an error when trying to modify a token outside of the match
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
with pytest.raises(ValueError):
doc = nlp(text)
# raises an error when trying to modify a token outside of the match
a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
with pytest.raises(ValueError):
doc = nlp(text)
def test_attributeruler_patterns_prop(nlp, pattern_dicts):
a = nlp.add_pipe("attribute_ruler")
a.add_patterns(pattern_dicts)
for p1, p2 in zip(pattern_dicts, a.patterns):
assert p1["patterns"] == p2["patterns"]
assert p1["attrs"] == p2["attrs"]
if p1.get("index"):
assert p1["index"] == p2["index"]
def test_attributeruler_serialize(nlp, pattern_dicts):
a = nlp.add_pipe("attribute_ruler")
a.add_patterns(pattern_dicts)
text = "This is a test."
attrs = ["ORTH", "LEMMA", "MORPH"]
doc = nlp(text)
# bytes roundtrip
a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
assert a.to_bytes() == a_reloaded.to_bytes()
doc1 = a_reloaded(nlp.make_doc(text))
numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
assert a.patterns == a_reloaded.patterns
# disk roundtrip
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(text)
assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns