ai-content-maker/.venv/Lib/site-packages/spacy/tests/pipeline/test_attributeruler.py

import numpy
import pytest

from spacy import registry, util
from spacy.lang.en import English
from spacy.pipeline import AttributeRuler
from spacy.tokens import Doc
from spacy.training import Example

from ..util import make_tempdir


@pytest.fixture
def nlp():
    return English()


@pytest.fixture
def pattern_dicts():
    return [
        {
            "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
            "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
        },
        # one pattern sets the lemma
        {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
        # another pattern sets the morphology
        {
            "patterns": [[{"ORTH": "test"}]],
            "attrs": {"MORPH": "Case=Nom|Number=Sing"},
            "index": 0,
        },
    ]


@pytest.fixture
def tag_map():
    return {
        ".": {"POS": "PUNCT", "PunctType": "peri"},
        ",": {"POS": "PUNCT", "PunctType": "comm"},
    }


@pytest.fixture
def morph_rules():
    return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}


def check_tag_map(ruler):
    doc = Doc(
        ruler.vocab,
        words=["This", "is", "a", "test", "."],
        tags=["DT", "VBZ", "DT", "NN", "."],
    )
    doc = ruler(doc)
    for i in range(len(doc)):
        if i == 4:
            assert doc[i].pos_ == "PUNCT"
            assert str(doc[i].morph) == "PunctType=peri"
        else:
            assert doc[i].pos_ == ""
            assert str(doc[i].morph) == ""


def check_morph_rules(ruler):
    doc = Doc(
        ruler.vocab,
        words=["This", "is", "the", "test", "."],
        tags=["DT", "VBZ", "DT", "NN", "."],
    )
    doc = ruler(doc)
    for i in range(len(doc)):
        if i != 2:
            assert doc[i].pos_ == ""
            assert str(doc[i].morph) == ""
        else:
            assert doc[2].pos_ == "DET"
            assert doc[2].lemma_ == "a"
            assert str(doc[2].morph) == "Case=Nom"


def test_attributeruler_init(nlp, pattern_dicts):
    a = nlp.add_pipe("attribute_ruler")
    for p in pattern_dicts:
        a.add(**p)
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")


def test_attributeruler_init_patterns(nlp, pattern_dicts):
    # initialize with patterns
    ruler = nlp.add_pipe("attribute_ruler")
    ruler.initialize(lambda: [], patterns=pattern_dicts)
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")
    nlp.remove_pipe("attribute_ruler")

    # initialize with patterns from misc registry
    @registry.misc("attribute_ruler_patterns")
    def attribute_ruler_patterns():
        return [
            {
                "patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],
                "attrs": {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
            },
            # one pattern sets the lemma
            {"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},
            # another pattern sets the morphology
            {
                "patterns": [[{"ORTH": "test"}]],
                "attrs": {"MORPH": "Case=Nom|Number=Sing"},
                "index": 0,
            },
        ]

    nlp.config["initialize"]["components"]["attribute_ruler"] = {
        "patterns": {"@misc": "attribute_ruler_patterns"}
    }
    nlp.add_pipe("attribute_ruler")
    nlp.initialize()
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    assert doc.has_annotation("LEMMA")
    assert doc.has_annotation("MORPH")


def test_attributeruler_init_clear(nlp, pattern_dicts):
    """Test that initialization clears patterns."""
    ruler = nlp.add_pipe("attribute_ruler")
    assert not len(ruler.matcher)
    ruler.add_patterns(pattern_dicts)
    assert len(ruler.matcher)
    ruler.initialize(lambda: [])
    assert not len(ruler.matcher)


def test_attributeruler_score(nlp, pattern_dicts):
    # initialize with patterns
    ruler = nlp.add_pipe("attribute_ruler")
    ruler.initialize(lambda: [], patterns=pattern_dicts)
    doc = nlp("This is a test.")
    assert doc[2].lemma_ == "the"
    assert str(doc[2].morph) == "Case=Nom|Number=Plur"
    assert doc[3].lemma_ == "cat"
    assert str(doc[3].morph) == "Case=Nom|Number=Sing"
    doc = nlp.make_doc("This is a test.")
    dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]
    scores = nlp.evaluate(dev_examples)
    # "cat" is the only correct lemma
    assert scores["lemma_acc"] == pytest.approx(0.2)
    # no morphs are set
    assert scores["morph_acc"] is None
    nlp.remove_pipe("attribute_ruler")

    # test with custom scorer
    @registry.misc("weird_scorer.v1")
    def make_weird_scorer():
        def weird_scorer(examples, weird_score, **kwargs):
            return {"weird_score": weird_score}

        return weird_scorer

    ruler = nlp.add_pipe(
        "attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}
    )
    ruler.initialize(lambda: [], patterns=pattern_dicts)
    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})
    assert scores["weird_score"] == 0.12345
    assert "token_acc" in scores
    assert "lemma_acc" not in scores
    scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})
    assert scores["weird_score"] == 0.23456


def test_attributeruler_rule_order(nlp):
    a = AttributeRuler(nlp.vocab)
    patterns = [
        {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "VERB"}},
        {"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},
    ]
    a.add_patterns(patterns)
    doc = Doc(
        nlp.vocab,
        words=["This", "is", "a", "test", "."],
        tags=["DT", "VBZ", "DT", "NN", "."],
    )
    doc = a(doc)
    assert doc[1].pos_ == "NOUN"


def test_attributeruler_tag_map(nlp, tag_map):
    ruler = AttributeRuler(nlp.vocab)
    ruler.load_from_tag_map(tag_map)
    check_tag_map(ruler)


def test_attributeruler_tag_map_initialize(nlp, tag_map):
    ruler = nlp.add_pipe("attribute_ruler")
    ruler.initialize(lambda: [], tag_map=tag_map)
    check_tag_map(ruler)


def test_attributeruler_morph_rules(nlp, morph_rules):
    ruler = AttributeRuler(nlp.vocab)
    ruler.load_from_morph_rules(morph_rules)
    check_morph_rules(ruler)


def test_attributeruler_morph_rules_initialize(nlp, morph_rules):
    ruler = nlp.add_pipe("attribute_ruler")
    ruler.initialize(lambda: [], morph_rules=morph_rules)
    check_morph_rules(ruler)


def test_attributeruler_indices(nlp):
    a = nlp.add_pipe("attribute_ruler")
    a.add(
        [[{"ORTH": "a"}, {"ORTH": "test"}]],
        {"LEMMA": "the", "MORPH": "Case=Nom|Number=Plur"},
        index=0,
    )
    a.add(
        [[{"ORTH": "This"}, {"ORTH": "is"}]],
        {"LEMMA": "was", "MORPH": "Case=Nom|Number=Sing"},
        index=1,
    )
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)

    text = "This is a test."
    doc = nlp(text)
    for i in range(len(doc)):
        if i == 1:
            assert doc[i].lemma_ == "was"
            assert str(doc[i].morph) == "Case=Nom|Number=Sing"
        elif i == 2:
            assert doc[i].lemma_ == "the"
            assert str(doc[i].morph) == "Case=Nom|Number=Plur"
        elif i == 3:
            assert doc[i].lemma_ == "cat"
        else:
            assert str(doc[i].morph) == ""
    # raises an error when trying to modify a token outside of the match
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)
    with pytest.raises(ValueError):
        doc = nlp(text)
    # raises an error when trying to modify a token outside of the match
    a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)
    with pytest.raises(ValueError):
        doc = nlp(text)


def test_attributeruler_patterns_prop(nlp, pattern_dicts):
    a = nlp.add_pipe("attribute_ruler")
    a.add_patterns(pattern_dicts)
    for p1, p2 in zip(pattern_dicts, a.patterns):
        assert p1["patterns"] == p2["patterns"]
        assert p1["attrs"] == p2["attrs"]
        if p1.get("index"):
            assert p1["index"] == p2["index"]


def test_attributeruler_serialize(nlp, pattern_dicts):
    a = nlp.add_pipe("attribute_ruler")
    a.add_patterns(pattern_dicts)
    text = "This is a test."
    attrs = ["ORTH", "LEMMA", "MORPH"]
    doc = nlp(text)
    # bytes roundtrip
    a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())
    assert a.to_bytes() == a_reloaded.to_bytes()
    doc1 = a_reloaded(nlp.make_doc(text))
    numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))
    assert a.patterns == a_reloaded.patterns
    # disk roundtrip
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(text)
        assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()
        assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))
        assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns
first commit 2024-05-03 04:18:51 +03:00			`import numpy`
			`import pytest`

			`from spacy import registry, util`
			`from spacy.lang.en import English`
			`from spacy.pipeline import AttributeRuler`
			`from spacy.tokens import Doc`
			`from spacy.training import Example`

			`from ..util import make_tempdir`


			`@pytest.fixture`
			`def nlp():`
			`return English()`


			`@pytest.fixture`
			`def pattern_dicts():`
			`return [`
			`{`
			`"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],`
			`"attrs": {"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"},`
			`},`
			`# one pattern sets the lemma`
			`{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},`
			`# another pattern sets the morphology`
			`{`
			`"patterns": [[{"ORTH": "test"}]],`
			`"attrs": {"MORPH": "Case=Nom\|Number=Sing"},`
			`"index": 0,`
			`},`
			`]`


			`@pytest.fixture`
			`def tag_map():`
			`return {`
			`".": {"POS": "PUNCT", "PunctType": "peri"},`
			`",": {"POS": "PUNCT", "PunctType": "comm"},`
			`}`


			`@pytest.fixture`
			`def morph_rules():`
			`return {"DT": {"the": {"POS": "DET", "LEMMA": "a", "Case": "Nom"}}}`


			`def check_tag_map(ruler):`
			`doc = Doc(`
			`ruler.vocab,`
			`words=["This", "is", "a", "test", "."],`
			`tags=["DT", "VBZ", "DT", "NN", "."],`
			`)`
			`doc = ruler(doc)`
			`for i in range(len(doc)):`
			`if i == 4:`
			`assert doc[i].pos_ == "PUNCT"`
			`assert str(doc[i].morph) == "PunctType=peri"`
			`else:`
			`assert doc[i].pos_ == ""`
			`assert str(doc[i].morph) == ""`


			`def check_morph_rules(ruler):`
			`doc = Doc(`
			`ruler.vocab,`
			`words=["This", "is", "the", "test", "."],`
			`tags=["DT", "VBZ", "DT", "NN", "."],`
			`)`
			`doc = ruler(doc)`
			`for i in range(len(doc)):`
			`if i != 2:`
			`assert doc[i].pos_ == ""`
			`assert str(doc[i].morph) == ""`
			`else:`
			`assert doc[2].pos_ == "DET"`
			`assert doc[2].lemma_ == "a"`
			`assert str(doc[2].morph) == "Case=Nom"`


			`def test_attributeruler_init(nlp, pattern_dicts):`
			`a = nlp.add_pipe("attribute_ruler")`
			`for p in pattern_dicts:`
			`a.add(**p)`
			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert str(doc[2].morph) == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert str(doc[3].morph) == "Case=Nom\|Number=Sing"`
			`assert doc.has_annotation("LEMMA")`
			`assert doc.has_annotation("MORPH")`


			`def test_attributeruler_init_patterns(nlp, pattern_dicts):`
			`# initialize with patterns`
			`ruler = nlp.add_pipe("attribute_ruler")`
			`ruler.initialize(lambda: [], patterns=pattern_dicts)`
			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert str(doc[2].morph) == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert str(doc[3].morph) == "Case=Nom\|Number=Sing"`
			`assert doc.has_annotation("LEMMA")`
			`assert doc.has_annotation("MORPH")`
			`nlp.remove_pipe("attribute_ruler")`

			`# initialize with patterns from misc registry`
			`@registry.misc("attribute_ruler_patterns")`
			`def attribute_ruler_patterns():`
			`return [`
			`{`
			`"patterns": [[{"ORTH": "a"}], [{"ORTH": "irrelevant"}]],`
			`"attrs": {"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"},`
			`},`
			`# one pattern sets the lemma`
			`{"patterns": [[{"ORTH": "test"}]], "attrs": {"LEMMA": "cat"}},`
			`# another pattern sets the morphology`
			`{`
			`"patterns": [[{"ORTH": "test"}]],`
			`"attrs": {"MORPH": "Case=Nom\|Number=Sing"},`
			`"index": 0,`
			`},`
			`]`

			`nlp.config["initialize"]["components"]["attribute_ruler"] = {`
			`"patterns": {"@misc": "attribute_ruler_patterns"}`
			`}`
			`nlp.add_pipe("attribute_ruler")`
			`nlp.initialize()`
			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert str(doc[2].morph) == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert str(doc[3].morph) == "Case=Nom\|Number=Sing"`
			`assert doc.has_annotation("LEMMA")`
			`assert doc.has_annotation("MORPH")`


			`def test_attributeruler_init_clear(nlp, pattern_dicts):`
			`"""Test that initialization clears patterns."""`
			`ruler = nlp.add_pipe("attribute_ruler")`
			`assert not len(ruler.matcher)`
			`ruler.add_patterns(pattern_dicts)`
			`assert len(ruler.matcher)`
			`ruler.initialize(lambda: [])`
			`assert not len(ruler.matcher)`


			`def test_attributeruler_score(nlp, pattern_dicts):`
			`# initialize with patterns`
			`ruler = nlp.add_pipe("attribute_ruler")`
			`ruler.initialize(lambda: [], patterns=pattern_dicts)`
			`doc = nlp("This is a test.")`
			`assert doc[2].lemma_ == "the"`
			`assert str(doc[2].morph) == "Case=Nom\|Number=Plur"`
			`assert doc[3].lemma_ == "cat"`
			`assert str(doc[3].morph) == "Case=Nom\|Number=Sing"`
			`doc = nlp.make_doc("This is a test.")`
			`dev_examples = [Example.from_dict(doc, {"lemmas": ["this", "is", "a", "cat", "."]})]`
			`scores = nlp.evaluate(dev_examples)`
			`# "cat" is the only correct lemma`
			`assert scores["lemma_acc"] == pytest.approx(0.2)`
			`# no morphs are set`
			`assert scores["morph_acc"] is None`
			`nlp.remove_pipe("attribute_ruler")`

			`# test with custom scorer`
			`@registry.misc("weird_scorer.v1")`
			`def make_weird_scorer():`
			`def weird_scorer(examples, weird_score, **kwargs):`
			`return {"weird_score": weird_score}`

			`return weird_scorer`

			`ruler = nlp.add_pipe(`
			`"attribute_ruler", config={"scorer": {"@misc": "weird_scorer.v1"}}`
			`)`
			`ruler.initialize(lambda: [], patterns=pattern_dicts)`
			`scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.12345})`
			`assert scores["weird_score"] == 0.12345`
			`assert "token_acc" in scores`
			`assert "lemma_acc" not in scores`
			`scores = nlp.evaluate(dev_examples, scorer_cfg={"weird_score": 0.23456})`
			`assert scores["weird_score"] == 0.23456`


			`def test_attributeruler_rule_order(nlp):`
			`a = AttributeRuler(nlp.vocab)`
			`patterns = [`
			`{"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "VERB"}},`
			`{"patterns": [[{"TAG": "VBZ"}]], "attrs": {"POS": "NOUN"}},`
			`]`
			`a.add_patterns(patterns)`
			`doc = Doc(`
			`nlp.vocab,`
			`words=["This", "is", "a", "test", "."],`
			`tags=["DT", "VBZ", "DT", "NN", "."],`
			`)`
			`doc = a(doc)`
			`assert doc[1].pos_ == "NOUN"`


			`def test_attributeruler_tag_map(nlp, tag_map):`
			`ruler = AttributeRuler(nlp.vocab)`
			`ruler.load_from_tag_map(tag_map)`
			`check_tag_map(ruler)`


			`def test_attributeruler_tag_map_initialize(nlp, tag_map):`
			`ruler = nlp.add_pipe("attribute_ruler")`
			`ruler.initialize(lambda: [], tag_map=tag_map)`
			`check_tag_map(ruler)`


			`def test_attributeruler_morph_rules(nlp, morph_rules):`
			`ruler = AttributeRuler(nlp.vocab)`
			`ruler.load_from_morph_rules(morph_rules)`
			`check_morph_rules(ruler)`


			`def test_attributeruler_morph_rules_initialize(nlp, morph_rules):`
			`ruler = nlp.add_pipe("attribute_ruler")`
			`ruler.initialize(lambda: [], morph_rules=morph_rules)`
			`check_morph_rules(ruler)`


			`def test_attributeruler_indices(nlp):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add(`
			`[[{"ORTH": "a"}, {"ORTH": "test"}]],`
			`{"LEMMA": "the", "MORPH": "Case=Nom\|Number=Plur"},`
			`index=0,`
			`)`
			`a.add(`
			`[[{"ORTH": "This"}, {"ORTH": "is"}]],`
			`{"LEMMA": "was", "MORPH": "Case=Nom\|Number=Sing"},`
			`index=1,`
			`)`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=-1)`

			`text = "This is a test."`
			`doc = nlp(text)`
			`for i in range(len(doc)):`
			`if i == 1:`
			`assert doc[i].lemma_ == "was"`
			`assert str(doc[i].morph) == "Case=Nom\|Number=Sing"`
			`elif i == 2:`
			`assert doc[i].lemma_ == "the"`
			`assert str(doc[i].morph) == "Case=Nom\|Number=Plur"`
			`elif i == 3:`
			`assert doc[i].lemma_ == "cat"`
			`else:`
			`assert str(doc[i].morph) == ""`
			`# raises an error when trying to modify a token outside of the match`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=2)`
			`with pytest.raises(ValueError):`
			`doc = nlp(text)`
			`# raises an error when trying to modify a token outside of the match`
			`a.add([[{"ORTH": "a"}, {"ORTH": "test"}]], {"LEMMA": "cat"}, index=10)`
			`with pytest.raises(ValueError):`
			`doc = nlp(text)`


			`def test_attributeruler_patterns_prop(nlp, pattern_dicts):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add_patterns(pattern_dicts)`
			`for p1, p2 in zip(pattern_dicts, a.patterns):`
			`assert p1["patterns"] == p2["patterns"]`
			`assert p1["attrs"] == p2["attrs"]`
			`if p1.get("index"):`
			`assert p1["index"] == p2["index"]`


			`def test_attributeruler_serialize(nlp, pattern_dicts):`
			`a = nlp.add_pipe("attribute_ruler")`
			`a.add_patterns(pattern_dicts)`
			`text = "This is a test."`
			`attrs = ["ORTH", "LEMMA", "MORPH"]`
			`doc = nlp(text)`
			`# bytes roundtrip`
			`a_reloaded = AttributeRuler(nlp.vocab).from_bytes(a.to_bytes())`
			`assert a.to_bytes() == a_reloaded.to_bytes()`
			`doc1 = a_reloaded(nlp.make_doc(text))`
			`numpy.array_equal(doc.to_array(attrs), doc1.to_array(attrs))`
			`assert a.patterns == a_reloaded.patterns`
			`# disk roundtrip`
			`with make_tempdir() as tmp_dir:`
			`nlp.to_disk(tmp_dir)`
			`nlp2 = util.load_model_from_path(tmp_dir)`
			`doc2 = nlp2(text)`
			`assert nlp2.get_pipe("attribute_ruler").to_bytes() == a.to_bytes()`
			`assert numpy.array_equal(doc.to_array(attrs), doc2.to_array(attrs))`
			`assert a.patterns == nlp2.get_pipe("attribute_ruler").patterns`