113 lines
3.5 KiB
Python
113 lines
3.5 KiB
Python
|
"""
|
||
|
Unit tests for Senna
|
||
|
"""
|
||
|
|
||
|
import unittest
|
||
|
from os import environ, path, sep
|
||
|
|
||
|
from nltk.classify import Senna
|
||
|
from nltk.tag import SennaChunkTagger, SennaNERTagger, SennaTagger
|
||
|
|
||
|
# Set Senna executable path for tests if it is not specified as an environment variable
|
||
|
if "SENNA" in environ:
|
||
|
SENNA_EXECUTABLE_PATH = path.normpath(environ["SENNA"]) + sep
|
||
|
else:
|
||
|
SENNA_EXECUTABLE_PATH = "/usr/share/senna-v3.0"
|
||
|
|
||
|
senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH)
|
||
|
|
||
|
|
||
|
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
|
||
|
class TestSennaPipeline(unittest.TestCase):
|
||
|
"""Unittest for nltk.classify.senna"""
|
||
|
|
||
|
def test_senna_pipeline(self):
|
||
|
"""Senna pipeline interface"""
|
||
|
|
||
|
pipeline = Senna(SENNA_EXECUTABLE_PATH, ["pos", "chk", "ner"])
|
||
|
sent = "Dusseldorf is an international business center".split()
|
||
|
result = [
|
||
|
(token["word"], token["chk"], token["ner"], token["pos"])
|
||
|
for token in pipeline.tag(sent)
|
||
|
]
|
||
|
expected = [
|
||
|
("Dusseldorf", "B-NP", "B-LOC", "NNP"),
|
||
|
("is", "B-VP", "O", "VBZ"),
|
||
|
("an", "B-NP", "O", "DT"),
|
||
|
("international", "I-NP", "O", "JJ"),
|
||
|
("business", "I-NP", "O", "NN"),
|
||
|
("center", "I-NP", "O", "NN"),
|
||
|
]
|
||
|
self.assertEqual(result, expected)
|
||
|
|
||
|
|
||
|
@unittest.skipUnless(senna_is_installed, "Requires Senna executable")
|
||
|
class TestSennaTagger(unittest.TestCase):
|
||
|
"""Unittest for nltk.tag.senna"""
|
||
|
|
||
|
def test_senna_tagger(self):
|
||
|
tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
|
||
|
result = tagger.tag("What is the airspeed of an unladen swallow ?".split())
|
||
|
expected = [
|
||
|
("What", "WP"),
|
||
|
("is", "VBZ"),
|
||
|
("the", "DT"),
|
||
|
("airspeed", "NN"),
|
||
|
("of", "IN"),
|
||
|
("an", "DT"),
|
||
|
("unladen", "NN"),
|
||
|
("swallow", "NN"),
|
||
|
("?", "."),
|
||
|
]
|
||
|
self.assertEqual(result, expected)
|
||
|
|
||
|
def test_senna_chunk_tagger(self):
|
||
|
chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH)
|
||
|
result_1 = chktagger.tag("What is the airspeed of an unladen swallow ?".split())
|
||
|
expected_1 = [
|
||
|
("What", "B-NP"),
|
||
|
("is", "B-VP"),
|
||
|
("the", "B-NP"),
|
||
|
("airspeed", "I-NP"),
|
||
|
("of", "B-PP"),
|
||
|
("an", "B-NP"),
|
||
|
("unladen", "I-NP"),
|
||
|
("swallow", "I-NP"),
|
||
|
("?", "O"),
|
||
|
]
|
||
|
|
||
|
result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type="NP"))
|
||
|
expected_2 = [
|
||
|
("What", "0"),
|
||
|
("the airspeed", "2-3"),
|
||
|
("an unladen swallow", "5-6-7"),
|
||
|
]
|
||
|
self.assertEqual(result_1, expected_1)
|
||
|
self.assertEqual(result_2, expected_2)
|
||
|
|
||
|
def test_senna_ner_tagger(self):
|
||
|
nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH)
|
||
|
result_1 = nertagger.tag("Shakespeare theatre was in London .".split())
|
||
|
expected_1 = [
|
||
|
("Shakespeare", "B-PER"),
|
||
|
("theatre", "O"),
|
||
|
("was", "O"),
|
||
|
("in", "O"),
|
||
|
("London", "B-LOC"),
|
||
|
(".", "O"),
|
||
|
]
|
||
|
|
||
|
result_2 = nertagger.tag("UN headquarters are in NY , USA .".split())
|
||
|
expected_2 = [
|
||
|
("UN", "B-ORG"),
|
||
|
("headquarters", "O"),
|
||
|
("are", "O"),
|
||
|
("in", "O"),
|
||
|
("NY", "B-LOC"),
|
||
|
(",", "O"),
|
||
|
("USA", "B-LOC"),
|
||
|
(".", "O"),
|
||
|
]
|
||
|
self.assertEqual(result_1, expected_1)
|
||
|
self.assertEqual(result_2, expected_2)
|