158 lines
6.0 KiB
Python
158 lines
6.0 KiB
Python
import unittest
|
|
from contextlib import closing
|
|
|
|
from nltk import data
|
|
from nltk.stem.porter import PorterStemmer
|
|
from nltk.stem.snowball import SnowballStemmer
|
|
|
|
|
|
class SnowballTest(unittest.TestCase):
|
|
def test_arabic(self):
|
|
"""
|
|
this unit testing for test the snowball arabic light stemmer
|
|
this stemmer deals with prefixes and suffixes
|
|
"""
|
|
# Test where the ignore_stopwords=True.
|
|
ar_stemmer = SnowballStemmer("arabic", True)
|
|
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
|
|
assert ar_stemmer.stem("العربية") == "عرب"
|
|
assert ar_stemmer.stem("فقالوا") == "قال"
|
|
assert ar_stemmer.stem("الطالبات") == "طالب"
|
|
assert ar_stemmer.stem("فالطالبات") == "طالب"
|
|
assert ar_stemmer.stem("والطالبات") == "طالب"
|
|
assert ar_stemmer.stem("الطالبون") == "طالب"
|
|
assert ar_stemmer.stem("اللذان") == "اللذان"
|
|
assert ar_stemmer.stem("من") == "من"
|
|
# Test where the ignore_stopwords=False.
|
|
ar_stemmer = SnowballStemmer("arabic", False)
|
|
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
|
|
assert ar_stemmer.stem("الطالبات") == "طالب"
|
|
assert ar_stemmer.stem("الكلمات") == "كلم"
|
|
# test where create the arabic stemmer without given init value to ignore_stopwords
|
|
ar_stemmer = SnowballStemmer("arabic")
|
|
assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب"
|
|
assert ar_stemmer.stem("العربية") == "عرب"
|
|
assert ar_stemmer.stem("فقالوا") == "قال"
|
|
assert ar_stemmer.stem("الطالبات") == "طالب"
|
|
assert ar_stemmer.stem("الكلمات") == "كلم"
|
|
|
|
def test_russian(self):
|
|
stemmer_russian = SnowballStemmer("russian")
|
|
assert stemmer_russian.stem("авантненькая") == "авантненьк"
|
|
|
|
def test_german(self):
|
|
stemmer_german = SnowballStemmer("german")
|
|
stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)
|
|
|
|
assert stemmer_german.stem("Schr\xe4nke") == "schrank"
|
|
assert stemmer_german2.stem("Schr\xe4nke") == "schrank"
|
|
|
|
assert stemmer_german.stem("keinen") == "kein"
|
|
assert stemmer_german2.stem("keinen") == "keinen"
|
|
|
|
def test_spanish(self):
|
|
stemmer = SnowballStemmer("spanish")
|
|
|
|
assert stemmer.stem("Visionado") == "vision"
|
|
|
|
# The word 'algue' was raising an IndexError
|
|
assert stemmer.stem("algue") == "algu"
|
|
|
|
def test_short_strings_bug(self):
|
|
stemmer = SnowballStemmer("english")
|
|
assert stemmer.stem("y's") == "y"
|
|
|
|
|
|
class PorterTest(unittest.TestCase):
|
|
def _vocabulary(self):
|
|
with closing(
|
|
data.find("stemmers/porter_test/porter_vocabulary.txt").open(
|
|
encoding="utf-8"
|
|
)
|
|
) as fp:
|
|
return fp.read().splitlines()
|
|
|
|
def _test_against_expected_output(self, stemmer_mode, expected_stems):
|
|
stemmer = PorterStemmer(mode=stemmer_mode)
|
|
for word, true_stem in zip(self._vocabulary(), expected_stems):
|
|
our_stem = stemmer.stem(word)
|
|
assert (
|
|
our_stem == true_stem
|
|
), "{} should stem to {} in {} mode but got {}".format(
|
|
word,
|
|
true_stem,
|
|
stemmer_mode,
|
|
our_stem,
|
|
)
|
|
|
|
def test_vocabulary_martin_mode(self):
|
|
"""Tests all words from the test vocabulary provided by M Porter
|
|
|
|
The sample vocabulary and output were sourced from
|
|
https://tartarus.org/martin/PorterStemmer/voc.txt and
|
|
https://tartarus.org/martin/PorterStemmer/output.txt
|
|
and are linked to from the Porter Stemmer algorithm's homepage
|
|
at https://tartarus.org/martin/PorterStemmer/
|
|
"""
|
|
with closing(
|
|
data.find("stemmers/porter_test/porter_martin_output.txt").open(
|
|
encoding="utf-8"
|
|
)
|
|
) as fp:
|
|
self._test_against_expected_output(
|
|
PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines()
|
|
)
|
|
|
|
def test_vocabulary_nltk_mode(self):
|
|
with closing(
|
|
data.find("stemmers/porter_test/porter_nltk_output.txt").open(
|
|
encoding="utf-8"
|
|
)
|
|
) as fp:
|
|
self._test_against_expected_output(
|
|
PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines()
|
|
)
|
|
|
|
def test_vocabulary_original_mode(self):
|
|
# The list of stems for this test was generated by taking the
|
|
# Martin-blessed stemmer from
|
|
# https://tartarus.org/martin/PorterStemmer/c.txt
|
|
# and removing all the --DEPARTURE-- sections from it and
|
|
# running it against Martin's test vocabulary.
|
|
|
|
with closing(
|
|
data.find("stemmers/porter_test/porter_original_output.txt").open(
|
|
encoding="utf-8"
|
|
)
|
|
) as fp:
|
|
self._test_against_expected_output(
|
|
PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines()
|
|
)
|
|
|
|
self._test_against_expected_output(
|
|
PorterStemmer.ORIGINAL_ALGORITHM,
|
|
data.find("stemmers/porter_test/porter_original_output.txt")
|
|
.open(encoding="utf-8")
|
|
.read()
|
|
.splitlines(),
|
|
)
|
|
|
|
def test_oed_bug(self):
|
|
"""Test for bug https://github.com/nltk/nltk/issues/1581
|
|
|
|
Ensures that 'oed' can be stemmed without throwing an error.
|
|
"""
|
|
assert PorterStemmer().stem("oed") == "o"
|
|
|
|
def test_lowercase_option(self):
|
|
"""Test for improvement on https://github.com/nltk/nltk/issues/2507
|
|
|
|
Ensures that stems are lowercased when `to_lowercase=True`
|
|
"""
|
|
porter = PorterStemmer()
|
|
assert porter.stem("On") == "on"
|
|
assert porter.stem("I") == "i"
|
|
assert porter.stem("I", to_lowercase=False) == "I"
|
|
assert porter.stem("Github") == "github"
|
|
assert porter.stem("Github", to_lowercase=False) == "Github"
|