1099 lines
39 KiB
Python
1099 lines
39 KiB
Python
|
#!/usr/bin/env python3
|
|||
|
"""Tests for TextProcessor"""
|
|||
|
import sys
|
|||
|
import unittest
|
|||
|
|
|||
|
from gruut.text_processor import Sentence, TextProcessor, TextProcessorSettings, Word
|
|||
|
from gruut.utils import print_graph
|
|||
|
|
|||
|
WORDS_KWARGS = {"explicit_lang": False, "phonemes": False, "pos": False}
|
|||
|
|
|||
|
|
|||
|
class TextProcessorTestCase(unittest.TestCase):
|
|||
|
"""Tests for TextProcessor"""
|
|||
|
|
|||
|
def test_whitespace(self):
|
|||
|
"""Text whitespace preservation"""
|
|||
|
processor = TextProcessor()
|
|||
|
graph, root = processor("This is a test ")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Whitespace is retained by default
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
|
|||
|
Word(idx=1, sent_idx=0, text="is", text_with_ws="is "),
|
|||
|
Word(idx=2, sent_idx=0, text="a", text_with_ws="a "),
|
|||
|
Word(idx=3, sent_idx=0, text="test", text_with_ws="test "),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_no_whitespace(self):
|
|||
|
"""Test disabling of whitespace preservation"""
|
|||
|
processor = TextProcessor(keep_whitespace=False)
|
|||
|
graph, root = processor("This is a test ")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Whitespace is discarded
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
|
|||
|
Word(idx=1, sent_idx=0, text="is", text_with_ws="is"),
|
|||
|
Word(idx=2, sent_idx=0, text="a", text_with_ws="a"),
|
|||
|
Word(idx=3, sent_idx=0, text="test", text_with_ws="test"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_punctuation(self):
|
|||
|
"""Test splitting of punctuation from around words"""
|
|||
|
processor = TextProcessor(
|
|||
|
begin_punctuations={'"', "«"},
|
|||
|
end_punctuations={'"', "»"},
|
|||
|
minor_breaks={","},
|
|||
|
major_breaks={"."},
|
|||
|
)
|
|||
|
graph, root = processor('This «is», a "test".')
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Punctuations are separated
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
|
|||
|
Word(
|
|||
|
idx=1, sent_idx=0, text="«", text_with_ws="«", is_punctuation=True
|
|||
|
),
|
|||
|
Word(idx=2, sent_idx=0, text="is", text_with_ws="is"),
|
|||
|
Word(
|
|||
|
idx=3, sent_idx=0, text="»", text_with_ws="»", is_punctuation=True
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=4, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
|
|||
|
),
|
|||
|
Word(idx=5, sent_idx=0, text="a", text_with_ws="a "),
|
|||
|
Word(
|
|||
|
idx=6, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
|
|||
|
),
|
|||
|
Word(idx=7, sent_idx=0, text="test", text_with_ws="test"),
|
|||
|
Word(
|
|||
|
idx=8, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=9, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_punctuation_with_inner_break(self):
|
|||
|
"""Test break inside of punctuation"""
|
|||
|
processor = TextProcessor(
|
|||
|
begin_punctuations={'"'}, end_punctuations={'"'}, major_breaks={"."},
|
|||
|
)
|
|||
|
graph, root = processor('Test "one." Test two.')
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# First sentence includes final quote
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
# First sentence
|
|||
|
Word(idx=0, sent_idx=0, text="Test", text_with_ws="Test "),
|
|||
|
Word(
|
|||
|
idx=1, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
|
|||
|
),
|
|||
|
Word(idx=2, sent_idx=0, text="one", text_with_ws="one"),
|
|||
|
Word(
|
|||
|
idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=4, sent_idx=0, text='"', text_with_ws='" ', is_punctuation=True
|
|||
|
),
|
|||
|
# Second sentence
|
|||
|
Word(idx=0, sent_idx=1, text="Test", text_with_ws="Test "),
|
|||
|
Word(idx=1, sent_idx=1, text="two", text_with_ws="two"),
|
|||
|
Word(
|
|||
|
idx=2, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_replacements(self):
|
|||
|
"""Test regex replacements during tokenization"""
|
|||
|
processor = TextProcessor(
|
|||
|
minor_breaks={","},
|
|||
|
major_breaks={"."},
|
|||
|
replacements=[
|
|||
|
("\\B'", '"'), # replace single quotes
|
|||
|
("'\\B", '"'),
|
|||
|
('[\\<\\>\\(\\)\\[\\]"]+', ""), # drop brackets/quotes
|
|||
|
],
|
|||
|
)
|
|||
|
graph, root = processor("\"This,\" [is] <a> (test) 'sentence.'")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Quotes and brackets are discarded
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
|
|||
|
Word(
|
|||
|
idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
|
|||
|
),
|
|||
|
Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
|
|||
|
Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
|
|||
|
Word(idx=4, sent_idx=0, text="test", text_with_ws="test "),
|
|||
|
Word(idx=5, sent_idx=0, text="sentence", text_with_ws="sentence"),
|
|||
|
Word(
|
|||
|
idx=6, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_abbreviations(self):
|
|||
|
"""Test expansion of abbreviations (with case preservation)"""
|
|||
|
processor = TextProcessor(
|
|||
|
minor_breaks={","},
|
|||
|
major_breaks={".", "?"},
|
|||
|
abbreviations={
|
|||
|
r"^([dD])r\.": r"\1octor",
|
|||
|
r"^([mM])r\.": r"\1ister",
|
|||
|
r"^([sS])t\.": r"\1treet",
|
|||
|
},
|
|||
|
)
|
|||
|
graph, root = processor("Mr.? I'm just a dr., on this St. at least.")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Abbreviations are expanded, maintaining capitalization
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="Mister", text_with_ws="Mister"),
|
|||
|
Word(
|
|||
|
idx=1, sent_idx=0, text="?", text_with_ws="? ", is_major_break=True
|
|||
|
),
|
|||
|
Word(idx=0, sent_idx=1, text="I'm", text_with_ws="I'm "),
|
|||
|
Word(idx=1, sent_idx=1, text="just", text_with_ws="just "),
|
|||
|
Word(idx=2, sent_idx=1, text="a", text_with_ws="a "),
|
|||
|
Word(idx=3, sent_idx=1, text="doctor", text_with_ws="doctor"),
|
|||
|
Word(
|
|||
|
idx=4, sent_idx=1, text=",", text_with_ws=", ", is_minor_break=True
|
|||
|
),
|
|||
|
Word(idx=5, sent_idx=1, text="on", text_with_ws="on "),
|
|||
|
Word(idx=6, sent_idx=1, text="this", text_with_ws="this "),
|
|||
|
Word(idx=7, sent_idx=1, text="Street", text_with_ws="Street "),
|
|||
|
Word(idx=8, sent_idx=1, text="at", text_with_ws="at "),
|
|||
|
Word(idx=9, sent_idx=1, text="least", text_with_ws="least"),
|
|||
|
Word(
|
|||
|
idx=10, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_multiple_sentences(self):
|
|||
|
"""Test sentence break"""
|
|||
|
processor = TextProcessor(major_breaks={".", "!"})
|
|||
|
graph, root = processor("First sentence. Second sentence! ")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Separated by a major break
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
|
|||
|
Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
|
|||
|
Word(
|
|||
|
idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
|
|||
|
),
|
|||
|
Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
|
|||
|
Word(idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"),
|
|||
|
Word(
|
|||
|
idx=2, sent_idx=1, text="!", text_with_ws="! ", is_major_break=True
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
# Check sentences too
|
|||
|
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
|
|||
|
self.assertEqual(
|
|||
|
sentences,
|
|||
|
[
|
|||
|
Sentence(
|
|||
|
idx=0,
|
|||
|
text="First sentence.",
|
|||
|
text_with_ws="First sentence. ",
|
|||
|
text_spoken="First sentence",
|
|||
|
words=[
|
|||
|
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
|
|||
|
Word(
|
|||
|
idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=2,
|
|||
|
sent_idx=0,
|
|||
|
text=".",
|
|||
|
text_with_ws=". ",
|
|||
|
is_major_break=True,
|
|||
|
),
|
|||
|
],
|
|||
|
),
|
|||
|
Sentence(
|
|||
|
idx=1,
|
|||
|
text="Second sentence!",
|
|||
|
text_with_ws="Second sentence! ",
|
|||
|
text_spoken="Second sentence",
|
|||
|
words=[
|
|||
|
Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
|
|||
|
Word(
|
|||
|
idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=2,
|
|||
|
sent_idx=1,
|
|||
|
text="!",
|
|||
|
text_with_ws="! ",
|
|||
|
is_major_break=True,
|
|||
|
),
|
|||
|
],
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_multiple_paragraphs(self):
|
|||
|
"""Test paragraph index"""
|
|||
|
processor = TextProcessor()
|
|||
|
graph, root = processor(
|
|||
|
"<speak><p>First paragraph</p><p>Second paragraph</p></speak>", ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Sentences/words should be in different paragraphs
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, par_idx=0, text="First", text_with_ws="First "),
|
|||
|
Word(
|
|||
|
idx=1,
|
|||
|
sent_idx=0,
|
|||
|
par_idx=0,
|
|||
|
text="paragraph",
|
|||
|
text_with_ws="paragraph",
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=0, sent_idx=0, par_idx=1, text="Second", text_with_ws="Second "
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=1,
|
|||
|
sent_idx=0,
|
|||
|
par_idx=1,
|
|||
|
text="paragraph",
|
|||
|
text_with_ws="paragraph",
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_explicit_sentence(self):
|
|||
|
"""Test <s> in SSML for avoiding sentence break"""
|
|||
|
processor = TextProcessor(major_breaks={".", "!"})
|
|||
|
graph, root = processor("<s>First sentence. Second sentence!</s>", ssml=True)
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Sentences should not be split apart
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
|
|||
|
Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
|
|||
|
Word(
|
|||
|
idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
|
|||
|
),
|
|||
|
Word(idx=3, sent_idx=0, text="Second", text_with_ws="Second "),
|
|||
|
Word(idx=4, sent_idx=0, text="sentence", text_with_ws="sentence"),
|
|||
|
Word(
|
|||
|
idx=5, sent_idx=0, text="!", text_with_ws="!", is_major_break=True
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_minor_breaks(self):
|
|||
|
"""Test minor (phrase) break"""
|
|||
|
processor = TextProcessor(minor_breaks={","})
|
|||
|
graph, root = processor("this, is a test")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Comma should be split from word
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="this", text_with_ws="this"),
|
|||
|
Word(
|
|||
|
idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
|
|||
|
),
|
|||
|
Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
|
|||
|
Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
|
|||
|
Word(idx=4, sent_idx=0, text="test", text_with_ws="test"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_word_breaks(self):
|
|||
|
"""Test inner-word break"""
|
|||
|
processor = TextProcessor(word_breaks={"-"})
|
|||
|
graph, root = processor("ninety-nine")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Word should be split
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="ninety", text_with_ws="ninety "),
|
|||
|
Word(idx=1, sent_idx=0, text="nine", text_with_ws="nine"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_spell_out(self):
|
|||
|
"""Test interpret-as="spell-out" in SSML"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor(
|
|||
|
'<say-as interpret-as="spell-out">test123</say-as>', ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
print_graph(graph, root)
|
|||
|
|
|||
|
# Word should be split into letters
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="t", text_with_ws="t "),
|
|||
|
Word(idx=1, sent_idx=0, text="e", text_with_ws="e "),
|
|||
|
Word(idx=2, sent_idx=0, text="s", text_with_ws="s "),
|
|||
|
Word(idx=3, sent_idx=0, text="t", text_with_ws="t "),
|
|||
|
Word(idx=4, sent_idx=0, text="one", text_with_ws="one "),
|
|||
|
Word(idx=5, sent_idx=0, text="two", text_with_ws="two "),
|
|||
|
Word(idx=6, sent_idx=0, text="three", text_with_ws="three"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_initialisms(self):
|
|||
|
"""Test initialism spell out"""
|
|||
|
processor = TextProcessor(
|
|||
|
major_breaks={"."},
|
|||
|
is_initialism=lambda s: s.isalpha() and s.isupper(),
|
|||
|
split_initialism=list,
|
|||
|
)
|
|||
|
graph, root = processor("TTS.")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Letters should be split
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="T", text_with_ws="T "),
|
|||
|
Word(idx=1, sent_idx=0, text="T", text_with_ws="T "),
|
|||
|
Word(idx=2, sent_idx=0, text="S", text_with_ws="S"),
|
|||
|
Word(
|
|||
|
idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_numbers_one_language(self):
|
|||
|
"""Test number verbalization (single language)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor("1 2 3")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Numbers should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="one", text_with_ws="one "),
|
|||
|
Word(idx=1, sent_idx=0, text="two", text_with_ws="two "),
|
|||
|
Word(idx=2, sent_idx=0, text="three", text_with_ws="three"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_numbers_multiple_languages(self):
|
|||
|
"""Test number verbalization (SSML, multiple languages)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor(
|
|||
|
'1 <w lang="es_ES">2</w> <w lang="de_DE">3</w>', ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, phonemes=False))
|
|||
|
|
|||
|
# Numbers should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
|
|||
|
Word(lang="es_ES", idx=1, sent_idx=0, text="dos", text_with_ws="dos "),
|
|||
|
Word(lang="de_DE", idx=2, sent_idx=0, text="drei", text_with_ws="drei"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_currency_one_language(self):
|
|||
|
"""Test currency verbalization (single language)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor("$10")
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Currency should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=1,
|
|||
|
sent_idx=0,
|
|||
|
text="dollars",
|
|||
|
text_with_ws="dollars",
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_currency_multiple_language(self):
|
|||
|
"""Test currency verbalization (SSML, multiple languages)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor(
|
|||
|
'€10 <w lang="fr_FR">€10</w> <w lang="nl_NL">€10</w>',
|
|||
|
ssml=True,
|
|||
|
phonemize=False,
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Currencies should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
|
|||
|
Word(
|
|||
|
lang="en_US", idx=1, sent_idx=0, text="euro", text_with_ws="euro "
|
|||
|
),
|
|||
|
Word(lang="fr_FR", idx=2, sent_idx=0, text="dix", text_with_ws="dix "),
|
|||
|
Word(
|
|||
|
lang="fr_FR", idx=3, sent_idx=0, text="euros", text_with_ws="euros "
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="nl_NL", idx=4, sent_idx=0, text="tien", text_with_ws="tien "
|
|||
|
),
|
|||
|
Word(lang="nl_NL", idx=5, sent_idx=0, text="euro", text_with_ws="euro"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_currency_default(self):
|
|||
|
"""Test default currency use when no currency symbol (interpret-as="currency")"""
|
|||
|
processor = TextProcessor(default_lang="en_US", default_currency="USD")
|
|||
|
graph, root = processor(
|
|||
|
'<say-as interpret-as="currency">10</say-as>', ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Currency should be verbalized, despite lack of "$" symbol
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=1,
|
|||
|
sent_idx=0,
|
|||
|
text="dollars",
|
|||
|
text_with_ws="dollars",
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_time(self):
|
|||
|
"""Test time verbalization (English)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor(" 4:01pm")
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Time should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(
|
|||
|
lang="en_US", idx=0, sent_idx=0, text="four", text_with_ws=" four "
|
|||
|
),
|
|||
|
Word(lang="en_US", idx=1, sent_idx=0, text="oh", text_with_ws="oh "),
|
|||
|
Word(lang="en_US", idx=2, sent_idx=0, text="one", text_with_ws="one "),
|
|||
|
Word(lang="en_US", idx=3, sent_idx=0, text="P", text_with_ws="P "),
|
|||
|
Word(lang="en_US", idx=4, sent_idx=0, text="M", text_with_ws="M"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_time_no_colon(self):
|
|||
|
"""Test time verbalization without a colon (English)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor("10am")
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Time should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
|
|||
|
Word(lang="en_US", idx=1, sent_idx=0, text="A", text_with_ws="A "),
|
|||
|
Word(lang="en_US", idx=2, sent_idx=0, text="M", text_with_ws="M"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_date_one_language(self):
|
|||
|
"""Test date verbalization (single language)"""
|
|||
|
processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
|
|||
|
graph, root = processor("4/1/1999")
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Date should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(
|
|||
|
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=2,
|
|||
|
sent_idx=0,
|
|||
|
text=",",
|
|||
|
text_with_ws=", ",
|
|||
|
is_minor_break=True,
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=3,
|
|||
|
sent_idx=0,
|
|||
|
text="nineteen",
|
|||
|
text_with_ws="nineteen ",
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=4,
|
|||
|
sent_idx=0,
|
|||
|
text="ninety",
|
|||
|
text_with_ws="ninety ",
|
|||
|
),
|
|||
|
Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_date_multiple_languages(self):
|
|||
|
"""Test date verbalization (SSML, multiple languages)"""
|
|||
|
processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
|
|||
|
graph, root = processor(
|
|||
|
'<speak><s>4/1/1999</s> <s lang="fr_FR">4/1/1999</s><s lang="de_DE">01.04.1999</s></speak>',
|
|||
|
ssml=True,
|
|||
|
phonemize=False, # ensure French year is split
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Date should be verbalized
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
# English
|
|||
|
Word(
|
|||
|
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=2,
|
|||
|
sent_idx=0,
|
|||
|
text=",",
|
|||
|
text_with_ws=", ",
|
|||
|
is_minor_break=True,
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=3,
|
|||
|
sent_idx=0,
|
|||
|
text="nineteen",
|
|||
|
text_with_ws="nineteen ",
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=4,
|
|||
|
sent_idx=0,
|
|||
|
text="ninety",
|
|||
|
text_with_ws="ninety ",
|
|||
|
),
|
|||
|
Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
|
|||
|
# French
|
|||
|
Word(
|
|||
|
lang="fr_FR",
|
|||
|
idx=0,
|
|||
|
sent_idx=1,
|
|||
|
text="quatrième",
|
|||
|
text_with_ws="quatrième ",
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="fr_FR",
|
|||
|
idx=1,
|
|||
|
sent_idx=1,
|
|||
|
text="janvier",
|
|||
|
text_with_ws="janvier ",
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="fr_FR", idx=2, sent_idx=1, text="mille", text_with_ws="mille "
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="fr_FR", idx=3, sent_idx=1, text="neuf", text_with_ws="neuf "
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="fr_FR", idx=4, sent_idx=1, text="cent", text_with_ws="cent "
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="fr_FR",
|
|||
|
idx=5,
|
|||
|
sent_idx=1,
|
|||
|
text="quatre",
|
|||
|
text_with_ws="quatre ",
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="fr_FR", idx=6, sent_idx=1, text="vingt", text_with_ws="vingt "
|
|||
|
),
|
|||
|
Word(lang="fr_FR", idx=7, sent_idx=1, text="dix", text_with_ws="dix "),
|
|||
|
Word(lang="fr_FR", idx=8, sent_idx=1, text="neuf", text_with_ws="neuf"),
|
|||
|
# German
|
|||
|
Word(
|
|||
|
lang="de_DE",
|
|||
|
idx=0,
|
|||
|
sent_idx=2,
|
|||
|
text="erste",
|
|||
|
text_with_ws="erste ",
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="de_DE",
|
|||
|
idx=1,
|
|||
|
sent_idx=2,
|
|||
|
text="April",
|
|||
|
text_with_ws="April ",
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="de_DE",
|
|||
|
idx=2,
|
|||
|
sent_idx=2,
|
|||
|
text="neunzehnhundertneunundneunzig",
|
|||
|
text_with_ws="neunzehnhundertneunundneunzig",
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_date_format_ordinal(self):
|
|||
|
"""Test date format in SSML (ordinal)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor(
|
|||
|
'<say-as interpret-as="date" format="md">4/1</say-as>', ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Date is forced to be interpreted and format using day ordinal (first)
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(
|
|||
|
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
|
|||
|
),
|
|||
|
Word(lang="en_US", idx=1, sent_idx=0, text="one", text_with_ws="one"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_date_format_cardinal(self):
|
|||
|
"""Test date format in SSML (cardinal)"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor(
|
|||
|
'<say-as interpret-as="date" format="dmy">4/1/2000</say-as>', ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, phonemes=False, pos=False))
|
|||
|
|
|||
|
# Date is forced to be interpreted and format using day ordinal (first)
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
|
|||
|
Word(
|
|||
|
lang="en_US", idx=1, sent_idx=0, text="April", text_with_ws="April "
|
|||
|
),
|
|||
|
Word(lang="en_US", idx=2, sent_idx=0, text="two", text_with_ws="two "),
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=3,
|
|||
|
sent_idx=0,
|
|||
|
text="thousand",
|
|||
|
text_with_ws="thousand",
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_part_of_speech_tagging(self):
|
|||
|
"""Test part-of-speech tagging"""
|
|||
|
|
|||
|
def get_parts_of_speech(words, *args, **kwargs):
|
|||
|
return [w.upper() for w in words]
|
|||
|
|
|||
|
processor = TextProcessor(
|
|||
|
# Made-up tagger that just gives the UPPER of the word back
|
|||
|
get_parts_of_speech=get_parts_of_speech
|
|||
|
)
|
|||
|
graph, root = processor("a test")
|
|||
|
words = list(processor.words(graph, root, explicit_lang=False, phonemes=False))
|
|||
|
|
|||
|
# Fake POS tags are added
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="a", text_with_ws="a ", pos="A"),
|
|||
|
Word(idx=1, sent_idx=0, text="test", text_with_ws="test", pos="TEST"),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_phonemize_one_language(self):
|
|||
|
"""Test phonemizer (single language)"""
|
|||
|
|
|||
|
def lookup_phonemes(word: str, *args, **kwargs):
|
|||
|
return list(word)
|
|||
|
|
|||
|
processor = TextProcessor(
|
|||
|
# Made-up phonemizer that just gives back the letters
|
|||
|
lookup_phonemes=lookup_phonemes,
|
|||
|
)
|
|||
|
graph, root = processor("test")
|
|||
|
words = list(processor.words(graph, root, pos=False, explicit_lang=False))
|
|||
|
|
|||
|
# Single word is "phonemized"
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(
|
|||
|
idx=0,
|
|||
|
sent_idx=0,
|
|||
|
text="test",
|
|||
|
text_with_ws="test",
|
|||
|
phonemes=["t", "e", "s", "t"],
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_phonemize_one_language_multiple_roles(self):
|
|||
|
"""Test phonemizer (SSML, multiple word roles)"""
|
|||
|
|
|||
|
def lookup_phonemes(word, role=None, **kwargs):
|
|||
|
return list(word) if not role else list(word.upper())
|
|||
|
|
|||
|
processor = TextProcessor(
|
|||
|
# Made-up phonemizer that gives back upper-case letters if a role is provided
|
|||
|
lookup_phonemes=lookup_phonemes
|
|||
|
)
|
|||
|
|
|||
|
# Use made-up role
|
|||
|
graph, root = processor(
|
|||
|
'<speak>test <w role="some_role">test</w></speak>', ssml=True, pos=False
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, pos=False, explicit_lang=False))
|
|||
|
|
|||
|
# Single word is phonemized two different manners depending on role
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(
|
|||
|
idx=0,
|
|||
|
sent_idx=0,
|
|||
|
text="test",
|
|||
|
text_with_ws="test ",
|
|||
|
phonemes=["t", "e", "s", "t"],
|
|||
|
),
|
|||
|
Word(
|
|||
|
idx=1,
|
|||
|
sent_idx=0,
|
|||
|
text="test",
|
|||
|
text_with_ws="test",
|
|||
|
phonemes=["T", "E", "S", "T"],
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_phonemize_multiple_languages(self):
|
|||
|
"""Test phonemizer (SSML, multiple languages)"""
|
|||
|
|
|||
|
def en_lookup_phonemes(word: str, *args, **kwargs):
|
|||
|
return list(word)
|
|||
|
|
|||
|
def de_lookup_phonemes(word: str, *args, **kwargs):
|
|||
|
return list(word.upper())
|
|||
|
|
|||
|
processor = TextProcessor(
|
|||
|
default_lang="en_US",
|
|||
|
lookup_phonemes=en_lookup_phonemes,
|
|||
|
settings={
|
|||
|
"de_DE": TextProcessorSettings(
|
|||
|
lang="de_DE", lookup_phonemes=de_lookup_phonemes
|
|||
|
)
|
|||
|
},
|
|||
|
)
|
|||
|
graph, root = processor(
|
|||
|
'<speak>test <w lang="de_DE">test</w></speak>', ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root))
|
|||
|
|
|||
|
# Single word is phonemized according to the lexicon with two different languages
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(
|
|||
|
lang="en_US",
|
|||
|
idx=0,
|
|||
|
sent_idx=0,
|
|||
|
text="test",
|
|||
|
text_with_ws="test ",
|
|||
|
phonemes=["t", "e", "s", "t"],
|
|||
|
),
|
|||
|
Word(
|
|||
|
lang="de_DE",
|
|||
|
idx=1,
|
|||
|
sent_idx=0,
|
|||
|
text="test",
|
|||
|
text_with_ws="test",
|
|||
|
phonemes=["T", "E", "S", "T"],
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_sub(self):
|
|||
|
"""Test SSML substitution"""
|
|||
|
processor = TextProcessor(default_lang="en_US")
|
|||
|
graph, root = processor(
|
|||
|
'<speak><sub alias="World Wide Web Consortium">W3C</sub></speak>', ssml=True
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Single word is replaced by multiple words
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="World", text_with_ws="World ",),
|
|||
|
Word(idx=1, sent_idx=0, text="Wide", text_with_ws="Wide ",),
|
|||
|
Word(idx=2, sent_idx=0, text="Web", text_with_ws="Web ",),
|
|||
|
Word(idx=3, sent_idx=0, text="Consortium", text_with_ws="Consortium",),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_break(self):
|
|||
|
"""Test SSML break tag"""
|
|||
|
processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
|
|||
|
graph, root = processor(
|
|||
|
"""
|
|||
|
<speak>
|
|||
|
<break time="1s"/>
|
|||
|
<p>
|
|||
|
<break time="2s" />
|
|||
|
<s>
|
|||
|
<break time="3s" />
|
|||
|
Break <break time="4s" /> here
|
|||
|
</s>
|
|||
|
<break time="5s" />
|
|||
|
</p>
|
|||
|
<break time="6s" />
|
|||
|
</speak>
|
|||
|
""",
|
|||
|
ssml=True,
|
|||
|
)
|
|||
|
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Break times are attached to appropriate elements
|
|||
|
self.assertEqual(
|
|||
|
sentences,
|
|||
|
[
|
|||
|
Sentence(
|
|||
|
idx=0,
|
|||
|
text="Break here",
|
|||
|
text_with_ws="Break here",
|
|||
|
text_spoken="Break here",
|
|||
|
pause_before_ms=((1 + 2) * 1000),
|
|||
|
pause_after_ms=((5 + 6) * 1000),
|
|||
|
words=[
|
|||
|
Word(
|
|||
|
idx=0,
|
|||
|
sent_idx=0,
|
|||
|
text="Break",
|
|||
|
text_with_ws="Break",
|
|||
|
pause_before_ms=(3 * 1000),
|
|||
|
pause_after_ms=(4 * 1000),
|
|||
|
),
|
|||
|
Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
|
|||
|
],
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_mark(self):
|
|||
|
"""Test SSML mark tag"""
|
|||
|
processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
|
|||
|
graph, root = processor(
|
|||
|
"""
|
|||
|
<speak>
|
|||
|
<mark name="a"/>
|
|||
|
<p>
|
|||
|
<mark name="b" />
|
|||
|
<s>
|
|||
|
<mark name="c" />
|
|||
|
Mark <mark name="d" /> here
|
|||
|
</s>
|
|||
|
<mark name="e" />
|
|||
|
</p>
|
|||
|
<mark name="f" />
|
|||
|
</speak>
|
|||
|
""",
|
|||
|
ssml=True,
|
|||
|
)
|
|||
|
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Mark names are attached to appropriate elements
|
|||
|
self.assertEqual(
|
|||
|
sentences,
|
|||
|
[
|
|||
|
Sentence(
|
|||
|
idx=0,
|
|||
|
text="Mark here",
|
|||
|
text_with_ws="Mark here",
|
|||
|
text_spoken="Mark here",
|
|||
|
marks_before=["a", "b"],
|
|||
|
marks_after=["e", "f"],
|
|||
|
words=[
|
|||
|
Word(
|
|||
|
idx=0,
|
|||
|
sent_idx=0,
|
|||
|
text="Mark",
|
|||
|
text_with_ws="Mark",
|
|||
|
marks_before=["c"],
|
|||
|
marks_after=["d"],
|
|||
|
),
|
|||
|
Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
|
|||
|
],
|
|||
|
),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_missing_speak(self):
|
|||
|
"""Test SSML with missing <speak> tag"""
|
|||
|
processor = TextProcessor()
|
|||
|
graph, root = processor("<s>hello</s><s>world</s>", ssml=True,)
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# <speak> is automatically added when XML fails to parse
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, text="hello", text_with_ws="hello",),
|
|||
|
Word(idx=0, sent_idx=1, text="world", text_with_ws="world",),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_adjacent_voice(self):
|
|||
|
"""Test SSML with adjacent <voice> tags"""
|
|||
|
processor = TextProcessor()
|
|||
|
graph, root = processor(
|
|||
|
'<voice name="a">hello.</voice><voice name="b">world.</voice>', ssml=True,
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, major_breaks=False, **WORDS_KWARGS))
|
|||
|
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, sent_idx=0, voice="a", text="hello", text_with_ws="hello",),
|
|||
|
Word(idx=0, sent_idx=1, voice="b", text="world", text_with_ws="world",),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_multiple_passes(self):
|
|||
|
"""Test sentence that needs multiple passes to fully resolve"""
|
|||
|
processor = TextProcessor()
|
|||
|
graph, root = processor("ABCD-10")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# 1) ABCD-10 -> ABCD 10
|
|||
|
# 2) ABCD 10 -> A B C D ten
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, text="A", text_with_ws="A ",),
|
|||
|
Word(idx=1, text="B", text_with_ws="B ",),
|
|||
|
Word(idx=2, text="C", text_with_ws="C ",),
|
|||
|
Word(idx=3, text="D", text_with_ws="D ",),
|
|||
|
Word(idx=4, text="ten", text_with_ws="ten",),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_number_nonfinite(self):
|
|||
|
"""Test sentence with nan or inf"""
|
|||
|
processor = TextProcessor()
|
|||
|
graph, root = processor("nan inf")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Words should not be parsed as numbers
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, text="nan", text_with_ws="nan ",),
|
|||
|
Word(idx=1, text="inf", text_with_ws="inf",),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
def test_override_initialism(self):
|
|||
|
"""Test use of inline lexicon pronunciation to override an initialism"""
|
|||
|
processor = TextProcessor()
|
|||
|
graph, root = processor("ROOFUS")
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Word is interpreted as initialism
|
|||
|
self.assertEqual(
|
|||
|
words,
|
|||
|
[
|
|||
|
Word(idx=0, text="R", text_with_ws="R ",),
|
|||
|
Word(idx=1, text="O", text_with_ws="O ",),
|
|||
|
Word(idx=2, text="O", text_with_ws="O ",),
|
|||
|
Word(idx=3, text="F", text_with_ws="F ",),
|
|||
|
Word(idx=4, text="U", text_with_ws="U ",),
|
|||
|
Word(idx=5, text="S", text_with_ws="S",),
|
|||
|
],
|
|||
|
)
|
|||
|
|
|||
|
graph, root = processor(
|
|||
|
"""
|
|||
|
<speak>
|
|||
|
<lexicon>
|
|||
|
<lexeme>
|
|||
|
<grapheme>ROOFUS</grapheme>
|
|||
|
<phoneme>ɹ ˈu f ə s</phoneme>
|
|||
|
</lexeme>
|
|||
|
</lexicon>
|
|||
|
<s>ROOFUS</s>
|
|||
|
</speak>""",
|
|||
|
ssml=True,
|
|||
|
)
|
|||
|
words = list(processor.words(graph, root, **WORDS_KWARGS))
|
|||
|
|
|||
|
# Word is *not* interpreted as initialism
|
|||
|
self.assertEqual(
|
|||
|
words, [Word(idx=0, text="ROOFUS", text_with_ws="ROOFUS",)],
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
def print_graph_stderr(graph, root):
|
|||
|
"""Print graph to stderr"""
|
|||
|
print_graph(graph, root, print_func=lambda *p: print(*p, file=sys.stderr))
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
unittest.main()
|