ai-content-maker/.venv/Lib/site-packages/tests/test_text_processor.py

1099 lines
39 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python3
"""Tests for TextProcessor"""
import sys
import unittest
from gruut.text_processor import Sentence, TextProcessor, TextProcessorSettings, Word
from gruut.utils import print_graph
WORDS_KWARGS = {"explicit_lang": False, "phonemes": False, "pos": False}
class TextProcessorTestCase(unittest.TestCase):
"""Tests for TextProcessor"""
def test_whitespace(self):
"""Text whitespace preservation"""
processor = TextProcessor()
graph, root = processor("This is a test ")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Whitespace is retained by default
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
Word(idx=1, sent_idx=0, text="is", text_with_ws="is "),
Word(idx=2, sent_idx=0, text="a", text_with_ws="a "),
Word(idx=3, sent_idx=0, text="test", text_with_ws="test "),
],
)
def test_no_whitespace(self):
"""Test disabling of whitespace preservation"""
processor = TextProcessor(keep_whitespace=False)
graph, root = processor("This is a test ")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Whitespace is discarded
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
Word(idx=1, sent_idx=0, text="is", text_with_ws="is"),
Word(idx=2, sent_idx=0, text="a", text_with_ws="a"),
Word(idx=3, sent_idx=0, text="test", text_with_ws="test"),
],
)
def test_punctuation(self):
"""Test splitting of punctuation from around words"""
processor = TextProcessor(
begin_punctuations={'"', "«"},
end_punctuations={'"', "»"},
minor_breaks={","},
major_breaks={"."},
)
graph, root = processor('This «is», a "test".')
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Punctuations are separated
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
Word(
idx=1, sent_idx=0, text="«", text_with_ws="«", is_punctuation=True
),
Word(idx=2, sent_idx=0, text="is", text_with_ws="is"),
Word(
idx=3, sent_idx=0, text="»", text_with_ws="»", is_punctuation=True
),
Word(
idx=4, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=5, sent_idx=0, text="a", text_with_ws="a "),
Word(
idx=6, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
),
Word(idx=7, sent_idx=0, text="test", text_with_ws="test"),
Word(
idx=8, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
),
Word(
idx=9, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_punctuation_with_inner_break(self):
"""Test break inside of punctuation"""
processor = TextProcessor(
begin_punctuations={'"'}, end_punctuations={'"'}, major_breaks={"."},
)
graph, root = processor('Test "one." Test two.')
words = list(processor.words(graph, root, **WORDS_KWARGS))
# First sentence includes final quote
self.assertEqual(
words,
[
# First sentence
Word(idx=0, sent_idx=0, text="Test", text_with_ws="Test "),
Word(
idx=1, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
),
Word(idx=2, sent_idx=0, text="one", text_with_ws="one"),
Word(
idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
Word(
idx=4, sent_idx=0, text='"', text_with_ws='" ', is_punctuation=True
),
# Second sentence
Word(idx=0, sent_idx=1, text="Test", text_with_ws="Test "),
Word(idx=1, sent_idx=1, text="two", text_with_ws="two"),
Word(
idx=2, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_replacements(self):
"""Test regex replacements during tokenization"""
processor = TextProcessor(
minor_breaks={","},
major_breaks={"."},
replacements=[
("\\B'", '"'), # replace single quotes
("'\\B", '"'),
('[\\<\\>\\(\\)\\[\\]"]+', ""), # drop brackets/quotes
],
)
graph, root = processor("\"This,\" [is] <a> (test) 'sentence.'")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Quotes and brackets are discarded
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
Word(
idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
Word(idx=4, sent_idx=0, text="test", text_with_ws="test "),
Word(idx=5, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=6, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_abbreviations(self):
"""Test expansion of abbreviations (with case preservation)"""
processor = TextProcessor(
minor_breaks={","},
major_breaks={".", "?"},
abbreviations={
r"^([dD])r\.": r"\1octor",
r"^([mM])r\.": r"\1ister",
r"^([sS])t\.": r"\1treet",
},
)
graph, root = processor("Mr.? I'm just a dr., on this St. at least.")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Abbreviations are expanded, maintaining capitalization
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="Mister", text_with_ws="Mister"),
Word(
idx=1, sent_idx=0, text="?", text_with_ws="? ", is_major_break=True
),
Word(idx=0, sent_idx=1, text="I'm", text_with_ws="I'm "),
Word(idx=1, sent_idx=1, text="just", text_with_ws="just "),
Word(idx=2, sent_idx=1, text="a", text_with_ws="a "),
Word(idx=3, sent_idx=1, text="doctor", text_with_ws="doctor"),
Word(
idx=4, sent_idx=1, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=5, sent_idx=1, text="on", text_with_ws="on "),
Word(idx=6, sent_idx=1, text="this", text_with_ws="this "),
Word(idx=7, sent_idx=1, text="Street", text_with_ws="Street "),
Word(idx=8, sent_idx=1, text="at", text_with_ws="at "),
Word(idx=9, sent_idx=1, text="least", text_with_ws="least"),
Word(
idx=10, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_multiple_sentences(self):
"""Test sentence break"""
processor = TextProcessor(major_breaks={".", "!"})
graph, root = processor("First sentence. Second sentence! ")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Separated by a major break
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
),
Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
Word(idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"),
Word(
idx=2, sent_idx=1, text="!", text_with_ws="! ", is_major_break=True
),
],
)
# Check sentences too
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
self.assertEqual(
sentences,
[
Sentence(
idx=0,
text="First sentence.",
text_with_ws="First sentence. ",
text_spoken="First sentence",
words=[
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
Word(
idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"
),
Word(
idx=2,
sent_idx=0,
text=".",
text_with_ws=". ",
is_major_break=True,
),
],
),
Sentence(
idx=1,
text="Second sentence!",
text_with_ws="Second sentence! ",
text_spoken="Second sentence",
words=[
Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
Word(
idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"
),
Word(
idx=2,
sent_idx=1,
text="!",
text_with_ws="! ",
is_major_break=True,
),
],
),
],
)
def test_multiple_paragraphs(self):
"""Test paragraph index"""
processor = TextProcessor()
graph, root = processor(
"<speak><p>First paragraph</p><p>Second paragraph</p></speak>", ssml=True
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Sentences/words should be in different paragraphs
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, par_idx=0, text="First", text_with_ws="First "),
Word(
idx=1,
sent_idx=0,
par_idx=0,
text="paragraph",
text_with_ws="paragraph",
),
Word(
idx=0, sent_idx=0, par_idx=1, text="Second", text_with_ws="Second "
),
Word(
idx=1,
sent_idx=0,
par_idx=1,
text="paragraph",
text_with_ws="paragraph",
),
],
)
def test_explicit_sentence(self):
"""Test <s> in SSML for avoiding sentence break"""
processor = TextProcessor(major_breaks={".", "!"})
graph, root = processor("<s>First sentence. Second sentence!</s>", ssml=True)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Sentences should not be split apart
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
),
Word(idx=3, sent_idx=0, text="Second", text_with_ws="Second "),
Word(idx=4, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=5, sent_idx=0, text="!", text_with_ws="!", is_major_break=True
),
],
)
def test_minor_breaks(self):
"""Test minor (phrase) break"""
processor = TextProcessor(minor_breaks={","})
graph, root = processor("this, is a test")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Comma should be split from word
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="this", text_with_ws="this"),
Word(
idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
Word(idx=4, sent_idx=0, text="test", text_with_ws="test"),
],
)
def test_word_breaks(self):
"""Test inner-word break"""
processor = TextProcessor(word_breaks={"-"})
graph, root = processor("ninety-nine")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Word should be split
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="ninety", text_with_ws="ninety "),
Word(idx=1, sent_idx=0, text="nine", text_with_ws="nine"),
],
)
def test_spell_out(self):
"""Test interpret-as="spell-out" in SSML"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<say-as interpret-as="spell-out">test123</say-as>', ssml=True
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
print_graph(graph, root)
# Word should be split into letters
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="t", text_with_ws="t "),
Word(idx=1, sent_idx=0, text="e", text_with_ws="e "),
Word(idx=2, sent_idx=0, text="s", text_with_ws="s "),
Word(idx=3, sent_idx=0, text="t", text_with_ws="t "),
Word(idx=4, sent_idx=0, text="one", text_with_ws="one "),
Word(idx=5, sent_idx=0, text="two", text_with_ws="two "),
Word(idx=6, sent_idx=0, text="three", text_with_ws="three"),
],
)
def test_initialisms(self):
"""Test initialism spell out"""
processor = TextProcessor(
major_breaks={"."},
is_initialism=lambda s: s.isalpha() and s.isupper(),
split_initialism=list,
)
graph, root = processor("TTS.")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Letters should be split
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="T", text_with_ws="T "),
Word(idx=1, sent_idx=0, text="T", text_with_ws="T "),
Word(idx=2, sent_idx=0, text="S", text_with_ws="S"),
Word(
idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_numbers_one_language(self):
"""Test number verbalization (single language)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor("1 2 3")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Numbers should be verbalized
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="one", text_with_ws="one "),
Word(idx=1, sent_idx=0, text="two", text_with_ws="two "),
Word(idx=2, sent_idx=0, text="three", text_with_ws="three"),
],
)
def test_numbers_multiple_languages(self):
"""Test number verbalization (SSML, multiple languages)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'1 <w lang="es_ES">2</w> <w lang="de_DE">3</w>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False))
# Numbers should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
Word(lang="es_ES", idx=1, sent_idx=0, text="dos", text_with_ws="dos "),
Word(lang="de_DE", idx=2, sent_idx=0, text="drei", text_with_ws="drei"),
],
)
def test_currency_one_language(self):
"""Test currency verbalization (single language)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor("$10")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Currency should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(
lang="en_US",
idx=1,
sent_idx=0,
text="dollars",
text_with_ws="dollars",
),
],
)
def test_currency_multiple_language(self):
"""Test currency verbalization (SSML, multiple languages)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'€10 <w lang="fr_FR">€10</w> <w lang="nl_NL">€10</w>',
ssml=True,
phonemize=False,
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Currencies should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(
lang="en_US", idx=1, sent_idx=0, text="euro", text_with_ws="euro "
),
Word(lang="fr_FR", idx=2, sent_idx=0, text="dix", text_with_ws="dix "),
Word(
lang="fr_FR", idx=3, sent_idx=0, text="euros", text_with_ws="euros "
),
Word(
lang="nl_NL", idx=4, sent_idx=0, text="tien", text_with_ws="tien "
),
Word(lang="nl_NL", idx=5, sent_idx=0, text="euro", text_with_ws="euro"),
],
)
def test_currency_default(self):
"""Test default currency use when no currency symbol (interpret-as="currency")"""
processor = TextProcessor(default_lang="en_US", default_currency="USD")
graph, root = processor(
'<say-as interpret-as="currency">10</say-as>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Currency should be verbalized, despite lack of "$" symbol
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(
lang="en_US",
idx=1,
sent_idx=0,
text="dollars",
text_with_ws="dollars",
),
],
)
def test_time(self):
"""Test time verbalization (English)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(" 4:01pm")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Time should be verbalized
self.assertEqual(
words,
[
Word(
lang="en_US", idx=0, sent_idx=0, text="four", text_with_ws=" four "
),
Word(lang="en_US", idx=1, sent_idx=0, text="oh", text_with_ws="oh "),
Word(lang="en_US", idx=2, sent_idx=0, text="one", text_with_ws="one "),
Word(lang="en_US", idx=3, sent_idx=0, text="P", text_with_ws="P "),
Word(lang="en_US", idx=4, sent_idx=0, text="M", text_with_ws="M"),
],
)
def test_time_no_colon(self):
"""Test time verbalization without a colon (English)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor("10am")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Time should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(lang="en_US", idx=1, sent_idx=0, text="A", text_with_ws="A "),
Word(lang="en_US", idx=2, sent_idx=0, text="M", text_with_ws="M"),
],
)
def test_date_one_language(self):
"""Test date verbalization (single language)"""
processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
graph, root = processor("4/1/1999")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date should be verbalized
self.assertEqual(
words,
[
Word(
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
),
Word(
lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
),
Word(
lang="en_US",
idx=2,
sent_idx=0,
text=",",
text_with_ws=", ",
is_minor_break=True,
),
Word(
lang="en_US",
idx=3,
sent_idx=0,
text="nineteen",
text_with_ws="nineteen ",
),
Word(
lang="en_US",
idx=4,
sent_idx=0,
text="ninety",
text_with_ws="ninety ",
),
Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
],
)
def test_date_multiple_languages(self):
"""Test date verbalization (SSML, multiple languages)"""
processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
graph, root = processor(
'<speak><s>4/1/1999</s> <s lang="fr_FR">4/1/1999</s><s lang="de_DE">01.04.1999</s></speak>',
ssml=True,
phonemize=False, # ensure French year is split
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date should be verbalized
self.assertEqual(
words,
[
# English
Word(
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
),
Word(
lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
),
Word(
lang="en_US",
idx=2,
sent_idx=0,
text=",",
text_with_ws=", ",
is_minor_break=True,
),
Word(
lang="en_US",
idx=3,
sent_idx=0,
text="nineteen",
text_with_ws="nineteen ",
),
Word(
lang="en_US",
idx=4,
sent_idx=0,
text="ninety",
text_with_ws="ninety ",
),
Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
# French
Word(
lang="fr_FR",
idx=0,
sent_idx=1,
text="quatrième",
text_with_ws="quatrième ",
),
Word(
lang="fr_FR",
idx=1,
sent_idx=1,
text="janvier",
text_with_ws="janvier ",
),
Word(
lang="fr_FR", idx=2, sent_idx=1, text="mille", text_with_ws="mille "
),
Word(
lang="fr_FR", idx=3, sent_idx=1, text="neuf", text_with_ws="neuf "
),
Word(
lang="fr_FR", idx=4, sent_idx=1, text="cent", text_with_ws="cent "
),
Word(
lang="fr_FR",
idx=5,
sent_idx=1,
text="quatre",
text_with_ws="quatre ",
),
Word(
lang="fr_FR", idx=6, sent_idx=1, text="vingt", text_with_ws="vingt "
),
Word(lang="fr_FR", idx=7, sent_idx=1, text="dix", text_with_ws="dix "),
Word(lang="fr_FR", idx=8, sent_idx=1, text="neuf", text_with_ws="neuf"),
# German
Word(
lang="de_DE",
idx=0,
sent_idx=2,
text="erste",
text_with_ws="erste ",
),
Word(
lang="de_DE",
idx=1,
sent_idx=2,
text="April",
text_with_ws="April ",
),
Word(
lang="de_DE",
idx=2,
sent_idx=2,
text="neunzehnhundertneunundneunzig",
text_with_ws="neunzehnhundertneunundneunzig",
),
],
)
def test_date_format_ordinal(self):
"""Test date format in SSML (ordinal)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<say-as interpret-as="date" format="md">4/1</say-as>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date is forced to be interpreted and format using day ordinal (first)
self.assertEqual(
words,
[
Word(
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
),
Word(lang="en_US", idx=1, sent_idx=0, text="one", text_with_ws="one"),
],
)
def test_date_format_cardinal(self):
"""Test date format in SSML (cardinal)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<say-as interpret-as="date" format="dmy">4/1/2000</say-as>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date is forced to be interpreted and format using day ordinal (first)
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
Word(
lang="en_US", idx=1, sent_idx=0, text="April", text_with_ws="April "
),
Word(lang="en_US", idx=2, sent_idx=0, text="two", text_with_ws="two "),
Word(
lang="en_US",
idx=3,
sent_idx=0,
text="thousand",
text_with_ws="thousand",
),
],
)
def test_part_of_speech_tagging(self):
"""Test part-of-speech tagging"""
def get_parts_of_speech(words, *args, **kwargs):
return [w.upper() for w in words]
processor = TextProcessor(
# Made-up tagger that just gives the UPPER of the word back
get_parts_of_speech=get_parts_of_speech
)
graph, root = processor("a test")
words = list(processor.words(graph, root, explicit_lang=False, phonemes=False))
# Fake POS tags are added
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="a", text_with_ws="a ", pos="A"),
Word(idx=1, sent_idx=0, text="test", text_with_ws="test", pos="TEST"),
],
)
def test_phonemize_one_language(self):
"""Test phonemizer (single language)"""
def lookup_phonemes(word: str, *args, **kwargs):
return list(word)
processor = TextProcessor(
# Made-up phonemizer that just gives back the letters
lookup_phonemes=lookup_phonemes,
)
graph, root = processor("test")
words = list(processor.words(graph, root, pos=False, explicit_lang=False))
# Single word is "phonemized"
self.assertEqual(
words,
[
Word(
idx=0,
sent_idx=0,
text="test",
text_with_ws="test",
phonemes=["t", "e", "s", "t"],
),
],
)
def test_phonemize_one_language_multiple_roles(self):
"""Test phonemizer (SSML, multiple word roles)"""
def lookup_phonemes(word, role=None, **kwargs):
return list(word) if not role else list(word.upper())
processor = TextProcessor(
# Made-up phonemizer that gives back upper-case letters if a role is provided
lookup_phonemes=lookup_phonemes
)
# Use made-up role
graph, root = processor(
'<speak>test <w role="some_role">test</w></speak>', ssml=True, pos=False
)
words = list(processor.words(graph, root, pos=False, explicit_lang=False))
# Single word is phonemized two different manners depending on role
self.assertEqual(
words,
[
Word(
idx=0,
sent_idx=0,
text="test",
text_with_ws="test ",
phonemes=["t", "e", "s", "t"],
),
Word(
idx=1,
sent_idx=0,
text="test",
text_with_ws="test",
phonemes=["T", "E", "S", "T"],
),
],
)
def test_phonemize_multiple_languages(self):
"""Test phonemizer (SSML, multiple languages)"""
def en_lookup_phonemes(word: str, *args, **kwargs):
return list(word)
def de_lookup_phonemes(word: str, *args, **kwargs):
return list(word.upper())
processor = TextProcessor(
default_lang="en_US",
lookup_phonemes=en_lookup_phonemes,
settings={
"de_DE": TextProcessorSettings(
lang="de_DE", lookup_phonemes=de_lookup_phonemes
)
},
)
graph, root = processor(
'<speak>test <w lang="de_DE">test</w></speak>', ssml=True
)
words = list(processor.words(graph, root))
# Single word is phonemized according to the lexicon with two different languages
self.assertEqual(
words,
[
Word(
lang="en_US",
idx=0,
sent_idx=0,
text="test",
text_with_ws="test ",
phonemes=["t", "e", "s", "t"],
),
Word(
lang="de_DE",
idx=1,
sent_idx=0,
text="test",
text_with_ws="test",
phonemes=["T", "E", "S", "T"],
),
],
)
def test_sub(self):
"""Test SSML substitution"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<speak><sub alias="World Wide Web Consortium">W3C</sub></speak>', ssml=True
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Single word is replaced by multiple words
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="World", text_with_ws="World ",),
Word(idx=1, sent_idx=0, text="Wide", text_with_ws="Wide ",),
Word(idx=2, sent_idx=0, text="Web", text_with_ws="Web ",),
Word(idx=3, sent_idx=0, text="Consortium", text_with_ws="Consortium",),
],
)
def test_break(self):
"""Test SSML break tag"""
processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
graph, root = processor(
"""
<speak>
<break time="1s"/>
<p>
<break time="2s" />
<s>
<break time="3s" />
Break <break time="4s" /> here
</s>
<break time="5s" />
</p>
<break time="6s" />
</speak>
""",
ssml=True,
)
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
# Break times are attached to appropriate elements
self.assertEqual(
sentences,
[
Sentence(
idx=0,
text="Break here",
text_with_ws="Break here",
text_spoken="Break here",
pause_before_ms=((1 + 2) * 1000),
pause_after_ms=((5 + 6) * 1000),
words=[
Word(
idx=0,
sent_idx=0,
text="Break",
text_with_ws="Break",
pause_before_ms=(3 * 1000),
pause_after_ms=(4 * 1000),
),
Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
],
),
],
)
def test_mark(self):
"""Test SSML mark tag"""
processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
graph, root = processor(
"""
<speak>
<mark name="a"/>
<p>
<mark name="b" />
<s>
<mark name="c" />
Mark <mark name="d" /> here
</s>
<mark name="e" />
</p>
<mark name="f" />
</speak>
""",
ssml=True,
)
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
# Mark names are attached to appropriate elements
self.assertEqual(
sentences,
[
Sentence(
idx=0,
text="Mark here",
text_with_ws="Mark here",
text_spoken="Mark here",
marks_before=["a", "b"],
marks_after=["e", "f"],
words=[
Word(
idx=0,
sent_idx=0,
text="Mark",
text_with_ws="Mark",
marks_before=["c"],
marks_after=["d"],
),
Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
],
),
],
)
def test_missing_speak(self):
"""Test SSML with missing <speak> tag"""
processor = TextProcessor()
graph, root = processor("<s>hello</s><s>world</s>", ssml=True,)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# <speak> is automatically added when XML fails to parse
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="hello", text_with_ws="hello",),
Word(idx=0, sent_idx=1, text="world", text_with_ws="world",),
],
)
def test_adjacent_voice(self):
"""Test SSML with adjacent <voice> tags"""
processor = TextProcessor()
graph, root = processor(
'<voice name="a">hello.</voice><voice name="b">world.</voice>', ssml=True,
)
words = list(processor.words(graph, root, major_breaks=False, **WORDS_KWARGS))
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, voice="a", text="hello", text_with_ws="hello",),
Word(idx=0, sent_idx=1, voice="b", text="world", text_with_ws="world",),
],
)
def test_multiple_passes(self):
"""Test sentence that needs multiple passes to fully resolve"""
processor = TextProcessor()
graph, root = processor("ABCD-10")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# 1) ABCD-10 -> ABCD 10
# 2) ABCD 10 -> A B C D ten
self.assertEqual(
words,
[
Word(idx=0, text="A", text_with_ws="A ",),
Word(idx=1, text="B", text_with_ws="B ",),
Word(idx=2, text="C", text_with_ws="C ",),
Word(idx=3, text="D", text_with_ws="D ",),
Word(idx=4, text="ten", text_with_ws="ten",),
],
)
def test_number_nonfinite(self):
"""Test sentence with nan or inf"""
processor = TextProcessor()
graph, root = processor("nan inf")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Words should not be parsed as numbers
self.assertEqual(
words,
[
Word(idx=0, text="nan", text_with_ws="nan ",),
Word(idx=1, text="inf", text_with_ws="inf",),
],
)
def test_override_initialism(self):
"""Test use of inline lexicon pronunciation to override an initialism"""
processor = TextProcessor()
graph, root = processor("ROOFUS")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Word is interpreted as initialism
self.assertEqual(
words,
[
Word(idx=0, text="R", text_with_ws="R ",),
Word(idx=1, text="O", text_with_ws="O ",),
Word(idx=2, text="O", text_with_ws="O ",),
Word(idx=3, text="F", text_with_ws="F ",),
Word(idx=4, text="U", text_with_ws="U ",),
Word(idx=5, text="S", text_with_ws="S",),
],
)
graph, root = processor(
"""
<speak>
<lexicon>
<lexeme>
<grapheme>ROOFUS</grapheme>
<phoneme>ɹ ˈu f ə s</phoneme>
</lexeme>
</lexicon>
<s>ROOFUS</s>
</speak>""",
ssml=True,
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Word is *not* interpreted as initialism
self.assertEqual(
words, [Word(idx=0, text="ROOFUS", text_with_ws="ROOFUS",)],
)
def print_graph_stderr(graph, root):
"""Print graph to stderr"""
print_graph(graph, root, print_func=lambda *p: print(*p, file=sys.stderr))
# -----------------------------------------------------------------------------
if __name__ == "__main__":
unittest.main()