ai-content-maker/.venv/Lib/site-packages/tests/test_text_processor.py

1099 lines
39 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Tests for TextProcessor"""
import sys
import unittest
from gruut.text_processor import Sentence, TextProcessor, TextProcessorSettings, Word
from gruut.utils import print_graph
WORDS_KWARGS = {"explicit_lang": False, "phonemes": False, "pos": False}
class TextProcessorTestCase(unittest.TestCase):
"""Tests for TextProcessor"""
def test_whitespace(self):
"""Text whitespace preservation"""
processor = TextProcessor()
graph, root = processor("This is a test ")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Whitespace is retained by default
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
Word(idx=1, sent_idx=0, text="is", text_with_ws="is "),
Word(idx=2, sent_idx=0, text="a", text_with_ws="a "),
Word(idx=3, sent_idx=0, text="test", text_with_ws="test "),
],
)
def test_no_whitespace(self):
"""Test disabling of whitespace preservation"""
processor = TextProcessor(keep_whitespace=False)
graph, root = processor("This is a test ")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Whitespace is discarded
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
Word(idx=1, sent_idx=0, text="is", text_with_ws="is"),
Word(idx=2, sent_idx=0, text="a", text_with_ws="a"),
Word(idx=3, sent_idx=0, text="test", text_with_ws="test"),
],
)
def test_punctuation(self):
"""Test splitting of punctuation from around words"""
processor = TextProcessor(
begin_punctuations={'"', "«"},
end_punctuations={'"', "»"},
minor_breaks={","},
major_breaks={"."},
)
graph, root = processor('This «is», a "test".')
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Punctuations are separated
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This "),
Word(
idx=1, sent_idx=0, text="«", text_with_ws="«", is_punctuation=True
),
Word(idx=2, sent_idx=0, text="is", text_with_ws="is"),
Word(
idx=3, sent_idx=0, text="»", text_with_ws="»", is_punctuation=True
),
Word(
idx=4, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=5, sent_idx=0, text="a", text_with_ws="a "),
Word(
idx=6, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
),
Word(idx=7, sent_idx=0, text="test", text_with_ws="test"),
Word(
idx=8, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
),
Word(
idx=9, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_punctuation_with_inner_break(self):
"""Test break inside of punctuation"""
processor = TextProcessor(
begin_punctuations={'"'}, end_punctuations={'"'}, major_breaks={"."},
)
graph, root = processor('Test "one." Test two.')
words = list(processor.words(graph, root, **WORDS_KWARGS))
# First sentence includes final quote
self.assertEqual(
words,
[
# First sentence
Word(idx=0, sent_idx=0, text="Test", text_with_ws="Test "),
Word(
idx=1, sent_idx=0, text='"', text_with_ws='"', is_punctuation=True
),
Word(idx=2, sent_idx=0, text="one", text_with_ws="one"),
Word(
idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
Word(
idx=4, sent_idx=0, text='"', text_with_ws='" ', is_punctuation=True
),
# Second sentence
Word(idx=0, sent_idx=1, text="Test", text_with_ws="Test "),
Word(idx=1, sent_idx=1, text="two", text_with_ws="two"),
Word(
idx=2, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_replacements(self):
"""Test regex replacements during tokenization"""
processor = TextProcessor(
minor_breaks={","},
major_breaks={"."},
replacements=[
("\\B'", '"'), # replace single quotes
("'\\B", '"'),
('[\\<\\>\\(\\)\\[\\]"]+', ""), # drop brackets/quotes
],
)
graph, root = processor("\"This,\" [is] <a> (test) 'sentence.'")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Quotes and brackets are discarded
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="This", text_with_ws="This"),
Word(
idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
Word(idx=4, sent_idx=0, text="test", text_with_ws="test "),
Word(idx=5, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=6, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_abbreviations(self):
"""Test expansion of abbreviations (with case preservation)"""
processor = TextProcessor(
minor_breaks={","},
major_breaks={".", "?"},
abbreviations={
r"^([dD])r\.": r"\1octor",
r"^([mM])r\.": r"\1ister",
r"^([sS])t\.": r"\1treet",
},
)
graph, root = processor("Mr.? I'm just a dr., on this St. at least.")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Abbreviations are expanded, maintaining capitalization
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="Mister", text_with_ws="Mister"),
Word(
idx=1, sent_idx=0, text="?", text_with_ws="? ", is_major_break=True
),
Word(idx=0, sent_idx=1, text="I'm", text_with_ws="I'm "),
Word(idx=1, sent_idx=1, text="just", text_with_ws="just "),
Word(idx=2, sent_idx=1, text="a", text_with_ws="a "),
Word(idx=3, sent_idx=1, text="doctor", text_with_ws="doctor"),
Word(
idx=4, sent_idx=1, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=5, sent_idx=1, text="on", text_with_ws="on "),
Word(idx=6, sent_idx=1, text="this", text_with_ws="this "),
Word(idx=7, sent_idx=1, text="Street", text_with_ws="Street "),
Word(idx=8, sent_idx=1, text="at", text_with_ws="at "),
Word(idx=9, sent_idx=1, text="least", text_with_ws="least"),
Word(
idx=10, sent_idx=1, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_multiple_sentences(self):
"""Test sentence break"""
processor = TextProcessor(major_breaks={".", "!"})
graph, root = processor("First sentence. Second sentence! ")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Separated by a major break
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
),
Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
Word(idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"),
Word(
idx=2, sent_idx=1, text="!", text_with_ws="! ", is_major_break=True
),
],
)
# Check sentences too
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
self.assertEqual(
sentences,
[
Sentence(
idx=0,
text="First sentence.",
text_with_ws="First sentence. ",
text_spoken="First sentence",
words=[
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
Word(
idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"
),
Word(
idx=2,
sent_idx=0,
text=".",
text_with_ws=". ",
is_major_break=True,
),
],
),
Sentence(
idx=1,
text="Second sentence!",
text_with_ws="Second sentence! ",
text_spoken="Second sentence",
words=[
Word(idx=0, sent_idx=1, text="Second", text_with_ws="Second "),
Word(
idx=1, sent_idx=1, text="sentence", text_with_ws="sentence"
),
Word(
idx=2,
sent_idx=1,
text="!",
text_with_ws="! ",
is_major_break=True,
),
],
),
],
)
def test_multiple_paragraphs(self):
"""Test paragraph index"""
processor = TextProcessor()
graph, root = processor(
"<speak><p>First paragraph</p><p>Second paragraph</p></speak>", ssml=True
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Sentences/words should be in different paragraphs
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, par_idx=0, text="First", text_with_ws="First "),
Word(
idx=1,
sent_idx=0,
par_idx=0,
text="paragraph",
text_with_ws="paragraph",
),
Word(
idx=0, sent_idx=0, par_idx=1, text="Second", text_with_ws="Second "
),
Word(
idx=1,
sent_idx=0,
par_idx=1,
text="paragraph",
text_with_ws="paragraph",
),
],
)
def test_explicit_sentence(self):
"""Test <s> in SSML for avoiding sentence break"""
processor = TextProcessor(major_breaks={".", "!"})
graph, root = processor("<s>First sentence. Second sentence!</s>", ssml=True)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Sentences should not be split apart
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="First", text_with_ws="First "),
Word(idx=1, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=2, sent_idx=0, text=".", text_with_ws=". ", is_major_break=True
),
Word(idx=3, sent_idx=0, text="Second", text_with_ws="Second "),
Word(idx=4, sent_idx=0, text="sentence", text_with_ws="sentence"),
Word(
idx=5, sent_idx=0, text="!", text_with_ws="!", is_major_break=True
),
],
)
def test_minor_breaks(self):
"""Test minor (phrase) break"""
processor = TextProcessor(minor_breaks={","})
graph, root = processor("this, is a test")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Comma should be split from word
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="this", text_with_ws="this"),
Word(
idx=1, sent_idx=0, text=",", text_with_ws=", ", is_minor_break=True
),
Word(idx=2, sent_idx=0, text="is", text_with_ws="is "),
Word(idx=3, sent_idx=0, text="a", text_with_ws="a "),
Word(idx=4, sent_idx=0, text="test", text_with_ws="test"),
],
)
def test_word_breaks(self):
"""Test inner-word break"""
processor = TextProcessor(word_breaks={"-"})
graph, root = processor("ninety-nine")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Word should be split
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="ninety", text_with_ws="ninety "),
Word(idx=1, sent_idx=0, text="nine", text_with_ws="nine"),
],
)
def test_spell_out(self):
"""Test interpret-as="spell-out" in SSML"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<say-as interpret-as="spell-out">test123</say-as>', ssml=True
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
print_graph(graph, root)
# Word should be split into letters
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="t", text_with_ws="t "),
Word(idx=1, sent_idx=0, text="e", text_with_ws="e "),
Word(idx=2, sent_idx=0, text="s", text_with_ws="s "),
Word(idx=3, sent_idx=0, text="t", text_with_ws="t "),
Word(idx=4, sent_idx=0, text="one", text_with_ws="one "),
Word(idx=5, sent_idx=0, text="two", text_with_ws="two "),
Word(idx=6, sent_idx=0, text="three", text_with_ws="three"),
],
)
def test_initialisms(self):
"""Test initialism spell out"""
processor = TextProcessor(
major_breaks={"."},
is_initialism=lambda s: s.isalpha() and s.isupper(),
split_initialism=list,
)
graph, root = processor("TTS.")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Letters should be split
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="T", text_with_ws="T "),
Word(idx=1, sent_idx=0, text="T", text_with_ws="T "),
Word(idx=2, sent_idx=0, text="S", text_with_ws="S"),
Word(
idx=3, sent_idx=0, text=".", text_with_ws=".", is_major_break=True
),
],
)
def test_numbers_one_language(self):
"""Test number verbalization (single language)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor("1 2 3")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Numbers should be verbalized
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="one", text_with_ws="one "),
Word(idx=1, sent_idx=0, text="two", text_with_ws="two "),
Word(idx=2, sent_idx=0, text="three", text_with_ws="three"),
],
)
def test_numbers_multiple_languages(self):
"""Test number verbalization (SSML, multiple languages)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'1 <w lang="es_ES">2</w> <w lang="de_DE">3</w>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False))
# Numbers should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
Word(lang="es_ES", idx=1, sent_idx=0, text="dos", text_with_ws="dos "),
Word(lang="de_DE", idx=2, sent_idx=0, text="drei", text_with_ws="drei"),
],
)
def test_currency_one_language(self):
"""Test currency verbalization (single language)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor("$10")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Currency should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(
lang="en_US",
idx=1,
sent_idx=0,
text="dollars",
text_with_ws="dollars",
),
],
)
def test_currency_multiple_language(self):
"""Test currency verbalization (SSML, multiple languages)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'€10 <w lang="fr_FR">€10</w> <w lang="nl_NL">€10</w>',
ssml=True,
phonemize=False,
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Currencies should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(
lang="en_US", idx=1, sent_idx=0, text="euro", text_with_ws="euro "
),
Word(lang="fr_FR", idx=2, sent_idx=0, text="dix", text_with_ws="dix "),
Word(
lang="fr_FR", idx=3, sent_idx=0, text="euros", text_with_ws="euros "
),
Word(
lang="nl_NL", idx=4, sent_idx=0, text="tien", text_with_ws="tien "
),
Word(lang="nl_NL", idx=5, sent_idx=0, text="euro", text_with_ws="euro"),
],
)
def test_currency_default(self):
"""Test default currency use when no currency symbol (interpret-as="currency")"""
processor = TextProcessor(default_lang="en_US", default_currency="USD")
graph, root = processor(
'<say-as interpret-as="currency">10</say-as>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Currency should be verbalized, despite lack of "$" symbol
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(
lang="en_US",
idx=1,
sent_idx=0,
text="dollars",
text_with_ws="dollars",
),
],
)
def test_time(self):
"""Test time verbalization (English)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(" 4:01pm")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Time should be verbalized
self.assertEqual(
words,
[
Word(
lang="en_US", idx=0, sent_idx=0, text="four", text_with_ws=" four "
),
Word(lang="en_US", idx=1, sent_idx=0, text="oh", text_with_ws="oh "),
Word(lang="en_US", idx=2, sent_idx=0, text="one", text_with_ws="one "),
Word(lang="en_US", idx=3, sent_idx=0, text="P", text_with_ws="P "),
Word(lang="en_US", idx=4, sent_idx=0, text="M", text_with_ws="M"),
],
)
def test_time_no_colon(self):
"""Test time verbalization without a colon (English)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor("10am")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Time should be verbalized
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="ten", text_with_ws="ten "),
Word(lang="en_US", idx=1, sent_idx=0, text="A", text_with_ws="A "),
Word(lang="en_US", idx=2, sent_idx=0, text="M", text_with_ws="M"),
],
)
def test_date_one_language(self):
"""Test date verbalization (single language)"""
processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
graph, root = processor("4/1/1999")
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date should be verbalized
self.assertEqual(
words,
[
Word(
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
),
Word(
lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
),
Word(
lang="en_US",
idx=2,
sent_idx=0,
text=",",
text_with_ws=", ",
is_minor_break=True,
),
Word(
lang="en_US",
idx=3,
sent_idx=0,
text="nineteen",
text_with_ws="nineteen ",
),
Word(
lang="en_US",
idx=4,
sent_idx=0,
text="ninety",
text_with_ws="ninety ",
),
Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
],
)
def test_date_multiple_languages(self):
"""Test date verbalization (SSML, multiple languages)"""
processor = TextProcessor(default_lang="en_US", word_breaks={"-"})
graph, root = processor(
'<speak><s>4/1/1999</s> <s lang="fr_FR">4/1/1999</s><s lang="de_DE">01.04.1999</s></speak>',
ssml=True,
phonemize=False, # ensure French year is split
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date should be verbalized
self.assertEqual(
words,
[
# English
Word(
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
),
Word(
lang="en_US", idx=1, sent_idx=0, text="first", text_with_ws="first"
),
Word(
lang="en_US",
idx=2,
sent_idx=0,
text=",",
text_with_ws=", ",
is_minor_break=True,
),
Word(
lang="en_US",
idx=3,
sent_idx=0,
text="nineteen",
text_with_ws="nineteen ",
),
Word(
lang="en_US",
idx=4,
sent_idx=0,
text="ninety",
text_with_ws="ninety ",
),
Word(lang="en_US", idx=5, sent_idx=0, text="nine", text_with_ws="nine"),
# French
Word(
lang="fr_FR",
idx=0,
sent_idx=1,
text="quatrième",
text_with_ws="quatrième ",
),
Word(
lang="fr_FR",
idx=1,
sent_idx=1,
text="janvier",
text_with_ws="janvier ",
),
Word(
lang="fr_FR", idx=2, sent_idx=1, text="mille", text_with_ws="mille "
),
Word(
lang="fr_FR", idx=3, sent_idx=1, text="neuf", text_with_ws="neuf "
),
Word(
lang="fr_FR", idx=4, sent_idx=1, text="cent", text_with_ws="cent "
),
Word(
lang="fr_FR",
idx=5,
sent_idx=1,
text="quatre",
text_with_ws="quatre ",
),
Word(
lang="fr_FR", idx=6, sent_idx=1, text="vingt", text_with_ws="vingt "
),
Word(lang="fr_FR", idx=7, sent_idx=1, text="dix", text_with_ws="dix "),
Word(lang="fr_FR", idx=8, sent_idx=1, text="neuf", text_with_ws="neuf"),
# German
Word(
lang="de_DE",
idx=0,
sent_idx=2,
text="erste",
text_with_ws="erste ",
),
Word(
lang="de_DE",
idx=1,
sent_idx=2,
text="April",
text_with_ws="April ",
),
Word(
lang="de_DE",
idx=2,
sent_idx=2,
text="neunzehnhundertneunundneunzig",
text_with_ws="neunzehnhundertneunundneunzig",
),
],
)
def test_date_format_ordinal(self):
"""Test date format in SSML (ordinal)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<say-as interpret-as="date" format="md">4/1</say-as>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date is forced to be interpreted and format using day ordinal (first)
self.assertEqual(
words,
[
Word(
lang="en_US", idx=0, sent_idx=0, text="April", text_with_ws="April "
),
Word(lang="en_US", idx=1, sent_idx=0, text="one", text_with_ws="one"),
],
)
def test_date_format_cardinal(self):
"""Test date format in SSML (cardinal)"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<say-as interpret-as="date" format="dmy">4/1/2000</say-as>', ssml=True
)
words = list(processor.words(graph, root, phonemes=False, pos=False))
# Date is forced to be interpreted and format using day ordinal (first)
self.assertEqual(
words,
[
Word(lang="en_US", idx=0, sent_idx=0, text="one", text_with_ws="one "),
Word(
lang="en_US", idx=1, sent_idx=0, text="April", text_with_ws="April "
),
Word(lang="en_US", idx=2, sent_idx=0, text="two", text_with_ws="two "),
Word(
lang="en_US",
idx=3,
sent_idx=0,
text="thousand",
text_with_ws="thousand",
),
],
)
def test_part_of_speech_tagging(self):
"""Test part-of-speech tagging"""
def get_parts_of_speech(words, *args, **kwargs):
return [w.upper() for w in words]
processor = TextProcessor(
# Made-up tagger that just gives the UPPER of the word back
get_parts_of_speech=get_parts_of_speech
)
graph, root = processor("a test")
words = list(processor.words(graph, root, explicit_lang=False, phonemes=False))
# Fake POS tags are added
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="a", text_with_ws="a ", pos="A"),
Word(idx=1, sent_idx=0, text="test", text_with_ws="test", pos="TEST"),
],
)
def test_phonemize_one_language(self):
"""Test phonemizer (single language)"""
def lookup_phonemes(word: str, *args, **kwargs):
return list(word)
processor = TextProcessor(
# Made-up phonemizer that just gives back the letters
lookup_phonemes=lookup_phonemes,
)
graph, root = processor("test")
words = list(processor.words(graph, root, pos=False, explicit_lang=False))
# Single word is "phonemized"
self.assertEqual(
words,
[
Word(
idx=0,
sent_idx=0,
text="test",
text_with_ws="test",
phonemes=["t", "e", "s", "t"],
),
],
)
def test_phonemize_one_language_multiple_roles(self):
"""Test phonemizer (SSML, multiple word roles)"""
def lookup_phonemes(word, role=None, **kwargs):
return list(word) if not role else list(word.upper())
processor = TextProcessor(
# Made-up phonemizer that gives back upper-case letters if a role is provided
lookup_phonemes=lookup_phonemes
)
# Use made-up role
graph, root = processor(
'<speak>test <w role="some_role">test</w></speak>', ssml=True, pos=False
)
words = list(processor.words(graph, root, pos=False, explicit_lang=False))
# Single word is phonemized two different manners depending on role
self.assertEqual(
words,
[
Word(
idx=0,
sent_idx=0,
text="test",
text_with_ws="test ",
phonemes=["t", "e", "s", "t"],
),
Word(
idx=1,
sent_idx=0,
text="test",
text_with_ws="test",
phonemes=["T", "E", "S", "T"],
),
],
)
def test_phonemize_multiple_languages(self):
"""Test phonemizer (SSML, multiple languages)"""
def en_lookup_phonemes(word: str, *args, **kwargs):
return list(word)
def de_lookup_phonemes(word: str, *args, **kwargs):
return list(word.upper())
processor = TextProcessor(
default_lang="en_US",
lookup_phonemes=en_lookup_phonemes,
settings={
"de_DE": TextProcessorSettings(
lang="de_DE", lookup_phonemes=de_lookup_phonemes
)
},
)
graph, root = processor(
'<speak>test <w lang="de_DE">test</w></speak>', ssml=True
)
words = list(processor.words(graph, root))
# Single word is phonemized according to the lexicon with two different languages
self.assertEqual(
words,
[
Word(
lang="en_US",
idx=0,
sent_idx=0,
text="test",
text_with_ws="test ",
phonemes=["t", "e", "s", "t"],
),
Word(
lang="de_DE",
idx=1,
sent_idx=0,
text="test",
text_with_ws="test",
phonemes=["T", "E", "S", "T"],
),
],
)
def test_sub(self):
"""Test SSML substitution"""
processor = TextProcessor(default_lang="en_US")
graph, root = processor(
'<speak><sub alias="World Wide Web Consortium">W3C</sub></speak>', ssml=True
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Single word is replaced by multiple words
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="World", text_with_ws="World ",),
Word(idx=1, sent_idx=0, text="Wide", text_with_ws="Wide ",),
Word(idx=2, sent_idx=0, text="Web", text_with_ws="Web ",),
Word(idx=3, sent_idx=0, text="Consortium", text_with_ws="Consortium",),
],
)
def test_break(self):
"""Test SSML break tag"""
processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
graph, root = processor(
"""
<speak>
<break time="1s"/>
<p>
<break time="2s" />
<s>
<break time="3s" />
Break <break time="4s" /> here
</s>
<break time="5s" />
</p>
<break time="6s" />
</speak>
""",
ssml=True,
)
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
# Break times are attached to appropriate elements
self.assertEqual(
sentences,
[
Sentence(
idx=0,
text="Break here",
text_with_ws="Break here",
text_spoken="Break here",
pause_before_ms=((1 + 2) * 1000),
pause_after_ms=((5 + 6) * 1000),
words=[
Word(
idx=0,
sent_idx=0,
text="Break",
text_with_ws="Break",
pause_before_ms=(3 * 1000),
pause_after_ms=(4 * 1000),
),
Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
],
),
],
)
def test_mark(self):
"""Test SSML mark tag"""
processor = TextProcessor(default_lang="en_US", keep_whitespace=False)
graph, root = processor(
"""
<speak>
<mark name="a"/>
<p>
<mark name="b" />
<s>
<mark name="c" />
Mark <mark name="d" /> here
</s>
<mark name="e" />
</p>
<mark name="f" />
</speak>
""",
ssml=True,
)
sentences = list(processor.sentences(graph, root, **WORDS_KWARGS))
# Mark names are attached to appropriate elements
self.assertEqual(
sentences,
[
Sentence(
idx=0,
text="Mark here",
text_with_ws="Mark here",
text_spoken="Mark here",
marks_before=["a", "b"],
marks_after=["e", "f"],
words=[
Word(
idx=0,
sent_idx=0,
text="Mark",
text_with_ws="Mark",
marks_before=["c"],
marks_after=["d"],
),
Word(idx=1, sent_idx=0, text="here", text_with_ws="here",),
],
),
],
)
def test_missing_speak(self):
"""Test SSML with missing <speak> tag"""
processor = TextProcessor()
graph, root = processor("<s>hello</s><s>world</s>", ssml=True,)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# <speak> is automatically added when XML fails to parse
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, text="hello", text_with_ws="hello",),
Word(idx=0, sent_idx=1, text="world", text_with_ws="world",),
],
)
def test_adjacent_voice(self):
"""Test SSML with adjacent <voice> tags"""
processor = TextProcessor()
graph, root = processor(
'<voice name="a">hello.</voice><voice name="b">world.</voice>', ssml=True,
)
words = list(processor.words(graph, root, major_breaks=False, **WORDS_KWARGS))
self.assertEqual(
words,
[
Word(idx=0, sent_idx=0, voice="a", text="hello", text_with_ws="hello",),
Word(idx=0, sent_idx=1, voice="b", text="world", text_with_ws="world",),
],
)
def test_multiple_passes(self):
"""Test sentence that needs multiple passes to fully resolve"""
processor = TextProcessor()
graph, root = processor("ABCD-10")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# 1) ABCD-10 -> ABCD 10
# 2) ABCD 10 -> A B C D ten
self.assertEqual(
words,
[
Word(idx=0, text="A", text_with_ws="A ",),
Word(idx=1, text="B", text_with_ws="B ",),
Word(idx=2, text="C", text_with_ws="C ",),
Word(idx=3, text="D", text_with_ws="D ",),
Word(idx=4, text="ten", text_with_ws="ten",),
],
)
def test_number_nonfinite(self):
"""Test sentence with nan or inf"""
processor = TextProcessor()
graph, root = processor("nan inf")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Words should not be parsed as numbers
self.assertEqual(
words,
[
Word(idx=0, text="nan", text_with_ws="nan ",),
Word(idx=1, text="inf", text_with_ws="inf",),
],
)
def test_override_initialism(self):
"""Test use of inline lexicon pronunciation to override an initialism"""
processor = TextProcessor()
graph, root = processor("ROOFUS")
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Word is interpreted as initialism
self.assertEqual(
words,
[
Word(idx=0, text="R", text_with_ws="R ",),
Word(idx=1, text="O", text_with_ws="O ",),
Word(idx=2, text="O", text_with_ws="O ",),
Word(idx=3, text="F", text_with_ws="F ",),
Word(idx=4, text="U", text_with_ws="U ",),
Word(idx=5, text="S", text_with_ws="S",),
],
)
graph, root = processor(
"""
<speak>
<lexicon>
<lexeme>
<grapheme>ROOFUS</grapheme>
<phoneme>ɹ ˈu f ə s</phoneme>
</lexeme>
</lexicon>
<s>ROOFUS</s>
</speak>""",
ssml=True,
)
words = list(processor.words(graph, root, **WORDS_KWARGS))
# Word is *not* interpreted as initialism
self.assertEqual(
words, [Word(idx=0, text="ROOFUS", text_with_ws="ROOFUS",)],
)
def print_graph_stderr(graph, root):
"""Print graph to stderr"""
print_graph(graph, root, print_func=lambda *p: print(*p, file=sys.stderr))
# -----------------------------------------------------------------------------
if __name__ == "__main__":
unittest.main()