ai-content-maker/.venv/Lib/site-packages/gruut/text_processor.py

2355 lines
87 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python3
"""Tokenizes, verbalizes, and phonemizes text and SSML"""
import itertools
import logging
import re
import typing
import xml.etree.ElementTree as etree
from decimal import Decimal
from pathlib import Path
import babel
import babel.numbers
import dateparser
import networkx as nx
from gruut_ipa import IPA
from num2words import num2words
from gruut.const import (
DATA_PROP,
PHONEMES_TYPE,
REGEX_PATTERN,
BreakNode,
BreakType,
BreakWordNode,
EndElement,
GraphType,
IgnoreNode,
InlineLexicon,
InterpretAs,
InterpretAsFormat,
Lexeme,
MarkNode,
Node,
ParagraphNode,
PunctuationWordNode,
Sentence,
SentenceNode,
SpeakNode,
SSMLParsingState,
TextProcessorSettings,
Word,
WordNode,
WordRole,
)
from gruut.lang import get_settings
from gruut.utils import (
attrib_no_namespace,
leaves,
load_lexicon,
maybe_split_ipa,
pipeline_split,
pipeline_transform,
resolve_lang,
tag_no_namespace,
text_and_elements,
)
# -----------------------------------------------------------------------------
_LOGGER = logging.getLogger("gruut.text_processor")
DEFAULT_LEXICON_ID = ""
# -----------------------------------------------------------------------------
class TextProcessor:
"""Tokenizes, verbalizes, and phonemizes text and SSML"""
def __init__(
self,
default_lang: str = "en_US",
model_prefix: str = "",
lang_dirs: typing.Optional[typing.Dict[str, typing.Union[str, Path]]] = None,
search_dirs: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None,
settings: typing.Optional[
typing.MutableMapping[str, TextProcessorSettings]
] = None,
**kwargs,
):
self.default_lang = default_lang
self.default_settings_kwargs = kwargs
self.model_prefix = model_prefix
self.search_dirs = search_dirs
if lang_dirs is None:
lang_dirs = {}
# Convert to Paths
self.lang_dirs = {
dir_lang: Path(dir_path) for dir_lang, dir_path in lang_dirs.items()
}
if settings is None:
settings = {}
self.settings = settings
def sentences(
self,
graph: GraphType,
root: Node,
major_breaks: bool = True,
minor_breaks: bool = True,
punctuations: bool = True,
explicit_lang: bool = True,
phonemes: bool = True,
break_phonemes: bool = True,
pos: bool = True,
) -> typing.Iterable[Sentence]:
"""Processes text and returns each sentence"""
def get_lang(lang: str) -> str:
if explicit_lang or (lang != self.default_lang):
return lang
# Implicit default language
return ""
sentence: typing.Optional[Sentence] = None
par_idx: int = -1
sent_idx: int = 0
sent_pause_before_ms: int = 0
sent_marks_before: typing.List[str] = []
word_pause_before_ms: int = 0
word_marks_before: typing.List[str] = []
sentences: typing.List[Sentence] = []
for dfs_node in nx.dfs_preorder_nodes(graph, root.node):
node = graph.nodes[dfs_node][DATA_PROP]
if isinstance(node, ParagraphNode):
par_idx += 1
sent_idx = 0
elif isinstance(node, SentenceNode):
# New sentence
sentences.append(
Sentence(
idx=sent_idx,
par_idx=par_idx,
text="",
text_with_ws="",
text_spoken="",
voice=node.voice,
lang=get_lang(node.lang),
pause_before_ms=sent_pause_before_ms,
marks_before=(sent_marks_before if sent_marks_before else None),
)
)
sent_pause_before_ms = 0
sent_marks_before = []
sent_idx += 1
elif graph.out_degree(dfs_node) == 0:
if isinstance(node, WordNode):
assert sentences, "No sentence"
sentence = sentences[-1]
word_node = typing.cast(WordNode, node)
sentence.words.append(
Word(
idx=len(sentence.words),
sent_idx=sentence.idx,
par_idx=sentence.par_idx,
text=word_node.text,
text_with_ws=word_node.text_with_ws,
phonemes=word_node.phonemes if phonemes else None,
pos=word_node.pos if pos else None,
lang=get_lang(node.lang),
voice=node.voice,
pause_before_ms=word_pause_before_ms,
marks_before=(
word_marks_before if word_marks_before else None
),
)
)
word_pause_before_ms = 0
word_marks_before = []
elif isinstance(node, BreakWordNode):
assert sentences, "No sentence"
sentence = sentences[-1]
break_word_node = typing.cast(BreakWordNode, node)
is_minor_break = break_word_node.break_type == BreakType.MINOR
is_major_break = break_word_node.break_type == BreakType.MAJOR
if (minor_breaks and is_minor_break) or (
major_breaks and is_major_break
):
sentence.words.append(
Word(
idx=len(sentence.words),
sent_idx=sentence.idx,
par_idx=sentence.par_idx,
text=break_word_node.text,
text_with_ws=break_word_node.text_with_ws,
phonemes=self._phonemes_for_break(
break_word_node.break_type,
lang=break_word_node.lang,
)
if phonemes and break_phonemes
else None,
is_minor_break=is_minor_break,
is_major_break=is_major_break,
lang=get_lang(node.lang),
voice=node.voice,
pause_before_ms=word_pause_before_ms,
marks_before=(
word_marks_before if word_marks_before else None
),
)
)
word_pause_before_ms = 0
word_marks_before = []
elif punctuations and isinstance(node, PunctuationWordNode):
assert sentences, "No sentence"
sentence = sentences[-1]
punct_word_node = typing.cast(PunctuationWordNode, node)
sentence.words.append(
Word(
idx=len(sentence.words),
sent_idx=sentence.idx,
par_idx=sentence.par_idx,
text=punct_word_node.text,
text_with_ws=punct_word_node.text_with_ws,
is_punctuation=True,
lang=get_lang(punct_word_node.lang),
pause_before_ms=word_pause_before_ms,
marks_before=(
word_marks_before if word_marks_before else None
),
)
)
word_pause_before_ms = 0
word_marks_before = []
elif isinstance(node, BreakNode):
# Pause for some time
break_node = typing.cast(BreakNode, node)
break_parent = self._find_parent(
graph, node, (SentenceNode, ParagraphNode, SpeakNode)
)
if break_parent is not None:
break_ms = break_node.get_milliseconds()
break_parent_edges = list(graph.out_edges(break_parent.node))
break_edge_idx = break_parent_edges.index(
(break_parent.node, break_node.node)
)
is_last_edge = break_edge_idx == (len(break_parent_edges) - 1)
if isinstance(break_parent, SentenceNode):
assert sentences
sentence = sentences[-1]
if is_last_edge:
# End of sentence, add pause after
sentence.pause_after_ms += break_ms
elif sentence.words:
# Between words, add pause after previous word
sentence.words[-1].pause_after_ms += break_ms
else:
# Before first word, set pause for first word
word_pause_before_ms += break_ms
elif isinstance(break_parent, ParagraphNode):
if sentences and (sentences[-1].par_idx == par_idx):
# Between sentences in the same paragraph, add pause after previous sentence
sentences[-1].pause_after_ms += break_ms
else:
# Add pause to beginning of next sentence
sent_pause_before_ms += break_ms
elif isinstance(break_parent, SpeakNode):
if sentences:
# After paragraphs or sentences
sentences[-1].pause_after_ms += break_ms
else:
# Before any paragraphs or sentences
sent_pause_before_ms += break_ms
elif isinstance(node, MarkNode):
# User-defined mark
mark_node = typing.cast(MarkNode, node)
mark_name = mark_node.name
mark_parent = self._find_parent(
graph, node, (SentenceNode, ParagraphNode, SpeakNode)
)
if mark_parent is not None:
mark_parent_edges = list(graph.out_edges(mark_parent.node))
mark_edge_idx = mark_parent_edges.index(
(mark_parent.node, mark_node.node)
)
is_last_edge = mark_edge_idx == (len(mark_parent_edges) - 1)
if isinstance(mark_parent, SentenceNode):
assert sentences
sentence = sentences[-1]
if is_last_edge:
# End of sentence, add mark after
if sentence.marks_after is None:
sentence.marks_after = []
sentence.marks_after.append(mark_name)
elif sentence.words:
# Between words, add pause after previous word
last_word = sentence.words[-1]
if last_word.marks_after is None:
last_word.marks_after = []
last_word.marks_after.append(mark_name)
else:
# Before first word, set pause for first word
word_marks_before.append(mark_name)
elif isinstance(mark_parent, ParagraphNode):
if sentences and (sentences[-1].par_idx == par_idx):
# Between sentences in the same paragraph, add pause after previous sentence
last_sentence = sentences[-1]
if last_sentence.marks_after is None:
last_sentence.marks_after = []
last_sentence.marks_after.append(mark_name)
else:
# Add pause to beginning of next sentence
sent_marks_before.append(mark_name)
elif isinstance(mark_parent, SpeakNode):
if sentences:
# After paragraphs or sentences
last_sentence = sentences[-1]
if last_sentence.marks_after is None:
last_sentence.marks_after = []
last_sentence.marks_after.append(mark_name)
else:
# Before any paragraphs or sentences
sent_marks_before.append(mark_name)
# Post-process sentences to fix up text, voice, etc.
for sentence in sentences:
settings = self.get_settings(sentence.lang)
if settings.keep_whitespace:
# Whitespace is preseved
sentence.text_with_ws = "".join(w.text_with_ws for w in sentence.words)
else:
# Make a best guess.
# The join string is used before spoken words (except the first word).
# This should have the effect of keeping punctuation next to words.
word_texts: typing.List[str] = []
for word in sentence.words:
if word.is_spoken:
if word_texts:
word_texts.append(f"{settings.join_str}{word.text}")
else:
word_texts.append(word.text)
else:
word_texts.append(word.text)
sentence.text_with_ws = "".join(word_texts)
sentence.text = settings.normalize_whitespace(sentence.text_with_ws)
sentence.text_spoken = settings.join_str.join(
w.text for w in sentence.words if w.is_spoken
)
# Normalize voice
sent_voice = sentence.voice
# Get voice used across all words
for word in sentence.words:
if word.voice:
if sent_voice and (sent_voice != word.voice):
# Multiple voices
sent_voice = ""
break
sent_voice = word.voice
if sent_voice:
sentence.voice = sent_voice
# Set voice on all words
for word in sentence.words:
word.voice = sent_voice
return sentences
def words(self, graph: GraphType, root: Node, **kwargs) -> typing.Iterable[Word]:
"""Processes text and returns each word"""
for sent in self.sentences(graph, root, **kwargs):
for word in sent:
yield word
def get_settings(self, lang: typing.Optional[str] = None) -> TextProcessorSettings:
"""Gets or creates settings for a language"""
lang = lang or self.default_lang
lang_settings = self.settings.get(lang)
if lang_settings is not None:
return lang_settings
# Try again with resolved language
resolved_lang = resolve_lang(lang)
lang_settings = self.settings.get(resolved_lang)
if lang_settings is not None:
# Patch for the future
self.settings[lang] = self.settings[resolved_lang]
return lang_settings
_LOGGER.debug(
"No custom settings for language %s (%s). Creating default settings.",
lang,
resolved_lang,
)
# Create default settings for language
lang_dir = self.lang_dirs.get(lang)
lang_settings = get_settings(
lang,
lang_dir=lang_dir,
model_prefix=self.model_prefix,
search_dirs=self.search_dirs,
**self.default_settings_kwargs,
)
self.settings[lang] = lang_settings
self.settings[resolved_lang] = lang_settings
return lang_settings
# -------------------------------------------------------------------------
# Processing
# -------------------------------------------------------------------------
def __call__(self, *args, **kwargs):
"""Processes text or SSML"""
return self.process(*args, **kwargs)
def process(
self,
text: str,
lang: typing.Optional[str] = None,
ssml: bool = False,
pos: bool = True,
phonemize: bool = True,
post_process: bool = True,
add_speak_tag: bool = True,
detect_numbers: bool = True,
detect_currency: bool = True,
detect_dates: bool = True,
detect_times: bool = True,
verbalize_numbers: bool = True,
verbalize_currency: bool = True,
verbalize_dates: bool = True,
verbalize_times: bool = True,
max_passes: int = 5,
) -> typing.Tuple[GraphType, Node]:
"""
Processes text or SSML
Args:
text: input text or SSML (ssml=True)
lang: default language of input text
ssml: True if input text is SSML
pos: False if part of speech tagging should be disabled
phonemize: False if phonemization should be disabled
post_process: False if sentence/graph post-processing should be disabled
add_speak_tag: True if <speak> should be automatically added to input text when ssml=True
detect_numbers: True if numbers should be annotated in text (interpret_as="number")
detect_currency: True if currency amounts should be annotated in text (interpret_as="currency")
detect_dates: True if dates should be annotated in text (interpret_as="date")
detect_times: True if clock times should be annotated in text (interpret_as="time")
verbalize_numbers: True if annotated numbers should be expanded into words
verbalize_currency: True if annotated currency amounts should be expanded into words
verbalize_dates: True if annotated dates should be expanded into words
verbalize_times: True if annotated clock times should be expanded into words
Returns:
graph, root: text graph and root node
"""
if ssml:
try:
root_element = etree.fromstring(text)
except Exception as e:
if add_speak_tag:
# Try wrapping text in <speak> and parsing again
root_element = etree.fromstring(f"<speak>{text}</speak>")
else:
# Log and re-raise exception
_LOGGER.exception("TextProcessor.process")
raise e
def iter_elements():
yield from text_and_elements(root_element)
else:
# Not XML
def iter_elements():
yield text
graph = typing.cast(GraphType, nx.DiGraph())
# Parse XML
last_paragraph: typing.Optional[ParagraphNode] = None
last_sentence: typing.Optional[SentenceNode] = None
last_speak: typing.Optional[SpeakNode] = None
root: typing.Optional[SpeakNode] = None
parsing_state = SSMLParsingState.DEFAULT
# [voice]
voice_stack: typing.List[str] = []
# [(interpret_as, format)]
say_as_stack: typing.List[typing.Tuple[str, str]] = []
# [(tag, lang)]
lang_stack: typing.List[typing.Tuple[str, str]] = []
current_lang: str = lang or self.default_lang
# [lexicon.id]
lookup_stack: typing.List[str] = []
lexicon_id: typing.Optional[str] = None
lexeme: typing.Optional[Lexeme] = None
# id -> lexicon
inline_lexicons: typing.Dict[str, InlineLexicon] = {}
# True if current word is the last one
is_last_word: bool = False
# Current word's role
word_role: typing.Optional[str] = None
# Alias from <sub>
last_alias: typing.Optional[str] = None
# Used to skip <metadata>
skip_elements: bool = False
# Phonemes to use for next word(s)
word_phonemes: typing.Optional[typing.List[typing.List[str]]] = None
# Create __init__ args for new Node
def scope_kwargs(target_class):
scope = {}
if voice_stack:
scope["voice"] = voice_stack[-1]
scope["lang"] = current_lang
if target_class is WordNode:
if say_as_stack:
scope["interpret_as"], scope["format"] = say_as_stack[-1]
if word_role is not None:
scope["role"] = word_role
if lookup_stack:
# Lexicon ids in order of look up
scope["lexicon_ids"] = list(reversed(lookup_stack))
return scope
def in_inline_lexicon(
word_text: str, word_role: typing.Optional[str] = None
) -> bool:
if inline_lexicons:
for inline_lexicon_id in itertools.chain(
lookup_stack, [DEFAULT_LEXICON_ID]
):
maybe_lexicon = inline_lexicons.get(inline_lexicon_id)
if maybe_lexicon is None:
continue
maybe_role_phonemes = maybe_lexicon.words.get(word_text)
if maybe_role_phonemes is None:
continue
if (word_role is not None) and (word_role in maybe_role_phonemes):
# Role-specific pronunciation
return True
if WordRole.DEFAULT in maybe_role_phonemes:
# Default pronunciation
return True
# No inline pronunciation
return False
# Process sub-elements and text chunks
for elem_or_text in iter_elements():
if isinstance(elem_or_text, str):
if skip_elements:
# Inside <metadata>
continue
# Text chunk
text = typing.cast(str, elem_or_text)
# <grapheme> inside <lexicon>
if parsing_state == SSMLParsingState.IN_LEXICON_GRAPHEME:
assert lexeme is not None
lexeme.grapheme = text.strip()
continue
# <phoneme> inside <lexicon>
if parsing_state == SSMLParsingState.IN_LEXICON_PHONEME:
assert lexeme is not None
text = text.strip()
# Phonemes will be split on whitespace if at least one
# space is present, otherwise assume phonemes =
# graphemes.
lexeme.phonemes = maybe_split_ipa(text)
continue
if last_alias is not None:
# Iniside a <sub>
text = last_alias
if last_speak is None:
# Implicit <speak>
last_speak = SpeakNode(node=len(graph), implicit=True)
graph.add_node(last_speak.node, data=last_speak)
if root is None:
root = last_speak
assert last_speak is not None
if last_paragraph is None:
# Implicit <p>
p_node = ParagraphNode(
node=len(graph), implicit=True, **scope_kwargs(ParagraphNode)
)
graph.add_node(p_node.node, data=p_node)
graph.add_edge(last_speak.node, p_node.node)
last_paragraph = p_node
assert last_paragraph is not None
if last_sentence is None:
# Implicit <s>
s_node = SentenceNode(
node=len(graph), implicit=True, **scope_kwargs(SentenceNode)
)
graph.add_node(s_node.node, data=s_node)
graph.add_edge(last_paragraph.node, s_node.node)
last_sentence = s_node
assert last_sentence is not None
if parsing_state == SSMLParsingState.IN_WORD:
# No splitting
word_text = text
settings = self.get_settings(current_lang)
if (
settings.keep_whitespace
and (not is_last_word)
and (not word_text.endswith(settings.join_str))
):
word_text += settings.join_str
word_kwargs = scope_kwargs(WordNode)
if word_phonemes:
word_kwargs["phonemes"] = word_phonemes.pop()
word_text_norm = settings.normalize_whitespace(word_text)
word_node = WordNode(
node=len(graph),
text=word_text_norm,
text_with_ws=word_text,
in_lexicon=(
in_inline_lexicon(word_text_norm, word_role)
or self._is_word_in_lexicon(word_text_norm, settings)
),
**word_kwargs,
)
graph.add_node(word_node.node, data=word_node)
graph.add_edge(last_sentence.node, word_node.node)
else:
# Split by whitespace
self._pipeline_tokenize(
graph,
last_sentence,
text,
word_phonemes=word_phonemes,
scope_kwargs=scope_kwargs(WordNode),
in_inline_lexicon=in_inline_lexicon,
)
elif isinstance(elem_or_text, EndElement):
# End of an element (e.g., </s>)
end_elem = typing.cast(EndElement, elem_or_text)
end_tag = tag_no_namespace(end_elem.element.tag)
if end_tag == "voice":
if voice_stack:
voice_stack.pop()
elif end_tag == "say-as":
if say_as_stack:
say_as_stack.pop()
elif end_tag == "lookup":
if lookup_stack:
lookup_stack.pop()
elif end_tag == "lexicon":
# Done parsing <lexicon>
parsing_state = SSMLParsingState.DEFAULT
lexicon_id = None
elif (end_tag == "grapheme") and (
parsing_state == SSMLParsingState.IN_LEXICON_GRAPHEME
):
# Done with lexicon grapheme
parsing_state = SSMLParsingState.IN_LEXICON
elif (end_tag == "phoneme") and (
parsing_state == SSMLParsingState.IN_LEXICON_PHONEME
):
# Done with lexicon phoneme
parsing_state = SSMLParsingState.IN_LEXICON
elif (end_tag == "lexeme") and (
parsing_state == SSMLParsingState.IN_LEXICON
):
# Done with lexicon entry
assert lexeme is not None, "No lexeme"
assert (
lexeme.phonemes is not None
), f"No phoneme for lexeme: {lexeme}"
assert lexicon_id is not None, "No lexicon id"
lexicon = inline_lexicons.get(lexicon_id)
assert lexicon is not None, f"No lexicon for id {lexicon_id}"
# Get or create role -> phonemes map
role_phonemes: typing.Dict[str, PHONEMES_TYPE] = lexicon.words.get(
lexeme.grapheme, {}
)
if lexeme.roles:
# Add phonemes for each role
for role in lexeme.roles:
role_phonemes[role] = lexeme.phonemes
else:
# Default (empty) role only
role_phonemes[WordRole.DEFAULT] = lexeme.phonemes
lexicon.words[lexeme.grapheme] = role_phonemes
# Reset state
lexeme = None
else:
if lang_stack and (lang_stack[-1][0] == end_tag):
lang_stack.pop()
if lang_stack:
current_lang = lang_stack[-1][1] # tag, lang
else:
current_lang = self.default_lang
if end_tag in {"w", "token"}:
# End of word
parsing_state = SSMLParsingState.DEFAULT
is_last_word = False
word_role = None
elif end_tag == "s":
# End of sentence
last_sentence = None
elif end_tag == "p":
# End of paragraph
last_paragraph = None
elif end_tag == "speak":
# End of speak
last_speak = root
elif end_tag == "sub":
# End of sub
last_alias = None
elif end_tag in {"metadata", "meta"}:
# End of metadata
skip_elements = False
elif end_tag == "phoneme":
# End of phoneme
word_phonemes = None
else:
if skip_elements:
# Inside <metadata>
continue
# Start of an element (e.g., <p>)
elem, elem_metadata = elem_or_text
elem = typing.cast(etree.Element, elem)
# Optional metadata for the element
elem_metadata = typing.cast(
typing.Optional[typing.Dict[str, typing.Any]], elem_metadata
)
elem_tag = tag_no_namespace(elem.tag)
if elem_tag == "speak":
# Explicit <speak>
maybe_lang = attrib_no_namespace(elem, "lang")
if maybe_lang:
lang_stack.append((elem_tag, maybe_lang))
current_lang = maybe_lang
speak_node = SpeakNode(
node=len(graph), element=elem, **scope_kwargs(SpeakNode)
)
if root is None:
root = speak_node
graph.add_node(speak_node.node, data=root)
last_speak = root
elif elem_tag == "voice":
# Set voice scope
voice_name = attrib_no_namespace(elem, "name")
voice_stack.append(voice_name)
elif elem_tag == "p":
# Explicit paragraph
if last_speak is None:
# Implicit <speak>
last_speak = SpeakNode(node=len(graph), implicit=True)
graph.add_node(last_speak.node, data=last_speak)
if root is None:
root = last_speak
assert last_speak is not None
maybe_lang = attrib_no_namespace(elem, "lang")
if maybe_lang:
lang_stack.append((elem_tag, maybe_lang))
current_lang = maybe_lang
p_node = ParagraphNode(
node=len(graph), element=elem, **scope_kwargs(ParagraphNode)
)
graph.add_node(p_node.node, data=p_node)
graph.add_edge(last_speak.node, p_node.node)
last_paragraph = p_node
# Force a new sentence to begin
last_sentence = None
elif elem_tag == "s":
# Explicit sentence
if last_speak is None:
# Implicit <speak>
last_speak = SpeakNode(node=len(graph), implicit=True)
graph.add_node(last_speak.node, data=last_speak)
if root is None:
root = last_speak
assert last_speak is not None
if last_paragraph is None:
# Implicit paragraph
p_node = ParagraphNode(
node=len(graph), **scope_kwargs(ParagraphNode)
)
graph.add_node(p_node.node, data=p_node)
graph.add_edge(last_speak.node, p_node.node)
last_paragraph = p_node
maybe_lang = attrib_no_namespace(elem, "lang")
if maybe_lang:
lang_stack.append((elem_tag, maybe_lang))
current_lang = maybe_lang
s_node = SentenceNode(
node=len(graph), element=elem, **scope_kwargs(SentenceNode)
)
graph.add_node(s_node.node, data=s_node)
graph.add_edge(last_paragraph.node, s_node.node)
last_sentence = s_node
elif elem_tag in {"w", "token"}:
# Explicit word
parsing_state = SSMLParsingState.IN_WORD
is_last_word = (
elem_metadata.get("is_last", False) if elem_metadata else False
)
maybe_lang = attrib_no_namespace(elem, "lang")
if maybe_lang:
lang_stack.append((elem_tag, maybe_lang))
current_lang = maybe_lang
word_role = attrib_no_namespace(elem, "role")
elif elem_tag == "break":
# Break
last_target = last_sentence or last_paragraph or last_speak
assert last_target is not None
break_node = BreakNode(
node=len(graph),
element=elem,
time=attrib_no_namespace(elem, "time", ""),
)
graph.add_node(break_node.node, data=break_node)
graph.add_edge(last_target.node, break_node.node)
elif elem_tag == "mark":
# Mark
last_target = last_sentence or last_paragraph or last_speak
assert last_target is not None
mark_node = MarkNode(
node=len(graph),
element=elem,
name=attrib_no_namespace(elem, "name", ""),
)
graph.add_node(mark_node.node, data=mark_node)
graph.add_edge(last_target.node, mark_node.node)
elif elem_tag == "say-as":
say_as_stack.append(
(
attrib_no_namespace(elem, "interpret-as", ""),
attrib_no_namespace(elem, "format", ""),
)
)
elif elem_tag == "sub":
# Sub
last_alias = attrib_no_namespace(elem, "alias", "")
elif elem_tag in {"metadata", "meta"}:
# Metadata
skip_elements = True
elif (elem_tag == "phoneme") and (
parsing_state != SSMLParsingState.IN_LEXICON
):
# Phonemes
word_phonemes_strs = attrib_no_namespace(elem, "ph", "").split()
if word_phonemes_strs:
# Phonemes will be split on whitespace if at least one
# space is present, otherwise assume phonemes =
# graphemes.
word_phonemes = [
maybe_split_ipa(phoneme_str)
for phoneme_str in word_phonemes_strs
]
else:
word_phonemes = None
elif elem_tag == "lang":
# Set language
maybe_lang = attrib_no_namespace(elem, "lang", "")
if maybe_lang:
lang_stack.append((elem_tag, maybe_lang))
current_lang = maybe_lang
elif elem_tag == "lookup":
lookup_id = attrib_no_namespace(elem, "ref")
assert lookup_id is not None, f"Lookup id required ({elem})"
lookup_stack.append(lookup_id)
elif elem_tag == "lexicon":
# Inline pronunciaton lexicon
# NOTE: Empty lexicon id means the "default" inline lexicon (<lookup> not required)
lexicon_id = attrib_no_namespace(elem, "id", DEFAULT_LEXICON_ID)
assert lexicon_id is not None
lexicon_alphabet = (
attrib_no_namespace(elem, "alphabet", "").strip().lower()
)
inline_lexicons[lexicon_id] = InlineLexicon(
lexicon_id=lexicon_id, alphabet=lexicon_alphabet
)
lexicon_uri = attrib_no_namespace(elem, "uri", "")
if lexicon_uri:
# Lexicon defined externally
_LOGGER.debug(
"Loading pronunciation lexicon from %s", lexicon_uri
)
load_lexicon(lexicon_uri, inline_lexicons[lexicon_id])
else:
# Lexicon defined within this document
parsing_state = SSMLParsingState.IN_LEXICON
elif (elem_tag == "lexeme") and (
parsing_state == SSMLParsingState.IN_LEXICON
):
if lexeme is None:
lexeme = Lexeme()
role_str = attrib_no_namespace(elem, "role")
if role_str:
lexeme.roles = set(role_str.strip().split())
elif (elem_tag == "grapheme") and (
parsing_state == SSMLParsingState.IN_LEXICON
):
# Inline pronunciaton lexicon (grapheme)
parsing_state = SSMLParsingState.IN_LEXICON_GRAPHEME
if lexeme is None:
lexeme = Lexeme()
elif (elem_tag == "phoneme") and (
parsing_state == SSMLParsingState.IN_LEXICON
):
# Inline pronunciaton lexicon (phoneme)
parsing_state = SSMLParsingState.IN_LEXICON_PHONEME
if lexeme is None:
lexeme = Lexeme()
assert root is not None
# Do multiple passes over the graph
num_passes_left = max_passes
while num_passes_left > 0:
was_changed = False
# Do replacements before minor/major breaks
if pipeline_split(self._split_replacements, graph, root):
was_changed = True
# Split punctuations (quotes, etc.) before breaks
if pipeline_split(self._split_punctuations, graph, root):
was_changed = True
# Split on minor breaks (commas, etc.)
if pipeline_split(self._split_minor_breaks, graph, root):
was_changed = True
# Expand abbrevations before major breaks
if pipeline_split(self._split_abbreviations, graph, root):
was_changed = True
# Break apart initialisms (e.g., TTS or T.T.S.) before major breaks
if pipeline_split(self._split_initialism, graph, root):
was_changed = True
# Split on major breaks (periods, etc.)
if pipeline_split(self._split_major_breaks, graph, root):
was_changed = True
# Break apart sentences using BreakWordNodes
if self._break_sentences(graph, root):
was_changed = True
# spell-out (e.g., abc -> a b c) before number expansion
if pipeline_split(self._split_spell_out, graph, root):
was_changed = True
# Transform text into known classes.
#
# The order here is very important, since words with "interpret_as"
# set will be skipped by later transformations.
#
# Dates are detected first so words like "1.1.2000" are not parsed
# as numbers by Babel (the de_DE locale will parse this as 112000).
#
if detect_dates:
if pipeline_transform(self._transform_date, graph, root):
was_changed = True
if detect_currency:
if pipeline_transform(self._transform_currency, graph, root):
was_changed = True
if detect_numbers:
if pipeline_transform(self._transform_number, graph, root):
was_changed = True
if detect_times:
if pipeline_transform(self._transform_time, graph, root):
was_changed = True
# Verbalize known classes
if verbalize_dates:
if pipeline_transform(self._verbalize_date, graph, root):
was_changed = True
if verbalize_times:
if pipeline_transform(self._verbalize_time, graph, root):
was_changed = True
if verbalize_numbers:
if pipeline_transform(self._verbalize_number, graph, root):
was_changed = True
if verbalize_currency:
if pipeline_transform(self._verbalize_currency, graph, root):
was_changed = True
# Break apart words
if pipeline_split(self._break_words, graph, root):
was_changed = True
# Ignore non-words
if pipeline_split(self._split_ignore_non_words, graph, root):
was_changed = True
if not was_changed:
# No changes, so we can stop
break
num_passes_left -= 1
# Gather words from leaves of the tree, group by sentence
def process_sentence(words: typing.List[WordNode]):
if pos:
pos_settings = self.get_settings(node.lang)
if pos_settings.get_parts_of_speech is not None:
pos_tags = pos_settings.get_parts_of_speech(
[word.text for word in words]
)
for word, pos_tag in zip(words, pos_tags):
word.pos = pos_tag
if not word.role:
word.role = f"gruut:{pos_tag}"
if phonemize:
# Add phonemes to word
for word in words:
if word.phonemes:
# Word already has phonemes
continue
lexicon_ids: typing.List[str] = []
if word.lexicon_ids:
lexicon_ids.extend(word.lexicon_ids)
lexicon_ids.append(DEFAULT_LEXICON_ID)
# Look up phonemes from inline <lexicon>
for lexicon_id in lexicon_ids:
lexicon = inline_lexicons.get(lexicon_id)
if lexicon is None:
continue
maybe_role_phonemes = lexicon.words.get(word.text)
if maybe_role_phonemes is None:
continue
maybe_phonemes = maybe_role_phonemes.get(word.role)
if (maybe_phonemes is None) and (word.role != WordRole.DEFAULT):
# Try again with default role
maybe_phonemes = maybe_role_phonemes.get(WordRole.DEFAULT)
if maybe_phonemes is not None:
# Found inline pronunciation
word.phonemes = maybe_phonemes
break
if word.phonemes:
# Got phonemes from inline lexicon
continue
phonemize_settings = self.get_settings(word.lang)
if phonemize_settings.lookup_phonemes is not None:
word.phonemes = phonemize_settings.lookup_phonemes(
word.text, word.role
)
if (not word.phonemes) and (
phonemize_settings.guess_phonemes is not None
):
word.phonemes = phonemize_settings.guess_phonemes(
word.text, word.role
)
# Process tree leaves
sentence_words: typing.List[WordNode] = []
for dfs_node in nx.dfs_preorder_nodes(graph, root.node):
node = graph.nodes[dfs_node][DATA_PROP]
if isinstance(node, SentenceNode):
if sentence_words:
process_sentence(sentence_words)
sentence_words = []
elif graph.out_degree(dfs_node) == 0:
if isinstance(node, WordNode):
word_node = typing.cast(WordNode, node)
sentence_words.append(word_node)
if sentence_words:
# Final sentence
process_sentence(sentence_words)
sentence_words = []
if post_process:
# Post-process sentences
for dfs_node in nx.dfs_preorder_nodes(graph, root.node):
node = graph.nodes[dfs_node][DATA_PROP]
if isinstance(node, SentenceNode):
sent_node = typing.cast(SentenceNode, node)
sent_settings = self.get_settings(sent_node.lang)
if sent_settings.post_process_sentence is not None:
sent_settings.post_process_sentence(
graph, sent_node, sent_settings
)
# Post process entire graph
self.post_process_graph(graph, root)
return graph, root
def post_process_graph(self, graph: GraphType, root: Node):
"""User-defined post-processing of entire graph"""
pass
# -------------------------------------------------------------------------
# Pipeline (custom)
# -------------------------------------------------------------------------
def _break_sentences(self, graph: GraphType, root: Node) -> bool:
"""Break sentences apart at BreakWordNode(break_type="major") nodes."""
was_changed = False
# This involves:
# 1. Identifying where in the edge list of sentence the break occurs
# 2. Creating a new sentence next to the existing one in the parent paragraph
# 3. Moving everything after the break into the new sentence
for leaf_node in list(leaves(graph, root)):
if not isinstance(leaf_node, BreakWordNode):
# Not a break
continue
break_word_node = typing.cast(BreakWordNode, leaf_node)
if break_word_node.break_type != BreakType.MAJOR:
# Not a major break
continue
# Get the path from the break up to the nearest sentence
parent_node: int = next(iter(graph.predecessors(break_word_node.node)))
parent: Node = graph.nodes[parent_node][DATA_PROP]
s_path: typing.List[Node] = [parent]
while not isinstance(parent, SentenceNode):
parent_node = next(iter(graph.predecessors(parent_node)))
parent = graph.nodes[parent_node][DATA_PROP]
s_path.append(parent)
# Should at least be [WordNode, SentenceNode]
assert len(s_path) >= 2
s_node = s_path[-1]
assert isinstance(s_node, SentenceNode)
if not s_node.implicit:
# Don't break apart explicit sentences
continue
# Probably a WordNode
below_s_node = s_path[-2]
# Edges after the break will need to be moved to the new sentence
s_edges = list(graph.out_edges(s_node.node))
break_edge_idx = s_edges.index((s_node.node, below_s_node.node))
edges_to_move = s_edges[break_edge_idx + 1 :]
if not edges_to_move:
# Final sentence, nothing to move
continue
# Locate parent paragraph so we can create a new sentence
p_node = self._find_parent(graph, s_node, ParagraphNode)
assert p_node is not None
# Find the index of the edge between the paragraph and the current sentence
p_s_edge = (p_node.node, s_node.node)
p_edges = list(graph.out_edges(p_node.node))
s_edge_idx = p_edges.index(p_s_edge)
# Remove existing edges from the paragraph
graph.remove_edges_from(p_edges)
# Create a sentence and add an edge to it right after the current sentence
new_s_node = SentenceNode(node=len(graph), implicit=True)
graph.add_node(new_s_node.node, data=new_s_node)
p_edges.insert(s_edge_idx + 1, (p_node.node, new_s_node.node))
# Insert paragraph edges with new sentence
graph.add_edges_from(p_edges)
# Move edges from current sentence to new sentence
graph.remove_edges_from(edges_to_move)
graph.add_edges_from([(new_s_node.node, v) for (u, v) in edges_to_move])
was_changed = True
return was_changed
def _break_words(self, graph: GraphType, node: Node):
"""Break apart words according to work breaks pattern"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon or (not word.implicit):
# Don't interpret words that are spoken for or explicit words (<w>)
return
settings = self.get_settings(word.lang)
if settings.word_breaks_pattern is None:
# No pattern set for this language
return
parts = settings.word_breaks_pattern.split(word.text)
if len(parts) < 2:
# Didn't split
return
# Preserve whitespace
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
last_part_idx = len(parts) - 1
for part_idx, part_text in enumerate(parts):
part_text_norm = settings.normalize_whitespace(part_text)
if not part_text_norm:
continue
if settings.keep_whitespace:
if part_idx == 0:
part_text = first_ws + part_text
if part_idx == last_part_idx:
part_text += last_ws
else:
part_text += settings.join_str
yield WordNode, {
"text": part_text_norm,
"text_with_ws": part_text,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
"in_lexicon": self._is_word_in_lexicon(part_text_norm, settings),
"is_from_broken_word": True,
}
def _split_punctuations(self, graph: GraphType, node: Node):
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon:
# Don't interpret words that are spoken for
return
settings = self.get_settings(word.lang)
if (settings.begin_punctuations_pattern is None) and (
settings.end_punctuations_pattern is None
):
# No punctuation patterns
return
word_text = word.text
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
has_punctuation = False
# Punctuations at the beginning of the word
if settings.begin_punctuations_pattern is not None:
# Split into begin punctuation and rest of word
parts = list(
filter(
None,
settings.begin_punctuations_pattern.split(word_text, maxsplit=1),
)
)
first_word = True
while word_text and (len(parts) == 2):
punct_text, word_text = parts
if first_word:
# Preserve leadingwhitespace
punct_text = first_ws + punct_text
first_word = False
punct_text_norm = settings.normalize_whitespace(punct_text)
has_punctuation = True
yield PunctuationWordNode, {
"text": punct_text_norm,
"text_with_ws": punct_text,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
}
parts = list(
filter(
None,
settings.begin_punctuations_pattern.split(
word_text, maxsplit=1
),
)
)
# Punctuations at the end of the word
end_punctuations: typing.List[str] = []
if settings.end_punctuations_pattern is not None:
# Split into rest of word and end punctuation
parts = list(
filter(
None, settings.end_punctuations_pattern.split(word_text, maxsplit=1)
)
)
while word_text and (len(parts) == 2):
word_text, punct_text = parts
has_punctuation = True
end_punctuations.append(punct_text)
parts = list(
filter(
None,
settings.end_punctuations_pattern.split(word_text, maxsplit=1),
)
)
if not has_punctuation:
# Leave word as-is
return
if settings.keep_whitespace and (not end_punctuations):
# Preserve trailing whitespace
word_text = word_text + last_ws
word_text_norm = settings.normalize_whitespace(word_text)
if word_text:
yield WordNode, {
"text": word_text_norm,
"text_with_ws": word_text,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
"in_lexicon": self._is_word_in_lexicon(word_text_norm, settings),
}
last_punct_idx = len(end_punctuations) - 1
for punct_idx, punct_text in enumerate(reversed(end_punctuations)):
if settings.keep_whitespace and (punct_idx == last_punct_idx):
# Preserve trailing whitespace
punct_text += last_ws
yield PunctuationWordNode, {
"text": punct_text.strip(),
"text_with_ws": punct_text,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
}
def _split_major_breaks(self, graph: GraphType, node: Node):
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon:
# Don't interpret words that are spoken for
return
settings = self.get_settings(word.lang)
if settings.major_breaks_pattern is None:
# No pattern set for this language
return
parts = settings.major_breaks_pattern.split(word.text_with_ws)
if len(parts) < 2:
return
word_part = parts[0]
break_part = parts[1]
if word_part.strip():
# Only yield word if there's anything but whitespace
word_part_norm = settings.normalize_whitespace(word_part)
yield WordNode, {
"text": word_part_norm,
"text_with_ws": word_part,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
"in_lexicon": self._is_word_in_lexicon(word_part_norm, settings),
}
else:
# Keep leading whitespace
break_part = word_part + break_part
yield BreakWordNode, {
"break_type": BreakType.MAJOR,
"text": settings.normalize_whitespace(break_part),
"text_with_ws": break_part,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
}
def _split_minor_breaks(self, graph: GraphType, node: Node):
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon:
# Don't interpret words that are spoken for
return
settings = self.get_settings(word.lang)
if settings.minor_breaks_pattern is None:
# No pattern set for this language
return
parts = settings.minor_breaks_pattern.split(word.text_with_ws)
if len(parts) < 2:
return
word_part = parts[0]
if word_part.strip():
# Only yield word if there's anything but whitespace
word_part_norm = settings.normalize_whitespace(word_part)
yield WordNode, {
"text": word_part_norm,
"text_with_ws": word_part,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
"in_lexicon": self._is_word_in_lexicon(word_part_norm, settings),
}
break_part = parts[1]
yield BreakWordNode, {
"break_type": BreakType.MINOR,
"text": settings.normalize_whitespace(break_part),
"text_with_ws": break_part,
"implicit": True,
"lang": word.lang,
"voice": word.voice,
}
def _find_parent(self, graph, node, *classes):
"""Tries to find a node whose type is in classes in the tree above node"""
parents = []
for parent_node in graph.predecessors(node.node):
parent = graph.nodes[parent_node][DATA_PROP]
if isinstance(parent, classes):
return parent
parents.append(parent)
for parent in parents:
match = self._find_parent(graph, parent, classes)
if match is not None:
return match
return None
# pylint: disable=no-self-use
def _phonemes_for_break(
self,
break_type: typing.Union[str, BreakType],
lang: typing.Optional[str] = None,
) -> typing.Optional[PHONEMES_TYPE]:
if break_type == BreakType.MAJOR:
return [IPA.BREAK_MAJOR.value]
if break_type == BreakType.MINOR:
return [IPA.BREAK_MINOR.value]
return None
# -------------------------------------------------------------------------
def _pipeline_tokenize(
self,
graph,
parent_node,
text,
word_phonemes: typing.Optional[typing.List[typing.List[str]]] = None,
scope_kwargs=None,
in_inline_lexicon: typing.Optional[
typing.Callable[[str, typing.Optional[str]], bool]
] = None,
):
"""Splits text into word nodes"""
if scope_kwargs is None:
scope_kwargs = {}
lang = self.default_lang
if scope_kwargs is not None:
lang = scope_kwargs.get("lang", lang)
settings = self.get_settings(lang)
assert settings is not None, f"No settings for {lang}"
if settings.pre_process_text is not None:
# Pre-process text
text = settings.pre_process_text(text)
# Split into separate words (preseving whitespace).
for word_text in settings.split_words(text):
word_text_norm = settings.normalize_whitespace(word_text)
if not word_text_norm:
continue
if not settings.keep_whitespace:
word_text = word_text_norm
word_kwargs = scope_kwargs
if word_phonemes:
word_kwargs = {**scope_kwargs, "phonemes": word_phonemes.pop()}
# Determine if word is in a lexicon.
# If so, it will not be interpreted as an initialism, split apart, etc.
in_lexicon: typing.Optional[bool] = None
if in_inline_lexicon is not None:
# Check inline <lexicon> first
in_lexicon = in_inline_lexicon(
word_text_norm, scope_kwargs.get("word_role")
)
if not in_lexicon:
# Check main language lexicon
in_lexicon = self._is_word_in_lexicon(word_text_norm, settings)
word_node = WordNode(
node=len(graph),
text=word_text_norm,
text_with_ws=word_text,
implicit=True,
in_lexicon=in_lexicon,
**word_kwargs,
)
graph.add_node(word_node.node, data=word_node)
graph.add_edge(parent_node.node, word_node.node)
# -------------------------------------------------------------------------
# Pipeline Splits
# -------------------------------------------------------------------------
def _split_spell_out(self, graph: GraphType, node: Node):
"""Expand spell-out (a-1 -> a dash one)"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as != InterpretAs.SPELL_OUT:
return
settings = self.get_settings(word.lang)
# Preserve whitespace
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
last_char_idx = len(word.text) - 1
for i, c in enumerate(word.text):
# Look up in settings first ("." -> "dot")
word_text = settings.spell_out_words.get(c)
role = WordRole.DEFAULT
if word_text is None:
if c.isalpha():
# Assume this is a letter
word_text = c
role = WordRole.LETTER
else:
# Leave as is (expand later in pipeline if digit, etc.)
word_text = c
if not word_text:
continue
if settings.keep_whitespace:
if i == 0:
word_text = first_ws + word_text
if i == last_char_idx:
word_text += last_ws
else:
word_text += settings.join_str
yield WordNode, {
"text": settings.normalize_whitespace(word_text),
"text_with_ws": word_text,
"implicit": True,
"lang": word.lang,
"role": role,
}
def _split_replacements(self, graph: GraphType, node: Node):
"""Do regex replacements on word text"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon:
# Don't interpret words that are spoken for
return
settings = self.get_settings(word.lang)
if not settings.replacements:
# No replacements
return
matched = False
new_text = word.text_with_ws
for pattern, template in settings.replacements:
assert isinstance(pattern, REGEX_PATTERN)
new_text, num_subs = pattern.subn(template, new_text)
if num_subs > 0:
matched = True
if matched:
# Tokenize new text (whitespace is preserved by regex)
for part_text in settings.split_words(new_text):
part_text_norm = settings.normalize_whitespace(part_text)
if not settings.keep_whitespace:
part_text = part_text_norm
if not part_text_norm:
# Ignore empty words
continue
yield WordNode, {
"text": part_text_norm,
"text_with_ws": part_text,
"implicit": True,
"lang": word.lang,
"in_lexicon": self._is_word_in_lexicon(part_text_norm, settings),
}
def _split_abbreviations(self, graph: GraphType, node: Node):
"""Expand abbreviations"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon:
# Don't interpret words that are spoken for
return
settings = self.get_settings(word.lang)
if not settings.abbreviations:
# No abbreviations
return
new_text: typing.Optional[str] = None
for pattern, template in settings.abbreviations.items():
assert isinstance(pattern, REGEX_PATTERN), pattern
match = pattern.match(word.text_with_ws)
if match is not None:
new_text = match.expand(template)
break
if new_text is not None:
# Tokenize new text (whitespace should be preserved by regex)
for part_text in settings.split_words(new_text):
part_text_norm = settings.normalize_whitespace(part_text)
if not part_text_norm:
continue
if not settings.keep_whitespace:
part_text = part_text_norm
yield WordNode, {
"text": part_text_norm,
"text_with_ws": part_text,
"implicit": True,
"lang": word.lang,
"in_lexicon": self._is_word_in_lexicon(part_text_norm, settings),
}
def _split_initialism(self, graph: GraphType, node: Node):
"""Split apart ABC or A.B.C."""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon or (len(word.text) < 2):
# Don't interpret words that are spoken for or are too short
return
settings = self.get_settings(word.lang)
if (settings.is_initialism is None) or (settings.split_initialism is None):
# Can't do anything without these functions
return
if not settings.is_initialism(word.text):
# Not an initialism
return
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
parts = settings.split_initialism(word.text)
last_part_idx = len(parts) - 1
# Split according to language-specific function
for part_idx, part_text in enumerate(parts):
part_text_norm = settings.normalize_whitespace(part_text)
if not part_text_norm:
continue
if settings.keep_whitespace:
if part_idx == 0:
part_text = first_ws + part_text
if 0 <= part_idx < last_part_idx:
part_text += settings.join_str
elif part_idx == last_part_idx:
part_text += last_ws
yield WordNode, {
"text": part_text_norm,
"text_with_ws": part_text,
"implicit": True,
"lang": word.lang,
"role": WordRole.LETTER,
}
def _split_ignore_non_words(self, graph: GraphType, node: Node):
"""Mark non-words as ignored"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if word.interpret_as or word.in_lexicon:
# Don't interpret words that are spoken for
return
settings = self.get_settings(word.lang)
if settings.is_non_word is None:
# No function for this language
return
if settings.is_non_word(word.text):
yield (IgnoreNode, {})
# -------------------------------------------------------------------------
# Pipeline Transformations
# -------------------------------------------------------------------------
def _transform_number(self, graph: GraphType, node: Node) -> bool:
if not isinstance(node, WordNode):
return False
word = typing.cast(WordNode, node)
if (not word.is_maybe_number) or (
word.interpret_as and (word.interpret_as != InterpretAs.NUMBER)
):
return False
settings = self.get_settings(word.lang)
assert settings.babel_locale
if settings.get_ordinal is not None:
# Try to parse as an ordinal (e.g., 1st -> 1)
ordinal_num = settings.get_ordinal(word.text)
if ordinal_num is not None:
word.interpret_as = InterpretAs.NUMBER
word.format = InterpretAsFormat.NUMBER_ORDINAL
word.number = Decimal(ordinal_num)
return False
try:
# Try to parse as a number
# This is important to handle thousand/decimal separators correctly.
number = babel.numbers.parse_decimal(
word.text, locale=settings.babel_locale
)
if not number.is_finite():
raise ValueError("Not parsing nan or inf")
word.interpret_as = InterpretAs.NUMBER
if not word.format:
# Retain ordinal, etc.
word.format = InterpretAsFormat.NUMBER_CARDINAL
word.number = number
if (1000 < number < 3000) and (re.match(r"^\d+$", word.text) is not None):
# Interpret numbers in this range as years by default, but only
# if the text was entirely digits.
#
# So "2020" will become "twenty twenty", but "2,020" will become
# "two thousand and twenty".
word.format = InterpretAsFormat.NUMBER_YEAR
except ValueError:
# Probably not a number
word.is_maybe_number = False
return True
def _transform_currency(self, graph: GraphType, node: Node,) -> bool:
if not isinstance(node, WordNode):
return False
word = typing.cast(WordNode, node)
if (not word.is_maybe_currency) or (
word.interpret_as and (word.interpret_as != InterpretAs.CURRENCY)
):
return False
settings = self.get_settings(word.lang)
if (settings.is_maybe_currency is not None) and (
not settings.is_maybe_currency(word.text)
):
# Probably not currency
word.is_maybe_currency = False
return False
assert settings.babel_locale
# Try to parse with known currency symbols
parsed = False
for currency_symbol in settings.currency_symbols:
if word.text.startswith(currency_symbol):
num_str = word.text[len(currency_symbol) :]
try:
# Try to parse as a number
# This is important to handle thousand/decimal separators correctly.
number = babel.numbers.parse_decimal(
num_str, locale=settings.babel_locale
)
word.interpret_as = InterpretAs.CURRENCY
word.currency_symbol = currency_symbol
word.number = number
parsed = True
break
except ValueError:
pass
# If this *must* be a currency value, use the default currency
if (not parsed) and (word.interpret_as == InterpretAs.CURRENCY):
default_currency = settings.default_currency
if default_currency:
# Forced interpretation using default currency
try:
number = babel.numbers.parse_decimal(
word.text, locale=settings.babel_locale
)
word.interpret_as = InterpretAs.CURRENCY
word.currency_name = default_currency
word.number = number
except ValueError:
pass
return True
def _transform_date(self, graph: GraphType, node: Node):
if not isinstance(node, WordNode):
return False
word = typing.cast(WordNode, node)
if (not word.is_maybe_date) or (
word.interpret_as and (word.interpret_as != InterpretAs.DATE)
):
return False
settings = self.get_settings(word.lang)
try:
if (settings.is_maybe_date is not None) and not settings.is_maybe_date(
word.text
):
# Probably not a date
word.is_maybe_date = False
return False
assert settings.dateparser_lang
dateparser_kwargs: typing.Dict[str, typing.Any] = {
"settings": {"STRICT_PARSING": True},
"languages": [settings.dateparser_lang],
}
date = dateparser.parse(word.text, **dateparser_kwargs)
if date is not None:
word.interpret_as = InterpretAs.DATE
word.date = date
elif word.interpret_as == InterpretAs.DATE:
# Try again without strict parsing
dateparser_kwargs["settings"]["STRICT_PARSING"] = False
date = dateparser.parse(word.text, **dateparser_kwargs)
if date is not None:
word.date = date
except Exception:
_LOGGER.exception("transform_date")
# Not a date
word.is_maybe_date = False
return False
return True
def _transform_time(self, graph: GraphType, node: Node):
if not isinstance(node, WordNode):
return False
word = typing.cast(WordNode, node)
if (not word.is_maybe_time) or (
word.interpret_as and (word.interpret_as != InterpretAs.TIME)
):
return False
settings = self.get_settings(word.lang)
if settings.parse_time is None:
# Can't parse a time anyways
return False
try:
if (settings.is_maybe_time is not None) and not settings.is_maybe_time(
word.text
):
# Probably not a time
word.is_maybe_time = False
return False
time = settings.parse_time(word.text)
if time is not None:
word.interpret_as = InterpretAs.TIME
word.time = time
except Exception:
_LOGGER.exception("transform_time")
# Not a time
word.is_maybe_time = False
return False
return True
def _is_word_in_lexicon(
self, word: str, settings: TextProcessorSettings
) -> typing.Optional[bool]:
"""True if word is in the lexicon"""
if settings.lookup_phonemes is None:
return None
return bool(settings.lookup_phonemes(word, do_transforms=False))
# -------------------------------------------------------------------------
# Verbalization
# -------------------------------------------------------------------------
def _verbalize_number(self, graph: GraphType, node: Node):
"""Split numbers into words"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if (word.interpret_as != InterpretAs.NUMBER) or (word.number is None):
return
settings = self.get_settings(word.lang)
if (settings.is_maybe_number is not None) and not settings.is_maybe_number(
word.text
):
# Probably not a number
return
assert settings.num2words_lang
num2words_kwargs = {"lang": settings.num2words_lang}
decimal_nums = [word.number]
if word.format == InterpretAsFormat.NUMBER_CARDINAL:
num2words_kwargs["to"] = "cardinal"
elif word.format == InterpretAsFormat.NUMBER_ORDINAL:
num2words_kwargs["to"] = "ordinal"
elif word.format == InterpretAsFormat.NUMBER_YEAR:
num2words_kwargs["to"] = "year"
elif word.format == InterpretAsFormat.NUMBER_DIGITS:
num2words_kwargs["to"] = "cardinal"
decimal_nums = [Decimal(d) for d in str(word.number.to_integral_value())]
for decimal_num in decimal_nums:
num_has_frac = (decimal_num % 1) != 0
# num2words uses the number as an index sometimes, so it *has* to be
# an integer, unless we're doing currency.
if num_has_frac:
final_num = float(decimal_num)
else:
final_num = int(decimal_num)
try:
# Convert to words (e.g., 100 -> one hundred)
num_str = num2words(final_num, **num2words_kwargs)
except NotImplementedError:
_LOGGER.exception(
"Failed to convert number %s to words for language %s",
word.text,
word.lang,
)
return
# Add original whitespace back in
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
num_str = first_ws + num_str + last_ws
# Split into separate words
for number_word_text in settings.split_words(num_str):
number_word_text_norm = settings.normalize_whitespace(number_word_text)
if not number_word_text_norm:
continue
if not settings.keep_whitespace:
number_word_text = number_word_text_norm
number_word = WordNode(
node=len(graph),
implicit=True,
lang=word.lang,
text=number_word_text_norm,
text_with_ws=number_word_text,
)
graph.add_node(number_word.node, data=number_word)
graph.add_edge(word.node, number_word.node)
def _verbalize_date(self, graph: GraphType, node: Node):
"""Split dates into words"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if (word.interpret_as != InterpretAs.DATE) or (word.date is None):
return
settings = self.get_settings(word.lang)
assert settings.babel_locale
assert settings.num2words_lang
date = word.date
date_format = word.format or settings.default_date_format
if "{" not in date_format:
# Transform into Python format string
date_format = date_format.strip().upper()
# MDY -> {M} {D} {Y}
date_format_str = settings.join_str.join(f"{{{c}}}" for c in date_format)
else:
# Assumed to be a Python format string already
date_format_str = date_format
day_card_str = ""
day_ord_str = ""
month_str = ""
year_str = ""
try:
if ("{M}" in date_format_str) or ("{m}" in date_format_str):
month_str = babel.dates.format_date(
date, "MMMM", locale=settings.babel_locale
)
num2words_kwargs = {"lang": settings.num2words_lang}
if ("{D}" in date_format_str) or ("{d}" in date_format_str):
# Cardinal day (1 -> one)
num2words_kwargs["to"] = "cardinal"
day_card_str = num2words(date.day, **num2words_kwargs)
if ("{O}" in date_format_str) or ("{o}" in date_format_str):
# Ordinal day (1 -> first)
num2words_kwargs["to"] = "ordinal"
day_ord_str = num2words(date.day, **num2words_kwargs)
if ("{Y}" in date_format_str) or ("{y}" in date_format_str):
try:
num2words_kwargs["to"] = "year"
year_str = num2words(date.year, **num2words_kwargs)
except Exception:
# Fall back to use cardinal number for year
num2words_kwargs["to"] = "cardinal"
year_str = num2words(date.year, **num2words_kwargs)
except Exception:
_LOGGER.exception(
"Failed to format date %s for language %s", word.text, word.lang
)
return
date_str = date_format_str.format(
**{
"M": month_str,
"m": month_str,
"D": day_card_str,
"d": day_card_str,
"O": day_ord_str,
"o": day_ord_str,
"Y": year_str,
"y": year_str,
}
)
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
date_str = first_ws + date_str + last_ws
# Split into separate words
for date_word_text in settings.split_words(date_str):
date_word_text_norm = settings.normalize_whitespace(date_word_text)
if not date_word_text_norm:
continue
if not settings.keep_whitespace:
date_word_text = date_word_text_norm
if not date_word_text:
continue
date_word = WordNode(
node=len(graph),
implicit=True,
lang=word.lang,
text=date_word_text_norm,
text_with_ws=date_word_text,
)
graph.add_node(date_word.node, data=date_word)
graph.add_edge(word.node, date_word.node)
def _verbalize_time(self, graph: GraphType, node: Node):
"""Split times into words"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if (word.interpret_as != InterpretAs.TIME) or (word.time is None):
return
settings = self.get_settings(word.lang)
if settings.verbalize_time is None:
# Can't verbalize
return
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
time_words = list(settings.verbalize_time(word.time))
last_idx = len(time_words) - 1
# Split into words
for word_idx, time_word_text in enumerate(time_words):
if word_idx == 0:
time_word_text = first_ws + time_word_text
if word_idx == last_idx:
time_word_text += last_ws
else:
time_word_text += settings.join_str
time_word_text_norm = settings.normalize_whitespace(time_word_text)
if not time_word_text_norm:
continue
if not settings.keep_whitespace:
time_word_text = time_word_text_norm
if not time_word_text:
continue
time_word = WordNode(
node=len(graph),
implicit=True,
lang=word.lang,
text=time_word_text_norm,
text_with_ws=time_word_text,
)
graph.add_node(time_word.node, data=time_word)
graph.add_edge(word.node, time_word.node)
# May contain numbers or initialisms
self._transform_number(graph, time_word)
for node_class, node_kwargs in self._split_initialism(graph, time_word):
new_node = node_class(node=len(graph), **node_kwargs)
graph.add_node(new_node.node, data=new_node)
graph.add_edge(time_word.node, new_node.node)
def _verbalize_currency(
self, graph: GraphType, node: Node,
):
"""Split currency amounts into words"""
if not isinstance(node, WordNode):
return
word = typing.cast(WordNode, node)
if (
(word.interpret_as != InterpretAs.CURRENCY)
or ((word.currency_symbol is None) and (word.currency_name is None))
or (word.number is None)
):
return
settings = self.get_settings(word.lang)
assert settings.num2words_lang
decimal_num = word.number
# True if number has non-zero fractional part
num_has_frac = (decimal_num % 1) != 0
num2words_kwargs = {"lang": settings.num2words_lang, "to": "currency"}
# Name of currency (e.g., USD)
if not word.currency_name:
currency_name = settings.default_currency
if settings.currencies:
# Look up currency in locale
currency_name = settings.currencies.get(
word.currency_symbol or "", settings.default_currency
)
word.currency_name = currency_name
num2words_kwargs["currency"] = word.currency_name
# Custom separator so we can remove 'zero cents'
num2words_kwargs["separator"] = "|"
try:
num_str = num2words(float(decimal_num), **num2words_kwargs)
except Exception:
_LOGGER.exception(
"Failed to verbalize currency %s for language %s", word, word.lang
)
return
# Post-process currency words
if num_has_frac:
# Discard num2words separator
num_str = num_str.replace("|", "")
else:
# Remove 'zero cents' part
num_str = num_str.split("|", maxsplit=1)[0]
# Add original whitespace back in
first_ws, last_ws = settings.get_whitespace(word.text_with_ws)
num_str = first_ws + num_str + last_ws
# Split into separate words
for currency_word_text in settings.split_words(num_str):
currency_word_text_norm = settings.normalize_whitespace(currency_word_text)
if not currency_word_text_norm:
continue
if not settings.keep_whitespace:
currency_word_text = currency_word_text_norm
currency_word = WordNode(
node=len(graph),
implicit=True,
lang=word.lang,
text=currency_word_text_norm,
text_with_ws=currency_word_text,
)
graph.add_node(currency_word.node, data=currency_word)
graph.add_edge(word.node, currency_word.node)