853 lines
24 KiB
Python
853 lines
24 KiB
Python
|
"""Shared classes, types, and enums"""
|
|||
|
import itertools
|
|||
|
import operator
|
|||
|
import re
|
|||
|
import typing
|
|||
|
import xml.etree.ElementTree as etree
|
|||
|
from dataclasses import dataclass, field
|
|||
|
from datetime import datetime
|
|||
|
from decimal import Decimal
|
|||
|
from enum import Enum
|
|||
|
|
|||
|
import babel
|
|||
|
import babel.core
|
|||
|
import babel.numbers
|
|||
|
|
|||
|
# alias -> full language name
|
|||
|
LANG_ALIASES = {
|
|||
|
"ar": "ar",
|
|||
|
"cs": "cs-cz",
|
|||
|
"de": "de-de",
|
|||
|
"en": "en-us",
|
|||
|
"en-gb": "en-us",
|
|||
|
"es": "es-es",
|
|||
|
"es-mx": "es-es",
|
|||
|
"fa": "fa",
|
|||
|
"fr": "fr-fr",
|
|||
|
"it": "it-it",
|
|||
|
"lb-lb": "lb",
|
|||
|
"nl": "nl",
|
|||
|
"nl-nl": "nl",
|
|||
|
"pt-br": "pt",
|
|||
|
"ru": "ru-ru",
|
|||
|
"sv": "sv-se",
|
|||
|
"sw": "sw",
|
|||
|
"zh": "zh-cn",
|
|||
|
}
|
|||
|
|
|||
|
ENGLISH_LANGS = {"en-us", "en-gb"}
|
|||
|
|
|||
|
# Languages that are expected to have a model directory
|
|||
|
KNOWN_LANGS = set(itertools.chain(ENGLISH_LANGS, LANG_ALIASES.values()))
|
|||
|
|
|||
|
|
|||
|
try:
|
|||
|
# Python >= 3.7
|
|||
|
REGEX_PATTERN = re.Pattern # type: ignore
|
|||
|
REGEX_MATCH = re.Match # type: ignore
|
|||
|
REGEX_TYPE = typing.Union[str, re.Pattern] # type: ignore
|
|||
|
except AttributeError:
|
|||
|
# Python 3.6
|
|||
|
REGEX_PATTERN = typing.Pattern # type: ignore
|
|||
|
REGEX_MATCH = typing.Match # type: ignore
|
|||
|
REGEX_TYPE = typing.Union[str, typing.Pattern] # type: ignore
|
|||
|
|
|||
|
|
|||
|
# Phonemes for a single word
|
|||
|
PHONEMES_TYPE = typing.Sequence[str]
|
|||
|
|
|||
|
# Type of nodes in a text graph
|
|||
|
NODE_TYPE = int
|
|||
|
|
|||
|
# Property used to hold node data in text graph
|
|||
|
DATA_PROP = "data"
|
|||
|
|
|||
|
|
|||
|
class GraphType:
|
|||
|
"""Type wrapper for networkx graph"""
|
|||
|
|
|||
|
nodes: typing.Dict[NODE_TYPE, typing.Dict[typing.Any, typing.Any]]
|
|||
|
"""Get node data for the graph"""
|
|||
|
|
|||
|
def add_node(self, node: NODE_TYPE, **kwargs):
|
|||
|
"""Add a new node to the graph"""
|
|||
|
pass
|
|||
|
|
|||
|
def add_edge(self, src: NODE_TYPE, dst: NODE_TYPE):
|
|||
|
"""Add a new edge to the graph"""
|
|||
|
pass
|
|||
|
|
|||
|
def out_degree(self, node: NODE_TYPE) -> int:
|
|||
|
"""Get number of outgoing edges from a node"""
|
|||
|
pass
|
|||
|
|
|||
|
def successors(self, node: NODE_TYPE) -> typing.Iterable[NODE_TYPE]:
|
|||
|
"""Yield nodes on outgoing edges"""
|
|||
|
pass
|
|||
|
|
|||
|
def predecessors(self, node: NODE_TYPE) -> typing.Iterable[NODE_TYPE]:
|
|||
|
"""Yield nodes from incoming edges"""
|
|||
|
pass
|
|||
|
|
|||
|
def out_edges(
|
|||
|
self, node: NODE_TYPE
|
|||
|
) -> typing.Iterable[typing.Tuple[NODE_TYPE, NODE_TYPE]]:
|
|||
|
"""Yield outgoing edges from a node"""
|
|||
|
pass
|
|||
|
|
|||
|
def add_edges_from(
|
|||
|
self, edges: typing.Iterable[typing.Tuple[NODE_TYPE, NODE_TYPE]]
|
|||
|
):
|
|||
|
"""Add edges from iterable"""
|
|||
|
pass
|
|||
|
|
|||
|
def remove_edges_from(
|
|||
|
self, edges: typing.Iterable[typing.Tuple[NODE_TYPE, NODE_TYPE]]
|
|||
|
):
|
|||
|
"""Remove edges from iterable"""
|
|||
|
pass
|
|||
|
|
|||
|
def __len__(self) -> int:
|
|||
|
"""Get number of nodes in the graph"""
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
DEFAULT_SPLIT_PATTERN = re.compile(r"(\s+)")
|
|||
|
|
|||
|
NORMALIZE_WHITESPACE_PATTERN = re.compile(r"\s+")
|
|||
|
SURROUNDING_WHITESPACE_PATTERN = re.compile(r"^(\s*)\S+(\s*)$")
|
|||
|
HAS_DIGIT_PATTERN = re.compile(r"[0-9]")
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class Time:
|
|||
|
"""Parsed time from text"""
|
|||
|
|
|||
|
hours: int
|
|||
|
minutes: int = 0
|
|||
|
|
|||
|
period: typing.Optional[str] = None
|
|||
|
"""A.M. or P.M."""
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class InterpretAs(str, Enum):
|
|||
|
"""Supported options for interpret-as attribute of <say-as>"""
|
|||
|
|
|||
|
SPELL_OUT = "spell-out"
|
|||
|
"""Word should be spelled out (abc = a b c)"""
|
|||
|
|
|||
|
DATE = "date"
|
|||
|
"""Word should be interpreted as a date"""
|
|||
|
|
|||
|
NUMBER = "number"
|
|||
|
"""Word should be interpreted as a number"""
|
|||
|
|
|||
|
CURRENCY = "currency"
|
|||
|
"""Word should be interpreted as an amount of currency"""
|
|||
|
|
|||
|
TIME = "time"
|
|||
|
"""Word should be interpreted as a time on the clock"""
|
|||
|
|
|||
|
WORD = "word"
|
|||
|
"""Interpret as regular word"""
|
|||
|
|
|||
|
|
|||
|
class InterpretAsFormat(str, Enum):
|
|||
|
"""Supported options for format attribute of <say-as>"""
|
|||
|
|
|||
|
NUMBER_CARDINAL = "cardinal"
|
|||
|
"""Cardinal version of number (1 = one)"""
|
|||
|
|
|||
|
NUMBER_ORDINAL = "ordinal"
|
|||
|
"""Ordinal version of number (1 = first)"""
|
|||
|
|
|||
|
NUMBER_DIGITS = "digits"
|
|||
|
"""Number as digits (12 = one two)"""
|
|||
|
|
|||
|
NUMBER_YEAR = "year"
|
|||
|
"""Number as a year (2021 = twenty twenty-one)"""
|
|||
|
|
|||
|
# Date formats
|
|||
|
# d = day
|
|||
|
# m = month
|
|||
|
# y = year
|
|||
|
# o = ordinal day ("first" instead of "one")
|
|||
|
DATE_DMY = "dmy"
|
|||
|
DATE_MDY = "mdy"
|
|||
|
DATE_YMD = "ymd"
|
|||
|
DATE_DMY_ORDINAL = "omy"
|
|||
|
DATE_MDY_ORDINAL = "moy"
|
|||
|
DATE_YMD_ORDINAL = "ymo"
|
|||
|
DATE_YM = "ym"
|
|||
|
DATE_MY = "my"
|
|||
|
DATE_MD = "md"
|
|||
|
DATE_MD_ORDINAL = "mo"
|
|||
|
DATE_DM_ORDINAL = "om"
|
|||
|
DATE_Y = "y"
|
|||
|
|
|||
|
|
|||
|
class BreakType(str, Enum):
|
|||
|
"""Types of sentence breaks"""
|
|||
|
|
|||
|
MINOR = "minor"
|
|||
|
"""Break between phrases"""
|
|||
|
|
|||
|
MAJOR = "major"
|
|||
|
"""Break between sentences"""
|
|||
|
|
|||
|
|
|||
|
class WordRole(str, Enum):
|
|||
|
"""Role of a word. Used to disambiguate pronunciations."""
|
|||
|
|
|||
|
DEFAULT = ""
|
|||
|
"""Use default word pronunciation"""
|
|||
|
|
|||
|
LETTER = "gruut:letter"
|
|||
|
"""Word should be pronounced as a letter (a = /eɪ/ instead of /ə/)"""
|
|||
|
|
|||
|
|
|||
|
class SSMLParsingState(int, Enum):
|
|||
|
"""Current state of SSML parsing"""
|
|||
|
|
|||
|
DEFAULT = 0
|
|||
|
|
|||
|
IN_WORD = 1
|
|||
|
"""Inside <w> or <token>"""
|
|||
|
|
|||
|
IN_LEXICON = 2
|
|||
|
"""Inside <lexicon>"""
|
|||
|
|
|||
|
IN_LEXICON_GRAPHEME = 3
|
|||
|
"""Inside <lexicon><grapheme>..."""
|
|||
|
|
|||
|
IN_LEXICON_PHONEME = 4
|
|||
|
"""Inside <lexicon><phoneme>..."""
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class InlineLexicon:
|
|||
|
"""SSML lexicon defined inline (not standards compliant)"""
|
|||
|
|
|||
|
lexicon_id: str
|
|||
|
alphabet: str = ""
|
|||
|
|
|||
|
# word -> role -> [phoneme]
|
|||
|
words: typing.Dict[str, typing.Dict[str, PHONEMES_TYPE]] = field(
|
|||
|
default_factory=dict
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class Lexeme:
|
|||
|
"""Entry of an inline lexicon"""
|
|||
|
|
|||
|
grapheme: str = ""
|
|||
|
phonemes: typing.Optional[PHONEMES_TYPE] = None
|
|||
|
roles: typing.Optional[typing.Set[str]] = None
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class Node:
|
|||
|
"""Base class of all text processing graph nodes"""
|
|||
|
|
|||
|
node: NODE_TYPE
|
|||
|
element: typing.Optional[etree.Element] = None
|
|||
|
voice: str = ""
|
|||
|
lang: str = ""
|
|||
|
implicit: bool = False
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class IgnoreNode(Node):
|
|||
|
"""Node should be ignored"""
|
|||
|
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class BreakNode(Node):
|
|||
|
"""Represents a user-specified break"""
|
|||
|
|
|||
|
time: str = ""
|
|||
|
"""Length of break in seconds (123s) or milliseconds (123ms)"""
|
|||
|
|
|||
|
def get_milliseconds(self) -> int:
|
|||
|
"""Get number of milliseconds from the time string"""
|
|||
|
if self.time.endswith("ms"):
|
|||
|
return int(self.time[:-2])
|
|||
|
|
|||
|
if self.time.endswith("s"):
|
|||
|
return int(float(self.time[:-1]) * 1000)
|
|||
|
|
|||
|
return 0
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class MarkNode(Node):
|
|||
|
"""Represents a user-specified mark"""
|
|||
|
|
|||
|
name: str = ""
|
|||
|
"""Name of the mark"""
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class WordNode(Node):
|
|||
|
"""Represents a single word"""
|
|||
|
|
|||
|
text: str = ""
|
|||
|
text_with_ws: str = ""
|
|||
|
interpret_as: typing.Union[str, InterpretAs] = ""
|
|||
|
format: typing.Union[str, InterpretAsFormat] = ""
|
|||
|
|
|||
|
number: typing.Optional[Decimal] = None
|
|||
|
date: typing.Optional[datetime] = None
|
|||
|
currency_symbol: typing.Optional[str] = None
|
|||
|
currency_name: typing.Optional[str] = None
|
|||
|
time: typing.Optional[Time] = None
|
|||
|
|
|||
|
role: typing.Union[str, WordRole] = WordRole.DEFAULT
|
|||
|
pos: typing.Optional[str] = None
|
|||
|
phonemes: typing.Optional[typing.Sequence[str]] = None
|
|||
|
|
|||
|
in_lexicon: typing.Optional[bool] = None
|
|||
|
lexicon_ids: typing.Optional[typing.Sequence[str]] = None
|
|||
|
|
|||
|
# Assume yes until proven otherwise
|
|||
|
is_maybe_number: bool = True
|
|||
|
is_maybe_date: bool = True
|
|||
|
is_maybe_currency: bool = True
|
|||
|
is_maybe_time: bool = True
|
|||
|
|
|||
|
is_from_broken_word: bool = False
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class BreakWordNode(Node):
|
|||
|
"""Represents a major/minor break in the text"""
|
|||
|
|
|||
|
break_type: typing.Union[str, BreakType] = ""
|
|||
|
text: str = ""
|
|||
|
text_with_ws: str = ""
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class PunctuationWordNode(Node):
|
|||
|
"""Represents a punctuation marker in the text"""
|
|||
|
|
|||
|
text: str = ""
|
|||
|
text_with_ws: str = ""
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class SentenceNode(Node):
|
|||
|
"""Represents a sentence with WordNodes under it"""
|
|||
|
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class ParagraphNode(Node):
|
|||
|
"""Represents a paragraph with SentenceNodes under it"""
|
|||
|
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class SpeakNode(Node):
|
|||
|
"""Top-level node for SSML"""
|
|||
|
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class Word:
|
|||
|
"""Processed word from a Sentence"""
|
|||
|
|
|||
|
idx: int
|
|||
|
"""Zero-based index of word in sentence"""
|
|||
|
|
|||
|
text: str
|
|||
|
"""Text with normalized whitespace"""
|
|||
|
|
|||
|
text_with_ws: str
|
|||
|
"""Text with original whitespace"""
|
|||
|
|
|||
|
leading_ws: str = ""
|
|||
|
"""Whitespace before text"""
|
|||
|
|
|||
|
trailing_ws: str = ""
|
|||
|
"""Whitespace after text"""
|
|||
|
|
|||
|
sent_idx: int = 0
|
|||
|
"""Zero-based index of sentence in paragraph"""
|
|||
|
|
|||
|
par_idx: int = 0
|
|||
|
"""Zero-based index of paragraph in document"""
|
|||
|
|
|||
|
lang: str = ""
|
|||
|
"""Language code"""
|
|||
|
|
|||
|
voice: str = ""
|
|||
|
"""Voice (from SSML)"""
|
|||
|
|
|||
|
pos: typing.Optional[str] = None
|
|||
|
"""Part of speech (None if not set)"""
|
|||
|
|
|||
|
phonemes: typing.Optional[typing.Sequence[str]] = None
|
|||
|
"""List of phonemes (None if not set)"""
|
|||
|
|
|||
|
is_major_break: bool = False
|
|||
|
"""True if word is a major break (separates sentences)"""
|
|||
|
|
|||
|
is_minor_break: bool = False
|
|||
|
"""True if word is a minor break (separates phrases)"""
|
|||
|
|
|||
|
is_punctuation: bool = False
|
|||
|
"""True if word is punctuation that surrounds a spoken word (quotes, etc.)"""
|
|||
|
|
|||
|
is_break: typing.Optional[bool] = None
|
|||
|
"""True if major or minor break"""
|
|||
|
|
|||
|
is_spoken: typing.Optional[bool] = None
|
|||
|
"""True if word is something that would be spoken during reading (not punctuation or break)"""
|
|||
|
|
|||
|
pause_before_ms: int = 0
|
|||
|
"""Milliseconds to pause before this word"""
|
|||
|
|
|||
|
pause_after_ms: int = 0
|
|||
|
"""Milliseconds to pause after this word"""
|
|||
|
|
|||
|
marks_before: typing.Optional[typing.List[str]] = None
|
|||
|
"""User-defined marks that occur before this word"""
|
|||
|
|
|||
|
marks_after: typing.Optional[typing.List[str]] = None
|
|||
|
"""User-defined marks that occur after this word"""
|
|||
|
|
|||
|
def __post_init__(self):
|
|||
|
if self.is_break is None:
|
|||
|
self.is_break = self.is_major_break or self.is_minor_break
|
|||
|
|
|||
|
if self.is_spoken is None:
|
|||
|
self.is_spoken = not (self.is_punctuation or self.is_break)
|
|||
|
|
|||
|
self.leading_ws, self.trailing_ws = default_get_whitespace(self.text_with_ws)
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class Sentence:
|
|||
|
"""Processed sentence from a document"""
|
|||
|
|
|||
|
idx: int
|
|||
|
"""Zero-based index of sentence in paragraph"""
|
|||
|
|
|||
|
text: str
|
|||
|
"""Text with normalized whitespace"""
|
|||
|
|
|||
|
text_with_ws: str
|
|||
|
"""Text with original whitespace"""
|
|||
|
|
|||
|
text_spoken: str
|
|||
|
"""Text with only spoken words and normalized whitespace"""
|
|||
|
|
|||
|
par_idx: int = 0
|
|||
|
"""Zero-based index of paragraph in document"""
|
|||
|
|
|||
|
lang: str = ""
|
|||
|
"""Language code"""
|
|||
|
|
|||
|
voice: str = ""
|
|||
|
"""Voice (from SSML)"""
|
|||
|
|
|||
|
words: typing.List[Word] = field(default_factory=list)
|
|||
|
"""Words in the sentence"""
|
|||
|
|
|||
|
pause_before_ms: int = 0
|
|||
|
"""Milliseconds to pause before this sentence"""
|
|||
|
|
|||
|
pause_after_ms: int = 0
|
|||
|
"""Milliseconds to pause after this sentence"""
|
|||
|
|
|||
|
marks_before: typing.Optional[typing.List[str]] = None
|
|||
|
"""User-defined marks that occur before this sentence"""
|
|||
|
|
|||
|
marks_after: typing.Optional[typing.List[str]] = None
|
|||
|
"""User-defined marks that occur after this sentence"""
|
|||
|
|
|||
|
def __iter__(self):
|
|||
|
"""Iterates over words"""
|
|||
|
return iter(self.words)
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
"""Number of words"""
|
|||
|
return len(self.words)
|
|||
|
|
|||
|
def __getitem__(self, key):
|
|||
|
"""Gets word by index"""
|
|||
|
return self.words[key]
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class LookupPhonemes:
|
|||
|
"""Look up phonemes for word/role in a lexicon"""
|
|||
|
|
|||
|
def __call__(
|
|||
|
self, word: str, role: typing.Optional[str] = None, do_transforms: bool = True
|
|||
|
) -> typing.Optional[PHONEMES_TYPE]:
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
class GuessPhonemes:
|
|||
|
"""Guess phonemes for word/role"""
|
|||
|
|
|||
|
def __call__(
|
|||
|
self, word: str, role: typing.Optional[str] = None
|
|||
|
) -> typing.Optional[PHONEMES_TYPE]:
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
class GetPartsOfSpeech:
|
|||
|
"""Get part of speech tags for words"""
|
|||
|
|
|||
|
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
class PostProcessSentence:
|
|||
|
"""Post-process each sentence node after tokenization/phonemization"""
|
|||
|
|
|||
|
def __call__(
|
|||
|
self, graph: GraphType, sentence_node: SentenceNode, settings: typing.Any,
|
|||
|
):
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class EndElement:
|
|||
|
"""Wrapper for end of an XML element (used in TextProcessor)"""
|
|||
|
|
|||
|
element: etree.Element
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def has_digit(s: str) -> bool:
|
|||
|
"""True if string contains at least one digit"""
|
|||
|
return HAS_DIGIT_PATTERN.search(s) is not None
|
|||
|
|
|||
|
|
|||
|
DEFAULT_WORD_PATTERN = re.compile(r"(\s*\S+(?:\s+|$))")
|
|||
|
|
|||
|
|
|||
|
def default_split_words(s: str) -> typing.Iterable[str]:
|
|||
|
"""Split text on whitespace"""
|
|||
|
yield from filter(None, DEFAULT_WORD_PATTERN.findall(s))
|
|||
|
|
|||
|
|
|||
|
def default_get_whitespace(s: str) -> typing.Tuple[str, str]:
|
|||
|
"""Returns leading and trailing whitespace of a string"""
|
|||
|
leading_ws, trailing_ws = "", ""
|
|||
|
match = SURROUNDING_WHITESPACE_PATTERN.match(s)
|
|||
|
if match is not None:
|
|||
|
leading_ws, trailing_ws = match.groups()
|
|||
|
|
|||
|
return leading_ws, trailing_ws
|
|||
|
|
|||
|
|
|||
|
def default_normalize_whitespace(s: str) -> str:
|
|||
|
"""Replace multiple spaces with single space"""
|
|||
|
return NORMALIZE_WHITESPACE_PATTERN.sub(" ", s.strip())
|
|||
|
|
|||
|
|
|||
|
def maybe_compile_regex(
|
|||
|
str_or_pattern: typing.Union[str, REGEX_PATTERN]
|
|||
|
) -> REGEX_PATTERN:
|
|||
|
"""Compile regex pattern if it's a string"""
|
|||
|
if isinstance(str_or_pattern, REGEX_PATTERN):
|
|||
|
return str_or_pattern
|
|||
|
|
|||
|
assert isinstance(str_or_pattern, str)
|
|||
|
|
|||
|
return re.compile(str_or_pattern)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
@dataclass
|
|||
|
class TextProcessorSettings:
|
|||
|
"""Language specific settings for text processing"""
|
|||
|
|
|||
|
lang: str
|
|||
|
"""Language code that these settings apply to (e.g., en_US)"""
|
|||
|
|
|||
|
# Whitespace/tokenization
|
|||
|
split_words: typing.Callable[[str], typing.Iterable[str]] = default_split_words
|
|||
|
"""Split text into words and separators"""
|
|||
|
|
|||
|
join_str: str = " "
|
|||
|
"""String used to combine text from words"""
|
|||
|
|
|||
|
keep_whitespace: bool = True
|
|||
|
"""True if original whitespace should be retained"""
|
|||
|
|
|||
|
is_non_word: typing.Optional[typing.Callable[[str], bool]] = None
|
|||
|
"""Returns true if text is not a word (and should be ignored in final output)"""
|
|||
|
|
|||
|
get_whitespace: typing.Callable[
|
|||
|
[str], typing.Tuple[str, str]
|
|||
|
] = default_get_whitespace
|
|||
|
"""Returns leading, trailing whitespace from a string"""
|
|||
|
|
|||
|
normalize_whitespace: typing.Callable[[str], str] = default_normalize_whitespace
|
|||
|
"""Normalizes whitespace in a string"""
|
|||
|
|
|||
|
# Punctuations
|
|||
|
begin_punctuations: typing.Optional[typing.Set[str]] = None
|
|||
|
"""Strings that should be split off from the beginning of a word"""
|
|||
|
|
|||
|
begin_punctuations_pattern: typing.Optional[REGEX_TYPE] = None
|
|||
|
"""Regex that overrides begin_punctuations"""
|
|||
|
|
|||
|
end_punctuations: typing.Optional[typing.Set[str]] = None
|
|||
|
"""Strings that should be split off from the end of a word"""
|
|||
|
|
|||
|
end_punctuations_pattern: typing.Optional[REGEX_TYPE] = None
|
|||
|
"""Regex that overrides end_punctuations"""
|
|||
|
|
|||
|
# Replacements/abbreviations
|
|||
|
replacements: typing.Sequence[typing.Tuple[REGEX_TYPE, str]] = field(
|
|||
|
default_factory=list
|
|||
|
)
|
|||
|
"""Regex, replacement template pairs that are applied in order right after tokenization on each word"""
|
|||
|
|
|||
|
abbreviations: typing.Dict[REGEX_TYPE, str] = field(default_factory=dict)
|
|||
|
"""Regex, replacement template pairs that may expand words after minor breaks are matched"""
|
|||
|
|
|||
|
spell_out_words: typing.Dict[str, str] = field(default_factory=dict)
|
|||
|
"""Written form, spoken form pairs that are applied with interpret-as="spell-out" in <say-as>"""
|
|||
|
|
|||
|
# Breaks
|
|||
|
major_breaks: typing.Set[str] = field(default_factory=set)
|
|||
|
"""Set of strings that occur at the end of a word and should break apart sentences."""
|
|||
|
|
|||
|
major_breaks_pattern: typing.Optional[REGEX_TYPE] = None
|
|||
|
"""Regex that overrides major_breaks"""
|
|||
|
|
|||
|
minor_breaks: typing.Set[str] = field(default_factory=set)
|
|||
|
"""Set of strings that occur at the end of a word and should break apart phrases."""
|
|||
|
|
|||
|
minor_breaks_pattern: typing.Optional[REGEX_TYPE] = None
|
|||
|
"""Regex that overrides minor_breaks"""
|
|||
|
|
|||
|
word_breaks: typing.Set[str] = field(default_factory=set)
|
|||
|
word_breaks_pattern: typing.Optional[REGEX_TYPE] = None
|
|||
|
"""Regex that overrides word_breaks"""
|
|||
|
|
|||
|
# Numbers
|
|||
|
is_maybe_number: typing.Optional[typing.Callable[[str], bool]] = has_digit
|
|||
|
"""True if a word may be a number (parsing will be attempted)"""
|
|||
|
|
|||
|
get_ordinal: typing.Optional[typing.Callable[[str], typing.Optional[int]]] = None
|
|||
|
"""Returns integer value of an ordinal string (e.g., 1st -> 1) or None if not an ordinal"""
|
|||
|
|
|||
|
babel_locale: typing.Optional[str] = None
|
|||
|
"""Locale used to parse numbers/dates/currencies (defaults to lang)"""
|
|||
|
|
|||
|
num2words_lang: typing.Optional[str] = None
|
|||
|
"""Language used to verbalize numbers (defaults to lang)"""
|
|||
|
|
|||
|
# Currency
|
|||
|
default_currency: str = "USD"
|
|||
|
"""Currency name to use when interpret-as="currency" but no currency symbol is present"""
|
|||
|
|
|||
|
currencies: typing.MutableMapping[str, str] = field(default_factory=dict)
|
|||
|
"""Mapping from currency symbol ($) to currency name (USD)"""
|
|||
|
|
|||
|
currency_symbols: typing.Sequence[str] = field(default_factory=list)
|
|||
|
"""Ordered list of currency symbols (decreasing length)"""
|
|||
|
|
|||
|
is_maybe_currency: typing.Optional[typing.Callable[[str], bool]] = has_digit
|
|||
|
"""True if a word may be an amount of currency (parsing will be attempted)"""
|
|||
|
|
|||
|
# Dates
|
|||
|
dateparser_lang: typing.Optional[str] = None
|
|||
|
"""Language used to parse dates (defaults to lang)"""
|
|||
|
|
|||
|
is_maybe_date: typing.Optional[typing.Callable[[str], bool]] = has_digit
|
|||
|
"""True if a word may be a date (parsing will be attempted)"""
|
|||
|
|
|||
|
default_date_format: typing.Union[
|
|||
|
str, InterpretAsFormat
|
|||
|
] = InterpretAsFormat.DATE_MDY_ORDINAL
|
|||
|
"""Format used to verbalize a date unless set with the format attribute of <say-as>"""
|
|||
|
|
|||
|
# Times
|
|||
|
is_maybe_time: typing.Optional[typing.Callable[[str], bool]] = has_digit
|
|||
|
"""True if a word may be a clock time (parsing will be attempted)"""
|
|||
|
|
|||
|
parse_time: typing.Optional[typing.Callable[[str], typing.Optional[Time]]] = None
|
|||
|
"""Parse word text into a Time object or None"""
|
|||
|
|
|||
|
verbalize_time: typing.Optional[
|
|||
|
typing.Callable[[Time], typing.Iterable[str]]
|
|||
|
] = None
|
|||
|
"""Convert Time to words"""
|
|||
|
|
|||
|
# Part of speech (pos) tagging
|
|||
|
get_parts_of_speech: typing.Optional[GetPartsOfSpeech] = None
|
|||
|
"""Optional function to get part of speech for a word"""
|
|||
|
|
|||
|
# Initialisms (e.g, TTS or T.T.S.)
|
|||
|
is_initialism: typing.Optional[typing.Callable[[str], bool]] = None
|
|||
|
"""True if a word is an initialism (will be split with split_initialism)"""
|
|||
|
|
|||
|
split_initialism: typing.Optional[
|
|||
|
typing.Callable[[str], typing.Sequence[str]]
|
|||
|
] = None
|
|||
|
"""Function to break apart an initialism into multiple words (called if is_initialism is True)"""
|
|||
|
|
|||
|
# Phonemization
|
|||
|
lookup_phonemes: typing.Optional[LookupPhonemes] = None
|
|||
|
"""Optional function to look up phonemes for a word/role (without guessing)"""
|
|||
|
|
|||
|
guess_phonemes: typing.Optional[GuessPhonemes] = None
|
|||
|
"""Optional function to guess phonemes for a word/role"""
|
|||
|
|
|||
|
# Pre/post-processing
|
|||
|
pre_process_text: typing.Optional[typing.Callable[[str], str]] = None
|
|||
|
"""Optional function to process text during tokenization"""
|
|||
|
|
|||
|
post_process_sentence: typing.Optional[PostProcessSentence] = None
|
|||
|
"""Optional function to post-process each sentence in the graph before post_process_graph"""
|
|||
|
|
|||
|
def __post_init__(self):
|
|||
|
# Languages/locales
|
|||
|
if self.babel_locale is None:
|
|||
|
if "-" in self.lang:
|
|||
|
# en-us -> en_US
|
|||
|
lang_parts = self.lang.split("-", maxsplit=1)
|
|||
|
self.babel_locale = "_".join(
|
|||
|
[lang_parts[0].lower(), lang_parts[1].upper()]
|
|||
|
)
|
|||
|
else:
|
|||
|
self.babel_locale = self.lang
|
|||
|
|
|||
|
if self.num2words_lang is None:
|
|||
|
self.num2words_lang = self.babel_locale
|
|||
|
|
|||
|
if self.dateparser_lang is None:
|
|||
|
# en_US -> en
|
|||
|
self.dateparser_lang = self.babel_locale.split("_")[0]
|
|||
|
|
|||
|
# Pre-compiled regular expressions
|
|||
|
self.replacements = [
|
|||
|
(maybe_compile_regex(pattern), template)
|
|||
|
for pattern, template in self.replacements
|
|||
|
]
|
|||
|
|
|||
|
compiled_abbreviations = {}
|
|||
|
for pattern, template in self.abbreviations.items():
|
|||
|
if isinstance(pattern, str):
|
|||
|
if not pattern.endswith("$") and self.major_breaks:
|
|||
|
# Automatically add optional major break at the end
|
|||
|
break_pattern_str = "|".join(
|
|||
|
re.escape(b) for b in self.major_breaks
|
|||
|
)
|
|||
|
pattern = (
|
|||
|
f"{pattern}(?P<break>{break_pattern_str})?(?P<whitespace>\\s*)$"
|
|||
|
)
|
|||
|
template += r"\g<break>\g<whitespace>"
|
|||
|
|
|||
|
pattern = re.compile(pattern)
|
|||
|
|
|||
|
compiled_abbreviations[pattern] = template
|
|||
|
|
|||
|
self.abbreviations = compiled_abbreviations
|
|||
|
|
|||
|
# Strings that should be separated from words, but do not cause any breaks
|
|||
|
if (self.begin_punctuations_pattern is None) and self.begin_punctuations:
|
|||
|
pattern_str = "|".join(re.escape(b) for b in self.begin_punctuations)
|
|||
|
|
|||
|
# Match begin_punctuations only at start a word
|
|||
|
self.begin_punctuations_pattern = f"^({pattern_str})"
|
|||
|
|
|||
|
if self.begin_punctuations_pattern is not None:
|
|||
|
self.begin_punctuations_pattern = maybe_compile_regex(
|
|||
|
self.begin_punctuations_pattern
|
|||
|
)
|
|||
|
|
|||
|
if (self.end_punctuations_pattern is None) and self.end_punctuations:
|
|||
|
pattern_str = "|".join(re.escape(b) for b in self.end_punctuations)
|
|||
|
|
|||
|
# Match end_punctuations only at end of a word
|
|||
|
self.end_punctuations_pattern = f"({pattern_str})$"
|
|||
|
|
|||
|
if self.end_punctuations_pattern is not None:
|
|||
|
self.end_punctuations_pattern = maybe_compile_regex(
|
|||
|
self.end_punctuations_pattern
|
|||
|
)
|
|||
|
|
|||
|
# Major breaks (split sentences)
|
|||
|
if (self.major_breaks_pattern is None) and self.major_breaks:
|
|||
|
pattern_str = "|".join(re.escape(b) for b in self.major_breaks)
|
|||
|
|
|||
|
# Match major break with either whitespace at the end or at the end of the text
|
|||
|
# Allow for multiple punctuation symbols (e.g., !?)
|
|||
|
self.major_breaks_pattern = f"((?:{pattern_str})+(?:\\s+|$))"
|
|||
|
|
|||
|
if self.major_breaks_pattern is not None:
|
|||
|
self.major_breaks_pattern = maybe_compile_regex(self.major_breaks_pattern)
|
|||
|
|
|||
|
# Minor breaks (don't split sentences)
|
|||
|
if (self.minor_breaks_pattern is None) and self.minor_breaks:
|
|||
|
pattern_str = "|".join(re.escape(b) for b in self.minor_breaks)
|
|||
|
|
|||
|
# Match minor break with either whitespace at the end or at the end of the text
|
|||
|
self.minor_breaks_pattern = f"((?:{pattern_str})(?:\\s+|$))"
|
|||
|
|
|||
|
if self.minor_breaks_pattern is not None:
|
|||
|
self.minor_breaks_pattern = maybe_compile_regex(self.minor_breaks_pattern)
|
|||
|
|
|||
|
# Word breaks (break words apart into multiple words)
|
|||
|
if (self.word_breaks_pattern is None) and self.word_breaks:
|
|||
|
pattern_str = "|".join(re.escape(b) for b in self.word_breaks)
|
|||
|
self.word_breaks_pattern = f"(?:{pattern_str})"
|
|||
|
|
|||
|
if self.word_breaks_pattern is not None:
|
|||
|
self.word_breaks_pattern = maybe_compile_regex(self.word_breaks_pattern)
|
|||
|
|
|||
|
# Currency
|
|||
|
if not self.currencies:
|
|||
|
try:
|
|||
|
# Look up currencies for locale
|
|||
|
locale_obj = babel.Locale(self.babel_locale)
|
|||
|
|
|||
|
# $ -> USD
|
|||
|
self.currencies = {
|
|||
|
babel.numbers.get_currency_symbol(cn): cn
|
|||
|
for cn in locale_obj.currency_symbols
|
|||
|
}
|
|||
|
except Exception:
|
|||
|
# No automatic currencies
|
|||
|
pass
|
|||
|
|
|||
|
if not self.currency_symbols:
|
|||
|
# Currency symbols (e.g., "$") by decreasing length
|
|||
|
self.currency_symbols = sorted(
|
|||
|
self.currencies, key=operator.length_hint, reverse=True
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|