ai-content-maker/.venv/Lib/site-packages/gruut/const.py

853 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Shared classes, types, and enums"""
import itertools
import operator
import re
import typing
import xml.etree.ElementTree as etree
from dataclasses import dataclass, field
from datetime import datetime
from decimal import Decimal
from enum import Enum
import babel
import babel.core
import babel.numbers
# alias -> full language name
LANG_ALIASES = {
"ar": "ar",
"cs": "cs-cz",
"de": "de-de",
"en": "en-us",
"en-gb": "en-us",
"es": "es-es",
"es-mx": "es-es",
"fa": "fa",
"fr": "fr-fr",
"it": "it-it",
"lb-lb": "lb",
"nl": "nl",
"nl-nl": "nl",
"pt-br": "pt",
"ru": "ru-ru",
"sv": "sv-se",
"sw": "sw",
"zh": "zh-cn",
}
ENGLISH_LANGS = {"en-us", "en-gb"}
# Languages that are expected to have a model directory
KNOWN_LANGS = set(itertools.chain(ENGLISH_LANGS, LANG_ALIASES.values()))
try:
# Python >= 3.7
REGEX_PATTERN = re.Pattern # type: ignore
REGEX_MATCH = re.Match # type: ignore
REGEX_TYPE = typing.Union[str, re.Pattern] # type: ignore
except AttributeError:
# Python 3.6
REGEX_PATTERN = typing.Pattern # type: ignore
REGEX_MATCH = typing.Match # type: ignore
REGEX_TYPE = typing.Union[str, typing.Pattern] # type: ignore
# Phonemes for a single word
PHONEMES_TYPE = typing.Sequence[str]
# Type of nodes in a text graph
NODE_TYPE = int
# Property used to hold node data in text graph
DATA_PROP = "data"
class GraphType:
"""Type wrapper for networkx graph"""
nodes: typing.Dict[NODE_TYPE, typing.Dict[typing.Any, typing.Any]]
"""Get node data for the graph"""
def add_node(self, node: NODE_TYPE, **kwargs):
"""Add a new node to the graph"""
pass
def add_edge(self, src: NODE_TYPE, dst: NODE_TYPE):
"""Add a new edge to the graph"""
pass
def out_degree(self, node: NODE_TYPE) -> int:
"""Get number of outgoing edges from a node"""
pass
def successors(self, node: NODE_TYPE) -> typing.Iterable[NODE_TYPE]:
"""Yield nodes on outgoing edges"""
pass
def predecessors(self, node: NODE_TYPE) -> typing.Iterable[NODE_TYPE]:
"""Yield nodes from incoming edges"""
pass
def out_edges(
self, node: NODE_TYPE
) -> typing.Iterable[typing.Tuple[NODE_TYPE, NODE_TYPE]]:
"""Yield outgoing edges from a node"""
pass
def add_edges_from(
self, edges: typing.Iterable[typing.Tuple[NODE_TYPE, NODE_TYPE]]
):
"""Add edges from iterable"""
pass
def remove_edges_from(
self, edges: typing.Iterable[typing.Tuple[NODE_TYPE, NODE_TYPE]]
):
"""Remove edges from iterable"""
pass
def __len__(self) -> int:
"""Get number of nodes in the graph"""
pass
# -----------------------------------------------------------------------------
DEFAULT_SPLIT_PATTERN = re.compile(r"(\s+)")
NORMALIZE_WHITESPACE_PATTERN = re.compile(r"\s+")
SURROUNDING_WHITESPACE_PATTERN = re.compile(r"^(\s*)\S+(\s*)$")
HAS_DIGIT_PATTERN = re.compile(r"[0-9]")
@dataclass
class Time:
"""Parsed time from text"""
hours: int
minutes: int = 0
period: typing.Optional[str] = None
"""A.M. or P.M."""
# -----------------------------------------------------------------------------
class InterpretAs(str, Enum):
"""Supported options for interpret-as attribute of <say-as>"""
SPELL_OUT = "spell-out"
"""Word should be spelled out (abc = a b c)"""
DATE = "date"
"""Word should be interpreted as a date"""
NUMBER = "number"
"""Word should be interpreted as a number"""
CURRENCY = "currency"
"""Word should be interpreted as an amount of currency"""
TIME = "time"
"""Word should be interpreted as a time on the clock"""
WORD = "word"
"""Interpret as regular word"""
class InterpretAsFormat(str, Enum):
"""Supported options for format attribute of <say-as>"""
NUMBER_CARDINAL = "cardinal"
"""Cardinal version of number (1 = one)"""
NUMBER_ORDINAL = "ordinal"
"""Ordinal version of number (1 = first)"""
NUMBER_DIGITS = "digits"
"""Number as digits (12 = one two)"""
NUMBER_YEAR = "year"
"""Number as a year (2021 = twenty twenty-one)"""
# Date formats
# d = day
# m = month
# y = year
# o = ordinal day ("first" instead of "one")
DATE_DMY = "dmy"
DATE_MDY = "mdy"
DATE_YMD = "ymd"
DATE_DMY_ORDINAL = "omy"
DATE_MDY_ORDINAL = "moy"
DATE_YMD_ORDINAL = "ymo"
DATE_YM = "ym"
DATE_MY = "my"
DATE_MD = "md"
DATE_MD_ORDINAL = "mo"
DATE_DM_ORDINAL = "om"
DATE_Y = "y"
class BreakType(str, Enum):
"""Types of sentence breaks"""
MINOR = "minor"
"""Break between phrases"""
MAJOR = "major"
"""Break between sentences"""
class WordRole(str, Enum):
"""Role of a word. Used to disambiguate pronunciations."""
DEFAULT = ""
"""Use default word pronunciation"""
LETTER = "gruut:letter"
"""Word should be pronounced as a letter (a = /eɪ/ instead of /ə/)"""
class SSMLParsingState(int, Enum):
"""Current state of SSML parsing"""
DEFAULT = 0
IN_WORD = 1
"""Inside <w> or <token>"""
IN_LEXICON = 2
"""Inside <lexicon>"""
IN_LEXICON_GRAPHEME = 3
"""Inside <lexicon><grapheme>..."""
IN_LEXICON_PHONEME = 4
"""Inside <lexicon><phoneme>..."""
@dataclass
class InlineLexicon:
"""SSML lexicon defined inline (not standards compliant)"""
lexicon_id: str
alphabet: str = ""
# word -> role -> [phoneme]
words: typing.Dict[str, typing.Dict[str, PHONEMES_TYPE]] = field(
default_factory=dict
)
@dataclass
class Lexeme:
"""Entry of an inline lexicon"""
grapheme: str = ""
phonemes: typing.Optional[PHONEMES_TYPE] = None
roles: typing.Optional[typing.Set[str]] = None
@dataclass
class Node:
"""Base class of all text processing graph nodes"""
node: NODE_TYPE
element: typing.Optional[etree.Element] = None
voice: str = ""
lang: str = ""
implicit: bool = False
@dataclass
class IgnoreNode(Node):
"""Node should be ignored"""
pass
@dataclass
class BreakNode(Node):
"""Represents a user-specified break"""
time: str = ""
"""Length of break in seconds (123s) or milliseconds (123ms)"""
def get_milliseconds(self) -> int:
"""Get number of milliseconds from the time string"""
if self.time.endswith("ms"):
return int(self.time[:-2])
if self.time.endswith("s"):
return int(float(self.time[:-1]) * 1000)
return 0
@dataclass
class MarkNode(Node):
"""Represents a user-specified mark"""
name: str = ""
"""Name of the mark"""
@dataclass
class WordNode(Node):
"""Represents a single word"""
text: str = ""
text_with_ws: str = ""
interpret_as: typing.Union[str, InterpretAs] = ""
format: typing.Union[str, InterpretAsFormat] = ""
number: typing.Optional[Decimal] = None
date: typing.Optional[datetime] = None
currency_symbol: typing.Optional[str] = None
currency_name: typing.Optional[str] = None
time: typing.Optional[Time] = None
role: typing.Union[str, WordRole] = WordRole.DEFAULT
pos: typing.Optional[str] = None
phonemes: typing.Optional[typing.Sequence[str]] = None
in_lexicon: typing.Optional[bool] = None
lexicon_ids: typing.Optional[typing.Sequence[str]] = None
# Assume yes until proven otherwise
is_maybe_number: bool = True
is_maybe_date: bool = True
is_maybe_currency: bool = True
is_maybe_time: bool = True
is_from_broken_word: bool = False
@dataclass
class BreakWordNode(Node):
"""Represents a major/minor break in the text"""
break_type: typing.Union[str, BreakType] = ""
text: str = ""
text_with_ws: str = ""
@dataclass
class PunctuationWordNode(Node):
"""Represents a punctuation marker in the text"""
text: str = ""
text_with_ws: str = ""
@dataclass
class SentenceNode(Node):
"""Represents a sentence with WordNodes under it"""
pass
@dataclass
class ParagraphNode(Node):
"""Represents a paragraph with SentenceNodes under it"""
pass
@dataclass
class SpeakNode(Node):
"""Top-level node for SSML"""
pass
# -----------------------------------------------------------------------------
@dataclass
class Word:
"""Processed word from a Sentence"""
idx: int
"""Zero-based index of word in sentence"""
text: str
"""Text with normalized whitespace"""
text_with_ws: str
"""Text with original whitespace"""
leading_ws: str = ""
"""Whitespace before text"""
trailing_ws: str = ""
"""Whitespace after text"""
sent_idx: int = 0
"""Zero-based index of sentence in paragraph"""
par_idx: int = 0
"""Zero-based index of paragraph in document"""
lang: str = ""
"""Language code"""
voice: str = ""
"""Voice (from SSML)"""
pos: typing.Optional[str] = None
"""Part of speech (None if not set)"""
phonemes: typing.Optional[typing.Sequence[str]] = None
"""List of phonemes (None if not set)"""
is_major_break: bool = False
"""True if word is a major break (separates sentences)"""
is_minor_break: bool = False
"""True if word is a minor break (separates phrases)"""
is_punctuation: bool = False
"""True if word is punctuation that surrounds a spoken word (quotes, etc.)"""
is_break: typing.Optional[bool] = None
"""True if major or minor break"""
is_spoken: typing.Optional[bool] = None
"""True if word is something that would be spoken during reading (not punctuation or break)"""
pause_before_ms: int = 0
"""Milliseconds to pause before this word"""
pause_after_ms: int = 0
"""Milliseconds to pause after this word"""
marks_before: typing.Optional[typing.List[str]] = None
"""User-defined marks that occur before this word"""
marks_after: typing.Optional[typing.List[str]] = None
"""User-defined marks that occur after this word"""
def __post_init__(self):
if self.is_break is None:
self.is_break = self.is_major_break or self.is_minor_break
if self.is_spoken is None:
self.is_spoken = not (self.is_punctuation or self.is_break)
self.leading_ws, self.trailing_ws = default_get_whitespace(self.text_with_ws)
@dataclass
class Sentence:
"""Processed sentence from a document"""
idx: int
"""Zero-based index of sentence in paragraph"""
text: str
"""Text with normalized whitespace"""
text_with_ws: str
"""Text with original whitespace"""
text_spoken: str
"""Text with only spoken words and normalized whitespace"""
par_idx: int = 0
"""Zero-based index of paragraph in document"""
lang: str = ""
"""Language code"""
voice: str = ""
"""Voice (from SSML)"""
words: typing.List[Word] = field(default_factory=list)
"""Words in the sentence"""
pause_before_ms: int = 0
"""Milliseconds to pause before this sentence"""
pause_after_ms: int = 0
"""Milliseconds to pause after this sentence"""
marks_before: typing.Optional[typing.List[str]] = None
"""User-defined marks that occur before this sentence"""
marks_after: typing.Optional[typing.List[str]] = None
"""User-defined marks that occur after this sentence"""
def __iter__(self):
"""Iterates over words"""
return iter(self.words)
def __len__(self):
"""Number of words"""
return len(self.words)
def __getitem__(self, key):
"""Gets word by index"""
return self.words[key]
# -----------------------------------------------------------------------------
class LookupPhonemes:
"""Look up phonemes for word/role in a lexicon"""
def __call__(
self, word: str, role: typing.Optional[str] = None, do_transforms: bool = True
) -> typing.Optional[PHONEMES_TYPE]:
pass
class GuessPhonemes:
"""Guess phonemes for word/role"""
def __call__(
self, word: str, role: typing.Optional[str] = None
) -> typing.Optional[PHONEMES_TYPE]:
pass
class GetPartsOfSpeech:
"""Get part of speech tags for words"""
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
pass
class PostProcessSentence:
"""Post-process each sentence node after tokenization/phonemization"""
def __call__(
self, graph: GraphType, sentence_node: SentenceNode, settings: typing.Any,
):
pass
@dataclass
class EndElement:
"""Wrapper for end of an XML element (used in TextProcessor)"""
element: etree.Element
# -----------------------------------------------------------------------------
def has_digit(s: str) -> bool:
"""True if string contains at least one digit"""
return HAS_DIGIT_PATTERN.search(s) is not None
DEFAULT_WORD_PATTERN = re.compile(r"(\s*\S+(?:\s+|$))")
def default_split_words(s: str) -> typing.Iterable[str]:
"""Split text on whitespace"""
yield from filter(None, DEFAULT_WORD_PATTERN.findall(s))
def default_get_whitespace(s: str) -> typing.Tuple[str, str]:
"""Returns leading and trailing whitespace of a string"""
leading_ws, trailing_ws = "", ""
match = SURROUNDING_WHITESPACE_PATTERN.match(s)
if match is not None:
leading_ws, trailing_ws = match.groups()
return leading_ws, trailing_ws
def default_normalize_whitespace(s: str) -> str:
"""Replace multiple spaces with single space"""
return NORMALIZE_WHITESPACE_PATTERN.sub(" ", s.strip())
def maybe_compile_regex(
str_or_pattern: typing.Union[str, REGEX_PATTERN]
) -> REGEX_PATTERN:
"""Compile regex pattern if it's a string"""
if isinstance(str_or_pattern, REGEX_PATTERN):
return str_or_pattern
assert isinstance(str_or_pattern, str)
return re.compile(str_or_pattern)
# -----------------------------------------------------------------------------
@dataclass
class TextProcessorSettings:
"""Language specific settings for text processing"""
lang: str
"""Language code that these settings apply to (e.g., en_US)"""
# Whitespace/tokenization
split_words: typing.Callable[[str], typing.Iterable[str]] = default_split_words
"""Split text into words and separators"""
join_str: str = " "
"""String used to combine text from words"""
keep_whitespace: bool = True
"""True if original whitespace should be retained"""
is_non_word: typing.Optional[typing.Callable[[str], bool]] = None
"""Returns true if text is not a word (and should be ignored in final output)"""
get_whitespace: typing.Callable[
[str], typing.Tuple[str, str]
] = default_get_whitespace
"""Returns leading, trailing whitespace from a string"""
normalize_whitespace: typing.Callable[[str], str] = default_normalize_whitespace
"""Normalizes whitespace in a string"""
# Punctuations
begin_punctuations: typing.Optional[typing.Set[str]] = None
"""Strings that should be split off from the beginning of a word"""
begin_punctuations_pattern: typing.Optional[REGEX_TYPE] = None
"""Regex that overrides begin_punctuations"""
end_punctuations: typing.Optional[typing.Set[str]] = None
"""Strings that should be split off from the end of a word"""
end_punctuations_pattern: typing.Optional[REGEX_TYPE] = None
"""Regex that overrides end_punctuations"""
# Replacements/abbreviations
replacements: typing.Sequence[typing.Tuple[REGEX_TYPE, str]] = field(
default_factory=list
)
"""Regex, replacement template pairs that are applied in order right after tokenization on each word"""
abbreviations: typing.Dict[REGEX_TYPE, str] = field(default_factory=dict)
"""Regex, replacement template pairs that may expand words after minor breaks are matched"""
spell_out_words: typing.Dict[str, str] = field(default_factory=dict)
"""Written form, spoken form pairs that are applied with interpret-as="spell-out" in <say-as>"""
# Breaks
major_breaks: typing.Set[str] = field(default_factory=set)
"""Set of strings that occur at the end of a word and should break apart sentences."""
major_breaks_pattern: typing.Optional[REGEX_TYPE] = None
"""Regex that overrides major_breaks"""
minor_breaks: typing.Set[str] = field(default_factory=set)
"""Set of strings that occur at the end of a word and should break apart phrases."""
minor_breaks_pattern: typing.Optional[REGEX_TYPE] = None
"""Regex that overrides minor_breaks"""
word_breaks: typing.Set[str] = field(default_factory=set)
word_breaks_pattern: typing.Optional[REGEX_TYPE] = None
"""Regex that overrides word_breaks"""
# Numbers
is_maybe_number: typing.Optional[typing.Callable[[str], bool]] = has_digit
"""True if a word may be a number (parsing will be attempted)"""
get_ordinal: typing.Optional[typing.Callable[[str], typing.Optional[int]]] = None
"""Returns integer value of an ordinal string (e.g., 1st -> 1) or None if not an ordinal"""
babel_locale: typing.Optional[str] = None
"""Locale used to parse numbers/dates/currencies (defaults to lang)"""
num2words_lang: typing.Optional[str] = None
"""Language used to verbalize numbers (defaults to lang)"""
# Currency
default_currency: str = "USD"
"""Currency name to use when interpret-as="currency" but no currency symbol is present"""
currencies: typing.MutableMapping[str, str] = field(default_factory=dict)
"""Mapping from currency symbol ($) to currency name (USD)"""
currency_symbols: typing.Sequence[str] = field(default_factory=list)
"""Ordered list of currency symbols (decreasing length)"""
is_maybe_currency: typing.Optional[typing.Callable[[str], bool]] = has_digit
"""True if a word may be an amount of currency (parsing will be attempted)"""
# Dates
dateparser_lang: typing.Optional[str] = None
"""Language used to parse dates (defaults to lang)"""
is_maybe_date: typing.Optional[typing.Callable[[str], bool]] = has_digit
"""True if a word may be a date (parsing will be attempted)"""
default_date_format: typing.Union[
str, InterpretAsFormat
] = InterpretAsFormat.DATE_MDY_ORDINAL
"""Format used to verbalize a date unless set with the format attribute of <say-as>"""
# Times
is_maybe_time: typing.Optional[typing.Callable[[str], bool]] = has_digit
"""True if a word may be a clock time (parsing will be attempted)"""
parse_time: typing.Optional[typing.Callable[[str], typing.Optional[Time]]] = None
"""Parse word text into a Time object or None"""
verbalize_time: typing.Optional[
typing.Callable[[Time], typing.Iterable[str]]
] = None
"""Convert Time to words"""
# Part of speech (pos) tagging
get_parts_of_speech: typing.Optional[GetPartsOfSpeech] = None
"""Optional function to get part of speech for a word"""
# Initialisms (e.g, TTS or T.T.S.)
is_initialism: typing.Optional[typing.Callable[[str], bool]] = None
"""True if a word is an initialism (will be split with split_initialism)"""
split_initialism: typing.Optional[
typing.Callable[[str], typing.Sequence[str]]
] = None
"""Function to break apart an initialism into multiple words (called if is_initialism is True)"""
# Phonemization
lookup_phonemes: typing.Optional[LookupPhonemes] = None
"""Optional function to look up phonemes for a word/role (without guessing)"""
guess_phonemes: typing.Optional[GuessPhonemes] = None
"""Optional function to guess phonemes for a word/role"""
# Pre/post-processing
pre_process_text: typing.Optional[typing.Callable[[str], str]] = None
"""Optional function to process text during tokenization"""
post_process_sentence: typing.Optional[PostProcessSentence] = None
"""Optional function to post-process each sentence in the graph before post_process_graph"""
def __post_init__(self):
# Languages/locales
if self.babel_locale is None:
if "-" in self.lang:
# en-us -> en_US
lang_parts = self.lang.split("-", maxsplit=1)
self.babel_locale = "_".join(
[lang_parts[0].lower(), lang_parts[1].upper()]
)
else:
self.babel_locale = self.lang
if self.num2words_lang is None:
self.num2words_lang = self.babel_locale
if self.dateparser_lang is None:
# en_US -> en
self.dateparser_lang = self.babel_locale.split("_")[0]
# Pre-compiled regular expressions
self.replacements = [
(maybe_compile_regex(pattern), template)
for pattern, template in self.replacements
]
compiled_abbreviations = {}
for pattern, template in self.abbreviations.items():
if isinstance(pattern, str):
if not pattern.endswith("$") and self.major_breaks:
# Automatically add optional major break at the end
break_pattern_str = "|".join(
re.escape(b) for b in self.major_breaks
)
pattern = (
f"{pattern}(?P<break>{break_pattern_str})?(?P<whitespace>\\s*)$"
)
template += r"\g<break>\g<whitespace>"
pattern = re.compile(pattern)
compiled_abbreviations[pattern] = template
self.abbreviations = compiled_abbreviations
# Strings that should be separated from words, but do not cause any breaks
if (self.begin_punctuations_pattern is None) and self.begin_punctuations:
pattern_str = "|".join(re.escape(b) for b in self.begin_punctuations)
# Match begin_punctuations only at start a word
self.begin_punctuations_pattern = f"^({pattern_str})"
if self.begin_punctuations_pattern is not None:
self.begin_punctuations_pattern = maybe_compile_regex(
self.begin_punctuations_pattern
)
if (self.end_punctuations_pattern is None) and self.end_punctuations:
pattern_str = "|".join(re.escape(b) for b in self.end_punctuations)
# Match end_punctuations only at end of a word
self.end_punctuations_pattern = f"({pattern_str})$"
if self.end_punctuations_pattern is not None:
self.end_punctuations_pattern = maybe_compile_regex(
self.end_punctuations_pattern
)
# Major breaks (split sentences)
if (self.major_breaks_pattern is None) and self.major_breaks:
pattern_str = "|".join(re.escape(b) for b in self.major_breaks)
# Match major break with either whitespace at the end or at the end of the text
# Allow for multiple punctuation symbols (e.g., !?)
self.major_breaks_pattern = f"((?:{pattern_str})+(?:\\s+|$))"
if self.major_breaks_pattern is not None:
self.major_breaks_pattern = maybe_compile_regex(self.major_breaks_pattern)
# Minor breaks (don't split sentences)
if (self.minor_breaks_pattern is None) and self.minor_breaks:
pattern_str = "|".join(re.escape(b) for b in self.minor_breaks)
# Match minor break with either whitespace at the end or at the end of the text
self.minor_breaks_pattern = f"((?:{pattern_str})(?:\\s+|$))"
if self.minor_breaks_pattern is not None:
self.minor_breaks_pattern = maybe_compile_regex(self.minor_breaks_pattern)
# Word breaks (break words apart into multiple words)
if (self.word_breaks_pattern is None) and self.word_breaks:
pattern_str = "|".join(re.escape(b) for b in self.word_breaks)
self.word_breaks_pattern = f"(?:{pattern_str})"
if self.word_breaks_pattern is not None:
self.word_breaks_pattern = maybe_compile_regex(self.word_breaks_pattern)
# Currency
if not self.currencies:
try:
# Look up currencies for locale
locale_obj = babel.Locale(self.babel_locale)
# $ -> USD
self.currencies = {
babel.numbers.get_currency_symbol(cn): cn
for cn in locale_obj.currency_symbols
}
except Exception:
# No automatic currencies
pass
if not self.currency_symbols:
# Currency symbols (e.g., "$") by decreasing length
self.currency_symbols = sorted(
self.currencies, key=operator.length_hint, reverse=True
)
# -----------------------------------------------------------------------------