ai-content-maker/.venv/Lib/site-packages/gruut/lang.py

883 lines
30 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python3
"""Language-specific settings"""
import logging
import re
import sqlite3
import typing
from pathlib import Path
import networkx as nx
from gruut.const import PHONEMES_TYPE, GraphType, SentenceNode, Time
from gruut.g2p import GraphemesToPhonemes
from gruut.phonemize import SqlitePhonemizer
from gruut.pos import PartOfSpeechTagger
from gruut.text_processor import InterpretAsFormat, TextProcessorSettings
from gruut.utils import find_lang_dir, remove_non_word_chars, resolve_lang
_LOGGER = logging.getLogger("gruut")
# -----------------------------------------------------------------------------
def get_settings(
lang: str,
search_dirs: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None,
lang_dir: typing.Optional[typing.Union[str, Path]] = None,
model_prefix: typing.Optional[str] = None,
load_pos_tagger: bool = True,
load_phoneme_lexicon: bool = True,
load_g2p_guesser: bool = True,
**settings_args,
) -> TextProcessorSettings:
"""Get settings for a specific language"""
model_prefix = model_prefix or ""
# Resolve language
if model_prefix:
# espeak
lang_model_prefix = model_prefix
lang_only = lang
elif "/" in lang:
# en-us/espeak
lang_only, lang_model_prefix = lang.split("/", maxsplit=1)
else:
# en-us
lang_only = lang
lang_model_prefix = ""
# en_US -> en-us
lang_only = resolve_lang(lang_only)
if lang_dir is None:
# Search for language data files
lang_dir = find_lang_dir(lang_only, search_dirs=search_dirs)
if lang_dir is not None:
lang_dir = Path(lang_dir)
# Part of speech tagger
if load_pos_tagger and ("get_parts_of_speech" not in settings_args):
pos_model_path = lang_dir / "pos" / "model.crf"
if pos_model_path.is_file():
# POS tagger model will load on first use
settings_args["get_parts_of_speech"] = DelayedPartOfSpeechTagger(
pos_model_path
)
else:
_LOGGER.debug(
"(%s) no part of speech tagger found at %s", lang, pos_model_path,
)
# Phonemizer
if load_phoneme_lexicon and ("lookup_phonemes" not in settings_args):
lexicon_db_path = lang_dir / lang_model_prefix / "lexicon.db"
if lexicon_db_path.is_file():
# Transformations to apply to words when they can't be found in the lexicon
phonemizer_args = {
"word_transform_funcs": [
remove_non_word_chars,
lambda s: remove_non_word_chars(s.lower()),
],
"casing_func": str.lower,
}
settings_args["lookup_phonemes"] = DelayedSqlitePhonemizer(
lexicon_db_path, **phonemizer_args
)
else:
_LOGGER.debug(
"(%s) no phoneme lexicon database found at %s",
lang,
lexicon_db_path,
)
# Grapheme to phoneme model
if load_g2p_guesser and ("guess_phonemes" not in settings_args):
g2p_model_path = lang_dir / lang_model_prefix / "g2p" / "model.crf"
if g2p_model_path.is_file():
settings_args["guess_phonemes"] = DelayedGraphemesToPhonemes(
g2p_model_path, transform_func=str.lower
)
else:
_LOGGER.debug(
"(%s) no grapheme to phoneme CRF model found at %s",
lang,
g2p_model_path,
)
# ---------------------------------
# Create language-specific settings
# ---------------------------------
if lang_only == "ar":
# Arabic
return get_ar_settings(lang_dir, **settings_args)
if lang_only == "cs-cz":
# Czech
return get_cs_settings(lang_dir, **settings_args)
if lang_only in {"en-us", "en-gb"}:
# English
return get_en_us_settings(lang_dir, **settings_args)
if lang_only == "de-de":
# German
return get_de_settings(lang_dir, **settings_args)
if lang_only in {"es-es", "es-mx"}:
# Spanish
return get_es_settings(lang_dir, **settings_args)
if lang_only == "fa":
# Farsi
return get_fa_settings(lang_dir, **settings_args)
if lang_only == "fr-fr":
# French
return get_fr_settings(lang_dir, **settings_args)
if lang_only == "it-it":
# Italian
return get_it_settings(lang_dir, **settings_args)
if lang_only == "lb":
# Lëtzebuergesch
return get_lb_settings(lang_dir, **settings_args)
if lang_only == "nl":
# Dutch
return get_nl_settings(lang_dir, **settings_args)
if lang_only == "pt":
# Portuguese
return get_pt_settings(lang_dir, **settings_args)
if lang_only == "ru-ru":
# Russian
return get_ru_settings(lang_dir, **settings_args)
if lang_only == "sv-se":
# Swedish
return get_sv_settings(lang_dir, **settings_args)
if lang_only == "sw":
# Swahili
return get_sw_settings(lang_dir, **settings_args)
if lang_only == "zh-cn":
# Chinese
return get_zh_settings(lang_dir, **settings_args)
# Default settings only
return TextProcessorSettings(lang=lang, **settings_args)
# -----------------------------------------------------------------------------
# Arabic (ar, اَلْعَرَبِيَّةُ)
# -----------------------------------------------------------------------------
class ArabicPreProcessText:
"""Pre-processes text using mishkal"""
def __call__(self, text: str) -> str:
try:
import mishkal.tashkeel
# Load vocalizer
if not hasattr(self, "vocalizer"):
vocalizer = mishkal.tashkeel.TashkeelClass()
setattr(self, "vocalizer", vocalizer)
else:
vocalizer = getattr(self, "vocalizer")
assert vocalizer is not None
# Add diacritics
text = vocalizer.tashkeel(text)
except ImportError:
_LOGGER.warning("mishkal is highly recommended for language 'ar'")
_LOGGER.warning("pip install 'mishkal>=0.4.0'")
return text
def get_ar_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Arabic"""
settings_args = {
"major_breaks": {".", "؟", "!"},
"minor_breaks": {"،", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"pre_process_text": ArabicPreProcessText(),
**settings_args,
}
return TextProcessorSettings(lang="ar", **settings_args)
# -----------------------------------------------------------------------------
# Czech (cs-cz, čeština)
# -----------------------------------------------------------------------------
def get_cs_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Czech"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">", ""},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="cs_CZ", **settings_args)
# -----------------------------------------------------------------------------
# English (en-us, en-gb)
# -----------------------------------------------------------------------------
# TTS and T.T.S.
EN_INITIALISM_PATTERN = re.compile(r"^\s*[A-Z]{2,}\s*$")
EN_INITIALISM_DOTS_PATTERN = re.compile(r"^(?:\s*[a-zA-Z]\.){1,}\s*$")
EN_NON_WORD_PATTERN = re.compile(r"^(\W|_)+$")
EN_ORDINAL_PATTERN = re.compile(r"^(-?[0-9][0-9,]*)(?:st|nd|rd|th).*$")
EN_TIME_PATTERN = re.compile(
r"""^((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
(?::
([0-5][0-9]))? # minutes
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
$""",
re.IGNORECASE | re.X,
)
def en_is_initialism(text: str) -> bool:
"""True if text is of the form TTS or T.T.S."""
return (EN_INITIALISM_PATTERN.match(text) is not None) or (
EN_INITIALISM_DOTS_PATTERN.match(text) is not None
)
def en_get_ordinal(text: str) -> typing.Optional[int]:
"""Parse English ordinal string (e.g., 1st -> 1)"""
match = EN_ORDINAL_PATTERN.match(text)
if match is not None:
return int(re.sub(r"[^0-9]", "", match.group(1)))
return None
def en_parse_time(text: str) -> typing.Optional[Time]:
"""Parse English clock time (e.g. 4:01pm)"""
match = EN_TIME_PATTERN.match(text.strip().lower())
if match is None:
return None
hours = int(match.group(1))
maybe_minutes = match.group(6)
minutes = 0 if maybe_minutes is None else int(maybe_minutes)
period = match.group(7)
if period is not None:
# Normalize period
if "a" in period:
period = "A.M."
else:
period = "P.M."
else:
if ":" not in text:
# Require a colon if no period is specified to avoid parsing plain
# numbers like "1" into time expressions.
return None
return Time(hours=hours, minutes=minutes, period=period)
def en_verbalize_time(time: Time) -> typing.Iterable[str]:
"""Convert time into words"""
hour = time.hours
past_noon = hour >= 12
if hour > 12:
hour -= 12
elif hour == 0:
hour = 12
past_noon = True
yield str(hour)
minute = time.minutes
if minute > 0:
if minute < 10:
yield "oh"
yield str(minute)
if time.period is None:
if past_noon:
yield "P.M."
else:
yield "A.M."
else:
yield time.period
def get_en_us_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for English"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "'", "", "«", "[", "(", "<", "*", "_"},
"end_punctuations": {'"', "'", "", "»", "]", ")", ">", "*", "_"},
"default_currency": "USD",
"default_date_format": "{m} {o}, {y}",
"is_initialism": en_is_initialism,
"split_initialism": lambda text: list(text.replace(".", "")),
"is_non_word": lambda text: EN_NON_WORD_PATTERN.match(text) is not None,
"get_ordinal": en_get_ordinal,
"parse_time": en_parse_time,
"verbalize_time": en_verbalize_time,
"replacements": [("", "'")], # normalize apostrophe
"abbreviations": {
r"^([cC])o\.": r"\1ompany", # co. -> company
r"^([dD])r\.": r"\1octor", # dr. -> doctor
r"^([dD])rs\.": r"\1octors", # drs. -> doctors
r"^([jJ])r\.('s)?": r"\1unior\2", # jr. -> junior
r"^([lL])td\.": r"\1imited", # -> ltd. -> limited
r"^([mM])r\.": r"\1ister", # -> mr. -> mister
r"^([mM])s\.": r"\1iss", # -> ms. -> miss
r"^([mM])rs\.": r"\1issus", # -> mrs. -> missus
r"^([sS])t\.": r"\1treet", # -> st. -> street
r"^([vV])s\.?": r"\1ersus", # -> vs. -> versus
r"(.*\d)%": r"\1 percent", # % -> percent
r"^&(\s*)$": r"and\1", # &-> and
r"^([mM])t\.": r"\1ount", # -> mt. -> mount
},
"spell_out_words": {
".": "dot",
"-": "dash",
"@": "at",
"*": "star",
"+": "plus",
"/": "slash",
},
**settings_args,
}
return TextProcessorSettings(lang="en_US", **settings_args)
# -----------------------------------------------------------------------------
# German (de-de)
# -----------------------------------------------------------------------------
def get_de_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for German"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">", ""},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY_ORDINAL,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="de_DE", **settings_args)
# -----------------------------------------------------------------------------
# Spanish (es-es, Español)
# -----------------------------------------------------------------------------
def get_es_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Spanish"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "¡", "¿"},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="es_ES", **settings_args)
# -----------------------------------------------------------------------------
# Farsi/Persian (fa, فارسی)
# -----------------------------------------------------------------------------
class FarsiPartOfSpeechTagger:
"""Add POS tags with hazm"""
def __init__(self, lang_dir: Path):
self.lang_dir = lang_dir
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
pos_tags = []
try:
import hazm
# Load normalizer
normalizer = getattr(self, "normalizer", None)
if normalizer is None:
normalizer = hazm.Normalizer()
setattr(self, "normalizer", normalizer)
# Load tagger
tagger = getattr(self, "tagger", None)
if tagger is None:
# Load part of speech tagger
model_path = self.lang_dir / "pos" / "postagger.model"
tagger = hazm.POSTagger(model=str(model_path))
setattr(self, "tagger", tagger)
text = " ".join(words)
for sentence in hazm.sent_tokenize(normalizer.normalize(text)):
for _word, pos in tagger.tag(hazm.word_tokenize(sentence)):
pos_tags.append(pos)
except ImportError:
_LOGGER.warning("hazm is highly recommended for language 'fa'")
_LOGGER.warning("pip install 'hazm>=0.7.0'")
return pos_tags
def fa_post_process_sentence(
graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings
):
"""Add e̞ for genitive case"""
from gruut.text_processor import DATA_PROP, WordNode
for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node):
if not graph.out_degree(dfs_node) == 0:
# Only leave
continue
node = graph.nodes[dfs_node][DATA_PROP]
if isinstance(node, WordNode):
word = typing.cast(WordNode, node)
if word.phonemes and (word.pos == "Ne"):
if isinstance(word.phonemes, list):
word.phonemes.append("")
else:
word.phonemes = list(word.phonemes) + [""]
def get_fa_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Farsi"""
settings_args = {
"major_breaks": {".", "؟", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">", ""},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"post_process_sentence": fa_post_process_sentence,
**settings_args,
}
if (lang_dir is not None) and ("get_parts_of_speech" not in settings_args):
settings_args["get_parts_of_speech"] = FarsiPartOfSpeechTagger(lang_dir)
return TextProcessorSettings(lang="fa", **settings_args)
# -----------------------------------------------------------------------------
# French (fr-fr, Français)
# -----------------------------------------------------------------------------
def fr_post_process_sentence(
graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings
):
"""Add liasons to phonemes"""
from gruut.text_processor import DATA_PROP, WordNode
from gruut.utils import sliding_window
words = []
for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node):
if not graph.out_degree(dfs_node) == 0:
# Only leave
continue
node = graph.nodes[dfs_node][DATA_PROP]
if isinstance(node, WordNode):
word_node = typing.cast(WordNode, node)
words.append(word_node)
for word1, word2 in sliding_window(words, 2):
if word2 is None:
continue
if not (word1.text and word1.phonemes and word2.text and word2.phonemes):
continue
liason = False
# Conditions to meet for liason check:
# 1) word 1 ends with a silent consonant
# 2) word 2 starts with a vowel (phoneme)
last_char1 = word1.text[-1]
ends_silent_consonant = fr_has_silent_consonant(last_char1, word1.phonemes[-1])
starts_vowel = fr_is_vowel(word2.phonemes[0])
if ends_silent_consonant and starts_vowel:
# Handle mandatory liason cases
# https://www.commeunefrancaise.com/blog/la-liaison
if word1.text == "et":
# No liason
pass
elif word1.pos in {"DET", "NUM"}:
# Determiner/adjective -> noun
liason = True
elif (word1.pos == "PRON") and (word2.pos in {"AUX", "VERB"}):
# Pronoun -> verb
liason = True
elif (word1.pos == "ADP") or (word1.text == "très"):
# Preposition
liason = True
elif (word1.pos == "ADJ") and (word2.pos in {"NOUN", "PROPN"}):
# Adjective -> noun
liason = True
elif word1.pos in {"AUX", "VERB"}:
# Verb -> vowel
liason = True
if liason:
# Apply liason
# s -> z
# p -> p
# d|t -> d
liason_pron = word1.phonemes
if last_char1 in {"s", "x", "z"}:
liason_pron.append("z")
elif last_char1 == "d":
liason_pron.append("t")
elif last_char1 in {"t", "p", "n"}:
# Final phoneme is same as char
liason_pron.append(last_char1)
def fr_has_silent_consonant(last_char: str, last_phoneme: str) -> bool:
"""True if last consonant is silent in French"""
# Credit: https://github.com/Remiphilius/PoemesProfonds/blob/master/lecture.py
if last_char in {"d", "p", "t"}:
return last_phoneme != last_char
if last_char == "r":
return last_phoneme != "ʁ"
if last_char in {"s", "x", "z"}:
return last_phoneme not in {"s", "z"}
if last_char == "n":
return last_phoneme not in {"n", "ŋ"}
return False
def fr_is_vowel(phoneme: str) -> bool:
"""True if phoneme is a French vowel"""
return phoneme in {
"i",
"y",
"u",
"e",
"ø",
"o",
"ə",
"ɛ",
"œ",
"ɔ",
"a",
"ɔ̃",
"ɛ̃",
"ɑ̃",
"œ̃",
}
def get_fr_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for French"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY_ORDINAL,
"replacements": [("", "'")], # normalize apostrophe
"post_process_sentence": fr_post_process_sentence,
**settings_args,
}
return TextProcessorSettings(lang="fr_FR", **settings_args)
# -----------------------------------------------------------------------------
# Italian (it-it, Italiano)
# -----------------------------------------------------------------------------
def get_it_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Italian"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"post_process_sentence": fr_post_process_sentence,
**settings_args,
}
return TextProcessorSettings(lang="it_IT", **settings_args)
# -----------------------------------------------------------------------------
# Luxembourgish (lb, Lëtzebuergesch)
# -----------------------------------------------------------------------------
def get_lb_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Luxembourgish"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"babel_locale": "lb",
**settings_args,
}
return TextProcessorSettings(lang="lb", **settings_args)
# -----------------------------------------------------------------------------
# Dutch (nl, Nederlands)
# -----------------------------------------------------------------------------
def get_nl_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Dutch"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="nl", **settings_args)
# -----------------------------------------------------------------------------
# Portuguese (pt, Português)
# -----------------------------------------------------------------------------
def get_pt_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create default settings for Portuguese"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="pt", **settings_args)
# -----------------------------------------------------------------------------
# Russian (ru, Русский)
# -----------------------------------------------------------------------------
def get_ru_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Russian"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "RUB",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="ru_RU", **settings_args)
# -----------------------------------------------------------------------------
# Swedish (sv-se, svenska)
# -----------------------------------------------------------------------------
def get_sv_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Swedish"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="sv_SE", **settings_args)
# -----------------------------------------------------------------------------
# Swahili (sw, Kiswahili)
# -----------------------------------------------------------------------------
def get_sw_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Swahili"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="sw", **settings_args)
# -----------------------------------------------------------------------------
# Chinese (zh-cn, 汉语)
# -----------------------------------------------------------------------------
def get_zh_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Chinese"""
# https://en.wikipedia.org/wiki/Chinese_punctuation
settings_args = {
"major_breaks": {"", "", ""},
"minor_breaks": {"", "", "", "", "……"},
"begin_punctuations": {"", "", "", "", "", '"', "", ""},
"end_punctuations": {"", "", "", "", "", '"', "", ""},
"word_breaks": {""},
"split_words": list,
"join_str": "",
**settings_args,
}
return TextProcessorSettings(lang="zh_CN", **settings_args)
# -----------------------------------------------------------------------------
class DelayedGraphemesToPhonemes:
"""Grapheme to phoneme guesser that loads on first use"""
def __init__(
self,
model_path: typing.Union[str, Path],
transform_func: typing.Optional[typing.Callable[[str], str]] = None,
**g2p_args,
):
self.model_path = model_path
self.g2p: typing.Optional[GraphemesToPhonemes] = None
self.transform_func = transform_func
self.g2p_args = g2p_args
def __call__(
self, word: str, role: typing.Optional[str] = None
) -> typing.Optional[PHONEMES_TYPE]:
if self.g2p is None:
_LOGGER.debug(
"Loading grapheme to phoneme CRF model from %s", self.model_path
)
self.g2p = GraphemesToPhonemes(self.model_path, **self.g2p_args)
assert self.g2p is not None
if self.transform_func is not None:
word = self.transform_func(word)
return self.g2p(word)
class DelayedPartOfSpeechTagger:
"""POS tagger that loads on first use"""
def __init__(self, model_path: typing.Union[str, Path], **tagger_args):
self.model_path = Path(model_path)
self.tagger: typing.Optional[PartOfSpeechTagger] = None
self.tagger_args = tagger_args
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
if self.tagger is None:
_LOGGER.debug("Loading part of speech tagger from %s", self.model_path)
self.tagger = PartOfSpeechTagger(self.model_path, **self.tagger_args)
assert self.tagger is not None
return self.tagger(words)
class DelayedSqlitePhonemizer:
"""Phonemizer that loads on first use"""
def __init__(self, db_path: typing.Union[str, Path], **phonemizer_args):
self.db_path = Path(db_path)
self.phonemizer: typing.Optional[SqlitePhonemizer] = None
self.phonemizer_args = phonemizer_args
def __call__(
self, word: str, role: typing.Optional[str] = None, do_transforms: bool = True
) -> typing.Optional[PHONEMES_TYPE]:
if self.phonemizer is None:
_LOGGER.debug("Connecting to lexicon database at %s", self.db_path)
db_conn = sqlite3.connect(str(self.db_path))
self.phonemizer = SqlitePhonemizer(db_conn=db_conn, **self.phonemizer_args)
assert self.phonemizer is not None
return self.phonemizer(word, role=role, do_transforms=do_transforms)