ai-content-maker/.venv/Lib/site-packages/gruut/lang.py

883 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Language-specific settings"""
import logging
import re
import sqlite3
import typing
from pathlib import Path
import networkx as nx
from gruut.const import PHONEMES_TYPE, GraphType, SentenceNode, Time
from gruut.g2p import GraphemesToPhonemes
from gruut.phonemize import SqlitePhonemizer
from gruut.pos import PartOfSpeechTagger
from gruut.text_processor import InterpretAsFormat, TextProcessorSettings
from gruut.utils import find_lang_dir, remove_non_word_chars, resolve_lang
_LOGGER = logging.getLogger("gruut")
# -----------------------------------------------------------------------------
def get_settings(
lang: str,
search_dirs: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None,
lang_dir: typing.Optional[typing.Union[str, Path]] = None,
model_prefix: typing.Optional[str] = None,
load_pos_tagger: bool = True,
load_phoneme_lexicon: bool = True,
load_g2p_guesser: bool = True,
**settings_args,
) -> TextProcessorSettings:
"""Get settings for a specific language"""
model_prefix = model_prefix or ""
# Resolve language
if model_prefix:
# espeak
lang_model_prefix = model_prefix
lang_only = lang
elif "/" in lang:
# en-us/espeak
lang_only, lang_model_prefix = lang.split("/", maxsplit=1)
else:
# en-us
lang_only = lang
lang_model_prefix = ""
# en_US -> en-us
lang_only = resolve_lang(lang_only)
if lang_dir is None:
# Search for language data files
lang_dir = find_lang_dir(lang_only, search_dirs=search_dirs)
if lang_dir is not None:
lang_dir = Path(lang_dir)
# Part of speech tagger
if load_pos_tagger and ("get_parts_of_speech" not in settings_args):
pos_model_path = lang_dir / "pos" / "model.crf"
if pos_model_path.is_file():
# POS tagger model will load on first use
settings_args["get_parts_of_speech"] = DelayedPartOfSpeechTagger(
pos_model_path
)
else:
_LOGGER.debug(
"(%s) no part of speech tagger found at %s", lang, pos_model_path,
)
# Phonemizer
if load_phoneme_lexicon and ("lookup_phonemes" not in settings_args):
lexicon_db_path = lang_dir / lang_model_prefix / "lexicon.db"
if lexicon_db_path.is_file():
# Transformations to apply to words when they can't be found in the lexicon
phonemizer_args = {
"word_transform_funcs": [
remove_non_word_chars,
lambda s: remove_non_word_chars(s.lower()),
],
"casing_func": str.lower,
}
settings_args["lookup_phonemes"] = DelayedSqlitePhonemizer(
lexicon_db_path, **phonemizer_args
)
else:
_LOGGER.debug(
"(%s) no phoneme lexicon database found at %s",
lang,
lexicon_db_path,
)
# Grapheme to phoneme model
if load_g2p_guesser and ("guess_phonemes" not in settings_args):
g2p_model_path = lang_dir / lang_model_prefix / "g2p" / "model.crf"
if g2p_model_path.is_file():
settings_args["guess_phonemes"] = DelayedGraphemesToPhonemes(
g2p_model_path, transform_func=str.lower
)
else:
_LOGGER.debug(
"(%s) no grapheme to phoneme CRF model found at %s",
lang,
g2p_model_path,
)
# ---------------------------------
# Create language-specific settings
# ---------------------------------
if lang_only == "ar":
# Arabic
return get_ar_settings(lang_dir, **settings_args)
if lang_only == "cs-cz":
# Czech
return get_cs_settings(lang_dir, **settings_args)
if lang_only in {"en-us", "en-gb"}:
# English
return get_en_us_settings(lang_dir, **settings_args)
if lang_only == "de-de":
# German
return get_de_settings(lang_dir, **settings_args)
if lang_only in {"es-es", "es-mx"}:
# Spanish
return get_es_settings(lang_dir, **settings_args)
if lang_only == "fa":
# Farsi
return get_fa_settings(lang_dir, **settings_args)
if lang_only == "fr-fr":
# French
return get_fr_settings(lang_dir, **settings_args)
if lang_only == "it-it":
# Italian
return get_it_settings(lang_dir, **settings_args)
if lang_only == "lb":
# Lëtzebuergesch
return get_lb_settings(lang_dir, **settings_args)
if lang_only == "nl":
# Dutch
return get_nl_settings(lang_dir, **settings_args)
if lang_only == "pt":
# Portuguese
return get_pt_settings(lang_dir, **settings_args)
if lang_only == "ru-ru":
# Russian
return get_ru_settings(lang_dir, **settings_args)
if lang_only == "sv-se":
# Swedish
return get_sv_settings(lang_dir, **settings_args)
if lang_only == "sw":
# Swahili
return get_sw_settings(lang_dir, **settings_args)
if lang_only == "zh-cn":
# Chinese
return get_zh_settings(lang_dir, **settings_args)
# Default settings only
return TextProcessorSettings(lang=lang, **settings_args)
# -----------------------------------------------------------------------------
# Arabic (ar, اَلْعَرَبِيَّةُ)
# -----------------------------------------------------------------------------
class ArabicPreProcessText:
"""Pre-processes text using mishkal"""
def __call__(self, text: str) -> str:
try:
import mishkal.tashkeel
# Load vocalizer
if not hasattr(self, "vocalizer"):
vocalizer = mishkal.tashkeel.TashkeelClass()
setattr(self, "vocalizer", vocalizer)
else:
vocalizer = getattr(self, "vocalizer")
assert vocalizer is not None
# Add diacritics
text = vocalizer.tashkeel(text)
except ImportError:
_LOGGER.warning("mishkal is highly recommended for language 'ar'")
_LOGGER.warning("pip install 'mishkal>=0.4.0'")
return text
def get_ar_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Arabic"""
settings_args = {
"major_breaks": {".", "؟", "!"},
"minor_breaks": {"،", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"pre_process_text": ArabicPreProcessText(),
**settings_args,
}
return TextProcessorSettings(lang="ar", **settings_args)
# -----------------------------------------------------------------------------
# Czech (cs-cz, čeština)
# -----------------------------------------------------------------------------
def get_cs_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Czech"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">", ""},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="cs_CZ", **settings_args)
# -----------------------------------------------------------------------------
# English (en-us, en-gb)
# -----------------------------------------------------------------------------
# TTS and T.T.S.
EN_INITIALISM_PATTERN = re.compile(r"^\s*[A-Z]{2,}\s*$")
EN_INITIALISM_DOTS_PATTERN = re.compile(r"^(?:\s*[a-zA-Z]\.){1,}\s*$")
EN_NON_WORD_PATTERN = re.compile(r"^(\W|_)+$")
EN_ORDINAL_PATTERN = re.compile(r"^(-?[0-9][0-9,]*)(?:st|nd|rd|th).*$")
EN_TIME_PATTERN = re.compile(
r"""^((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
(?::
([0-5][0-9]))? # minutes
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
$""",
re.IGNORECASE | re.X,
)
def en_is_initialism(text: str) -> bool:
"""True if text is of the form TTS or T.T.S."""
return (EN_INITIALISM_PATTERN.match(text) is not None) or (
EN_INITIALISM_DOTS_PATTERN.match(text) is not None
)
def en_get_ordinal(text: str) -> typing.Optional[int]:
"""Parse English ordinal string (e.g., 1st -> 1)"""
match = EN_ORDINAL_PATTERN.match(text)
if match is not None:
return int(re.sub(r"[^0-9]", "", match.group(1)))
return None
def en_parse_time(text: str) -> typing.Optional[Time]:
"""Parse English clock time (e.g. 4:01pm)"""
match = EN_TIME_PATTERN.match(text.strip().lower())
if match is None:
return None
hours = int(match.group(1))
maybe_minutes = match.group(6)
minutes = 0 if maybe_minutes is None else int(maybe_minutes)
period = match.group(7)
if period is not None:
# Normalize period
if "a" in period:
period = "A.M."
else:
period = "P.M."
else:
if ":" not in text:
# Require a colon if no period is specified to avoid parsing plain
# numbers like "1" into time expressions.
return None
return Time(hours=hours, minutes=minutes, period=period)
def en_verbalize_time(time: Time) -> typing.Iterable[str]:
"""Convert time into words"""
hour = time.hours
past_noon = hour >= 12
if hour > 12:
hour -= 12
elif hour == 0:
hour = 12
past_noon = True
yield str(hour)
minute = time.minutes
if minute > 0:
if minute < 10:
yield "oh"
yield str(minute)
if time.period is None:
if past_noon:
yield "P.M."
else:
yield "A.M."
else:
yield time.period
def get_en_us_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for English"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "'", "", "«", "[", "(", "<", "*", "_"},
"end_punctuations": {'"', "'", "", "»", "]", ")", ">", "*", "_"},
"default_currency": "USD",
"default_date_format": "{m} {o}, {y}",
"is_initialism": en_is_initialism,
"split_initialism": lambda text: list(text.replace(".", "")),
"is_non_word": lambda text: EN_NON_WORD_PATTERN.match(text) is not None,
"get_ordinal": en_get_ordinal,
"parse_time": en_parse_time,
"verbalize_time": en_verbalize_time,
"replacements": [("", "'")], # normalize apostrophe
"abbreviations": {
r"^([cC])o\.": r"\1ompany", # co. -> company
r"^([dD])r\.": r"\1octor", # dr. -> doctor
r"^([dD])rs\.": r"\1octors", # drs. -> doctors
r"^([jJ])r\.('s)?": r"\1unior\2", # jr. -> junior
r"^([lL])td\.": r"\1imited", # -> ltd. -> limited
r"^([mM])r\.": r"\1ister", # -> mr. -> mister
r"^([mM])s\.": r"\1iss", # -> ms. -> miss
r"^([mM])rs\.": r"\1issus", # -> mrs. -> missus
r"^([sS])t\.": r"\1treet", # -> st. -> street
r"^([vV])s\.?": r"\1ersus", # -> vs. -> versus
r"(.*\d)%": r"\1 percent", # % -> percent
r"^&(\s*)$": r"and\1", # &-> and
r"^([mM])t\.": r"\1ount", # -> mt. -> mount
},
"spell_out_words": {
".": "dot",
"-": "dash",
"@": "at",
"*": "star",
"+": "plus",
"/": "slash",
},
**settings_args,
}
return TextProcessorSettings(lang="en_US", **settings_args)
# -----------------------------------------------------------------------------
# German (de-de)
# -----------------------------------------------------------------------------
def get_de_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for German"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">", ""},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY_ORDINAL,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="de_DE", **settings_args)
# -----------------------------------------------------------------------------
# Spanish (es-es, Español)
# -----------------------------------------------------------------------------
def get_es_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Spanish"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "¡", "¿"},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="es_ES", **settings_args)
# -----------------------------------------------------------------------------
# Farsi/Persian (fa, فارسی)
# -----------------------------------------------------------------------------
class FarsiPartOfSpeechTagger:
"""Add POS tags with hazm"""
def __init__(self, lang_dir: Path):
self.lang_dir = lang_dir
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
pos_tags = []
try:
import hazm
# Load normalizer
normalizer = getattr(self, "normalizer", None)
if normalizer is None:
normalizer = hazm.Normalizer()
setattr(self, "normalizer", normalizer)
# Load tagger
tagger = getattr(self, "tagger", None)
if tagger is None:
# Load part of speech tagger
model_path = self.lang_dir / "pos" / "postagger.model"
tagger = hazm.POSTagger(model=str(model_path))
setattr(self, "tagger", tagger)
text = " ".join(words)
for sentence in hazm.sent_tokenize(normalizer.normalize(text)):
for _word, pos in tagger.tag(hazm.word_tokenize(sentence)):
pos_tags.append(pos)
except ImportError:
_LOGGER.warning("hazm is highly recommended for language 'fa'")
_LOGGER.warning("pip install 'hazm>=0.7.0'")
return pos_tags
def fa_post_process_sentence(
graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings
):
"""Add e̞ for genitive case"""
from gruut.text_processor import DATA_PROP, WordNode
for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node):
if not graph.out_degree(dfs_node) == 0:
# Only leave
continue
node = graph.nodes[dfs_node][DATA_PROP]
if isinstance(node, WordNode):
word = typing.cast(WordNode, node)
if word.phonemes and (word.pos == "Ne"):
if isinstance(word.phonemes, list):
word.phonemes.append("")
else:
word.phonemes = list(word.phonemes) + [""]
def get_fa_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Farsi"""
settings_args = {
"major_breaks": {".", "؟", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", "", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">", ""},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"post_process_sentence": fa_post_process_sentence,
**settings_args,
}
if (lang_dir is not None) and ("get_parts_of_speech" not in settings_args):
settings_args["get_parts_of_speech"] = FarsiPartOfSpeechTagger(lang_dir)
return TextProcessorSettings(lang="fa", **settings_args)
# -----------------------------------------------------------------------------
# French (fr-fr, Français)
# -----------------------------------------------------------------------------
def fr_post_process_sentence(
graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings
):
"""Add liasons to phonemes"""
from gruut.text_processor import DATA_PROP, WordNode
from gruut.utils import sliding_window
words = []
for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node):
if not graph.out_degree(dfs_node) == 0:
# Only leave
continue
node = graph.nodes[dfs_node][DATA_PROP]
if isinstance(node, WordNode):
word_node = typing.cast(WordNode, node)
words.append(word_node)
for word1, word2 in sliding_window(words, 2):
if word2 is None:
continue
if not (word1.text and word1.phonemes and word2.text and word2.phonemes):
continue
liason = False
# Conditions to meet for liason check:
# 1) word 1 ends with a silent consonant
# 2) word 2 starts with a vowel (phoneme)
last_char1 = word1.text[-1]
ends_silent_consonant = fr_has_silent_consonant(last_char1, word1.phonemes[-1])
starts_vowel = fr_is_vowel(word2.phonemes[0])
if ends_silent_consonant and starts_vowel:
# Handle mandatory liason cases
# https://www.commeunefrancaise.com/blog/la-liaison
if word1.text == "et":
# No liason
pass
elif word1.pos in {"DET", "NUM"}:
# Determiner/adjective -> noun
liason = True
elif (word1.pos == "PRON") and (word2.pos in {"AUX", "VERB"}):
# Pronoun -> verb
liason = True
elif (word1.pos == "ADP") or (word1.text == "très"):
# Preposition
liason = True
elif (word1.pos == "ADJ") and (word2.pos in {"NOUN", "PROPN"}):
# Adjective -> noun
liason = True
elif word1.pos in {"AUX", "VERB"}:
# Verb -> vowel
liason = True
if liason:
# Apply liason
# s -> z
# p -> p
# d|t -> d
liason_pron = word1.phonemes
if last_char1 in {"s", "x", "z"}:
liason_pron.append("z")
elif last_char1 == "d":
liason_pron.append("t")
elif last_char1 in {"t", "p", "n"}:
# Final phoneme is same as char
liason_pron.append(last_char1)
def fr_has_silent_consonant(last_char: str, last_phoneme: str) -> bool:
"""True if last consonant is silent in French"""
# Credit: https://github.com/Remiphilius/PoemesProfonds/blob/master/lecture.py
if last_char in {"d", "p", "t"}:
return last_phoneme != last_char
if last_char == "r":
return last_phoneme != "ʁ"
if last_char in {"s", "x", "z"}:
return last_phoneme not in {"s", "z"}
if last_char == "n":
return last_phoneme not in {"n", "ŋ"}
return False
def fr_is_vowel(phoneme: str) -> bool:
"""True if phoneme is a French vowel"""
return phoneme in {
"i",
"y",
"u",
"e",
"ø",
"o",
"ə",
"ɛ",
"œ",
"ɔ",
"a",
"ɔ̃",
"ɛ̃",
"ɑ̃",
"œ̃",
}
def get_fr_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for French"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY_ORDINAL,
"replacements": [("", "'")], # normalize apostrophe
"post_process_sentence": fr_post_process_sentence,
**settings_args,
}
return TextProcessorSettings(lang="fr_FR", **settings_args)
# -----------------------------------------------------------------------------
# Italian (it-it, Italiano)
# -----------------------------------------------------------------------------
def get_it_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Italian"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"post_process_sentence": fr_post_process_sentence,
**settings_args,
}
return TextProcessorSettings(lang="it_IT", **settings_args)
# -----------------------------------------------------------------------------
# Luxembourgish (lb, Lëtzebuergesch)
# -----------------------------------------------------------------------------
def get_lb_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Luxembourgish"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
"babel_locale": "lb",
**settings_args,
}
return TextProcessorSettings(lang="lb", **settings_args)
# -----------------------------------------------------------------------------
# Dutch (nl, Nederlands)
# -----------------------------------------------------------------------------
def get_nl_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Dutch"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="nl", **settings_args)
# -----------------------------------------------------------------------------
# Portuguese (pt, Português)
# -----------------------------------------------------------------------------
def get_pt_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create default settings for Portuguese"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "EUR",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="pt", **settings_args)
# -----------------------------------------------------------------------------
# Russian (ru, Русский)
# -----------------------------------------------------------------------------
def get_ru_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Russian"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_currency": "RUB",
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="ru_RU", **settings_args)
# -----------------------------------------------------------------------------
# Swedish (sv-se, svenska)
# -----------------------------------------------------------------------------
def get_sv_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Swedish"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":", "..."},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="sv_SE", **settings_args)
# -----------------------------------------------------------------------------
# Swahili (sw, Kiswahili)
# -----------------------------------------------------------------------------
def get_sw_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Swahili"""
settings_args = {
"major_breaks": {".", "?", "!"},
"minor_breaks": {",", ";", ":"},
"word_breaks": {"-", "_"},
"begin_punctuations": {'"', "", "«", "[", "(", "<", ""},
"end_punctuations": {'"', "", "»", "]", ")", ">"},
"default_date_format": InterpretAsFormat.DATE_DMY,
"replacements": [("", "'")], # normalize apostrophe
**settings_args,
}
return TextProcessorSettings(lang="sw", **settings_args)
# -----------------------------------------------------------------------------
# Chinese (zh-cn, 汉语)
# -----------------------------------------------------------------------------
def get_zh_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
"""Create settings for Chinese"""
# https://en.wikipedia.org/wiki/Chinese_punctuation
settings_args = {
"major_breaks": {"", "", ""},
"minor_breaks": {"", "", "", "", "……"},
"begin_punctuations": {"", "", "", "", "", '"', "", ""},
"end_punctuations": {"", "", "", "", "", '"', "", ""},
"word_breaks": {""},
"split_words": list,
"join_str": "",
**settings_args,
}
return TextProcessorSettings(lang="zh_CN", **settings_args)
# -----------------------------------------------------------------------------
class DelayedGraphemesToPhonemes:
"""Grapheme to phoneme guesser that loads on first use"""
def __init__(
self,
model_path: typing.Union[str, Path],
transform_func: typing.Optional[typing.Callable[[str], str]] = None,
**g2p_args,
):
self.model_path = model_path
self.g2p: typing.Optional[GraphemesToPhonemes] = None
self.transform_func = transform_func
self.g2p_args = g2p_args
def __call__(
self, word: str, role: typing.Optional[str] = None
) -> typing.Optional[PHONEMES_TYPE]:
if self.g2p is None:
_LOGGER.debug(
"Loading grapheme to phoneme CRF model from %s", self.model_path
)
self.g2p = GraphemesToPhonemes(self.model_path, **self.g2p_args)
assert self.g2p is not None
if self.transform_func is not None:
word = self.transform_func(word)
return self.g2p(word)
class DelayedPartOfSpeechTagger:
"""POS tagger that loads on first use"""
def __init__(self, model_path: typing.Union[str, Path], **tagger_args):
self.model_path = Path(model_path)
self.tagger: typing.Optional[PartOfSpeechTagger] = None
self.tagger_args = tagger_args
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
if self.tagger is None:
_LOGGER.debug("Loading part of speech tagger from %s", self.model_path)
self.tagger = PartOfSpeechTagger(self.model_path, **self.tagger_args)
assert self.tagger is not None
return self.tagger(words)
class DelayedSqlitePhonemizer:
"""Phonemizer that loads on first use"""
def __init__(self, db_path: typing.Union[str, Path], **phonemizer_args):
self.db_path = Path(db_path)
self.phonemizer: typing.Optional[SqlitePhonemizer] = None
self.phonemizer_args = phonemizer_args
def __call__(
self, word: str, role: typing.Optional[str] = None, do_transforms: bool = True
) -> typing.Optional[PHONEMES_TYPE]:
if self.phonemizer is None:
_LOGGER.debug("Connecting to lexicon database at %s", self.db_path)
db_conn = sqlite3.connect(str(self.db_path))
self.phonemizer = SqlitePhonemizer(db_conn=db_conn, **self.phonemizer_args)
assert self.phonemizer is not None
return self.phonemizer(word, role=role, do_transforms=do_transforms)