883 lines
30 KiB
Python
883 lines
30 KiB
Python
|
#!/usr/bin/env python3
|
|||
|
"""Language-specific settings"""
|
|||
|
import logging
|
|||
|
import re
|
|||
|
import sqlite3
|
|||
|
import typing
|
|||
|
from pathlib import Path
|
|||
|
|
|||
|
import networkx as nx
|
|||
|
|
|||
|
from gruut.const import PHONEMES_TYPE, GraphType, SentenceNode, Time
|
|||
|
from gruut.g2p import GraphemesToPhonemes
|
|||
|
from gruut.phonemize import SqlitePhonemizer
|
|||
|
from gruut.pos import PartOfSpeechTagger
|
|||
|
from gruut.text_processor import InterpretAsFormat, TextProcessorSettings
|
|||
|
from gruut.utils import find_lang_dir, remove_non_word_chars, resolve_lang
|
|||
|
|
|||
|
_LOGGER = logging.getLogger("gruut")
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_settings(
|
|||
|
lang: str,
|
|||
|
search_dirs: typing.Optional[typing.Iterable[typing.Union[str, Path]]] = None,
|
|||
|
lang_dir: typing.Optional[typing.Union[str, Path]] = None,
|
|||
|
model_prefix: typing.Optional[str] = None,
|
|||
|
load_pos_tagger: bool = True,
|
|||
|
load_phoneme_lexicon: bool = True,
|
|||
|
load_g2p_guesser: bool = True,
|
|||
|
**settings_args,
|
|||
|
) -> TextProcessorSettings:
|
|||
|
"""Get settings for a specific language"""
|
|||
|
model_prefix = model_prefix or ""
|
|||
|
|
|||
|
# Resolve language
|
|||
|
if model_prefix:
|
|||
|
# espeak
|
|||
|
lang_model_prefix = model_prefix
|
|||
|
lang_only = lang
|
|||
|
elif "/" in lang:
|
|||
|
# en-us/espeak
|
|||
|
lang_only, lang_model_prefix = lang.split("/", maxsplit=1)
|
|||
|
else:
|
|||
|
# en-us
|
|||
|
lang_only = lang
|
|||
|
lang_model_prefix = ""
|
|||
|
|
|||
|
# en_US -> en-us
|
|||
|
lang_only = resolve_lang(lang_only)
|
|||
|
|
|||
|
if lang_dir is None:
|
|||
|
# Search for language data files
|
|||
|
lang_dir = find_lang_dir(lang_only, search_dirs=search_dirs)
|
|||
|
|
|||
|
if lang_dir is not None:
|
|||
|
lang_dir = Path(lang_dir)
|
|||
|
|
|||
|
# Part of speech tagger
|
|||
|
if load_pos_tagger and ("get_parts_of_speech" not in settings_args):
|
|||
|
pos_model_path = lang_dir / "pos" / "model.crf"
|
|||
|
if pos_model_path.is_file():
|
|||
|
# POS tagger model will load on first use
|
|||
|
settings_args["get_parts_of_speech"] = DelayedPartOfSpeechTagger(
|
|||
|
pos_model_path
|
|||
|
)
|
|||
|
else:
|
|||
|
_LOGGER.debug(
|
|||
|
"(%s) no part of speech tagger found at %s", lang, pos_model_path,
|
|||
|
)
|
|||
|
|
|||
|
# Phonemizer
|
|||
|
if load_phoneme_lexicon and ("lookup_phonemes" not in settings_args):
|
|||
|
lexicon_db_path = lang_dir / lang_model_prefix / "lexicon.db"
|
|||
|
if lexicon_db_path.is_file():
|
|||
|
# Transformations to apply to words when they can't be found in the lexicon
|
|||
|
phonemizer_args = {
|
|||
|
"word_transform_funcs": [
|
|||
|
remove_non_word_chars,
|
|||
|
lambda s: remove_non_word_chars(s.lower()),
|
|||
|
],
|
|||
|
"casing_func": str.lower,
|
|||
|
}
|
|||
|
|
|||
|
settings_args["lookup_phonemes"] = DelayedSqlitePhonemizer(
|
|||
|
lexicon_db_path, **phonemizer_args
|
|||
|
)
|
|||
|
else:
|
|||
|
_LOGGER.debug(
|
|||
|
"(%s) no phoneme lexicon database found at %s",
|
|||
|
lang,
|
|||
|
lexicon_db_path,
|
|||
|
)
|
|||
|
|
|||
|
# Grapheme to phoneme model
|
|||
|
if load_g2p_guesser and ("guess_phonemes" not in settings_args):
|
|||
|
g2p_model_path = lang_dir / lang_model_prefix / "g2p" / "model.crf"
|
|||
|
if g2p_model_path.is_file():
|
|||
|
settings_args["guess_phonemes"] = DelayedGraphemesToPhonemes(
|
|||
|
g2p_model_path, transform_func=str.lower
|
|||
|
)
|
|||
|
|
|||
|
else:
|
|||
|
_LOGGER.debug(
|
|||
|
"(%s) no grapheme to phoneme CRF model found at %s",
|
|||
|
lang,
|
|||
|
g2p_model_path,
|
|||
|
)
|
|||
|
|
|||
|
# ---------------------------------
|
|||
|
# Create language-specific settings
|
|||
|
# ---------------------------------
|
|||
|
|
|||
|
if lang_only == "ar":
|
|||
|
# Arabic
|
|||
|
return get_ar_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "cs-cz":
|
|||
|
# Czech
|
|||
|
return get_cs_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only in {"en-us", "en-gb"}:
|
|||
|
# English
|
|||
|
return get_en_us_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "de-de":
|
|||
|
# German
|
|||
|
return get_de_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only in {"es-es", "es-mx"}:
|
|||
|
# Spanish
|
|||
|
return get_es_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "fa":
|
|||
|
# Farsi
|
|||
|
return get_fa_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "fr-fr":
|
|||
|
# French
|
|||
|
return get_fr_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "it-it":
|
|||
|
# Italian
|
|||
|
return get_it_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "lb":
|
|||
|
# Lëtzebuergesch
|
|||
|
return get_lb_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "nl":
|
|||
|
# Dutch
|
|||
|
return get_nl_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "pt":
|
|||
|
# Portuguese
|
|||
|
return get_pt_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "ru-ru":
|
|||
|
# Russian
|
|||
|
return get_ru_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "sv-se":
|
|||
|
# Swedish
|
|||
|
return get_sv_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "sw":
|
|||
|
# Swahili
|
|||
|
return get_sw_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
if lang_only == "zh-cn":
|
|||
|
# Chinese
|
|||
|
return get_zh_settings(lang_dir, **settings_args)
|
|||
|
|
|||
|
# Default settings only
|
|||
|
return TextProcessorSettings(lang=lang, **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Arabic (ar, اَلْعَرَبِيَّةُ)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class ArabicPreProcessText:
|
|||
|
"""Pre-processes text using mishkal"""
|
|||
|
|
|||
|
def __call__(self, text: str) -> str:
|
|||
|
try:
|
|||
|
import mishkal.tashkeel
|
|||
|
|
|||
|
# Load vocalizer
|
|||
|
if not hasattr(self, "vocalizer"):
|
|||
|
vocalizer = mishkal.tashkeel.TashkeelClass()
|
|||
|
setattr(self, "vocalizer", vocalizer)
|
|||
|
else:
|
|||
|
vocalizer = getattr(self, "vocalizer")
|
|||
|
|
|||
|
assert vocalizer is not None
|
|||
|
|
|||
|
# Add diacritics
|
|||
|
text = vocalizer.tashkeel(text)
|
|||
|
except ImportError:
|
|||
|
_LOGGER.warning("mishkal is highly recommended for language 'ar'")
|
|||
|
_LOGGER.warning("pip install 'mishkal>=0.4.0'")
|
|||
|
|
|||
|
return text
|
|||
|
|
|||
|
|
|||
|
def get_ar_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Arabic"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "؟", "!"},
|
|||
|
"minor_breaks": {"،", ";", ":"},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
"pre_process_text": ArabicPreProcessText(),
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="ar", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Czech (cs-cz, čeština)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_cs_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Czech"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":"},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "’", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">", "’"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="cs_CZ", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# English (en-us, en-gb)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
# TTS and T.T.S.
|
|||
|
EN_INITIALISM_PATTERN = re.compile(r"^\s*[A-Z]{2,}\s*$")
|
|||
|
EN_INITIALISM_DOTS_PATTERN = re.compile(r"^(?:\s*[a-zA-Z]\.){1,}\s*$")
|
|||
|
|
|||
|
EN_NON_WORD_PATTERN = re.compile(r"^(\W|_)+$")
|
|||
|
EN_ORDINAL_PATTERN = re.compile(r"^(-?[0-9][0-9,]*)(?:st|nd|rd|th).*$")
|
|||
|
|
|||
|
EN_TIME_PATTERN = re.compile(
|
|||
|
r"""^((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours
|
|||
|
(?::
|
|||
|
([0-5][0-9]))? # minutes
|
|||
|
\s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm
|
|||
|
$""",
|
|||
|
re.IGNORECASE | re.X,
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
def en_is_initialism(text: str) -> bool:
|
|||
|
"""True if text is of the form TTS or T.T.S."""
|
|||
|
return (EN_INITIALISM_PATTERN.match(text) is not None) or (
|
|||
|
EN_INITIALISM_DOTS_PATTERN.match(text) is not None
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
def en_get_ordinal(text: str) -> typing.Optional[int]:
|
|||
|
"""Parse English ordinal string (e.g., 1st -> 1)"""
|
|||
|
match = EN_ORDINAL_PATTERN.match(text)
|
|||
|
if match is not None:
|
|||
|
return int(re.sub(r"[^0-9]", "", match.group(1)))
|
|||
|
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
def en_parse_time(text: str) -> typing.Optional[Time]:
|
|||
|
"""Parse English clock time (e.g. 4:01pm)"""
|
|||
|
match = EN_TIME_PATTERN.match(text.strip().lower())
|
|||
|
if match is None:
|
|||
|
return None
|
|||
|
|
|||
|
hours = int(match.group(1))
|
|||
|
maybe_minutes = match.group(6)
|
|||
|
minutes = 0 if maybe_minutes is None else int(maybe_minutes)
|
|||
|
period = match.group(7)
|
|||
|
|
|||
|
if period is not None:
|
|||
|
# Normalize period
|
|||
|
if "a" in period:
|
|||
|
period = "A.M."
|
|||
|
else:
|
|||
|
period = "P.M."
|
|||
|
else:
|
|||
|
if ":" not in text:
|
|||
|
# Require a colon if no period is specified to avoid parsing plain
|
|||
|
# numbers like "1" into time expressions.
|
|||
|
return None
|
|||
|
|
|||
|
return Time(hours=hours, minutes=minutes, period=period)
|
|||
|
|
|||
|
|
|||
|
def en_verbalize_time(time: Time) -> typing.Iterable[str]:
|
|||
|
"""Convert time into words"""
|
|||
|
|
|||
|
hour = time.hours
|
|||
|
past_noon = hour >= 12
|
|||
|
|
|||
|
if hour > 12:
|
|||
|
hour -= 12
|
|||
|
elif hour == 0:
|
|||
|
hour = 12
|
|||
|
past_noon = True
|
|||
|
|
|||
|
yield str(hour)
|
|||
|
|
|||
|
minute = time.minutes
|
|||
|
if minute > 0:
|
|||
|
if minute < 10:
|
|||
|
yield "oh"
|
|||
|
|
|||
|
yield str(minute)
|
|||
|
|
|||
|
if time.period is None:
|
|||
|
if past_noon:
|
|||
|
yield "P.M."
|
|||
|
else:
|
|||
|
yield "A.M."
|
|||
|
else:
|
|||
|
yield time.period
|
|||
|
|
|||
|
|
|||
|
def get_en_us_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for English"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "'", "“", "«", "[", "(", "<", "*", "_"},
|
|||
|
"end_punctuations": {'"', "'", "”", "»", "]", ")", ">", "*", "_"},
|
|||
|
"default_currency": "USD",
|
|||
|
"default_date_format": "{m} {o}, {y}",
|
|||
|
"is_initialism": en_is_initialism,
|
|||
|
"split_initialism": lambda text: list(text.replace(".", "")),
|
|||
|
"is_non_word": lambda text: EN_NON_WORD_PATTERN.match(text) is not None,
|
|||
|
"get_ordinal": en_get_ordinal,
|
|||
|
"parse_time": en_parse_time,
|
|||
|
"verbalize_time": en_verbalize_time,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
"abbreviations": {
|
|||
|
r"^([cC])o\.": r"\1ompany", # co. -> company
|
|||
|
r"^([dD])r\.": r"\1octor", # dr. -> doctor
|
|||
|
r"^([dD])rs\.": r"\1octors", # drs. -> doctors
|
|||
|
r"^([jJ])r\.('s)?": r"\1unior\2", # jr. -> junior
|
|||
|
r"^([lL])td\.": r"\1imited", # -> ltd. -> limited
|
|||
|
r"^([mM])r\.": r"\1ister", # -> mr. -> mister
|
|||
|
r"^([mM])s\.": r"\1iss", # -> ms. -> miss
|
|||
|
r"^([mM])rs\.": r"\1issus", # -> mrs. -> missus
|
|||
|
r"^([sS])t\.": r"\1treet", # -> st. -> street
|
|||
|
r"^([vV])s\.?": r"\1ersus", # -> vs. -> versus
|
|||
|
r"(.*\d)%": r"\1 percent", # % -> percent
|
|||
|
r"^&(\s*)$": r"and\1", # &-> and
|
|||
|
r"^([mM])t\.": r"\1ount", # -> mt. -> mount
|
|||
|
},
|
|||
|
"spell_out_words": {
|
|||
|
".": "dot",
|
|||
|
"-": "dash",
|
|||
|
"@": "at",
|
|||
|
"*": "star",
|
|||
|
"+": "plus",
|
|||
|
"/": "slash",
|
|||
|
},
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
|
|||
|
return TextProcessorSettings(lang="en_US", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# German (de-de)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_de_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for German"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "’", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">", "’"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY_ORDINAL,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="de_DE", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Spanish (es-es, Español)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_es_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Spanish"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "¡", "¿"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="es_ES", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Farsi/Persian (fa, فارسی)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class FarsiPartOfSpeechTagger:
|
|||
|
"""Add POS tags with hazm"""
|
|||
|
|
|||
|
def __init__(self, lang_dir: Path):
|
|||
|
self.lang_dir = lang_dir
|
|||
|
|
|||
|
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
|
|||
|
pos_tags = []
|
|||
|
|
|||
|
try:
|
|||
|
import hazm
|
|||
|
|
|||
|
# Load normalizer
|
|||
|
normalizer = getattr(self, "normalizer", None)
|
|||
|
if normalizer is None:
|
|||
|
normalizer = hazm.Normalizer()
|
|||
|
setattr(self, "normalizer", normalizer)
|
|||
|
|
|||
|
# Load tagger
|
|||
|
tagger = getattr(self, "tagger", None)
|
|||
|
if tagger is None:
|
|||
|
# Load part of speech tagger
|
|||
|
model_path = self.lang_dir / "pos" / "postagger.model"
|
|||
|
tagger = hazm.POSTagger(model=str(model_path))
|
|||
|
setattr(self, "tagger", tagger)
|
|||
|
|
|||
|
text = " ".join(words)
|
|||
|
for sentence in hazm.sent_tokenize(normalizer.normalize(text)):
|
|||
|
for _word, pos in tagger.tag(hazm.word_tokenize(sentence)):
|
|||
|
pos_tags.append(pos)
|
|||
|
except ImportError:
|
|||
|
_LOGGER.warning("hazm is highly recommended for language 'fa'")
|
|||
|
_LOGGER.warning("pip install 'hazm>=0.7.0'")
|
|||
|
|
|||
|
return pos_tags
|
|||
|
|
|||
|
|
|||
|
def fa_post_process_sentence(
|
|||
|
graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings
|
|||
|
):
|
|||
|
"""Add e̞ for genitive case"""
|
|||
|
from gruut.text_processor import DATA_PROP, WordNode
|
|||
|
|
|||
|
for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node):
|
|||
|
if not graph.out_degree(dfs_node) == 0:
|
|||
|
# Only leave
|
|||
|
continue
|
|||
|
|
|||
|
node = graph.nodes[dfs_node][DATA_PROP]
|
|||
|
if isinstance(node, WordNode):
|
|||
|
word = typing.cast(WordNode, node)
|
|||
|
if word.phonemes and (word.pos == "Ne"):
|
|||
|
if isinstance(word.phonemes, list):
|
|||
|
word.phonemes.append("e̞")
|
|||
|
else:
|
|||
|
word.phonemes = list(word.phonemes) + ["e̞"]
|
|||
|
|
|||
|
|
|||
|
def get_fa_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Farsi"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "؟", "!"},
|
|||
|
"minor_breaks": {",", ";", ":"},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "’", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">", "’"},
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
"post_process_sentence": fa_post_process_sentence,
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
|
|||
|
if (lang_dir is not None) and ("get_parts_of_speech" not in settings_args):
|
|||
|
settings_args["get_parts_of_speech"] = FarsiPartOfSpeechTagger(lang_dir)
|
|||
|
|
|||
|
return TextProcessorSettings(lang="fa", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# French (fr-fr, Français)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def fr_post_process_sentence(
|
|||
|
graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings
|
|||
|
):
|
|||
|
"""Add liasons to phonemes"""
|
|||
|
from gruut.text_processor import DATA_PROP, WordNode
|
|||
|
from gruut.utils import sliding_window
|
|||
|
|
|||
|
words = []
|
|||
|
for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node):
|
|||
|
if not graph.out_degree(dfs_node) == 0:
|
|||
|
# Only leave
|
|||
|
continue
|
|||
|
|
|||
|
node = graph.nodes[dfs_node][DATA_PROP]
|
|||
|
if isinstance(node, WordNode):
|
|||
|
word_node = typing.cast(WordNode, node)
|
|||
|
words.append(word_node)
|
|||
|
|
|||
|
for word1, word2 in sliding_window(words, 2):
|
|||
|
if word2 is None:
|
|||
|
continue
|
|||
|
|
|||
|
if not (word1.text and word1.phonemes and word2.text and word2.phonemes):
|
|||
|
continue
|
|||
|
|
|||
|
liason = False
|
|||
|
|
|||
|
# Conditions to meet for liason check:
|
|||
|
# 1) word 1 ends with a silent consonant
|
|||
|
# 2) word 2 starts with a vowel (phoneme)
|
|||
|
|
|||
|
last_char1 = word1.text[-1]
|
|||
|
ends_silent_consonant = fr_has_silent_consonant(last_char1, word1.phonemes[-1])
|
|||
|
starts_vowel = fr_is_vowel(word2.phonemes[0])
|
|||
|
|
|||
|
if ends_silent_consonant and starts_vowel:
|
|||
|
# Handle mandatory liason cases
|
|||
|
# https://www.commeunefrancaise.com/blog/la-liaison
|
|||
|
|
|||
|
if word1.text == "et":
|
|||
|
# No liason
|
|||
|
pass
|
|||
|
elif word1.pos in {"DET", "NUM"}:
|
|||
|
# Determiner/adjective -> noun
|
|||
|
liason = True
|
|||
|
elif (word1.pos == "PRON") and (word2.pos in {"AUX", "VERB"}):
|
|||
|
# Pronoun -> verb
|
|||
|
liason = True
|
|||
|
elif (word1.pos == "ADP") or (word1.text == "très"):
|
|||
|
# Preposition
|
|||
|
liason = True
|
|||
|
elif (word1.pos == "ADJ") and (word2.pos in {"NOUN", "PROPN"}):
|
|||
|
# Adjective -> noun
|
|||
|
liason = True
|
|||
|
elif word1.pos in {"AUX", "VERB"}:
|
|||
|
# Verb -> vowel
|
|||
|
liason = True
|
|||
|
|
|||
|
if liason:
|
|||
|
# Apply liason
|
|||
|
# s -> z
|
|||
|
# p -> p
|
|||
|
# d|t -> d
|
|||
|
liason_pron = word1.phonemes
|
|||
|
|
|||
|
if last_char1 in {"s", "x", "z"}:
|
|||
|
liason_pron.append("z")
|
|||
|
elif last_char1 == "d":
|
|||
|
liason_pron.append("t")
|
|||
|
elif last_char1 in {"t", "p", "n"}:
|
|||
|
# Final phoneme is same as char
|
|||
|
liason_pron.append(last_char1)
|
|||
|
|
|||
|
|
|||
|
def fr_has_silent_consonant(last_char: str, last_phoneme: str) -> bool:
|
|||
|
"""True if last consonant is silent in French"""
|
|||
|
# Credit: https://github.com/Remiphilius/PoemesProfonds/blob/master/lecture.py
|
|||
|
|
|||
|
if last_char in {"d", "p", "t"}:
|
|||
|
return last_phoneme != last_char
|
|||
|
if last_char == "r":
|
|||
|
return last_phoneme != "ʁ"
|
|||
|
if last_char in {"s", "x", "z"}:
|
|||
|
return last_phoneme not in {"s", "z"}
|
|||
|
if last_char == "n":
|
|||
|
return last_phoneme not in {"n", "ŋ"}
|
|||
|
|
|||
|
return False
|
|||
|
|
|||
|
|
|||
|
def fr_is_vowel(phoneme: str) -> bool:
|
|||
|
"""True if phoneme is a French vowel"""
|
|||
|
return phoneme in {
|
|||
|
"i",
|
|||
|
"y",
|
|||
|
"u",
|
|||
|
"e",
|
|||
|
"ø",
|
|||
|
"o",
|
|||
|
"ə",
|
|||
|
"ɛ",
|
|||
|
"œ",
|
|||
|
"ɔ",
|
|||
|
"a",
|
|||
|
"ɔ̃",
|
|||
|
"ɛ̃",
|
|||
|
"ɑ̃",
|
|||
|
"œ̃",
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
def get_fr_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for French"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY_ORDINAL,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
"post_process_sentence": fr_post_process_sentence,
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="fr_FR", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Italian (it-it, Italiano)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_it_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Italian"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
"post_process_sentence": fr_post_process_sentence,
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="it_IT", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Luxembourgish (lb, Lëtzebuergesch)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_lb_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Luxembourgish"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
"babel_locale": "lb",
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="lb", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Dutch (nl, Nederlands)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_nl_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Dutch"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="nl", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Portuguese (pt, Português)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_pt_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create default settings for Portuguese"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_currency": "EUR",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="pt", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Russian (ru, Русский)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_ru_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Russian"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":"},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_currency": "RUB",
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="ru_RU", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Swedish (sv-se, svenska)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_sv_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Swedish"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":", "..."},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="sv_SE", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Swahili (sw, Kiswahili)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_sw_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Swahili"""
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {".", "?", "!"},
|
|||
|
"minor_breaks": {",", ";", ":"},
|
|||
|
"word_breaks": {"-", "_"},
|
|||
|
"begin_punctuations": {'"', "“", "«", "[", "(", "<", "„"},
|
|||
|
"end_punctuations": {'"', "”", "»", "]", ")", ">"},
|
|||
|
"default_date_format": InterpretAsFormat.DATE_DMY,
|
|||
|
"replacements": [("’", "'")], # normalize apostrophe
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="sw", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
# Chinese (zh-cn, 汉语)
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
def get_zh_settings(lang_dir=None, **settings_args) -> TextProcessorSettings:
|
|||
|
"""Create settings for Chinese"""
|
|||
|
|
|||
|
# https://en.wikipedia.org/wiki/Chinese_punctuation
|
|||
|
settings_args = {
|
|||
|
"major_breaks": {"。", "!", "?"},
|
|||
|
"minor_breaks": {";", ":", ",", "、", "……"},
|
|||
|
"begin_punctuations": {"(", "[", "【", "「", "﹁", '"', "《", "〈"},
|
|||
|
"end_punctuations": {")", "]", " 】", "」", "﹂", '"', "》", "〉"},
|
|||
|
"word_breaks": {"‧"},
|
|||
|
"split_words": list,
|
|||
|
"join_str": "",
|
|||
|
**settings_args,
|
|||
|
}
|
|||
|
return TextProcessorSettings(lang="zh_CN", **settings_args)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class DelayedGraphemesToPhonemes:
|
|||
|
"""Grapheme to phoneme guesser that loads on first use"""
|
|||
|
|
|||
|
def __init__(
|
|||
|
self,
|
|||
|
model_path: typing.Union[str, Path],
|
|||
|
transform_func: typing.Optional[typing.Callable[[str], str]] = None,
|
|||
|
**g2p_args,
|
|||
|
):
|
|||
|
self.model_path = model_path
|
|||
|
self.g2p: typing.Optional[GraphemesToPhonemes] = None
|
|||
|
self.transform_func = transform_func
|
|||
|
self.g2p_args = g2p_args
|
|||
|
|
|||
|
def __call__(
|
|||
|
self, word: str, role: typing.Optional[str] = None
|
|||
|
) -> typing.Optional[PHONEMES_TYPE]:
|
|||
|
if self.g2p is None:
|
|||
|
_LOGGER.debug(
|
|||
|
"Loading grapheme to phoneme CRF model from %s", self.model_path
|
|||
|
)
|
|||
|
self.g2p = GraphemesToPhonemes(self.model_path, **self.g2p_args)
|
|||
|
|
|||
|
assert self.g2p is not None
|
|||
|
|
|||
|
if self.transform_func is not None:
|
|||
|
word = self.transform_func(word)
|
|||
|
|
|||
|
return self.g2p(word)
|
|||
|
|
|||
|
|
|||
|
class DelayedPartOfSpeechTagger:
|
|||
|
"""POS tagger that loads on first use"""
|
|||
|
|
|||
|
def __init__(self, model_path: typing.Union[str, Path], **tagger_args):
|
|||
|
|
|||
|
self.model_path = Path(model_path)
|
|||
|
self.tagger: typing.Optional[PartOfSpeechTagger] = None
|
|||
|
self.tagger_args = tagger_args
|
|||
|
|
|||
|
def __call__(self, words: typing.Sequence[str]) -> typing.Sequence[str]:
|
|||
|
if self.tagger is None:
|
|||
|
_LOGGER.debug("Loading part of speech tagger from %s", self.model_path)
|
|||
|
self.tagger = PartOfSpeechTagger(self.model_path, **self.tagger_args)
|
|||
|
|
|||
|
assert self.tagger is not None
|
|||
|
return self.tagger(words)
|
|||
|
|
|||
|
|
|||
|
class DelayedSqlitePhonemizer:
|
|||
|
"""Phonemizer that loads on first use"""
|
|||
|
|
|||
|
def __init__(self, db_path: typing.Union[str, Path], **phonemizer_args):
|
|||
|
|
|||
|
self.db_path = Path(db_path)
|
|||
|
self.phonemizer: typing.Optional[SqlitePhonemizer] = None
|
|||
|
self.phonemizer_args = phonemizer_args
|
|||
|
|
|||
|
def __call__(
|
|||
|
self, word: str, role: typing.Optional[str] = None, do_transforms: bool = True
|
|||
|
) -> typing.Optional[PHONEMES_TYPE]:
|
|||
|
if self.phonemizer is None:
|
|||
|
_LOGGER.debug("Connecting to lexicon database at %s", self.db_path)
|
|||
|
db_conn = sqlite3.connect(str(self.db_path))
|
|||
|
self.phonemizer = SqlitePhonemizer(db_conn=db_conn, **self.phonemizer_args)
|
|||
|
|
|||
|
assert self.phonemizer is not None
|
|||
|
return self.phonemizer(word, role=role, do_transforms=do_transforms)
|