105 lines
2.9 KiB
Python
105 lines
2.9 KiB
Python
|
"""gruut module"""
|
||
|
import itertools
|
||
|
import logging
|
||
|
import re
|
||
|
import sqlite3
|
||
|
import threading
|
||
|
import typing
|
||
|
from enum import Enum
|
||
|
from pathlib import Path
|
||
|
|
||
|
from gruut.const import KNOWN_LANGS, TextProcessorSettings
|
||
|
from gruut.resources import _DIR, _PACKAGE
|
||
|
from gruut.text_processor import Sentence, TextProcessor
|
||
|
from gruut.utils import resolve_lang
|
||
|
|
||
|
# -----------------------------------------------------------------------------
|
||
|
|
||
|
_LOGGER = logging.getLogger(_PACKAGE)
|
||
|
|
||
|
__version__ = (_DIR / "VERSION").read_text(encoding="utf-8").strip()
|
||
|
__author__ = "Michael Hansen (synesthesiam)"
|
||
|
__all__ = [
|
||
|
"sentences",
|
||
|
"is_language_supported",
|
||
|
"get_supported_languages",
|
||
|
"TextProcessor",
|
||
|
"TextProcessorSettings",
|
||
|
]
|
||
|
|
||
|
# -----------------------------------------------------------------------------
|
||
|
|
||
|
_LOCAL = threading.local()
|
||
|
_PROCESSORS_LOCK = threading.RLock()
|
||
|
|
||
|
|
||
|
def sentences(
|
||
|
text: str,
|
||
|
lang: str = "en_US",
|
||
|
ssml: bool = False,
|
||
|
espeak: bool = False,
|
||
|
major_breaks: bool = True,
|
||
|
minor_breaks: bool = True,
|
||
|
punctuations: bool = True,
|
||
|
explicit_lang: bool = True,
|
||
|
phonemes: bool = True,
|
||
|
break_phonemes: bool = True,
|
||
|
pos: bool = True,
|
||
|
**process_args,
|
||
|
) -> typing.Iterable[Sentence]:
|
||
|
"""
|
||
|
Process text and return sentences
|
||
|
|
||
|
Args:
|
||
|
text: input text or SSML (ssml=True)
|
||
|
lang: default language of input text
|
||
|
ssml: True if input text is SSML
|
||
|
espeak: True if eSpeak phonemes should be used
|
||
|
major_breaks: False if no sentence-breaking symbols in output
|
||
|
minor_breaks: False if no phrase-breaking symbols in output
|
||
|
punctuations: False if no word-surrounding symbols in output
|
||
|
**process_args: keyword arguments passed to TextProcessor.process
|
||
|
|
||
|
Returns:
|
||
|
sentences: iterable of Sentence objects
|
||
|
|
||
|
"""
|
||
|
model_prefix = "" if (not espeak) else "espeak"
|
||
|
|
||
|
with _PROCESSORS_LOCK:
|
||
|
if not hasattr(_LOCAL, "processors"):
|
||
|
_LOCAL.processors = {}
|
||
|
|
||
|
text_processor = _LOCAL.processors.get(model_prefix)
|
||
|
if text_processor is None:
|
||
|
text_processor = TextProcessor(default_lang=lang, model_prefix=model_prefix)
|
||
|
_LOCAL.processors[model_prefix] = text_processor
|
||
|
|
||
|
assert text_processor is not None
|
||
|
graph, root = text_processor(text, lang=lang, ssml=ssml, **process_args)
|
||
|
|
||
|
yield from text_processor.sentences(
|
||
|
graph,
|
||
|
root,
|
||
|
major_breaks=major_breaks,
|
||
|
minor_breaks=minor_breaks,
|
||
|
punctuations=punctuations,
|
||
|
explicit_lang=explicit_lang,
|
||
|
phonemes=phonemes,
|
||
|
break_phonemes=break_phonemes,
|
||
|
pos=pos,
|
||
|
)
|
||
|
|
||
|
|
||
|
# -----------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
def is_language_supported(lang: str) -> bool:
|
||
|
"""True if gruut supports lang"""
|
||
|
return resolve_lang(lang) in KNOWN_LANGS
|
||
|
|
||
|
|
||
|
def get_supported_languages() -> typing.Set[str]:
|
||
|
"""Set of supported gruut languages"""
|
||
|
return set(KNOWN_LANGS)
|