926 lines
30 KiB
Python
926 lines
30 KiB
Python
#!/usr/bin/env python3
|
||
"""Functions for manipulating phones/phonemes"""
|
||
import logging
|
||
import re
|
||
import typing
|
||
import unicodedata
|
||
from collections import defaultdict
|
||
|
||
from gruut_ipa.constants import ( # noqa: F401
|
||
_DATA_DIR,
|
||
_DIR,
|
||
CONSONANTS,
|
||
FEATURE_COLUMNS,
|
||
FEATURE_EMPTY,
|
||
FEATURE_KEYS,
|
||
FEATURE_ORDINAL_COLUMNS,
|
||
IPA,
|
||
LANG_ALIASES,
|
||
SCHWAS,
|
||
VOWELS,
|
||
Accent,
|
||
Break,
|
||
BreakType,
|
||
Consonant,
|
||
ConsonantPlace,
|
||
ConsonantType,
|
||
Dipthong,
|
||
Intonation,
|
||
PhonemeLength,
|
||
Schwa,
|
||
Stress,
|
||
Vowel,
|
||
VowelHeight,
|
||
VowelPlacement,
|
||
)
|
||
from gruut_ipa.utils import resolve_lang
|
||
|
||
_LOGGER = logging.getLogger("gruut_ipa")
|
||
|
||
# -----------------------------------------------------------------------------
|
||
|
||
|
||
class Phone:
|
||
"""Single IPA phone with diacritics and suprasegmentals"""
|
||
|
||
def __init__(
|
||
self,
|
||
letters: str,
|
||
stress: typing.Optional[Stress] = None,
|
||
accents: typing.Optional[typing.Iterable[Accent]] = None,
|
||
is_long: bool = False,
|
||
nasal: typing.Optional[typing.Set[int]] = None,
|
||
raised: typing.Optional[typing.Set[int]] = None,
|
||
diacritics: typing.Optional[typing.Dict[int, typing.Set[str]]] = None,
|
||
suprasegmentals: typing.Optional[typing.Set[str]] = None,
|
||
tone: str = "",
|
||
):
|
||
self.letters: str = unicodedata.normalize("NFC", letters)
|
||
self.stress = stress
|
||
self.accents: typing.List[Accent] = list(accents or [])
|
||
self.is_long: bool = is_long
|
||
|
||
self.nasal: typing.Set[int] = nasal or set()
|
||
self.is_nasal = bool(self.nasal)
|
||
|
||
self.raised: typing.Set[int] = raised or set()
|
||
self.is_raised = bool(self.raised)
|
||
|
||
self.tone: str = tone
|
||
|
||
self.diacritics: typing.Dict[int, typing.Set[str]] = diacritics or defaultdict(
|
||
set
|
||
)
|
||
self.suprasegmentals: typing.Set[str] = suprasegmentals or set()
|
||
|
||
# Decompose suprasegmentals and diacritics
|
||
if self.stress == Stress.PRIMARY:
|
||
self.suprasegmentals.add(IPA.STRESS_PRIMARY)
|
||
elif self.stress == Stress.SECONDARY:
|
||
self.suprasegmentals.add(IPA.STRESS_SECONDARY)
|
||
|
||
if Accent.ACUTE in self.accents:
|
||
self.suprasegmentals.add(IPA.ACCENT_ACUTE)
|
||
|
||
if Accent.GRAVE in self.accents:
|
||
self.suprasegmentals.add(IPA.ACCENT_GRAVE)
|
||
|
||
if self.is_long:
|
||
self.suprasegmentals.add(IPA.LONG)
|
||
|
||
# Nasal
|
||
for letter_index in self.nasal:
|
||
letter_diacritics = self.diacritics.get(letter_index)
|
||
if letter_diacritics is None:
|
||
letter_diacritics = set()
|
||
self.diacritics[letter_index] = letter_diacritics
|
||
|
||
letter_diacritics.add(IPA.NASAL)
|
||
|
||
# Raised
|
||
for letter_index in self.raised:
|
||
letter_diacritics = self.diacritics.get(letter_index)
|
||
if letter_diacritics is None:
|
||
letter_diacritics = set()
|
||
self.diacritics[letter_index] = letter_diacritics
|
||
|
||
letter_diacritics.add(IPA.RAISED)
|
||
|
||
self._text: str = ""
|
||
|
||
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
|
||
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
|
||
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
|
||
|
||
@property
|
||
def text(self) -> str:
|
||
"""Get textual representation of phone (NFC normalized)"""
|
||
if self._text:
|
||
return self._text
|
||
|
||
# Pre-letter suprasegmentals
|
||
for accent in self.accents:
|
||
if accent == Accent.ACUTE:
|
||
self._text += IPA.ACCENT_ACUTE
|
||
elif accent == Accent.GRAVE:
|
||
self._text += IPA.ACCENT_GRAVE
|
||
|
||
if self.stress == Stress.PRIMARY:
|
||
self._text += IPA.STRESS_PRIMARY
|
||
elif self.stress == Stress.SECONDARY:
|
||
self._text += IPA.STRESS_SECONDARY
|
||
|
||
# Letters and diacritics
|
||
for letter_index, letter in enumerate(self.letters):
|
||
self._text += letter
|
||
|
||
# Diacritics
|
||
for diacritic in self.diacritics.get(letter_index, []):
|
||
self._text += diacritic
|
||
|
||
# Tone
|
||
if self.tone:
|
||
self._text += self.tone
|
||
|
||
# Post-letter suprasegmentals
|
||
if self.is_long:
|
||
self._text += IPA.LONG
|
||
|
||
# Re-normalize and combine
|
||
self._text = unicodedata.normalize("NFC", self._text)
|
||
|
||
return self._text
|
||
|
||
@property
|
||
def is_vowel(self) -> bool:
|
||
"""True if phone is a vowel"""
|
||
return self.vowel is not None
|
||
|
||
@property
|
||
def is_consonant(self) -> bool:
|
||
"""True if phone is a consonant"""
|
||
return self.consonant is not None
|
||
|
||
@property
|
||
def is_schwa(self) -> bool:
|
||
"""True if phone is a schwa"""
|
||
return self.schwa is not None
|
||
|
||
def __repr__(self) -> str:
|
||
return self.text
|
||
|
||
@staticmethod
|
||
def from_string(phone_str: str) -> "Phone":
|
||
"""Parse phone from string"""
|
||
# Decompose into base and combining characters
|
||
codepoints = unicodedata.normalize("NFD", phone_str)
|
||
kwargs: typing.Dict[str, typing.Any] = {
|
||
"letters": "",
|
||
"diacritics": defaultdict(set),
|
||
"tone": "",
|
||
"accents": [],
|
||
"nasal": set(),
|
||
"raised": set(),
|
||
}
|
||
|
||
in_tone = False
|
||
new_letter = False
|
||
letter_index = 0
|
||
|
||
for c in codepoints:
|
||
# Check for stress
|
||
if (c == IPA.ACCENT_ACUTE) and not in_tone:
|
||
kwargs["accents"].append(Accent.ACUTE)
|
||
elif (c == IPA.ACCENT_GRAVE) and not in_tone:
|
||
kwargs["accents"].append(Accent.GRAVE)
|
||
elif c == IPA.STRESS_PRIMARY:
|
||
kwargs["stress"] = Stress.PRIMARY
|
||
elif c == IPA.STRESS_SECONDARY:
|
||
kwargs["stress"] = Stress.SECONDARY
|
||
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
|
||
# Interpret as part of tone
|
||
kwargs["tone"] += c
|
||
elif IPA.is_long(c):
|
||
# Check for elongation
|
||
kwargs["is_long"] = True
|
||
elif IPA.is_nasal(c):
|
||
# Check for nasalation
|
||
kwargs["nasal"].add(letter_index)
|
||
elif IPA.is_raised(c):
|
||
# Check for raised articulation
|
||
kwargs["raised"].add(letter_index)
|
||
elif IPA.is_bracket(c) or IPA.is_break(c):
|
||
# Skip brackets/syllable breaks
|
||
pass
|
||
elif IPA.is_tie(c):
|
||
# Keep ties in letters
|
||
kwargs["letters"] += c
|
||
letter_index += 1
|
||
elif IPA.is_tone(c):
|
||
# Tone numbers/letters
|
||
kwargs["tone"] += c
|
||
in_tone = True
|
||
elif unicodedata.combining(c) > 0:
|
||
# Stow some diacritics that we don't do anything with
|
||
kwargs["diacritics"][letter_index].add(c)
|
||
else:
|
||
# Include all other characters in letters
|
||
kwargs["letters"] += c
|
||
if new_letter:
|
||
letter_index += 1
|
||
|
||
new_letter = True
|
||
|
||
return Phone(**kwargs)
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
|
||
|
||
class Pronunciation:
|
||
"""Collection of phones and breaks for some unit of text (word, sentence, etc.)"""
|
||
|
||
def __init__(
|
||
self, phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]]
|
||
):
|
||
self.phones_and_others = phones_and_others
|
||
|
||
self.phones: typing.List[Phone] = []
|
||
self.breaks: typing.List[Break] = []
|
||
self.intonations: typing.List[Intonation] = []
|
||
|
||
# Decompose into phones, breaks, and intonations
|
||
for p in self.phones_and_others:
|
||
if isinstance(p, Phone):
|
||
self.phones.append(p)
|
||
elif isinstance(p, Break):
|
||
self.breaks.append(p)
|
||
elif isinstance(p, Intonation):
|
||
self.intonations.append(p)
|
||
|
||
self._text = ""
|
||
|
||
@property
|
||
def text(self) -> str:
|
||
"""Get text representation of pronunciation (NFC normalized)"""
|
||
if not self._text:
|
||
self._text = "".join(p.text for p in self.phones_and_others)
|
||
|
||
return self._text
|
||
|
||
def __repr__(self) -> str:
|
||
return self.text
|
||
|
||
def __iter__(self):
|
||
return iter(self.phones_and_others)
|
||
|
||
def __getitem__(self, idx):
|
||
return self.phones_and_others[idx]
|
||
|
||
@staticmethod
|
||
def from_string(
|
||
pron_str: str,
|
||
keep_stress: bool = True,
|
||
keep_accents: typing.Optional[bool] = None,
|
||
drop_tones: bool = False,
|
||
keep_ties: bool = True,
|
||
) -> "Pronunciation":
|
||
"""Split an IPA pronunciation into phones.
|
||
|
||
Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa).
|
||
Elongation markers bind to the previous non-combining codepoint (e.g., aː).
|
||
Ties join two non-combining sequences (e.g. t͡ʃ).
|
||
|
||
Whitespace and brackets are skipped.
|
||
|
||
Returns list of phones.
|
||
"""
|
||
if keep_accents is None:
|
||
keep_accents = keep_stress
|
||
|
||
clusters = []
|
||
cluster = ""
|
||
stress = ""
|
||
is_stress = False
|
||
accents = ""
|
||
is_accent = False
|
||
tone = ""
|
||
in_tone = False
|
||
skip_next_cluster = False
|
||
|
||
codepoints = unicodedata.normalize("NFD", pron_str)
|
||
|
||
for codepoint in codepoints:
|
||
new_cluster = False
|
||
is_stress = False
|
||
is_accent = False
|
||
|
||
if (
|
||
codepoint.isspace()
|
||
or IPA.is_bracket(codepoint)
|
||
or (codepoint in {IPA.BREAK_SYLLABLE})
|
||
):
|
||
# Skip whitespace, brackets, and syllable breaks
|
||
continue
|
||
|
||
if IPA.is_break(codepoint) or IPA.is_intonation(codepoint):
|
||
# Keep minor/major/word breaks and intonation markers
|
||
new_cluster = True
|
||
|
||
if IPA.is_accent(codepoint) and not in_tone:
|
||
is_accent = True
|
||
if cluster:
|
||
new_cluster = True
|
||
skip_next_cluster = True
|
||
elif IPA.is_stress(codepoint):
|
||
is_stress = True
|
||
if cluster:
|
||
new_cluster = True
|
||
skip_next_cluster = True
|
||
elif in_tone and (codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
|
||
# Interpret as part of tone
|
||
if not drop_tones:
|
||
tone += codepoint
|
||
|
||
continue
|
||
elif IPA.is_long(codepoint):
|
||
# Add to current cluster
|
||
pass
|
||
elif IPA.is_tie(codepoint):
|
||
if keep_ties:
|
||
# Add next non-combining to current cluster
|
||
skip_next_cluster = True
|
||
else:
|
||
# Ignore ties
|
||
continue
|
||
elif IPA.is_tone(codepoint):
|
||
# Add to end of current cluster
|
||
if not drop_tones:
|
||
tone += codepoint
|
||
|
||
in_tone = True
|
||
continue
|
||
elif unicodedata.combining(codepoint) == 0:
|
||
# Non-combining character
|
||
if skip_next_cluster:
|
||
# Add to current cluster
|
||
skip_next_cluster = False
|
||
elif cluster:
|
||
# Start a new cluster
|
||
new_cluster = True
|
||
|
||
if new_cluster and cluster:
|
||
clusters.append(accents + stress + cluster + tone)
|
||
accents = ""
|
||
stress = ""
|
||
cluster = ""
|
||
tone = ""
|
||
|
||
if is_accent:
|
||
if keep_accents:
|
||
accents += codepoint
|
||
elif is_stress:
|
||
if keep_stress:
|
||
stress += codepoint
|
||
else:
|
||
cluster += codepoint
|
||
|
||
if cluster:
|
||
clusters.append(accents + stress + cluster + tone)
|
||
|
||
phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] = []
|
||
for cluster in clusters:
|
||
if IPA.is_break(cluster):
|
||
phones_and_others.append(Break.from_string(cluster))
|
||
elif IPA.is_intonation(cluster):
|
||
phones_and_others.append(Intonation.from_string(cluster))
|
||
else:
|
||
phones_and_others.append(Phone.from_string(cluster))
|
||
|
||
return Pronunciation(phones_and_others)
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
|
||
|
||
class Phoneme:
|
||
"""Phoneme composed of international phonetic alphabet symbols"""
|
||
|
||
def __init__(
|
||
self,
|
||
text: str,
|
||
example: str = "",
|
||
unknown: bool = False,
|
||
tones: typing.Optional[typing.Iterable[str]] = None,
|
||
is_ipa: bool = True,
|
||
):
|
||
self._text = ""
|
||
self._text_compare = ""
|
||
self.example = example
|
||
self.unknown = unknown
|
||
|
||
# List of allowable tones for phoneme
|
||
self.tones = list(tones or [])
|
||
|
||
self.stress: typing.Optional[Stress] = None
|
||
self.accents: typing.List[Accent] = []
|
||
self.elongated: bool = False
|
||
self.nasalated: typing.Set[int] = set()
|
||
self.raised: typing.Set[int] = set()
|
||
self._extra_combining: typing.Dict[int, typing.List[str]] = defaultdict(list)
|
||
|
||
# Decompose into base and combining characters
|
||
codepoints = unicodedata.normalize("NFD", text)
|
||
self.letters = ""
|
||
self.tone = ""
|
||
|
||
if is_ipa:
|
||
in_tone = False
|
||
letter_index = 0
|
||
new_letter = False
|
||
|
||
for c in codepoints:
|
||
# Check for stress
|
||
if (c == IPA.ACCENT_ACUTE) and (not in_tone):
|
||
self.accents.append(Accent.ACUTE)
|
||
elif (c == IPA.ACCENT_GRAVE) and (not in_tone):
|
||
self.accents.append(Accent.GRAVE)
|
||
elif c == IPA.STRESS_PRIMARY:
|
||
self.stress = Stress.PRIMARY
|
||
elif c == IPA.STRESS_SECONDARY:
|
||
self.stress = Stress.SECONDARY
|
||
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
|
||
# Interpret as part of tone
|
||
self.tone += c
|
||
elif IPA.is_long(c):
|
||
# Check for elongation
|
||
self.elongated = True
|
||
elif IPA.is_nasal(c):
|
||
# Check for nasalation
|
||
self.nasalated.add(letter_index)
|
||
elif IPA.is_raised(c):
|
||
# Check for raised articulation
|
||
self.raised.add(letter_index)
|
||
elif IPA.is_bracket(c) or IPA.is_break(c):
|
||
# Skip brackets/syllable breaks
|
||
pass
|
||
elif IPA.is_tone(c):
|
||
# Keep tone separate
|
||
self.tone += c
|
||
in_tone = True
|
||
elif c in {IPA.SYLLABIC, IPA.NON_SYLLABIC, IPA.EXTRA_SHORT}:
|
||
# Stow some diacritics that we don't do anything with
|
||
self._extra_combining[letter_index].append(c)
|
||
else:
|
||
# Include all other characters in base
|
||
self.letters += c
|
||
|
||
if new_letter:
|
||
letter_index += 1
|
||
|
||
new_letter = True
|
||
else:
|
||
self.letters = text
|
||
|
||
# Re-normalize and combine letters
|
||
self.letters = unicodedata.normalize("NFC", self.letters)
|
||
self.letters_graphemes = IPA.graphemes(self.letters)
|
||
|
||
# Categorize
|
||
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
|
||
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
|
||
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
|
||
self.dipthong: typing.Optional[Dipthong] = None
|
||
|
||
if (
|
||
(not self.vowel)
|
||
and (not self.consonant)
|
||
and (not self.schwa)
|
||
and (len(self.letters) == 2)
|
||
):
|
||
# Check if dipthong (two vowels)
|
||
vowel1 = VOWELS.get(self.letters[0])
|
||
vowel2 = VOWELS.get(self.letters[1])
|
||
if vowel1 and vowel2:
|
||
self.dipthong = Dipthong(vowel1, vowel2)
|
||
|
||
@property
|
||
def text(self) -> str:
|
||
"""Return letters with stress and elongation (NFC normalized)"""
|
||
if self._text:
|
||
return self._text
|
||
|
||
for accent in self.accents:
|
||
if accent == Accent.ACUTE:
|
||
self._text += IPA.ACCENT_ACUTE
|
||
elif accent == Accent.GRAVE:
|
||
self._text += IPA.ACCENT_GRAVE
|
||
|
||
if self.stress == Stress.PRIMARY:
|
||
self._text += IPA.STRESS_PRIMARY
|
||
elif self.stress == Stress.SECONDARY:
|
||
self._text += IPA.STRESS_SECONDARY
|
||
|
||
for letter_index, letter in enumerate(self.letters):
|
||
self._text += letter
|
||
|
||
if letter_index in self.nasalated:
|
||
self._text += IPA.NASAL
|
||
|
||
if letter_index in self.raised:
|
||
self._text += IPA.RAISED
|
||
|
||
for c in self._extra_combining[letter_index]:
|
||
self._text += c
|
||
|
||
if self.tone:
|
||
self._text += self.tone
|
||
|
||
if self.elongated:
|
||
self._text += IPA.LONG
|
||
|
||
# Re-normalize and combine
|
||
self._text = unicodedata.normalize("NFC", self._text)
|
||
|
||
return self._text
|
||
|
||
@property
|
||
def text_compare(self) -> str:
|
||
"""Return letters and elongation with no stress/tones (NFC normalized)"""
|
||
if self._text_compare:
|
||
return self._text_compare
|
||
|
||
for letter_index, letter in enumerate(self.letters):
|
||
self._text_compare += letter
|
||
|
||
if letter_index in self.nasalated:
|
||
self._text_compare += IPA.NASAL
|
||
|
||
if letter_index in self.raised:
|
||
self._text_compare += IPA.RAISED
|
||
|
||
for c in self._extra_combining[letter_index]:
|
||
self._text_compare += c
|
||
|
||
if self.elongated:
|
||
self._text_compare += IPA.LONG
|
||
|
||
# Re-normalize and combine
|
||
self._text_compare = unicodedata.normalize("NFC", self._text_compare)
|
||
|
||
return self._text_compare
|
||
|
||
def copy(self) -> "Phoneme":
|
||
"""Create a copy of this phonemes"""
|
||
return Phoneme(text=self.text, example=self.example, unknown=self.unknown)
|
||
|
||
def __repr__(self) -> str:
|
||
"""Return symbol with stress and elongation."""
|
||
return self.text
|
||
|
||
def to_dict(self) -> typing.Dict[str, typing.Any]:
|
||
"""Return properties of phoneme as a dict"""
|
||
type_name = "Phoneme"
|
||
props: typing.Dict[str, typing.Any] = {
|
||
"text": repr(self),
|
||
"letters": self.letters,
|
||
"tone": self.tone,
|
||
"tones": self.tones,
|
||
}
|
||
|
||
if self.unknown:
|
||
props["unknown"] = True
|
||
|
||
if self.example:
|
||
props["example"] = self.example
|
||
|
||
props["accents"] = [a.value for a in self.accents]
|
||
props["stress"] = self.stress.value if self.stress is not None else ""
|
||
|
||
if self.vowel:
|
||
type_name = "Vowel"
|
||
props["height"] = self.vowel.height.value
|
||
props["placement"] = self.vowel.placement.value
|
||
props["rounded"] = self.vowel.rounded
|
||
elif self.consonant:
|
||
type_name = "Consonant"
|
||
props["type"] = self.consonant.type.value
|
||
props["place"] = self.consonant.place.value
|
||
props["voiced"] = self.consonant.voiced
|
||
elif self.dipthong:
|
||
type_name = "Dipthong"
|
||
elif self.schwa:
|
||
type_name = "Schwa"
|
||
props["r_coloured"] = self.schwa.r_coloured
|
||
|
||
props["type"] = type_name
|
||
|
||
props["nasalated"] = list(self.nasalated)
|
||
props["raised"] = list(self.raised)
|
||
props["elongated"] = self.elongated
|
||
|
||
return props
|
||
|
||
def to_string(self) -> str:
|
||
"""Return descriptive string of phoneme"""
|
||
props = self.to_dict()
|
||
type_name = props.get("type", "Phoneme")
|
||
|
||
prop_strs = [f"{k}={v}" for k, v in props.items()]
|
||
|
||
return f"{type_name}(" + ", ".join(prop_strs) + ")"
|
||
|
||
|
||
# -----------------------------------------------------------------------------
|
||
|
||
|
||
class Phonemes:
|
||
"""Set of phonemes and allophones for a language"""
|
||
|
||
COMMENT_STR = "#"
|
||
|
||
def __init__(self, phonemes=None, ipa_map=None):
|
||
self.phonemes = phonemes or []
|
||
self.ipa_map = ipa_map or {}
|
||
|
||
# Regex for replacing IPA
|
||
self._ipa_map_regex = None
|
||
|
||
# Phonemes sorted by descreasing length
|
||
self._phonemes_sorted = None
|
||
|
||
# Map from original phoneme to gruut IPA
|
||
self.gruut_ipa_map: typing.Dict[str, str] = {}
|
||
|
||
self.phoneme_texts: typing.Set[str] = {}
|
||
self.update()
|
||
|
||
def __iter__(self):
|
||
return iter(self.phonemes)
|
||
|
||
def __len__(self):
|
||
return len(self.phonemes)
|
||
|
||
def __getitem__(self, key):
|
||
return self.phonemes[key]
|
||
|
||
def __contains__(self, item):
|
||
if isinstance(item, str):
|
||
# Compare IPA text
|
||
return item in self.phoneme_texts
|
||
|
||
return item in self.phonemes
|
||
|
||
@staticmethod
|
||
def from_language(language: str) -> "Phonemes":
|
||
"""Load phonemes for a given language"""
|
||
language = resolve_lang(language)
|
||
|
||
# Load phonemes themselves
|
||
phonemes_path = _DATA_DIR / language / "phonemes.txt"
|
||
with open(phonemes_path, "r", encoding="utf-8") as phonemes_file:
|
||
phonemes = Phonemes.from_text(phonemes_file)
|
||
|
||
# Try to load optional map from original phoneme to gruut IPA
|
||
gruut_ipa_map: typing.Optional[typing.Dict[str, str]] = None
|
||
map_path = _DATA_DIR / language / "ipa_map.txt"
|
||
if map_path.is_file():
|
||
gruut_ipa_map = {}
|
||
with open(map_path, "r", encoding="utf-8") as map_file:
|
||
for line in map_file:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
from_phoneme, to_ipa = line.split(maxsplit=1)
|
||
gruut_ipa_map[from_phoneme] = to_ipa
|
||
|
||
if gruut_ipa_map:
|
||
phonemes.gruut_ipa_map = gruut_ipa_map
|
||
|
||
return phonemes
|
||
|
||
@staticmethod
|
||
def from_text(text_file) -> "Phonemes":
|
||
"""Load text file with phonemes, examples, and allophones"""
|
||
lang = Phonemes()
|
||
|
||
for line in text_file:
|
||
# Remove comments
|
||
line, *_ = line.split(Phonemes.COMMENT_STR, maxsplit=1)
|
||
line = line.strip()
|
||
if line:
|
||
# phoneme [example] [allophone] [allophone] ! [tone] [tone]...
|
||
parts = line.split()
|
||
phoneme_ipa = parts[0]
|
||
example = ""
|
||
|
||
if len(parts) > 1:
|
||
example = parts[1]
|
||
|
||
tones = []
|
||
if len(parts) > 2:
|
||
in_tone = False
|
||
|
||
# Map allophone back to phoneme
|
||
for part in parts[2:]:
|
||
if part == "!":
|
||
# Begin possible tones for this phoneme
|
||
in_tone = True
|
||
elif in_tone:
|
||
tones.append(part)
|
||
else:
|
||
lang.ipa_map[part] = phoneme_ipa
|
||
|
||
lang.phonemes.append(
|
||
Phoneme(text=phoneme_ipa, example=example, tones=tones)
|
||
)
|
||
|
||
lang.update()
|
||
|
||
return lang
|
||
|
||
def update(self):
|
||
"""Call after modifying phonemes or IPA map to re-sort"""
|
||
# Create single regex that will be used to replace IPA.
|
||
# The final regex is of the form (AAA|BB|C) where each case is in
|
||
# decreasing length order.
|
||
#
|
||
# If the replacement is not a substring of any phonemes, then the
|
||
# replacement is straightforward.
|
||
#
|
||
# If it is a substring of some phoneme, however, we need to be careful.
|
||
# For example, naively replacing "e" with "eɪ" in the string "beɪ" will
|
||
# produce "beeɪ" when we want it to be "beɪ".
|
||
#
|
||
# So the substring case becomes "e(?!ɪ)" which uses a negative lookahead
|
||
# to avoid the problem.
|
||
cases = []
|
||
for match_text in sorted(self.ipa_map.keys(), key=len, reverse=True):
|
||
if match_text.startswith(","):
|
||
# Raw regex
|
||
cases.append(match_text[1:])
|
||
continue
|
||
|
||
# Check against all of the phonemes
|
||
case_added = False
|
||
for phoneme in self.phonemes:
|
||
num_extra = len(phoneme.text) - len(match_text)
|
||
if (num_extra > 0) and phoneme.text.startswith(match_text):
|
||
# Use negative lookahead to avoid replacing part of a valid
|
||
# phoneme.
|
||
cases.append(
|
||
"{}(?!{})".format(
|
||
re.escape(match_text[:num_extra]),
|
||
re.escape(phoneme.text[num_extra:]),
|
||
)
|
||
)
|
||
|
||
case_added = True
|
||
break
|
||
|
||
if not case_added:
|
||
# No substring problem
|
||
cases.append(re.escape(match_text))
|
||
|
||
ipa_map_regex_str = "({})".format("|".join(cases))
|
||
self._ipa_map_regex = re.compile(ipa_map_regex_str)
|
||
|
||
# Split phonemes and sort by reverse length
|
||
split_phonemes = [
|
||
([pb.text for pb in Pronunciation.from_string(p.text)], p)
|
||
for p in self.phonemes
|
||
]
|
||
|
||
self._phonemes_sorted = sorted(
|
||
split_phonemes, key=lambda kp: len(kp[0]), reverse=True
|
||
)
|
||
|
||
# Update IPA texts set for phonemes
|
||
self.phoneme_texts = set(p.text for p in self.phonemes)
|
||
|
||
def split(
|
||
self,
|
||
pron_str: typing.Union[str, Pronunciation],
|
||
keep_stress: bool = True,
|
||
keep_accents: typing.Optional[bool] = None,
|
||
drop_tones: bool = False,
|
||
is_ipa: bool = True,
|
||
) -> typing.List[Phoneme]:
|
||
"""Split an IPA pronunciation into phonemes"""
|
||
if not self._ipa_map_regex:
|
||
self.update()
|
||
|
||
if keep_accents is None:
|
||
keep_accents = keep_stress
|
||
|
||
word_phonemes: typing.List[Phoneme] = []
|
||
|
||
if self.ipa_map:
|
||
if isinstance(pron_str, Pronunciation):
|
||
pron_str = "".join(p.text for p in pron_str)
|
||
|
||
def handle_replace(match):
|
||
text = match.group(1)
|
||
return self.ipa_map.get(text, text)
|
||
|
||
pron_str = self._ipa_map_regex.sub(handle_replace, pron_str)
|
||
|
||
# Get text for IPA phones
|
||
if isinstance(pron_str, Pronunciation):
|
||
# Use supplied pronunication
|
||
ipas = [pb.text for pb in pron_str]
|
||
elif is_ipa:
|
||
# Split string into pronunciation
|
||
pron = Pronunciation.from_string(
|
||
pron_str,
|
||
keep_stress=keep_stress,
|
||
keep_accents=keep_accents,
|
||
drop_tones=drop_tones,
|
||
)
|
||
ipas = [pb.text for pb in pron]
|
||
else:
|
||
ipas = IPA.graphemes(pron_str)
|
||
|
||
# Keep stress and tones separate to make phoneme comparisons easier
|
||
ipa_stress: typing.Dict[int, str] = defaultdict(str)
|
||
ipa_tones: typing.Dict[int, str] = defaultdict(str)
|
||
|
||
if is_ipa:
|
||
in_tone = False
|
||
for ipa_idx, ipa in enumerate(ipas):
|
||
if ipa:
|
||
keep_ipa = ""
|
||
for codepoint in ipa:
|
||
if IPA.is_accent(codepoint) and (not in_tone):
|
||
if keep_accents:
|
||
ipa_stress[ipa_idx] += codepoint
|
||
elif IPA.is_stress(codepoint):
|
||
if keep_stress:
|
||
ipa_stress[ipa_idx] += codepoint
|
||
elif in_tone and (
|
||
codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}
|
||
):
|
||
# Interpret as part of time
|
||
if not drop_tones:
|
||
ipa_tones[ipa_idx] += codepoint
|
||
elif IPA.is_tone(codepoint):
|
||
if not drop_tones:
|
||
ipa_tones[ipa_idx] += codepoint
|
||
|
||
in_tone = True
|
||
else:
|
||
keep_ipa += codepoint
|
||
|
||
ipas[ipa_idx] = keep_ipa
|
||
|
||
num_ipas: int = len(ipas)
|
||
|
||
# ---------------------------------------------------------------------
|
||
|
||
# pylint: disable=consider-using-enumerate
|
||
for ipa_idx in range(len(ipas)):
|
||
ipa = ipas[ipa_idx]
|
||
if ipa is None:
|
||
# Skip replaced piece
|
||
continue
|
||
|
||
phoneme_match = False
|
||
for phoneme_ipas, phoneme in self._phonemes_sorted:
|
||
if ipa_idx <= (num_ipas - len(phoneme_ipas)):
|
||
phoneme_match = True
|
||
phoneme_stress = ""
|
||
phoneme_tones = ""
|
||
|
||
# Look forward into sequence
|
||
for phoneme_idx in range(len(phoneme_ipas)):
|
||
phoneme_stress += ipa_stress[ipa_idx + phoneme_idx]
|
||
phoneme_tones += ipa_tones[ipa_idx + phoneme_idx]
|
||
|
||
if phoneme_ipas[phoneme_idx] != ipas[ipa_idx + phoneme_idx]:
|
||
phoneme_match = False
|
||
break
|
||
|
||
if phoneme_match:
|
||
# Successful match
|
||
if phoneme_stress or phoneme_tones:
|
||
# Create a copy of the phoneme with applied stress/tones
|
||
phoneme = Phoneme(
|
||
text=(phoneme_stress + phoneme.text + phoneme_tones),
|
||
example=phoneme.example,
|
||
)
|
||
|
||
word_phonemes.append(phoneme)
|
||
|
||
# Patch ipas to skip replaced pieces
|
||
for phoneme_idx in range(1, len(phoneme_ipas)):
|
||
ipas[ipa_idx + phoneme_idx] = None
|
||
|
||
break
|
||
|
||
if not phoneme_match:
|
||
# Add unknown phoneme
|
||
word_phonemes.append(Phoneme(text=ipa, unknown=True))
|
||
|
||
return word_phonemes
|