926 lines
30 KiB
Python
926 lines
30 KiB
Python
|
#!/usr/bin/env python3
|
|||
|
"""Functions for manipulating phones/phonemes"""
|
|||
|
import logging
|
|||
|
import re
|
|||
|
import typing
|
|||
|
import unicodedata
|
|||
|
from collections import defaultdict
|
|||
|
|
|||
|
from gruut_ipa.constants import ( # noqa: F401
|
|||
|
_DATA_DIR,
|
|||
|
_DIR,
|
|||
|
CONSONANTS,
|
|||
|
FEATURE_COLUMNS,
|
|||
|
FEATURE_EMPTY,
|
|||
|
FEATURE_KEYS,
|
|||
|
FEATURE_ORDINAL_COLUMNS,
|
|||
|
IPA,
|
|||
|
LANG_ALIASES,
|
|||
|
SCHWAS,
|
|||
|
VOWELS,
|
|||
|
Accent,
|
|||
|
Break,
|
|||
|
BreakType,
|
|||
|
Consonant,
|
|||
|
ConsonantPlace,
|
|||
|
ConsonantType,
|
|||
|
Dipthong,
|
|||
|
Intonation,
|
|||
|
PhonemeLength,
|
|||
|
Schwa,
|
|||
|
Stress,
|
|||
|
Vowel,
|
|||
|
VowelHeight,
|
|||
|
VowelPlacement,
|
|||
|
)
|
|||
|
from gruut_ipa.utils import resolve_lang
|
|||
|
|
|||
|
_LOGGER = logging.getLogger("gruut_ipa")
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class Phone:
|
|||
|
"""Single IPA phone with diacritics and suprasegmentals"""
|
|||
|
|
|||
|
def __init__(
|
|||
|
self,
|
|||
|
letters: str,
|
|||
|
stress: typing.Optional[Stress] = None,
|
|||
|
accents: typing.Optional[typing.Iterable[Accent]] = None,
|
|||
|
is_long: bool = False,
|
|||
|
nasal: typing.Optional[typing.Set[int]] = None,
|
|||
|
raised: typing.Optional[typing.Set[int]] = None,
|
|||
|
diacritics: typing.Optional[typing.Dict[int, typing.Set[str]]] = None,
|
|||
|
suprasegmentals: typing.Optional[typing.Set[str]] = None,
|
|||
|
tone: str = "",
|
|||
|
):
|
|||
|
self.letters: str = unicodedata.normalize("NFC", letters)
|
|||
|
self.stress = stress
|
|||
|
self.accents: typing.List[Accent] = list(accents or [])
|
|||
|
self.is_long: bool = is_long
|
|||
|
|
|||
|
self.nasal: typing.Set[int] = nasal or set()
|
|||
|
self.is_nasal = bool(self.nasal)
|
|||
|
|
|||
|
self.raised: typing.Set[int] = raised or set()
|
|||
|
self.is_raised = bool(self.raised)
|
|||
|
|
|||
|
self.tone: str = tone
|
|||
|
|
|||
|
self.diacritics: typing.Dict[int, typing.Set[str]] = diacritics or defaultdict(
|
|||
|
set
|
|||
|
)
|
|||
|
self.suprasegmentals: typing.Set[str] = suprasegmentals or set()
|
|||
|
|
|||
|
# Decompose suprasegmentals and diacritics
|
|||
|
if self.stress == Stress.PRIMARY:
|
|||
|
self.suprasegmentals.add(IPA.STRESS_PRIMARY)
|
|||
|
elif self.stress == Stress.SECONDARY:
|
|||
|
self.suprasegmentals.add(IPA.STRESS_SECONDARY)
|
|||
|
|
|||
|
if Accent.ACUTE in self.accents:
|
|||
|
self.suprasegmentals.add(IPA.ACCENT_ACUTE)
|
|||
|
|
|||
|
if Accent.GRAVE in self.accents:
|
|||
|
self.suprasegmentals.add(IPA.ACCENT_GRAVE)
|
|||
|
|
|||
|
if self.is_long:
|
|||
|
self.suprasegmentals.add(IPA.LONG)
|
|||
|
|
|||
|
# Nasal
|
|||
|
for letter_index in self.nasal:
|
|||
|
letter_diacritics = self.diacritics.get(letter_index)
|
|||
|
if letter_diacritics is None:
|
|||
|
letter_diacritics = set()
|
|||
|
self.diacritics[letter_index] = letter_diacritics
|
|||
|
|
|||
|
letter_diacritics.add(IPA.NASAL)
|
|||
|
|
|||
|
# Raised
|
|||
|
for letter_index in self.raised:
|
|||
|
letter_diacritics = self.diacritics.get(letter_index)
|
|||
|
if letter_diacritics is None:
|
|||
|
letter_diacritics = set()
|
|||
|
self.diacritics[letter_index] = letter_diacritics
|
|||
|
|
|||
|
letter_diacritics.add(IPA.RAISED)
|
|||
|
|
|||
|
self._text: str = ""
|
|||
|
|
|||
|
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
|
|||
|
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
|
|||
|
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
|
|||
|
|
|||
|
@property
|
|||
|
def text(self) -> str:
|
|||
|
"""Get textual representation of phone (NFC normalized)"""
|
|||
|
if self._text:
|
|||
|
return self._text
|
|||
|
|
|||
|
# Pre-letter suprasegmentals
|
|||
|
for accent in self.accents:
|
|||
|
if accent == Accent.ACUTE:
|
|||
|
self._text += IPA.ACCENT_ACUTE
|
|||
|
elif accent == Accent.GRAVE:
|
|||
|
self._text += IPA.ACCENT_GRAVE
|
|||
|
|
|||
|
if self.stress == Stress.PRIMARY:
|
|||
|
self._text += IPA.STRESS_PRIMARY
|
|||
|
elif self.stress == Stress.SECONDARY:
|
|||
|
self._text += IPA.STRESS_SECONDARY
|
|||
|
|
|||
|
# Letters and diacritics
|
|||
|
for letter_index, letter in enumerate(self.letters):
|
|||
|
self._text += letter
|
|||
|
|
|||
|
# Diacritics
|
|||
|
for diacritic in self.diacritics.get(letter_index, []):
|
|||
|
self._text += diacritic
|
|||
|
|
|||
|
# Tone
|
|||
|
if self.tone:
|
|||
|
self._text += self.tone
|
|||
|
|
|||
|
# Post-letter suprasegmentals
|
|||
|
if self.is_long:
|
|||
|
self._text += IPA.LONG
|
|||
|
|
|||
|
# Re-normalize and combine
|
|||
|
self._text = unicodedata.normalize("NFC", self._text)
|
|||
|
|
|||
|
return self._text
|
|||
|
|
|||
|
@property
|
|||
|
def is_vowel(self) -> bool:
|
|||
|
"""True if phone is a vowel"""
|
|||
|
return self.vowel is not None
|
|||
|
|
|||
|
@property
|
|||
|
def is_consonant(self) -> bool:
|
|||
|
"""True if phone is a consonant"""
|
|||
|
return self.consonant is not None
|
|||
|
|
|||
|
@property
|
|||
|
def is_schwa(self) -> bool:
|
|||
|
"""True if phone is a schwa"""
|
|||
|
return self.schwa is not None
|
|||
|
|
|||
|
def __repr__(self) -> str:
|
|||
|
return self.text
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def from_string(phone_str: str) -> "Phone":
|
|||
|
"""Parse phone from string"""
|
|||
|
# Decompose into base and combining characters
|
|||
|
codepoints = unicodedata.normalize("NFD", phone_str)
|
|||
|
kwargs: typing.Dict[str, typing.Any] = {
|
|||
|
"letters": "",
|
|||
|
"diacritics": defaultdict(set),
|
|||
|
"tone": "",
|
|||
|
"accents": [],
|
|||
|
"nasal": set(),
|
|||
|
"raised": set(),
|
|||
|
}
|
|||
|
|
|||
|
in_tone = False
|
|||
|
new_letter = False
|
|||
|
letter_index = 0
|
|||
|
|
|||
|
for c in codepoints:
|
|||
|
# Check for stress
|
|||
|
if (c == IPA.ACCENT_ACUTE) and not in_tone:
|
|||
|
kwargs["accents"].append(Accent.ACUTE)
|
|||
|
elif (c == IPA.ACCENT_GRAVE) and not in_tone:
|
|||
|
kwargs["accents"].append(Accent.GRAVE)
|
|||
|
elif c == IPA.STRESS_PRIMARY:
|
|||
|
kwargs["stress"] = Stress.PRIMARY
|
|||
|
elif c == IPA.STRESS_SECONDARY:
|
|||
|
kwargs["stress"] = Stress.SECONDARY
|
|||
|
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
|
|||
|
# Interpret as part of tone
|
|||
|
kwargs["tone"] += c
|
|||
|
elif IPA.is_long(c):
|
|||
|
# Check for elongation
|
|||
|
kwargs["is_long"] = True
|
|||
|
elif IPA.is_nasal(c):
|
|||
|
# Check for nasalation
|
|||
|
kwargs["nasal"].add(letter_index)
|
|||
|
elif IPA.is_raised(c):
|
|||
|
# Check for raised articulation
|
|||
|
kwargs["raised"].add(letter_index)
|
|||
|
elif IPA.is_bracket(c) or IPA.is_break(c):
|
|||
|
# Skip brackets/syllable breaks
|
|||
|
pass
|
|||
|
elif IPA.is_tie(c):
|
|||
|
# Keep ties in letters
|
|||
|
kwargs["letters"] += c
|
|||
|
letter_index += 1
|
|||
|
elif IPA.is_tone(c):
|
|||
|
# Tone numbers/letters
|
|||
|
kwargs["tone"] += c
|
|||
|
in_tone = True
|
|||
|
elif unicodedata.combining(c) > 0:
|
|||
|
# Stow some diacritics that we don't do anything with
|
|||
|
kwargs["diacritics"][letter_index].add(c)
|
|||
|
else:
|
|||
|
# Include all other characters in letters
|
|||
|
kwargs["letters"] += c
|
|||
|
if new_letter:
|
|||
|
letter_index += 1
|
|||
|
|
|||
|
new_letter = True
|
|||
|
|
|||
|
return Phone(**kwargs)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class Pronunciation:
|
|||
|
"""Collection of phones and breaks for some unit of text (word, sentence, etc.)"""
|
|||
|
|
|||
|
def __init__(
|
|||
|
self, phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]]
|
|||
|
):
|
|||
|
self.phones_and_others = phones_and_others
|
|||
|
|
|||
|
self.phones: typing.List[Phone] = []
|
|||
|
self.breaks: typing.List[Break] = []
|
|||
|
self.intonations: typing.List[Intonation] = []
|
|||
|
|
|||
|
# Decompose into phones, breaks, and intonations
|
|||
|
for p in self.phones_and_others:
|
|||
|
if isinstance(p, Phone):
|
|||
|
self.phones.append(p)
|
|||
|
elif isinstance(p, Break):
|
|||
|
self.breaks.append(p)
|
|||
|
elif isinstance(p, Intonation):
|
|||
|
self.intonations.append(p)
|
|||
|
|
|||
|
self._text = ""
|
|||
|
|
|||
|
@property
|
|||
|
def text(self) -> str:
|
|||
|
"""Get text representation of pronunciation (NFC normalized)"""
|
|||
|
if not self._text:
|
|||
|
self._text = "".join(p.text for p in self.phones_and_others)
|
|||
|
|
|||
|
return self._text
|
|||
|
|
|||
|
def __repr__(self) -> str:
|
|||
|
return self.text
|
|||
|
|
|||
|
def __iter__(self):
|
|||
|
return iter(self.phones_and_others)
|
|||
|
|
|||
|
def __getitem__(self, idx):
|
|||
|
return self.phones_and_others[idx]
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def from_string(
|
|||
|
pron_str: str,
|
|||
|
keep_stress: bool = True,
|
|||
|
keep_accents: typing.Optional[bool] = None,
|
|||
|
drop_tones: bool = False,
|
|||
|
keep_ties: bool = True,
|
|||
|
) -> "Pronunciation":
|
|||
|
"""Split an IPA pronunciation into phones.
|
|||
|
|
|||
|
Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa).
|
|||
|
Elongation markers bind to the previous non-combining codepoint (e.g., aː).
|
|||
|
Ties join two non-combining sequences (e.g. t͡ʃ).
|
|||
|
|
|||
|
Whitespace and brackets are skipped.
|
|||
|
|
|||
|
Returns list of phones.
|
|||
|
"""
|
|||
|
if keep_accents is None:
|
|||
|
keep_accents = keep_stress
|
|||
|
|
|||
|
clusters = []
|
|||
|
cluster = ""
|
|||
|
stress = ""
|
|||
|
is_stress = False
|
|||
|
accents = ""
|
|||
|
is_accent = False
|
|||
|
tone = ""
|
|||
|
in_tone = False
|
|||
|
skip_next_cluster = False
|
|||
|
|
|||
|
codepoints = unicodedata.normalize("NFD", pron_str)
|
|||
|
|
|||
|
for codepoint in codepoints:
|
|||
|
new_cluster = False
|
|||
|
is_stress = False
|
|||
|
is_accent = False
|
|||
|
|
|||
|
if (
|
|||
|
codepoint.isspace()
|
|||
|
or IPA.is_bracket(codepoint)
|
|||
|
or (codepoint in {IPA.BREAK_SYLLABLE})
|
|||
|
):
|
|||
|
# Skip whitespace, brackets, and syllable breaks
|
|||
|
continue
|
|||
|
|
|||
|
if IPA.is_break(codepoint) or IPA.is_intonation(codepoint):
|
|||
|
# Keep minor/major/word breaks and intonation markers
|
|||
|
new_cluster = True
|
|||
|
|
|||
|
if IPA.is_accent(codepoint) and not in_tone:
|
|||
|
is_accent = True
|
|||
|
if cluster:
|
|||
|
new_cluster = True
|
|||
|
skip_next_cluster = True
|
|||
|
elif IPA.is_stress(codepoint):
|
|||
|
is_stress = True
|
|||
|
if cluster:
|
|||
|
new_cluster = True
|
|||
|
skip_next_cluster = True
|
|||
|
elif in_tone and (codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
|
|||
|
# Interpret as part of tone
|
|||
|
if not drop_tones:
|
|||
|
tone += codepoint
|
|||
|
|
|||
|
continue
|
|||
|
elif IPA.is_long(codepoint):
|
|||
|
# Add to current cluster
|
|||
|
pass
|
|||
|
elif IPA.is_tie(codepoint):
|
|||
|
if keep_ties:
|
|||
|
# Add next non-combining to current cluster
|
|||
|
skip_next_cluster = True
|
|||
|
else:
|
|||
|
# Ignore ties
|
|||
|
continue
|
|||
|
elif IPA.is_tone(codepoint):
|
|||
|
# Add to end of current cluster
|
|||
|
if not drop_tones:
|
|||
|
tone += codepoint
|
|||
|
|
|||
|
in_tone = True
|
|||
|
continue
|
|||
|
elif unicodedata.combining(codepoint) == 0:
|
|||
|
# Non-combining character
|
|||
|
if skip_next_cluster:
|
|||
|
# Add to current cluster
|
|||
|
skip_next_cluster = False
|
|||
|
elif cluster:
|
|||
|
# Start a new cluster
|
|||
|
new_cluster = True
|
|||
|
|
|||
|
if new_cluster and cluster:
|
|||
|
clusters.append(accents + stress + cluster + tone)
|
|||
|
accents = ""
|
|||
|
stress = ""
|
|||
|
cluster = ""
|
|||
|
tone = ""
|
|||
|
|
|||
|
if is_accent:
|
|||
|
if keep_accents:
|
|||
|
accents += codepoint
|
|||
|
elif is_stress:
|
|||
|
if keep_stress:
|
|||
|
stress += codepoint
|
|||
|
else:
|
|||
|
cluster += codepoint
|
|||
|
|
|||
|
if cluster:
|
|||
|
clusters.append(accents + stress + cluster + tone)
|
|||
|
|
|||
|
phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] = []
|
|||
|
for cluster in clusters:
|
|||
|
if IPA.is_break(cluster):
|
|||
|
phones_and_others.append(Break.from_string(cluster))
|
|||
|
elif IPA.is_intonation(cluster):
|
|||
|
phones_and_others.append(Intonation.from_string(cluster))
|
|||
|
else:
|
|||
|
phones_and_others.append(Phone.from_string(cluster))
|
|||
|
|
|||
|
return Pronunciation(phones_and_others)
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class Phoneme:
|
|||
|
"""Phoneme composed of international phonetic alphabet symbols"""
|
|||
|
|
|||
|
def __init__(
|
|||
|
self,
|
|||
|
text: str,
|
|||
|
example: str = "",
|
|||
|
unknown: bool = False,
|
|||
|
tones: typing.Optional[typing.Iterable[str]] = None,
|
|||
|
is_ipa: bool = True,
|
|||
|
):
|
|||
|
self._text = ""
|
|||
|
self._text_compare = ""
|
|||
|
self.example = example
|
|||
|
self.unknown = unknown
|
|||
|
|
|||
|
# List of allowable tones for phoneme
|
|||
|
self.tones = list(tones or [])
|
|||
|
|
|||
|
self.stress: typing.Optional[Stress] = None
|
|||
|
self.accents: typing.List[Accent] = []
|
|||
|
self.elongated: bool = False
|
|||
|
self.nasalated: typing.Set[int] = set()
|
|||
|
self.raised: typing.Set[int] = set()
|
|||
|
self._extra_combining: typing.Dict[int, typing.List[str]] = defaultdict(list)
|
|||
|
|
|||
|
# Decompose into base and combining characters
|
|||
|
codepoints = unicodedata.normalize("NFD", text)
|
|||
|
self.letters = ""
|
|||
|
self.tone = ""
|
|||
|
|
|||
|
if is_ipa:
|
|||
|
in_tone = False
|
|||
|
letter_index = 0
|
|||
|
new_letter = False
|
|||
|
|
|||
|
for c in codepoints:
|
|||
|
# Check for stress
|
|||
|
if (c == IPA.ACCENT_ACUTE) and (not in_tone):
|
|||
|
self.accents.append(Accent.ACUTE)
|
|||
|
elif (c == IPA.ACCENT_GRAVE) and (not in_tone):
|
|||
|
self.accents.append(Accent.GRAVE)
|
|||
|
elif c == IPA.STRESS_PRIMARY:
|
|||
|
self.stress = Stress.PRIMARY
|
|||
|
elif c == IPA.STRESS_SECONDARY:
|
|||
|
self.stress = Stress.SECONDARY
|
|||
|
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
|
|||
|
# Interpret as part of tone
|
|||
|
self.tone += c
|
|||
|
elif IPA.is_long(c):
|
|||
|
# Check for elongation
|
|||
|
self.elongated = True
|
|||
|
elif IPA.is_nasal(c):
|
|||
|
# Check for nasalation
|
|||
|
self.nasalated.add(letter_index)
|
|||
|
elif IPA.is_raised(c):
|
|||
|
# Check for raised articulation
|
|||
|
self.raised.add(letter_index)
|
|||
|
elif IPA.is_bracket(c) or IPA.is_break(c):
|
|||
|
# Skip brackets/syllable breaks
|
|||
|
pass
|
|||
|
elif IPA.is_tone(c):
|
|||
|
# Keep tone separate
|
|||
|
self.tone += c
|
|||
|
in_tone = True
|
|||
|
elif c in {IPA.SYLLABIC, IPA.NON_SYLLABIC, IPA.EXTRA_SHORT}:
|
|||
|
# Stow some diacritics that we don't do anything with
|
|||
|
self._extra_combining[letter_index].append(c)
|
|||
|
else:
|
|||
|
# Include all other characters in base
|
|||
|
self.letters += c
|
|||
|
|
|||
|
if new_letter:
|
|||
|
letter_index += 1
|
|||
|
|
|||
|
new_letter = True
|
|||
|
else:
|
|||
|
self.letters = text
|
|||
|
|
|||
|
# Re-normalize and combine letters
|
|||
|
self.letters = unicodedata.normalize("NFC", self.letters)
|
|||
|
self.letters_graphemes = IPA.graphemes(self.letters)
|
|||
|
|
|||
|
# Categorize
|
|||
|
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
|
|||
|
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
|
|||
|
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
|
|||
|
self.dipthong: typing.Optional[Dipthong] = None
|
|||
|
|
|||
|
if (
|
|||
|
(not self.vowel)
|
|||
|
and (not self.consonant)
|
|||
|
and (not self.schwa)
|
|||
|
and (len(self.letters) == 2)
|
|||
|
):
|
|||
|
# Check if dipthong (two vowels)
|
|||
|
vowel1 = VOWELS.get(self.letters[0])
|
|||
|
vowel2 = VOWELS.get(self.letters[1])
|
|||
|
if vowel1 and vowel2:
|
|||
|
self.dipthong = Dipthong(vowel1, vowel2)
|
|||
|
|
|||
|
@property
|
|||
|
def text(self) -> str:
|
|||
|
"""Return letters with stress and elongation (NFC normalized)"""
|
|||
|
if self._text:
|
|||
|
return self._text
|
|||
|
|
|||
|
for accent in self.accents:
|
|||
|
if accent == Accent.ACUTE:
|
|||
|
self._text += IPA.ACCENT_ACUTE
|
|||
|
elif accent == Accent.GRAVE:
|
|||
|
self._text += IPA.ACCENT_GRAVE
|
|||
|
|
|||
|
if self.stress == Stress.PRIMARY:
|
|||
|
self._text += IPA.STRESS_PRIMARY
|
|||
|
elif self.stress == Stress.SECONDARY:
|
|||
|
self._text += IPA.STRESS_SECONDARY
|
|||
|
|
|||
|
for letter_index, letter in enumerate(self.letters):
|
|||
|
self._text += letter
|
|||
|
|
|||
|
if letter_index in self.nasalated:
|
|||
|
self._text += IPA.NASAL
|
|||
|
|
|||
|
if letter_index in self.raised:
|
|||
|
self._text += IPA.RAISED
|
|||
|
|
|||
|
for c in self._extra_combining[letter_index]:
|
|||
|
self._text += c
|
|||
|
|
|||
|
if self.tone:
|
|||
|
self._text += self.tone
|
|||
|
|
|||
|
if self.elongated:
|
|||
|
self._text += IPA.LONG
|
|||
|
|
|||
|
# Re-normalize and combine
|
|||
|
self._text = unicodedata.normalize("NFC", self._text)
|
|||
|
|
|||
|
return self._text
|
|||
|
|
|||
|
@property
|
|||
|
def text_compare(self) -> str:
|
|||
|
"""Return letters and elongation with no stress/tones (NFC normalized)"""
|
|||
|
if self._text_compare:
|
|||
|
return self._text_compare
|
|||
|
|
|||
|
for letter_index, letter in enumerate(self.letters):
|
|||
|
self._text_compare += letter
|
|||
|
|
|||
|
if letter_index in self.nasalated:
|
|||
|
self._text_compare += IPA.NASAL
|
|||
|
|
|||
|
if letter_index in self.raised:
|
|||
|
self._text_compare += IPA.RAISED
|
|||
|
|
|||
|
for c in self._extra_combining[letter_index]:
|
|||
|
self._text_compare += c
|
|||
|
|
|||
|
if self.elongated:
|
|||
|
self._text_compare += IPA.LONG
|
|||
|
|
|||
|
# Re-normalize and combine
|
|||
|
self._text_compare = unicodedata.normalize("NFC", self._text_compare)
|
|||
|
|
|||
|
return self._text_compare
|
|||
|
|
|||
|
def copy(self) -> "Phoneme":
|
|||
|
"""Create a copy of this phonemes"""
|
|||
|
return Phoneme(text=self.text, example=self.example, unknown=self.unknown)
|
|||
|
|
|||
|
def __repr__(self) -> str:
|
|||
|
"""Return symbol with stress and elongation."""
|
|||
|
return self.text
|
|||
|
|
|||
|
def to_dict(self) -> typing.Dict[str, typing.Any]:
|
|||
|
"""Return properties of phoneme as a dict"""
|
|||
|
type_name = "Phoneme"
|
|||
|
props: typing.Dict[str, typing.Any] = {
|
|||
|
"text": repr(self),
|
|||
|
"letters": self.letters,
|
|||
|
"tone": self.tone,
|
|||
|
"tones": self.tones,
|
|||
|
}
|
|||
|
|
|||
|
if self.unknown:
|
|||
|
props["unknown"] = True
|
|||
|
|
|||
|
if self.example:
|
|||
|
props["example"] = self.example
|
|||
|
|
|||
|
props["accents"] = [a.value for a in self.accents]
|
|||
|
props["stress"] = self.stress.value if self.stress is not None else ""
|
|||
|
|
|||
|
if self.vowel:
|
|||
|
type_name = "Vowel"
|
|||
|
props["height"] = self.vowel.height.value
|
|||
|
props["placement"] = self.vowel.placement.value
|
|||
|
props["rounded"] = self.vowel.rounded
|
|||
|
elif self.consonant:
|
|||
|
type_name = "Consonant"
|
|||
|
props["type"] = self.consonant.type.value
|
|||
|
props["place"] = self.consonant.place.value
|
|||
|
props["voiced"] = self.consonant.voiced
|
|||
|
elif self.dipthong:
|
|||
|
type_name = "Dipthong"
|
|||
|
elif self.schwa:
|
|||
|
type_name = "Schwa"
|
|||
|
props["r_coloured"] = self.schwa.r_coloured
|
|||
|
|
|||
|
props["type"] = type_name
|
|||
|
|
|||
|
props["nasalated"] = list(self.nasalated)
|
|||
|
props["raised"] = list(self.raised)
|
|||
|
props["elongated"] = self.elongated
|
|||
|
|
|||
|
return props
|
|||
|
|
|||
|
def to_string(self) -> str:
|
|||
|
"""Return descriptive string of phoneme"""
|
|||
|
props = self.to_dict()
|
|||
|
type_name = props.get("type", "Phoneme")
|
|||
|
|
|||
|
prop_strs = [f"{k}={v}" for k, v in props.items()]
|
|||
|
|
|||
|
return f"{type_name}(" + ", ".join(prop_strs) + ")"
|
|||
|
|
|||
|
|
|||
|
# -----------------------------------------------------------------------------
|
|||
|
|
|||
|
|
|||
|
class Phonemes:
|
|||
|
"""Set of phonemes and allophones for a language"""
|
|||
|
|
|||
|
COMMENT_STR = "#"
|
|||
|
|
|||
|
def __init__(self, phonemes=None, ipa_map=None):
|
|||
|
self.phonemes = phonemes or []
|
|||
|
self.ipa_map = ipa_map or {}
|
|||
|
|
|||
|
# Regex for replacing IPA
|
|||
|
self._ipa_map_regex = None
|
|||
|
|
|||
|
# Phonemes sorted by descreasing length
|
|||
|
self._phonemes_sorted = None
|
|||
|
|
|||
|
# Map from original phoneme to gruut IPA
|
|||
|
self.gruut_ipa_map: typing.Dict[str, str] = {}
|
|||
|
|
|||
|
self.phoneme_texts: typing.Set[str] = {}
|
|||
|
self.update()
|
|||
|
|
|||
|
def __iter__(self):
|
|||
|
return iter(self.phonemes)
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
return len(self.phonemes)
|
|||
|
|
|||
|
def __getitem__(self, key):
|
|||
|
return self.phonemes[key]
|
|||
|
|
|||
|
def __contains__(self, item):
|
|||
|
if isinstance(item, str):
|
|||
|
# Compare IPA text
|
|||
|
return item in self.phoneme_texts
|
|||
|
|
|||
|
return item in self.phonemes
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def from_language(language: str) -> "Phonemes":
|
|||
|
"""Load phonemes for a given language"""
|
|||
|
language = resolve_lang(language)
|
|||
|
|
|||
|
# Load phonemes themselves
|
|||
|
phonemes_path = _DATA_DIR / language / "phonemes.txt"
|
|||
|
with open(phonemes_path, "r", encoding="utf-8") as phonemes_file:
|
|||
|
phonemes = Phonemes.from_text(phonemes_file)
|
|||
|
|
|||
|
# Try to load optional map from original phoneme to gruut IPA
|
|||
|
gruut_ipa_map: typing.Optional[typing.Dict[str, str]] = None
|
|||
|
map_path = _DATA_DIR / language / "ipa_map.txt"
|
|||
|
if map_path.is_file():
|
|||
|
gruut_ipa_map = {}
|
|||
|
with open(map_path, "r", encoding="utf-8") as map_file:
|
|||
|
for line in map_file:
|
|||
|
line = line.strip()
|
|||
|
if not line:
|
|||
|
continue
|
|||
|
|
|||
|
from_phoneme, to_ipa = line.split(maxsplit=1)
|
|||
|
gruut_ipa_map[from_phoneme] = to_ipa
|
|||
|
|
|||
|
if gruut_ipa_map:
|
|||
|
phonemes.gruut_ipa_map = gruut_ipa_map
|
|||
|
|
|||
|
return phonemes
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def from_text(text_file) -> "Phonemes":
|
|||
|
"""Load text file with phonemes, examples, and allophones"""
|
|||
|
lang = Phonemes()
|
|||
|
|
|||
|
for line in text_file:
|
|||
|
# Remove comments
|
|||
|
line, *_ = line.split(Phonemes.COMMENT_STR, maxsplit=1)
|
|||
|
line = line.strip()
|
|||
|
if line:
|
|||
|
# phoneme [example] [allophone] [allophone] ! [tone] [tone]...
|
|||
|
parts = line.split()
|
|||
|
phoneme_ipa = parts[0]
|
|||
|
example = ""
|
|||
|
|
|||
|
if len(parts) > 1:
|
|||
|
example = parts[1]
|
|||
|
|
|||
|
tones = []
|
|||
|
if len(parts) > 2:
|
|||
|
in_tone = False
|
|||
|
|
|||
|
# Map allophone back to phoneme
|
|||
|
for part in parts[2:]:
|
|||
|
if part == "!":
|
|||
|
# Begin possible tones for this phoneme
|
|||
|
in_tone = True
|
|||
|
elif in_tone:
|
|||
|
tones.append(part)
|
|||
|
else:
|
|||
|
lang.ipa_map[part] = phoneme_ipa
|
|||
|
|
|||
|
lang.phonemes.append(
|
|||
|
Phoneme(text=phoneme_ipa, example=example, tones=tones)
|
|||
|
)
|
|||
|
|
|||
|
lang.update()
|
|||
|
|
|||
|
return lang
|
|||
|
|
|||
|
def update(self):
|
|||
|
"""Call after modifying phonemes or IPA map to re-sort"""
|
|||
|
# Create single regex that will be used to replace IPA.
|
|||
|
# The final regex is of the form (AAA|BB|C) where each case is in
|
|||
|
# decreasing length order.
|
|||
|
#
|
|||
|
# If the replacement is not a substring of any phonemes, then the
|
|||
|
# replacement is straightforward.
|
|||
|
#
|
|||
|
# If it is a substring of some phoneme, however, we need to be careful.
|
|||
|
# For example, naively replacing "e" with "eɪ" in the string "beɪ" will
|
|||
|
# produce "beeɪ" when we want it to be "beɪ".
|
|||
|
#
|
|||
|
# So the substring case becomes "e(?!ɪ)" which uses a negative lookahead
|
|||
|
# to avoid the problem.
|
|||
|
cases = []
|
|||
|
for match_text in sorted(self.ipa_map.keys(), key=len, reverse=True):
|
|||
|
if match_text.startswith(","):
|
|||
|
# Raw regex
|
|||
|
cases.append(match_text[1:])
|
|||
|
continue
|
|||
|
|
|||
|
# Check against all of the phonemes
|
|||
|
case_added = False
|
|||
|
for phoneme in self.phonemes:
|
|||
|
num_extra = len(phoneme.text) - len(match_text)
|
|||
|
if (num_extra > 0) and phoneme.text.startswith(match_text):
|
|||
|
# Use negative lookahead to avoid replacing part of a valid
|
|||
|
# phoneme.
|
|||
|
cases.append(
|
|||
|
"{}(?!{})".format(
|
|||
|
re.escape(match_text[:num_extra]),
|
|||
|
re.escape(phoneme.text[num_extra:]),
|
|||
|
)
|
|||
|
)
|
|||
|
|
|||
|
case_added = True
|
|||
|
break
|
|||
|
|
|||
|
if not case_added:
|
|||
|
# No substring problem
|
|||
|
cases.append(re.escape(match_text))
|
|||
|
|
|||
|
ipa_map_regex_str = "({})".format("|".join(cases))
|
|||
|
self._ipa_map_regex = re.compile(ipa_map_regex_str)
|
|||
|
|
|||
|
# Split phonemes and sort by reverse length
|
|||
|
split_phonemes = [
|
|||
|
([pb.text for pb in Pronunciation.from_string(p.text)], p)
|
|||
|
for p in self.phonemes
|
|||
|
]
|
|||
|
|
|||
|
self._phonemes_sorted = sorted(
|
|||
|
split_phonemes, key=lambda kp: len(kp[0]), reverse=True
|
|||
|
)
|
|||
|
|
|||
|
# Update IPA texts set for phonemes
|
|||
|
self.phoneme_texts = set(p.text for p in self.phonemes)
|
|||
|
|
|||
|
def split(
|
|||
|
self,
|
|||
|
pron_str: typing.Union[str, Pronunciation],
|
|||
|
keep_stress: bool = True,
|
|||
|
keep_accents: typing.Optional[bool] = None,
|
|||
|
drop_tones: bool = False,
|
|||
|
is_ipa: bool = True,
|
|||
|
) -> typing.List[Phoneme]:
|
|||
|
"""Split an IPA pronunciation into phonemes"""
|
|||
|
if not self._ipa_map_regex:
|
|||
|
self.update()
|
|||
|
|
|||
|
if keep_accents is None:
|
|||
|
keep_accents = keep_stress
|
|||
|
|
|||
|
word_phonemes: typing.List[Phoneme] = []
|
|||
|
|
|||
|
if self.ipa_map:
|
|||
|
if isinstance(pron_str, Pronunciation):
|
|||
|
pron_str = "".join(p.text for p in pron_str)
|
|||
|
|
|||
|
def handle_replace(match):
|
|||
|
text = match.group(1)
|
|||
|
return self.ipa_map.get(text, text)
|
|||
|
|
|||
|
pron_str = self._ipa_map_regex.sub(handle_replace, pron_str)
|
|||
|
|
|||
|
# Get text for IPA phones
|
|||
|
if isinstance(pron_str, Pronunciation):
|
|||
|
# Use supplied pronunication
|
|||
|
ipas = [pb.text for pb in pron_str]
|
|||
|
elif is_ipa:
|
|||
|
# Split string into pronunciation
|
|||
|
pron = Pronunciation.from_string(
|
|||
|
pron_str,
|
|||
|
keep_stress=keep_stress,
|
|||
|
keep_accents=keep_accents,
|
|||
|
drop_tones=drop_tones,
|
|||
|
)
|
|||
|
ipas = [pb.text for pb in pron]
|
|||
|
else:
|
|||
|
ipas = IPA.graphemes(pron_str)
|
|||
|
|
|||
|
# Keep stress and tones separate to make phoneme comparisons easier
|
|||
|
ipa_stress: typing.Dict[int, str] = defaultdict(str)
|
|||
|
ipa_tones: typing.Dict[int, str] = defaultdict(str)
|
|||
|
|
|||
|
if is_ipa:
|
|||
|
in_tone = False
|
|||
|
for ipa_idx, ipa in enumerate(ipas):
|
|||
|
if ipa:
|
|||
|
keep_ipa = ""
|
|||
|
for codepoint in ipa:
|
|||
|
if IPA.is_accent(codepoint) and (not in_tone):
|
|||
|
if keep_accents:
|
|||
|
ipa_stress[ipa_idx] += codepoint
|
|||
|
elif IPA.is_stress(codepoint):
|
|||
|
if keep_stress:
|
|||
|
ipa_stress[ipa_idx] += codepoint
|
|||
|
elif in_tone and (
|
|||
|
codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}
|
|||
|
):
|
|||
|
# Interpret as part of time
|
|||
|
if not drop_tones:
|
|||
|
ipa_tones[ipa_idx] += codepoint
|
|||
|
elif IPA.is_tone(codepoint):
|
|||
|
if not drop_tones:
|
|||
|
ipa_tones[ipa_idx] += codepoint
|
|||
|
|
|||
|
in_tone = True
|
|||
|
else:
|
|||
|
keep_ipa += codepoint
|
|||
|
|
|||
|
ipas[ipa_idx] = keep_ipa
|
|||
|
|
|||
|
num_ipas: int = len(ipas)
|
|||
|
|
|||
|
# ---------------------------------------------------------------------
|
|||
|
|
|||
|
# pylint: disable=consider-using-enumerate
|
|||
|
for ipa_idx in range(len(ipas)):
|
|||
|
ipa = ipas[ipa_idx]
|
|||
|
if ipa is None:
|
|||
|
# Skip replaced piece
|
|||
|
continue
|
|||
|
|
|||
|
phoneme_match = False
|
|||
|
for phoneme_ipas, phoneme in self._phonemes_sorted:
|
|||
|
if ipa_idx <= (num_ipas - len(phoneme_ipas)):
|
|||
|
phoneme_match = True
|
|||
|
phoneme_stress = ""
|
|||
|
phoneme_tones = ""
|
|||
|
|
|||
|
# Look forward into sequence
|
|||
|
for phoneme_idx in range(len(phoneme_ipas)):
|
|||
|
phoneme_stress += ipa_stress[ipa_idx + phoneme_idx]
|
|||
|
phoneme_tones += ipa_tones[ipa_idx + phoneme_idx]
|
|||
|
|
|||
|
if phoneme_ipas[phoneme_idx] != ipas[ipa_idx + phoneme_idx]:
|
|||
|
phoneme_match = False
|
|||
|
break
|
|||
|
|
|||
|
if phoneme_match:
|
|||
|
# Successful match
|
|||
|
if phoneme_stress or phoneme_tones:
|
|||
|
# Create a copy of the phoneme with applied stress/tones
|
|||
|
phoneme = Phoneme(
|
|||
|
text=(phoneme_stress + phoneme.text + phoneme_tones),
|
|||
|
example=phoneme.example,
|
|||
|
)
|
|||
|
|
|||
|
word_phonemes.append(phoneme)
|
|||
|
|
|||
|
# Patch ipas to skip replaced pieces
|
|||
|
for phoneme_idx in range(1, len(phoneme_ipas)):
|
|||
|
ipas[ipa_idx + phoneme_idx] = None
|
|||
|
|
|||
|
break
|
|||
|
|
|||
|
if not phoneme_match:
|
|||
|
# Add unknown phoneme
|
|||
|
word_phonemes.append(Phoneme(text=ipa, unknown=True))
|
|||
|
|
|||
|
return word_phonemes
|