#!/usr/bin/env python3 """Functions for manipulating phones/phonemes""" import logging import re import typing import unicodedata from collections import defaultdict from gruut_ipa.constants import ( # noqa: F401 _DATA_DIR, _DIR, CONSONANTS, FEATURE_COLUMNS, FEATURE_EMPTY, FEATURE_KEYS, FEATURE_ORDINAL_COLUMNS, IPA, LANG_ALIASES, SCHWAS, VOWELS, Accent, Break, BreakType, Consonant, ConsonantPlace, ConsonantType, Dipthong, Intonation, PhonemeLength, Schwa, Stress, Vowel, VowelHeight, VowelPlacement, ) from gruut_ipa.utils import resolve_lang _LOGGER = logging.getLogger("gruut_ipa") # ----------------------------------------------------------------------------- class Phone: """Single IPA phone with diacritics and suprasegmentals""" def __init__( self, letters: str, stress: typing.Optional[Stress] = None, accents: typing.Optional[typing.Iterable[Accent]] = None, is_long: bool = False, nasal: typing.Optional[typing.Set[int]] = None, raised: typing.Optional[typing.Set[int]] = None, diacritics: typing.Optional[typing.Dict[int, typing.Set[str]]] = None, suprasegmentals: typing.Optional[typing.Set[str]] = None, tone: str = "", ): self.letters: str = unicodedata.normalize("NFC", letters) self.stress = stress self.accents: typing.List[Accent] = list(accents or []) self.is_long: bool = is_long self.nasal: typing.Set[int] = nasal or set() self.is_nasal = bool(self.nasal) self.raised: typing.Set[int] = raised or set() self.is_raised = bool(self.raised) self.tone: str = tone self.diacritics: typing.Dict[int, typing.Set[str]] = diacritics or defaultdict( set ) self.suprasegmentals: typing.Set[str] = suprasegmentals or set() # Decompose suprasegmentals and diacritics if self.stress == Stress.PRIMARY: self.suprasegmentals.add(IPA.STRESS_PRIMARY) elif self.stress == Stress.SECONDARY: self.suprasegmentals.add(IPA.STRESS_SECONDARY) if Accent.ACUTE in self.accents: self.suprasegmentals.add(IPA.ACCENT_ACUTE) if Accent.GRAVE in self.accents: self.suprasegmentals.add(IPA.ACCENT_GRAVE) if self.is_long: self.suprasegmentals.add(IPA.LONG) # Nasal for letter_index in self.nasal: letter_diacritics = self.diacritics.get(letter_index) if letter_diacritics is None: letter_diacritics = set() self.diacritics[letter_index] = letter_diacritics letter_diacritics.add(IPA.NASAL) # Raised for letter_index in self.raised: letter_diacritics = self.diacritics.get(letter_index) if letter_diacritics is None: letter_diacritics = set() self.diacritics[letter_index] = letter_diacritics letter_diacritics.add(IPA.RAISED) self._text: str = "" self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters) self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters) self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters) @property def text(self) -> str: """Get textual representation of phone (NFC normalized)""" if self._text: return self._text # Pre-letter suprasegmentals for accent in self.accents: if accent == Accent.ACUTE: self._text += IPA.ACCENT_ACUTE elif accent == Accent.GRAVE: self._text += IPA.ACCENT_GRAVE if self.stress == Stress.PRIMARY: self._text += IPA.STRESS_PRIMARY elif self.stress == Stress.SECONDARY: self._text += IPA.STRESS_SECONDARY # Letters and diacritics for letter_index, letter in enumerate(self.letters): self._text += letter # Diacritics for diacritic in self.diacritics.get(letter_index, []): self._text += diacritic # Tone if self.tone: self._text += self.tone # Post-letter suprasegmentals if self.is_long: self._text += IPA.LONG # Re-normalize and combine self._text = unicodedata.normalize("NFC", self._text) return self._text @property def is_vowel(self) -> bool: """True if phone is a vowel""" return self.vowel is not None @property def is_consonant(self) -> bool: """True if phone is a consonant""" return self.consonant is not None @property def is_schwa(self) -> bool: """True if phone is a schwa""" return self.schwa is not None def __repr__(self) -> str: return self.text @staticmethod def from_string(phone_str: str) -> "Phone": """Parse phone from string""" # Decompose into base and combining characters codepoints = unicodedata.normalize("NFD", phone_str) kwargs: typing.Dict[str, typing.Any] = { "letters": "", "diacritics": defaultdict(set), "tone": "", "accents": [], "nasal": set(), "raised": set(), } in_tone = False new_letter = False letter_index = 0 for c in codepoints: # Check for stress if (c == IPA.ACCENT_ACUTE) and not in_tone: kwargs["accents"].append(Accent.ACUTE) elif (c == IPA.ACCENT_GRAVE) and not in_tone: kwargs["accents"].append(Accent.GRAVE) elif c == IPA.STRESS_PRIMARY: kwargs["stress"] = Stress.PRIMARY elif c == IPA.STRESS_SECONDARY: kwargs["stress"] = Stress.SECONDARY elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}): # Interpret as part of tone kwargs["tone"] += c elif IPA.is_long(c): # Check for elongation kwargs["is_long"] = True elif IPA.is_nasal(c): # Check for nasalation kwargs["nasal"].add(letter_index) elif IPA.is_raised(c): # Check for raised articulation kwargs["raised"].add(letter_index) elif IPA.is_bracket(c) or IPA.is_break(c): # Skip brackets/syllable breaks pass elif IPA.is_tie(c): # Keep ties in letters kwargs["letters"] += c letter_index += 1 elif IPA.is_tone(c): # Tone numbers/letters kwargs["tone"] += c in_tone = True elif unicodedata.combining(c) > 0: # Stow some diacritics that we don't do anything with kwargs["diacritics"][letter_index].add(c) else: # Include all other characters in letters kwargs["letters"] += c if new_letter: letter_index += 1 new_letter = True return Phone(**kwargs) # ----------------------------------------------------------------------------- class Pronunciation: """Collection of phones and breaks for some unit of text (word, sentence, etc.)""" def __init__( self, phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] ): self.phones_and_others = phones_and_others self.phones: typing.List[Phone] = [] self.breaks: typing.List[Break] = [] self.intonations: typing.List[Intonation] = [] # Decompose into phones, breaks, and intonations for p in self.phones_and_others: if isinstance(p, Phone): self.phones.append(p) elif isinstance(p, Break): self.breaks.append(p) elif isinstance(p, Intonation): self.intonations.append(p) self._text = "" @property def text(self) -> str: """Get text representation of pronunciation (NFC normalized)""" if not self._text: self._text = "".join(p.text for p in self.phones_and_others) return self._text def __repr__(self) -> str: return self.text def __iter__(self): return iter(self.phones_and_others) def __getitem__(self, idx): return self.phones_and_others[idx] @staticmethod def from_string( pron_str: str, keep_stress: bool = True, keep_accents: typing.Optional[bool] = None, drop_tones: bool = False, keep_ties: bool = True, ) -> "Pronunciation": """Split an IPA pronunciation into phones. Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa). Elongation markers bind to the previous non-combining codepoint (e.g., aː). Ties join two non-combining sequences (e.g. t͡ʃ). Whitespace and brackets are skipped. Returns list of phones. """ if keep_accents is None: keep_accents = keep_stress clusters = [] cluster = "" stress = "" is_stress = False accents = "" is_accent = False tone = "" in_tone = False skip_next_cluster = False codepoints = unicodedata.normalize("NFD", pron_str) for codepoint in codepoints: new_cluster = False is_stress = False is_accent = False if ( codepoint.isspace() or IPA.is_bracket(codepoint) or (codepoint in {IPA.BREAK_SYLLABLE}) ): # Skip whitespace, brackets, and syllable breaks continue if IPA.is_break(codepoint) or IPA.is_intonation(codepoint): # Keep minor/major/word breaks and intonation markers new_cluster = True if IPA.is_accent(codepoint) and not in_tone: is_accent = True if cluster: new_cluster = True skip_next_cluster = True elif IPA.is_stress(codepoint): is_stress = True if cluster: new_cluster = True skip_next_cluster = True elif in_tone and (codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}): # Interpret as part of tone if not drop_tones: tone += codepoint continue elif IPA.is_long(codepoint): # Add to current cluster pass elif IPA.is_tie(codepoint): if keep_ties: # Add next non-combining to current cluster skip_next_cluster = True else: # Ignore ties continue elif IPA.is_tone(codepoint): # Add to end of current cluster if not drop_tones: tone += codepoint in_tone = True continue elif unicodedata.combining(codepoint) == 0: # Non-combining character if skip_next_cluster: # Add to current cluster skip_next_cluster = False elif cluster: # Start a new cluster new_cluster = True if new_cluster and cluster: clusters.append(accents + stress + cluster + tone) accents = "" stress = "" cluster = "" tone = "" if is_accent: if keep_accents: accents += codepoint elif is_stress: if keep_stress: stress += codepoint else: cluster += codepoint if cluster: clusters.append(accents + stress + cluster + tone) phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] = [] for cluster in clusters: if IPA.is_break(cluster): phones_and_others.append(Break.from_string(cluster)) elif IPA.is_intonation(cluster): phones_and_others.append(Intonation.from_string(cluster)) else: phones_and_others.append(Phone.from_string(cluster)) return Pronunciation(phones_and_others) # ----------------------------------------------------------------------------- class Phoneme: """Phoneme composed of international phonetic alphabet symbols""" def __init__( self, text: str, example: str = "", unknown: bool = False, tones: typing.Optional[typing.Iterable[str]] = None, is_ipa: bool = True, ): self._text = "" self._text_compare = "" self.example = example self.unknown = unknown # List of allowable tones for phoneme self.tones = list(tones or []) self.stress: typing.Optional[Stress] = None self.accents: typing.List[Accent] = [] self.elongated: bool = False self.nasalated: typing.Set[int] = set() self.raised: typing.Set[int] = set() self._extra_combining: typing.Dict[int, typing.List[str]] = defaultdict(list) # Decompose into base and combining characters codepoints = unicodedata.normalize("NFD", text) self.letters = "" self.tone = "" if is_ipa: in_tone = False letter_index = 0 new_letter = False for c in codepoints: # Check for stress if (c == IPA.ACCENT_ACUTE) and (not in_tone): self.accents.append(Accent.ACUTE) elif (c == IPA.ACCENT_GRAVE) and (not in_tone): self.accents.append(Accent.GRAVE) elif c == IPA.STRESS_PRIMARY: self.stress = Stress.PRIMARY elif c == IPA.STRESS_SECONDARY: self.stress = Stress.SECONDARY elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}): # Interpret as part of tone self.tone += c elif IPA.is_long(c): # Check for elongation self.elongated = True elif IPA.is_nasal(c): # Check for nasalation self.nasalated.add(letter_index) elif IPA.is_raised(c): # Check for raised articulation self.raised.add(letter_index) elif IPA.is_bracket(c) or IPA.is_break(c): # Skip brackets/syllable breaks pass elif IPA.is_tone(c): # Keep tone separate self.tone += c in_tone = True elif c in {IPA.SYLLABIC, IPA.NON_SYLLABIC, IPA.EXTRA_SHORT}: # Stow some diacritics that we don't do anything with self._extra_combining[letter_index].append(c) else: # Include all other characters in base self.letters += c if new_letter: letter_index += 1 new_letter = True else: self.letters = text # Re-normalize and combine letters self.letters = unicodedata.normalize("NFC", self.letters) self.letters_graphemes = IPA.graphemes(self.letters) # Categorize self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters) self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters) self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters) self.dipthong: typing.Optional[Dipthong] = None if ( (not self.vowel) and (not self.consonant) and (not self.schwa) and (len(self.letters) == 2) ): # Check if dipthong (two vowels) vowel1 = VOWELS.get(self.letters[0]) vowel2 = VOWELS.get(self.letters[1]) if vowel1 and vowel2: self.dipthong = Dipthong(vowel1, vowel2) @property def text(self) -> str: """Return letters with stress and elongation (NFC normalized)""" if self._text: return self._text for accent in self.accents: if accent == Accent.ACUTE: self._text += IPA.ACCENT_ACUTE elif accent == Accent.GRAVE: self._text += IPA.ACCENT_GRAVE if self.stress == Stress.PRIMARY: self._text += IPA.STRESS_PRIMARY elif self.stress == Stress.SECONDARY: self._text += IPA.STRESS_SECONDARY for letter_index, letter in enumerate(self.letters): self._text += letter if letter_index in self.nasalated: self._text += IPA.NASAL if letter_index in self.raised: self._text += IPA.RAISED for c in self._extra_combining[letter_index]: self._text += c if self.tone: self._text += self.tone if self.elongated: self._text += IPA.LONG # Re-normalize and combine self._text = unicodedata.normalize("NFC", self._text) return self._text @property def text_compare(self) -> str: """Return letters and elongation with no stress/tones (NFC normalized)""" if self._text_compare: return self._text_compare for letter_index, letter in enumerate(self.letters): self._text_compare += letter if letter_index in self.nasalated: self._text_compare += IPA.NASAL if letter_index in self.raised: self._text_compare += IPA.RAISED for c in self._extra_combining[letter_index]: self._text_compare += c if self.elongated: self._text_compare += IPA.LONG # Re-normalize and combine self._text_compare = unicodedata.normalize("NFC", self._text_compare) return self._text_compare def copy(self) -> "Phoneme": """Create a copy of this phonemes""" return Phoneme(text=self.text, example=self.example, unknown=self.unknown) def __repr__(self) -> str: """Return symbol with stress and elongation.""" return self.text def to_dict(self) -> typing.Dict[str, typing.Any]: """Return properties of phoneme as a dict""" type_name = "Phoneme" props: typing.Dict[str, typing.Any] = { "text": repr(self), "letters": self.letters, "tone": self.tone, "tones": self.tones, } if self.unknown: props["unknown"] = True if self.example: props["example"] = self.example props["accents"] = [a.value for a in self.accents] props["stress"] = self.stress.value if self.stress is not None else "" if self.vowel: type_name = "Vowel" props["height"] = self.vowel.height.value props["placement"] = self.vowel.placement.value props["rounded"] = self.vowel.rounded elif self.consonant: type_name = "Consonant" props["type"] = self.consonant.type.value props["place"] = self.consonant.place.value props["voiced"] = self.consonant.voiced elif self.dipthong: type_name = "Dipthong" elif self.schwa: type_name = "Schwa" props["r_coloured"] = self.schwa.r_coloured props["type"] = type_name props["nasalated"] = list(self.nasalated) props["raised"] = list(self.raised) props["elongated"] = self.elongated return props def to_string(self) -> str: """Return descriptive string of phoneme""" props = self.to_dict() type_name = props.get("type", "Phoneme") prop_strs = [f"{k}={v}" for k, v in props.items()] return f"{type_name}(" + ", ".join(prop_strs) + ")" # ----------------------------------------------------------------------------- class Phonemes: """Set of phonemes and allophones for a language""" COMMENT_STR = "#" def __init__(self, phonemes=None, ipa_map=None): self.phonemes = phonemes or [] self.ipa_map = ipa_map or {} # Regex for replacing IPA self._ipa_map_regex = None # Phonemes sorted by descreasing length self._phonemes_sorted = None # Map from original phoneme to gruut IPA self.gruut_ipa_map: typing.Dict[str, str] = {} self.phoneme_texts: typing.Set[str] = {} self.update() def __iter__(self): return iter(self.phonemes) def __len__(self): return len(self.phonemes) def __getitem__(self, key): return self.phonemes[key] def __contains__(self, item): if isinstance(item, str): # Compare IPA text return item in self.phoneme_texts return item in self.phonemes @staticmethod def from_language(language: str) -> "Phonemes": """Load phonemes for a given language""" language = resolve_lang(language) # Load phonemes themselves phonemes_path = _DATA_DIR / language / "phonemes.txt" with open(phonemes_path, "r", encoding="utf-8") as phonemes_file: phonemes = Phonemes.from_text(phonemes_file) # Try to load optional map from original phoneme to gruut IPA gruut_ipa_map: typing.Optional[typing.Dict[str, str]] = None map_path = _DATA_DIR / language / "ipa_map.txt" if map_path.is_file(): gruut_ipa_map = {} with open(map_path, "r", encoding="utf-8") as map_file: for line in map_file: line = line.strip() if not line: continue from_phoneme, to_ipa = line.split(maxsplit=1) gruut_ipa_map[from_phoneme] = to_ipa if gruut_ipa_map: phonemes.gruut_ipa_map = gruut_ipa_map return phonemes @staticmethod def from_text(text_file) -> "Phonemes": """Load text file with phonemes, examples, and allophones""" lang = Phonemes() for line in text_file: # Remove comments line, *_ = line.split(Phonemes.COMMENT_STR, maxsplit=1) line = line.strip() if line: # phoneme [example] [allophone] [allophone] ! [tone] [tone]... parts = line.split() phoneme_ipa = parts[0] example = "" if len(parts) > 1: example = parts[1] tones = [] if len(parts) > 2: in_tone = False # Map allophone back to phoneme for part in parts[2:]: if part == "!": # Begin possible tones for this phoneme in_tone = True elif in_tone: tones.append(part) else: lang.ipa_map[part] = phoneme_ipa lang.phonemes.append( Phoneme(text=phoneme_ipa, example=example, tones=tones) ) lang.update() return lang def update(self): """Call after modifying phonemes or IPA map to re-sort""" # Create single regex that will be used to replace IPA. # The final regex is of the form (AAA|BB|C) where each case is in # decreasing length order. # # If the replacement is not a substring of any phonemes, then the # replacement is straightforward. # # If it is a substring of some phoneme, however, we need to be careful. # For example, naively replacing "e" with "eɪ" in the string "beɪ" will # produce "beeɪ" when we want it to be "beɪ". # # So the substring case becomes "e(?!ɪ)" which uses a negative lookahead # to avoid the problem. cases = [] for match_text in sorted(self.ipa_map.keys(), key=len, reverse=True): if match_text.startswith(","): # Raw regex cases.append(match_text[1:]) continue # Check against all of the phonemes case_added = False for phoneme in self.phonemes: num_extra = len(phoneme.text) - len(match_text) if (num_extra > 0) and phoneme.text.startswith(match_text): # Use negative lookahead to avoid replacing part of a valid # phoneme. cases.append( "{}(?!{})".format( re.escape(match_text[:num_extra]), re.escape(phoneme.text[num_extra:]), ) ) case_added = True break if not case_added: # No substring problem cases.append(re.escape(match_text)) ipa_map_regex_str = "({})".format("|".join(cases)) self._ipa_map_regex = re.compile(ipa_map_regex_str) # Split phonemes and sort by reverse length split_phonemes = [ ([pb.text for pb in Pronunciation.from_string(p.text)], p) for p in self.phonemes ] self._phonemes_sorted = sorted( split_phonemes, key=lambda kp: len(kp[0]), reverse=True ) # Update IPA texts set for phonemes self.phoneme_texts = set(p.text for p in self.phonemes) def split( self, pron_str: typing.Union[str, Pronunciation], keep_stress: bool = True, keep_accents: typing.Optional[bool] = None, drop_tones: bool = False, is_ipa: bool = True, ) -> typing.List[Phoneme]: """Split an IPA pronunciation into phonemes""" if not self._ipa_map_regex: self.update() if keep_accents is None: keep_accents = keep_stress word_phonemes: typing.List[Phoneme] = [] if self.ipa_map: if isinstance(pron_str, Pronunciation): pron_str = "".join(p.text for p in pron_str) def handle_replace(match): text = match.group(1) return self.ipa_map.get(text, text) pron_str = self._ipa_map_regex.sub(handle_replace, pron_str) # Get text for IPA phones if isinstance(pron_str, Pronunciation): # Use supplied pronunication ipas = [pb.text for pb in pron_str] elif is_ipa: # Split string into pronunciation pron = Pronunciation.from_string( pron_str, keep_stress=keep_stress, keep_accents=keep_accents, drop_tones=drop_tones, ) ipas = [pb.text for pb in pron] else: ipas = IPA.graphemes(pron_str) # Keep stress and tones separate to make phoneme comparisons easier ipa_stress: typing.Dict[int, str] = defaultdict(str) ipa_tones: typing.Dict[int, str] = defaultdict(str) if is_ipa: in_tone = False for ipa_idx, ipa in enumerate(ipas): if ipa: keep_ipa = "" for codepoint in ipa: if IPA.is_accent(codepoint) and (not in_tone): if keep_accents: ipa_stress[ipa_idx] += codepoint elif IPA.is_stress(codepoint): if keep_stress: ipa_stress[ipa_idx] += codepoint elif in_tone and ( codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT} ): # Interpret as part of time if not drop_tones: ipa_tones[ipa_idx] += codepoint elif IPA.is_tone(codepoint): if not drop_tones: ipa_tones[ipa_idx] += codepoint in_tone = True else: keep_ipa += codepoint ipas[ipa_idx] = keep_ipa num_ipas: int = len(ipas) # --------------------------------------------------------------------- # pylint: disable=consider-using-enumerate for ipa_idx in range(len(ipas)): ipa = ipas[ipa_idx] if ipa is None: # Skip replaced piece continue phoneme_match = False for phoneme_ipas, phoneme in self._phonemes_sorted: if ipa_idx <= (num_ipas - len(phoneme_ipas)): phoneme_match = True phoneme_stress = "" phoneme_tones = "" # Look forward into sequence for phoneme_idx in range(len(phoneme_ipas)): phoneme_stress += ipa_stress[ipa_idx + phoneme_idx] phoneme_tones += ipa_tones[ipa_idx + phoneme_idx] if phoneme_ipas[phoneme_idx] != ipas[ipa_idx + phoneme_idx]: phoneme_match = False break if phoneme_match: # Successful match if phoneme_stress or phoneme_tones: # Create a copy of the phoneme with applied stress/tones phoneme = Phoneme( text=(phoneme_stress + phoneme.text + phoneme_tones), example=phoneme.example, ) word_phonemes.append(phoneme) # Patch ipas to skip replaced pieces for phoneme_idx in range(1, len(phoneme_ipas)): ipas[ipa_idx + phoneme_idx] = None break if not phoneme_match: # Add unknown phoneme word_phonemes.append(Phoneme(text=ipa, unknown=True)) return word_phonemes