ai-content-maker/.venv/Lib/site-packages/gruut_ipa/phonemes.py

926 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Functions for manipulating phones/phonemes"""
import logging
import re
import typing
import unicodedata
from collections import defaultdict
from gruut_ipa.constants import ( # noqa: F401
_DATA_DIR,
_DIR,
CONSONANTS,
FEATURE_COLUMNS,
FEATURE_EMPTY,
FEATURE_KEYS,
FEATURE_ORDINAL_COLUMNS,
IPA,
LANG_ALIASES,
SCHWAS,
VOWELS,
Accent,
Break,
BreakType,
Consonant,
ConsonantPlace,
ConsonantType,
Dipthong,
Intonation,
PhonemeLength,
Schwa,
Stress,
Vowel,
VowelHeight,
VowelPlacement,
)
from gruut_ipa.utils import resolve_lang
_LOGGER = logging.getLogger("gruut_ipa")
# -----------------------------------------------------------------------------
class Phone:
"""Single IPA phone with diacritics and suprasegmentals"""
def __init__(
self,
letters: str,
stress: typing.Optional[Stress] = None,
accents: typing.Optional[typing.Iterable[Accent]] = None,
is_long: bool = False,
nasal: typing.Optional[typing.Set[int]] = None,
raised: typing.Optional[typing.Set[int]] = None,
diacritics: typing.Optional[typing.Dict[int, typing.Set[str]]] = None,
suprasegmentals: typing.Optional[typing.Set[str]] = None,
tone: str = "",
):
self.letters: str = unicodedata.normalize("NFC", letters)
self.stress = stress
self.accents: typing.List[Accent] = list(accents or [])
self.is_long: bool = is_long
self.nasal: typing.Set[int] = nasal or set()
self.is_nasal = bool(self.nasal)
self.raised: typing.Set[int] = raised or set()
self.is_raised = bool(self.raised)
self.tone: str = tone
self.diacritics: typing.Dict[int, typing.Set[str]] = diacritics or defaultdict(
set
)
self.suprasegmentals: typing.Set[str] = suprasegmentals or set()
# Decompose suprasegmentals and diacritics
if self.stress == Stress.PRIMARY:
self.suprasegmentals.add(IPA.STRESS_PRIMARY)
elif self.stress == Stress.SECONDARY:
self.suprasegmentals.add(IPA.STRESS_SECONDARY)
if Accent.ACUTE in self.accents:
self.suprasegmentals.add(IPA.ACCENT_ACUTE)
if Accent.GRAVE in self.accents:
self.suprasegmentals.add(IPA.ACCENT_GRAVE)
if self.is_long:
self.suprasegmentals.add(IPA.LONG)
# Nasal
for letter_index in self.nasal:
letter_diacritics = self.diacritics.get(letter_index)
if letter_diacritics is None:
letter_diacritics = set()
self.diacritics[letter_index] = letter_diacritics
letter_diacritics.add(IPA.NASAL)
# Raised
for letter_index in self.raised:
letter_diacritics = self.diacritics.get(letter_index)
if letter_diacritics is None:
letter_diacritics = set()
self.diacritics[letter_index] = letter_diacritics
letter_diacritics.add(IPA.RAISED)
self._text: str = ""
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
@property
def text(self) -> str:
"""Get textual representation of phone (NFC normalized)"""
if self._text:
return self._text
# Pre-letter suprasegmentals
for accent in self.accents:
if accent == Accent.ACUTE:
self._text += IPA.ACCENT_ACUTE
elif accent == Accent.GRAVE:
self._text += IPA.ACCENT_GRAVE
if self.stress == Stress.PRIMARY:
self._text += IPA.STRESS_PRIMARY
elif self.stress == Stress.SECONDARY:
self._text += IPA.STRESS_SECONDARY
# Letters and diacritics
for letter_index, letter in enumerate(self.letters):
self._text += letter
# Diacritics
for diacritic in self.diacritics.get(letter_index, []):
self._text += diacritic
# Tone
if self.tone:
self._text += self.tone
# Post-letter suprasegmentals
if self.is_long:
self._text += IPA.LONG
# Re-normalize and combine
self._text = unicodedata.normalize("NFC", self._text)
return self._text
@property
def is_vowel(self) -> bool:
"""True if phone is a vowel"""
return self.vowel is not None
@property
def is_consonant(self) -> bool:
"""True if phone is a consonant"""
return self.consonant is not None
@property
def is_schwa(self) -> bool:
"""True if phone is a schwa"""
return self.schwa is not None
def __repr__(self) -> str:
return self.text
@staticmethod
def from_string(phone_str: str) -> "Phone":
"""Parse phone from string"""
# Decompose into base and combining characters
codepoints = unicodedata.normalize("NFD", phone_str)
kwargs: typing.Dict[str, typing.Any] = {
"letters": "",
"diacritics": defaultdict(set),
"tone": "",
"accents": [],
"nasal": set(),
"raised": set(),
}
in_tone = False
new_letter = False
letter_index = 0
for c in codepoints:
# Check for stress
if (c == IPA.ACCENT_ACUTE) and not in_tone:
kwargs["accents"].append(Accent.ACUTE)
elif (c == IPA.ACCENT_GRAVE) and not in_tone:
kwargs["accents"].append(Accent.GRAVE)
elif c == IPA.STRESS_PRIMARY:
kwargs["stress"] = Stress.PRIMARY
elif c == IPA.STRESS_SECONDARY:
kwargs["stress"] = Stress.SECONDARY
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
# Interpret as part of tone
kwargs["tone"] += c
elif IPA.is_long(c):
# Check for elongation
kwargs["is_long"] = True
elif IPA.is_nasal(c):
# Check for nasalation
kwargs["nasal"].add(letter_index)
elif IPA.is_raised(c):
# Check for raised articulation
kwargs["raised"].add(letter_index)
elif IPA.is_bracket(c) or IPA.is_break(c):
# Skip brackets/syllable breaks
pass
elif IPA.is_tie(c):
# Keep ties in letters
kwargs["letters"] += c
letter_index += 1
elif IPA.is_tone(c):
# Tone numbers/letters
kwargs["tone"] += c
in_tone = True
elif unicodedata.combining(c) > 0:
# Stow some diacritics that we don't do anything with
kwargs["diacritics"][letter_index].add(c)
else:
# Include all other characters in letters
kwargs["letters"] += c
if new_letter:
letter_index += 1
new_letter = True
return Phone(**kwargs)
# -----------------------------------------------------------------------------
class Pronunciation:
"""Collection of phones and breaks for some unit of text (word, sentence, etc.)"""
def __init__(
self, phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]]
):
self.phones_and_others = phones_and_others
self.phones: typing.List[Phone] = []
self.breaks: typing.List[Break] = []
self.intonations: typing.List[Intonation] = []
# Decompose into phones, breaks, and intonations
for p in self.phones_and_others:
if isinstance(p, Phone):
self.phones.append(p)
elif isinstance(p, Break):
self.breaks.append(p)
elif isinstance(p, Intonation):
self.intonations.append(p)
self._text = ""
@property
def text(self) -> str:
"""Get text representation of pronunciation (NFC normalized)"""
if not self._text:
self._text = "".join(p.text for p in self.phones_and_others)
return self._text
def __repr__(self) -> str:
return self.text
def __iter__(self):
return iter(self.phones_and_others)
def __getitem__(self, idx):
return self.phones_and_others[idx]
@staticmethod
def from_string(
pron_str: str,
keep_stress: bool = True,
keep_accents: typing.Optional[bool] = None,
drop_tones: bool = False,
keep_ties: bool = True,
) -> "Pronunciation":
"""Split an IPA pronunciation into phones.
Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa).
Elongation markers bind to the previous non-combining codepoint (e.g., aː).
Ties join two non-combining sequences (e.g. t͡ʃ).
Whitespace and brackets are skipped.
Returns list of phones.
"""
if keep_accents is None:
keep_accents = keep_stress
clusters = []
cluster = ""
stress = ""
is_stress = False
accents = ""
is_accent = False
tone = ""
in_tone = False
skip_next_cluster = False
codepoints = unicodedata.normalize("NFD", pron_str)
for codepoint in codepoints:
new_cluster = False
is_stress = False
is_accent = False
if (
codepoint.isspace()
or IPA.is_bracket(codepoint)
or (codepoint in {IPA.BREAK_SYLLABLE})
):
# Skip whitespace, brackets, and syllable breaks
continue
if IPA.is_break(codepoint) or IPA.is_intonation(codepoint):
# Keep minor/major/word breaks and intonation markers
new_cluster = True
if IPA.is_accent(codepoint) and not in_tone:
is_accent = True
if cluster:
new_cluster = True
skip_next_cluster = True
elif IPA.is_stress(codepoint):
is_stress = True
if cluster:
new_cluster = True
skip_next_cluster = True
elif in_tone and (codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
# Interpret as part of tone
if not drop_tones:
tone += codepoint
continue
elif IPA.is_long(codepoint):
# Add to current cluster
pass
elif IPA.is_tie(codepoint):
if keep_ties:
# Add next non-combining to current cluster
skip_next_cluster = True
else:
# Ignore ties
continue
elif IPA.is_tone(codepoint):
# Add to end of current cluster
if not drop_tones:
tone += codepoint
in_tone = True
continue
elif unicodedata.combining(codepoint) == 0:
# Non-combining character
if skip_next_cluster:
# Add to current cluster
skip_next_cluster = False
elif cluster:
# Start a new cluster
new_cluster = True
if new_cluster and cluster:
clusters.append(accents + stress + cluster + tone)
accents = ""
stress = ""
cluster = ""
tone = ""
if is_accent:
if keep_accents:
accents += codepoint
elif is_stress:
if keep_stress:
stress += codepoint
else:
cluster += codepoint
if cluster:
clusters.append(accents + stress + cluster + tone)
phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] = []
for cluster in clusters:
if IPA.is_break(cluster):
phones_and_others.append(Break.from_string(cluster))
elif IPA.is_intonation(cluster):
phones_and_others.append(Intonation.from_string(cluster))
else:
phones_and_others.append(Phone.from_string(cluster))
return Pronunciation(phones_and_others)
# -----------------------------------------------------------------------------
class Phoneme:
"""Phoneme composed of international phonetic alphabet symbols"""
def __init__(
self,
text: str,
example: str = "",
unknown: bool = False,
tones: typing.Optional[typing.Iterable[str]] = None,
is_ipa: bool = True,
):
self._text = ""
self._text_compare = ""
self.example = example
self.unknown = unknown
# List of allowable tones for phoneme
self.tones = list(tones or [])
self.stress: typing.Optional[Stress] = None
self.accents: typing.List[Accent] = []
self.elongated: bool = False
self.nasalated: typing.Set[int] = set()
self.raised: typing.Set[int] = set()
self._extra_combining: typing.Dict[int, typing.List[str]] = defaultdict(list)
# Decompose into base and combining characters
codepoints = unicodedata.normalize("NFD", text)
self.letters = ""
self.tone = ""
if is_ipa:
in_tone = False
letter_index = 0
new_letter = False
for c in codepoints:
# Check for stress
if (c == IPA.ACCENT_ACUTE) and (not in_tone):
self.accents.append(Accent.ACUTE)
elif (c == IPA.ACCENT_GRAVE) and (not in_tone):
self.accents.append(Accent.GRAVE)
elif c == IPA.STRESS_PRIMARY:
self.stress = Stress.PRIMARY
elif c == IPA.STRESS_SECONDARY:
self.stress = Stress.SECONDARY
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
# Interpret as part of tone
self.tone += c
elif IPA.is_long(c):
# Check for elongation
self.elongated = True
elif IPA.is_nasal(c):
# Check for nasalation
self.nasalated.add(letter_index)
elif IPA.is_raised(c):
# Check for raised articulation
self.raised.add(letter_index)
elif IPA.is_bracket(c) or IPA.is_break(c):
# Skip brackets/syllable breaks
pass
elif IPA.is_tone(c):
# Keep tone separate
self.tone += c
in_tone = True
elif c in {IPA.SYLLABIC, IPA.NON_SYLLABIC, IPA.EXTRA_SHORT}:
# Stow some diacritics that we don't do anything with
self._extra_combining[letter_index].append(c)
else:
# Include all other characters in base
self.letters += c
if new_letter:
letter_index += 1
new_letter = True
else:
self.letters = text
# Re-normalize and combine letters
self.letters = unicodedata.normalize("NFC", self.letters)
self.letters_graphemes = IPA.graphemes(self.letters)
# Categorize
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
self.dipthong: typing.Optional[Dipthong] = None
if (
(not self.vowel)
and (not self.consonant)
and (not self.schwa)
and (len(self.letters) == 2)
):
# Check if dipthong (two vowels)
vowel1 = VOWELS.get(self.letters[0])
vowel2 = VOWELS.get(self.letters[1])
if vowel1 and vowel2:
self.dipthong = Dipthong(vowel1, vowel2)
@property
def text(self) -> str:
"""Return letters with stress and elongation (NFC normalized)"""
if self._text:
return self._text
for accent in self.accents:
if accent == Accent.ACUTE:
self._text += IPA.ACCENT_ACUTE
elif accent == Accent.GRAVE:
self._text += IPA.ACCENT_GRAVE
if self.stress == Stress.PRIMARY:
self._text += IPA.STRESS_PRIMARY
elif self.stress == Stress.SECONDARY:
self._text += IPA.STRESS_SECONDARY
for letter_index, letter in enumerate(self.letters):
self._text += letter
if letter_index in self.nasalated:
self._text += IPA.NASAL
if letter_index in self.raised:
self._text += IPA.RAISED
for c in self._extra_combining[letter_index]:
self._text += c
if self.tone:
self._text += self.tone
if self.elongated:
self._text += IPA.LONG
# Re-normalize and combine
self._text = unicodedata.normalize("NFC", self._text)
return self._text
@property
def text_compare(self) -> str:
"""Return letters and elongation with no stress/tones (NFC normalized)"""
if self._text_compare:
return self._text_compare
for letter_index, letter in enumerate(self.letters):
self._text_compare += letter
if letter_index in self.nasalated:
self._text_compare += IPA.NASAL
if letter_index in self.raised:
self._text_compare += IPA.RAISED
for c in self._extra_combining[letter_index]:
self._text_compare += c
if self.elongated:
self._text_compare += IPA.LONG
# Re-normalize and combine
self._text_compare = unicodedata.normalize("NFC", self._text_compare)
return self._text_compare
def copy(self) -> "Phoneme":
"""Create a copy of this phonemes"""
return Phoneme(text=self.text, example=self.example, unknown=self.unknown)
def __repr__(self) -> str:
"""Return symbol with stress and elongation."""
return self.text
def to_dict(self) -> typing.Dict[str, typing.Any]:
"""Return properties of phoneme as a dict"""
type_name = "Phoneme"
props: typing.Dict[str, typing.Any] = {
"text": repr(self),
"letters": self.letters,
"tone": self.tone,
"tones": self.tones,
}
if self.unknown:
props["unknown"] = True
if self.example:
props["example"] = self.example
props["accents"] = [a.value for a in self.accents]
props["stress"] = self.stress.value if self.stress is not None else ""
if self.vowel:
type_name = "Vowel"
props["height"] = self.vowel.height.value
props["placement"] = self.vowel.placement.value
props["rounded"] = self.vowel.rounded
elif self.consonant:
type_name = "Consonant"
props["type"] = self.consonant.type.value
props["place"] = self.consonant.place.value
props["voiced"] = self.consonant.voiced
elif self.dipthong:
type_name = "Dipthong"
elif self.schwa:
type_name = "Schwa"
props["r_coloured"] = self.schwa.r_coloured
props["type"] = type_name
props["nasalated"] = list(self.nasalated)
props["raised"] = list(self.raised)
props["elongated"] = self.elongated
return props
def to_string(self) -> str:
"""Return descriptive string of phoneme"""
props = self.to_dict()
type_name = props.get("type", "Phoneme")
prop_strs = [f"{k}={v}" for k, v in props.items()]
return f"{type_name}(" + ", ".join(prop_strs) + ")"
# -----------------------------------------------------------------------------
class Phonemes:
"""Set of phonemes and allophones for a language"""
COMMENT_STR = "#"
def __init__(self, phonemes=None, ipa_map=None):
self.phonemes = phonemes or []
self.ipa_map = ipa_map or {}
# Regex for replacing IPA
self._ipa_map_regex = None
# Phonemes sorted by descreasing length
self._phonemes_sorted = None
# Map from original phoneme to gruut IPA
self.gruut_ipa_map: typing.Dict[str, str] = {}
self.phoneme_texts: typing.Set[str] = {}
self.update()
def __iter__(self):
return iter(self.phonemes)
def __len__(self):
return len(self.phonemes)
def __getitem__(self, key):
return self.phonemes[key]
def __contains__(self, item):
if isinstance(item, str):
# Compare IPA text
return item in self.phoneme_texts
return item in self.phonemes
@staticmethod
def from_language(language: str) -> "Phonemes":
"""Load phonemes for a given language"""
language = resolve_lang(language)
# Load phonemes themselves
phonemes_path = _DATA_DIR / language / "phonemes.txt"
with open(phonemes_path, "r", encoding="utf-8") as phonemes_file:
phonemes = Phonemes.from_text(phonemes_file)
# Try to load optional map from original phoneme to gruut IPA
gruut_ipa_map: typing.Optional[typing.Dict[str, str]] = None
map_path = _DATA_DIR / language / "ipa_map.txt"
if map_path.is_file():
gruut_ipa_map = {}
with open(map_path, "r", encoding="utf-8") as map_file:
for line in map_file:
line = line.strip()
if not line:
continue
from_phoneme, to_ipa = line.split(maxsplit=1)
gruut_ipa_map[from_phoneme] = to_ipa
if gruut_ipa_map:
phonemes.gruut_ipa_map = gruut_ipa_map
return phonemes
@staticmethod
def from_text(text_file) -> "Phonemes":
"""Load text file with phonemes, examples, and allophones"""
lang = Phonemes()
for line in text_file:
# Remove comments
line, *_ = line.split(Phonemes.COMMENT_STR, maxsplit=1)
line = line.strip()
if line:
# phoneme [example] [allophone] [allophone] ! [tone] [tone]...
parts = line.split()
phoneme_ipa = parts[0]
example = ""
if len(parts) > 1:
example = parts[1]
tones = []
if len(parts) > 2:
in_tone = False
# Map allophone back to phoneme
for part in parts[2:]:
if part == "!":
# Begin possible tones for this phoneme
in_tone = True
elif in_tone:
tones.append(part)
else:
lang.ipa_map[part] = phoneme_ipa
lang.phonemes.append(
Phoneme(text=phoneme_ipa, example=example, tones=tones)
)
lang.update()
return lang
def update(self):
"""Call after modifying phonemes or IPA map to re-sort"""
# Create single regex that will be used to replace IPA.
# The final regex is of the form (AAA|BB|C) where each case is in
# decreasing length order.
#
# If the replacement is not a substring of any phonemes, then the
# replacement is straightforward.
#
# If it is a substring of some phoneme, however, we need to be careful.
# For example, naively replacing "e" with "eɪ" in the string "beɪ" will
# produce "beeɪ" when we want it to be "beɪ".
#
# So the substring case becomes "e(?!ɪ)" which uses a negative lookahead
# to avoid the problem.
cases = []
for match_text in sorted(self.ipa_map.keys(), key=len, reverse=True):
if match_text.startswith(","):
# Raw regex
cases.append(match_text[1:])
continue
# Check against all of the phonemes
case_added = False
for phoneme in self.phonemes:
num_extra = len(phoneme.text) - len(match_text)
if (num_extra > 0) and phoneme.text.startswith(match_text):
# Use negative lookahead to avoid replacing part of a valid
# phoneme.
cases.append(
"{}(?!{})".format(
re.escape(match_text[:num_extra]),
re.escape(phoneme.text[num_extra:]),
)
)
case_added = True
break
if not case_added:
# No substring problem
cases.append(re.escape(match_text))
ipa_map_regex_str = "({})".format("|".join(cases))
self._ipa_map_regex = re.compile(ipa_map_regex_str)
# Split phonemes and sort by reverse length
split_phonemes = [
([pb.text for pb in Pronunciation.from_string(p.text)], p)
for p in self.phonemes
]
self._phonemes_sorted = sorted(
split_phonemes, key=lambda kp: len(kp[0]), reverse=True
)
# Update IPA texts set for phonemes
self.phoneme_texts = set(p.text for p in self.phonemes)
def split(
self,
pron_str: typing.Union[str, Pronunciation],
keep_stress: bool = True,
keep_accents: typing.Optional[bool] = None,
drop_tones: bool = False,
is_ipa: bool = True,
) -> typing.List[Phoneme]:
"""Split an IPA pronunciation into phonemes"""
if not self._ipa_map_regex:
self.update()
if keep_accents is None:
keep_accents = keep_stress
word_phonemes: typing.List[Phoneme] = []
if self.ipa_map:
if isinstance(pron_str, Pronunciation):
pron_str = "".join(p.text for p in pron_str)
def handle_replace(match):
text = match.group(1)
return self.ipa_map.get(text, text)
pron_str = self._ipa_map_regex.sub(handle_replace, pron_str)
# Get text for IPA phones
if isinstance(pron_str, Pronunciation):
# Use supplied pronunication
ipas = [pb.text for pb in pron_str]
elif is_ipa:
# Split string into pronunciation
pron = Pronunciation.from_string(
pron_str,
keep_stress=keep_stress,
keep_accents=keep_accents,
drop_tones=drop_tones,
)
ipas = [pb.text for pb in pron]
else:
ipas = IPA.graphemes(pron_str)
# Keep stress and tones separate to make phoneme comparisons easier
ipa_stress: typing.Dict[int, str] = defaultdict(str)
ipa_tones: typing.Dict[int, str] = defaultdict(str)
if is_ipa:
in_tone = False
for ipa_idx, ipa in enumerate(ipas):
if ipa:
keep_ipa = ""
for codepoint in ipa:
if IPA.is_accent(codepoint) and (not in_tone):
if keep_accents:
ipa_stress[ipa_idx] += codepoint
elif IPA.is_stress(codepoint):
if keep_stress:
ipa_stress[ipa_idx] += codepoint
elif in_tone and (
codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}
):
# Interpret as part of time
if not drop_tones:
ipa_tones[ipa_idx] += codepoint
elif IPA.is_tone(codepoint):
if not drop_tones:
ipa_tones[ipa_idx] += codepoint
in_tone = True
else:
keep_ipa += codepoint
ipas[ipa_idx] = keep_ipa
num_ipas: int = len(ipas)
# ---------------------------------------------------------------------
# pylint: disable=consider-using-enumerate
for ipa_idx in range(len(ipas)):
ipa = ipas[ipa_idx]
if ipa is None:
# Skip replaced piece
continue
phoneme_match = False
for phoneme_ipas, phoneme in self._phonemes_sorted:
if ipa_idx <= (num_ipas - len(phoneme_ipas)):
phoneme_match = True
phoneme_stress = ""
phoneme_tones = ""
# Look forward into sequence
for phoneme_idx in range(len(phoneme_ipas)):
phoneme_stress += ipa_stress[ipa_idx + phoneme_idx]
phoneme_tones += ipa_tones[ipa_idx + phoneme_idx]
if phoneme_ipas[phoneme_idx] != ipas[ipa_idx + phoneme_idx]:
phoneme_match = False
break
if phoneme_match:
# Successful match
if phoneme_stress or phoneme_tones:
# Create a copy of the phoneme with applied stress/tones
phoneme = Phoneme(
text=(phoneme_stress + phoneme.text + phoneme_tones),
example=phoneme.example,
)
word_phonemes.append(phoneme)
# Patch ipas to skip replaced pieces
for phoneme_idx in range(1, len(phoneme_ipas)):
ipas[ipa_idx + phoneme_idx] = None
break
if not phoneme_match:
# Add unknown phoneme
word_phonemes.append(Phoneme(text=ipa, unknown=True))
return word_phonemes