ai-content-maker/.venv/Lib/site-packages/gruut_ipa/phonemes.py

926 lines
30 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python3
"""Functions for manipulating phones/phonemes"""
import logging
import re
import typing
import unicodedata
from collections import defaultdict
from gruut_ipa.constants import ( # noqa: F401
_DATA_DIR,
_DIR,
CONSONANTS,
FEATURE_COLUMNS,
FEATURE_EMPTY,
FEATURE_KEYS,
FEATURE_ORDINAL_COLUMNS,
IPA,
LANG_ALIASES,
SCHWAS,
VOWELS,
Accent,
Break,
BreakType,
Consonant,
ConsonantPlace,
ConsonantType,
Dipthong,
Intonation,
PhonemeLength,
Schwa,
Stress,
Vowel,
VowelHeight,
VowelPlacement,
)
from gruut_ipa.utils import resolve_lang
_LOGGER = logging.getLogger("gruut_ipa")
# -----------------------------------------------------------------------------
class Phone:
"""Single IPA phone with diacritics and suprasegmentals"""
def __init__(
self,
letters: str,
stress: typing.Optional[Stress] = None,
accents: typing.Optional[typing.Iterable[Accent]] = None,
is_long: bool = False,
nasal: typing.Optional[typing.Set[int]] = None,
raised: typing.Optional[typing.Set[int]] = None,
diacritics: typing.Optional[typing.Dict[int, typing.Set[str]]] = None,
suprasegmentals: typing.Optional[typing.Set[str]] = None,
tone: str = "",
):
self.letters: str = unicodedata.normalize("NFC", letters)
self.stress = stress
self.accents: typing.List[Accent] = list(accents or [])
self.is_long: bool = is_long
self.nasal: typing.Set[int] = nasal or set()
self.is_nasal = bool(self.nasal)
self.raised: typing.Set[int] = raised or set()
self.is_raised = bool(self.raised)
self.tone: str = tone
self.diacritics: typing.Dict[int, typing.Set[str]] = diacritics or defaultdict(
set
)
self.suprasegmentals: typing.Set[str] = suprasegmentals or set()
# Decompose suprasegmentals and diacritics
if self.stress == Stress.PRIMARY:
self.suprasegmentals.add(IPA.STRESS_PRIMARY)
elif self.stress == Stress.SECONDARY:
self.suprasegmentals.add(IPA.STRESS_SECONDARY)
if Accent.ACUTE in self.accents:
self.suprasegmentals.add(IPA.ACCENT_ACUTE)
if Accent.GRAVE in self.accents:
self.suprasegmentals.add(IPA.ACCENT_GRAVE)
if self.is_long:
self.suprasegmentals.add(IPA.LONG)
# Nasal
for letter_index in self.nasal:
letter_diacritics = self.diacritics.get(letter_index)
if letter_diacritics is None:
letter_diacritics = set()
self.diacritics[letter_index] = letter_diacritics
letter_diacritics.add(IPA.NASAL)
# Raised
for letter_index in self.raised:
letter_diacritics = self.diacritics.get(letter_index)
if letter_diacritics is None:
letter_diacritics = set()
self.diacritics[letter_index] = letter_diacritics
letter_diacritics.add(IPA.RAISED)
self._text: str = ""
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
@property
def text(self) -> str:
"""Get textual representation of phone (NFC normalized)"""
if self._text:
return self._text
# Pre-letter suprasegmentals
for accent in self.accents:
if accent == Accent.ACUTE:
self._text += IPA.ACCENT_ACUTE
elif accent == Accent.GRAVE:
self._text += IPA.ACCENT_GRAVE
if self.stress == Stress.PRIMARY:
self._text += IPA.STRESS_PRIMARY
elif self.stress == Stress.SECONDARY:
self._text += IPA.STRESS_SECONDARY
# Letters and diacritics
for letter_index, letter in enumerate(self.letters):
self._text += letter
# Diacritics
for diacritic in self.diacritics.get(letter_index, []):
self._text += diacritic
# Tone
if self.tone:
self._text += self.tone
# Post-letter suprasegmentals
if self.is_long:
self._text += IPA.LONG
# Re-normalize and combine
self._text = unicodedata.normalize("NFC", self._text)
return self._text
@property
def is_vowel(self) -> bool:
"""True if phone is a vowel"""
return self.vowel is not None
@property
def is_consonant(self) -> bool:
"""True if phone is a consonant"""
return self.consonant is not None
@property
def is_schwa(self) -> bool:
"""True if phone is a schwa"""
return self.schwa is not None
def __repr__(self) -> str:
return self.text
@staticmethod
def from_string(phone_str: str) -> "Phone":
"""Parse phone from string"""
# Decompose into base and combining characters
codepoints = unicodedata.normalize("NFD", phone_str)
kwargs: typing.Dict[str, typing.Any] = {
"letters": "",
"diacritics": defaultdict(set),
"tone": "",
"accents": [],
"nasal": set(),
"raised": set(),
}
in_tone = False
new_letter = False
letter_index = 0
for c in codepoints:
# Check for stress
if (c == IPA.ACCENT_ACUTE) and not in_tone:
kwargs["accents"].append(Accent.ACUTE)
elif (c == IPA.ACCENT_GRAVE) and not in_tone:
kwargs["accents"].append(Accent.GRAVE)
elif c == IPA.STRESS_PRIMARY:
kwargs["stress"] = Stress.PRIMARY
elif c == IPA.STRESS_SECONDARY:
kwargs["stress"] = Stress.SECONDARY
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
# Interpret as part of tone
kwargs["tone"] += c
elif IPA.is_long(c):
# Check for elongation
kwargs["is_long"] = True
elif IPA.is_nasal(c):
# Check for nasalation
kwargs["nasal"].add(letter_index)
elif IPA.is_raised(c):
# Check for raised articulation
kwargs["raised"].add(letter_index)
elif IPA.is_bracket(c) or IPA.is_break(c):
# Skip brackets/syllable breaks
pass
elif IPA.is_tie(c):
# Keep ties in letters
kwargs["letters"] += c
letter_index += 1
elif IPA.is_tone(c):
# Tone numbers/letters
kwargs["tone"] += c
in_tone = True
elif unicodedata.combining(c) > 0:
# Stow some diacritics that we don't do anything with
kwargs["diacritics"][letter_index].add(c)
else:
# Include all other characters in letters
kwargs["letters"] += c
if new_letter:
letter_index += 1
new_letter = True
return Phone(**kwargs)
# -----------------------------------------------------------------------------
class Pronunciation:
"""Collection of phones and breaks for some unit of text (word, sentence, etc.)"""
def __init__(
self, phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]]
):
self.phones_and_others = phones_and_others
self.phones: typing.List[Phone] = []
self.breaks: typing.List[Break] = []
self.intonations: typing.List[Intonation] = []
# Decompose into phones, breaks, and intonations
for p in self.phones_and_others:
if isinstance(p, Phone):
self.phones.append(p)
elif isinstance(p, Break):
self.breaks.append(p)
elif isinstance(p, Intonation):
self.intonations.append(p)
self._text = ""
@property
def text(self) -> str:
"""Get text representation of pronunciation (NFC normalized)"""
if not self._text:
self._text = "".join(p.text for p in self.phones_and_others)
return self._text
def __repr__(self) -> str:
return self.text
def __iter__(self):
return iter(self.phones_and_others)
def __getitem__(self, idx):
return self.phones_and_others[idx]
@staticmethod
def from_string(
pron_str: str,
keep_stress: bool = True,
keep_accents: typing.Optional[bool] = None,
drop_tones: bool = False,
keep_ties: bool = True,
) -> "Pronunciation":
"""Split an IPA pronunciation into phones.
Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa).
Elongation markers bind to the previous non-combining codepoint (e.g., aː).
Ties join two non-combining sequences (e.g. t͡ʃ).
Whitespace and brackets are skipped.
Returns list of phones.
"""
if keep_accents is None:
keep_accents = keep_stress
clusters = []
cluster = ""
stress = ""
is_stress = False
accents = ""
is_accent = False
tone = ""
in_tone = False
skip_next_cluster = False
codepoints = unicodedata.normalize("NFD", pron_str)
for codepoint in codepoints:
new_cluster = False
is_stress = False
is_accent = False
if (
codepoint.isspace()
or IPA.is_bracket(codepoint)
or (codepoint in {IPA.BREAK_SYLLABLE})
):
# Skip whitespace, brackets, and syllable breaks
continue
if IPA.is_break(codepoint) or IPA.is_intonation(codepoint):
# Keep minor/major/word breaks and intonation markers
new_cluster = True
if IPA.is_accent(codepoint) and not in_tone:
is_accent = True
if cluster:
new_cluster = True
skip_next_cluster = True
elif IPA.is_stress(codepoint):
is_stress = True
if cluster:
new_cluster = True
skip_next_cluster = True
elif in_tone and (codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
# Interpret as part of tone
if not drop_tones:
tone += codepoint
continue
elif IPA.is_long(codepoint):
# Add to current cluster
pass
elif IPA.is_tie(codepoint):
if keep_ties:
# Add next non-combining to current cluster
skip_next_cluster = True
else:
# Ignore ties
continue
elif IPA.is_tone(codepoint):
# Add to end of current cluster
if not drop_tones:
tone += codepoint
in_tone = True
continue
elif unicodedata.combining(codepoint) == 0:
# Non-combining character
if skip_next_cluster:
# Add to current cluster
skip_next_cluster = False
elif cluster:
# Start a new cluster
new_cluster = True
if new_cluster and cluster:
clusters.append(accents + stress + cluster + tone)
accents = ""
stress = ""
cluster = ""
tone = ""
if is_accent:
if keep_accents:
accents += codepoint
elif is_stress:
if keep_stress:
stress += codepoint
else:
cluster += codepoint
if cluster:
clusters.append(accents + stress + cluster + tone)
phones_and_others: typing.List[typing.Union[Phone, Break, Intonation]] = []
for cluster in clusters:
if IPA.is_break(cluster):
phones_and_others.append(Break.from_string(cluster))
elif IPA.is_intonation(cluster):
phones_and_others.append(Intonation.from_string(cluster))
else:
phones_and_others.append(Phone.from_string(cluster))
return Pronunciation(phones_and_others)
# -----------------------------------------------------------------------------
class Phoneme:
"""Phoneme composed of international phonetic alphabet symbols"""
def __init__(
self,
text: str,
example: str = "",
unknown: bool = False,
tones: typing.Optional[typing.Iterable[str]] = None,
is_ipa: bool = True,
):
self._text = ""
self._text_compare = ""
self.example = example
self.unknown = unknown
# List of allowable tones for phoneme
self.tones = list(tones or [])
self.stress: typing.Optional[Stress] = None
self.accents: typing.List[Accent] = []
self.elongated: bool = False
self.nasalated: typing.Set[int] = set()
self.raised: typing.Set[int] = set()
self._extra_combining: typing.Dict[int, typing.List[str]] = defaultdict(list)
# Decompose into base and combining characters
codepoints = unicodedata.normalize("NFD", text)
self.letters = ""
self.tone = ""
if is_ipa:
in_tone = False
letter_index = 0
new_letter = False
for c in codepoints:
# Check for stress
if (c == IPA.ACCENT_ACUTE) and (not in_tone):
self.accents.append(Accent.ACUTE)
elif (c == IPA.ACCENT_GRAVE) and (not in_tone):
self.accents.append(Accent.GRAVE)
elif c == IPA.STRESS_PRIMARY:
self.stress = Stress.PRIMARY
elif c == IPA.STRESS_SECONDARY:
self.stress = Stress.SECONDARY
elif in_tone and (c in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}):
# Interpret as part of tone
self.tone += c
elif IPA.is_long(c):
# Check for elongation
self.elongated = True
elif IPA.is_nasal(c):
# Check for nasalation
self.nasalated.add(letter_index)
elif IPA.is_raised(c):
# Check for raised articulation
self.raised.add(letter_index)
elif IPA.is_bracket(c) or IPA.is_break(c):
# Skip brackets/syllable breaks
pass
elif IPA.is_tone(c):
# Keep tone separate
self.tone += c
in_tone = True
elif c in {IPA.SYLLABIC, IPA.NON_SYLLABIC, IPA.EXTRA_SHORT}:
# Stow some diacritics that we don't do anything with
self._extra_combining[letter_index].append(c)
else:
# Include all other characters in base
self.letters += c
if new_letter:
letter_index += 1
new_letter = True
else:
self.letters = text
# Re-normalize and combine letters
self.letters = unicodedata.normalize("NFC", self.letters)
self.letters_graphemes = IPA.graphemes(self.letters)
# Categorize
self.vowel: typing.Optional[Vowel] = VOWELS.get(self.letters)
self.consonant: typing.Optional[Consonant] = CONSONANTS.get(self.letters)
self.schwa: typing.Optional[Schwa] = SCHWAS.get(self.letters)
self.dipthong: typing.Optional[Dipthong] = None
if (
(not self.vowel)
and (not self.consonant)
and (not self.schwa)
and (len(self.letters) == 2)
):
# Check if dipthong (two vowels)
vowel1 = VOWELS.get(self.letters[0])
vowel2 = VOWELS.get(self.letters[1])
if vowel1 and vowel2:
self.dipthong = Dipthong(vowel1, vowel2)
@property
def text(self) -> str:
"""Return letters with stress and elongation (NFC normalized)"""
if self._text:
return self._text
for accent in self.accents:
if accent == Accent.ACUTE:
self._text += IPA.ACCENT_ACUTE
elif accent == Accent.GRAVE:
self._text += IPA.ACCENT_GRAVE
if self.stress == Stress.PRIMARY:
self._text += IPA.STRESS_PRIMARY
elif self.stress == Stress.SECONDARY:
self._text += IPA.STRESS_SECONDARY
for letter_index, letter in enumerate(self.letters):
self._text += letter
if letter_index in self.nasalated:
self._text += IPA.NASAL
if letter_index in self.raised:
self._text += IPA.RAISED
for c in self._extra_combining[letter_index]:
self._text += c
if self.tone:
self._text += self.tone
if self.elongated:
self._text += IPA.LONG
# Re-normalize and combine
self._text = unicodedata.normalize("NFC", self._text)
return self._text
@property
def text_compare(self) -> str:
"""Return letters and elongation with no stress/tones (NFC normalized)"""
if self._text_compare:
return self._text_compare
for letter_index, letter in enumerate(self.letters):
self._text_compare += letter
if letter_index in self.nasalated:
self._text_compare += IPA.NASAL
if letter_index in self.raised:
self._text_compare += IPA.RAISED
for c in self._extra_combining[letter_index]:
self._text_compare += c
if self.elongated:
self._text_compare += IPA.LONG
# Re-normalize and combine
self._text_compare = unicodedata.normalize("NFC", self._text_compare)
return self._text_compare
def copy(self) -> "Phoneme":
"""Create a copy of this phonemes"""
return Phoneme(text=self.text, example=self.example, unknown=self.unknown)
def __repr__(self) -> str:
"""Return symbol with stress and elongation."""
return self.text
def to_dict(self) -> typing.Dict[str, typing.Any]:
"""Return properties of phoneme as a dict"""
type_name = "Phoneme"
props: typing.Dict[str, typing.Any] = {
"text": repr(self),
"letters": self.letters,
"tone": self.tone,
"tones": self.tones,
}
if self.unknown:
props["unknown"] = True
if self.example:
props["example"] = self.example
props["accents"] = [a.value for a in self.accents]
props["stress"] = self.stress.value if self.stress is not None else ""
if self.vowel:
type_name = "Vowel"
props["height"] = self.vowel.height.value
props["placement"] = self.vowel.placement.value
props["rounded"] = self.vowel.rounded
elif self.consonant:
type_name = "Consonant"
props["type"] = self.consonant.type.value
props["place"] = self.consonant.place.value
props["voiced"] = self.consonant.voiced
elif self.dipthong:
type_name = "Dipthong"
elif self.schwa:
type_name = "Schwa"
props["r_coloured"] = self.schwa.r_coloured
props["type"] = type_name
props["nasalated"] = list(self.nasalated)
props["raised"] = list(self.raised)
props["elongated"] = self.elongated
return props
def to_string(self) -> str:
"""Return descriptive string of phoneme"""
props = self.to_dict()
type_name = props.get("type", "Phoneme")
prop_strs = [f"{k}={v}" for k, v in props.items()]
return f"{type_name}(" + ", ".join(prop_strs) + ")"
# -----------------------------------------------------------------------------
class Phonemes:
"""Set of phonemes and allophones for a language"""
COMMENT_STR = "#"
def __init__(self, phonemes=None, ipa_map=None):
self.phonemes = phonemes or []
self.ipa_map = ipa_map or {}
# Regex for replacing IPA
self._ipa_map_regex = None
# Phonemes sorted by descreasing length
self._phonemes_sorted = None
# Map from original phoneme to gruut IPA
self.gruut_ipa_map: typing.Dict[str, str] = {}
self.phoneme_texts: typing.Set[str] = {}
self.update()
def __iter__(self):
return iter(self.phonemes)
def __len__(self):
return len(self.phonemes)
def __getitem__(self, key):
return self.phonemes[key]
def __contains__(self, item):
if isinstance(item, str):
# Compare IPA text
return item in self.phoneme_texts
return item in self.phonemes
@staticmethod
def from_language(language: str) -> "Phonemes":
"""Load phonemes for a given language"""
language = resolve_lang(language)
# Load phonemes themselves
phonemes_path = _DATA_DIR / language / "phonemes.txt"
with open(phonemes_path, "r", encoding="utf-8") as phonemes_file:
phonemes = Phonemes.from_text(phonemes_file)
# Try to load optional map from original phoneme to gruut IPA
gruut_ipa_map: typing.Optional[typing.Dict[str, str]] = None
map_path = _DATA_DIR / language / "ipa_map.txt"
if map_path.is_file():
gruut_ipa_map = {}
with open(map_path, "r", encoding="utf-8") as map_file:
for line in map_file:
line = line.strip()
if not line:
continue
from_phoneme, to_ipa = line.split(maxsplit=1)
gruut_ipa_map[from_phoneme] = to_ipa
if gruut_ipa_map:
phonemes.gruut_ipa_map = gruut_ipa_map
return phonemes
@staticmethod
def from_text(text_file) -> "Phonemes":
"""Load text file with phonemes, examples, and allophones"""
lang = Phonemes()
for line in text_file:
# Remove comments
line, *_ = line.split(Phonemes.COMMENT_STR, maxsplit=1)
line = line.strip()
if line:
# phoneme [example] [allophone] [allophone] ! [tone] [tone]...
parts = line.split()
phoneme_ipa = parts[0]
example = ""
if len(parts) > 1:
example = parts[1]
tones = []
if len(parts) > 2:
in_tone = False
# Map allophone back to phoneme
for part in parts[2:]:
if part == "!":
# Begin possible tones for this phoneme
in_tone = True
elif in_tone:
tones.append(part)
else:
lang.ipa_map[part] = phoneme_ipa
lang.phonemes.append(
Phoneme(text=phoneme_ipa, example=example, tones=tones)
)
lang.update()
return lang
def update(self):
"""Call after modifying phonemes or IPA map to re-sort"""
# Create single regex that will be used to replace IPA.
# The final regex is of the form (AAA|BB|C) where each case is in
# decreasing length order.
#
# If the replacement is not a substring of any phonemes, then the
# replacement is straightforward.
#
# If it is a substring of some phoneme, however, we need to be careful.
# For example, naively replacing "e" with "eɪ" in the string "beɪ" will
# produce "beeɪ" when we want it to be "beɪ".
#
# So the substring case becomes "e(?!ɪ)" which uses a negative lookahead
# to avoid the problem.
cases = []
for match_text in sorted(self.ipa_map.keys(), key=len, reverse=True):
if match_text.startswith(","):
# Raw regex
cases.append(match_text[1:])
continue
# Check against all of the phonemes
case_added = False
for phoneme in self.phonemes:
num_extra = len(phoneme.text) - len(match_text)
if (num_extra > 0) and phoneme.text.startswith(match_text):
# Use negative lookahead to avoid replacing part of a valid
# phoneme.
cases.append(
"{}(?!{})".format(
re.escape(match_text[:num_extra]),
re.escape(phoneme.text[num_extra:]),
)
)
case_added = True
break
if not case_added:
# No substring problem
cases.append(re.escape(match_text))
ipa_map_regex_str = "({})".format("|".join(cases))
self._ipa_map_regex = re.compile(ipa_map_regex_str)
# Split phonemes and sort by reverse length
split_phonemes = [
([pb.text for pb in Pronunciation.from_string(p.text)], p)
for p in self.phonemes
]
self._phonemes_sorted = sorted(
split_phonemes, key=lambda kp: len(kp[0]), reverse=True
)
# Update IPA texts set for phonemes
self.phoneme_texts = set(p.text for p in self.phonemes)
def split(
self,
pron_str: typing.Union[str, Pronunciation],
keep_stress: bool = True,
keep_accents: typing.Optional[bool] = None,
drop_tones: bool = False,
is_ipa: bool = True,
) -> typing.List[Phoneme]:
"""Split an IPA pronunciation into phonemes"""
if not self._ipa_map_regex:
self.update()
if keep_accents is None:
keep_accents = keep_stress
word_phonemes: typing.List[Phoneme] = []
if self.ipa_map:
if isinstance(pron_str, Pronunciation):
pron_str = "".join(p.text for p in pron_str)
def handle_replace(match):
text = match.group(1)
return self.ipa_map.get(text, text)
pron_str = self._ipa_map_regex.sub(handle_replace, pron_str)
# Get text for IPA phones
if isinstance(pron_str, Pronunciation):
# Use supplied pronunication
ipas = [pb.text for pb in pron_str]
elif is_ipa:
# Split string into pronunciation
pron = Pronunciation.from_string(
pron_str,
keep_stress=keep_stress,
keep_accents=keep_accents,
drop_tones=drop_tones,
)
ipas = [pb.text for pb in pron]
else:
ipas = IPA.graphemes(pron_str)
# Keep stress and tones separate to make phoneme comparisons easier
ipa_stress: typing.Dict[int, str] = defaultdict(str)
ipa_tones: typing.Dict[int, str] = defaultdict(str)
if is_ipa:
in_tone = False
for ipa_idx, ipa in enumerate(ipas):
if ipa:
keep_ipa = ""
for codepoint in ipa:
if IPA.is_accent(codepoint) and (not in_tone):
if keep_accents:
ipa_stress[ipa_idx] += codepoint
elif IPA.is_stress(codepoint):
if keep_stress:
ipa_stress[ipa_idx] += codepoint
elif in_tone and (
codepoint in {IPA.TONE_GLOTTALIZED, IPA.TONE_SHORT}
):
# Interpret as part of time
if not drop_tones:
ipa_tones[ipa_idx] += codepoint
elif IPA.is_tone(codepoint):
if not drop_tones:
ipa_tones[ipa_idx] += codepoint
in_tone = True
else:
keep_ipa += codepoint
ipas[ipa_idx] = keep_ipa
num_ipas: int = len(ipas)
# ---------------------------------------------------------------------
# pylint: disable=consider-using-enumerate
for ipa_idx in range(len(ipas)):
ipa = ipas[ipa_idx]
if ipa is None:
# Skip replaced piece
continue
phoneme_match = False
for phoneme_ipas, phoneme in self._phonemes_sorted:
if ipa_idx <= (num_ipas - len(phoneme_ipas)):
phoneme_match = True
phoneme_stress = ""
phoneme_tones = ""
# Look forward into sequence
for phoneme_idx in range(len(phoneme_ipas)):
phoneme_stress += ipa_stress[ipa_idx + phoneme_idx]
phoneme_tones += ipa_tones[ipa_idx + phoneme_idx]
if phoneme_ipas[phoneme_idx] != ipas[ipa_idx + phoneme_idx]:
phoneme_match = False
break
if phoneme_match:
# Successful match
if phoneme_stress or phoneme_tones:
# Create a copy of the phoneme with applied stress/tones
phoneme = Phoneme(
text=(phoneme_stress + phoneme.text + phoneme_tones),
example=phoneme.example,
)
word_phonemes.append(phoneme)
# Patch ipas to skip replaced pieces
for phoneme_idx in range(1, len(phoneme_ipas)):
ipas[ipa_idx + phoneme_idx] = None
break
if not phoneme_match:
# Add unknown phoneme
word_phonemes.append(Phoneme(text=ipa, unknown=True))
return word_phonemes