231 lines
7.8 KiB
Python
231 lines
7.8 KiB
Python
"""Methods for mapping phonemes from one language to another"""
|
||
import typing
|
||
import unicodedata
|
||
from copy import copy
|
||
from dataclasses import dataclass
|
||
|
||
from gruut_ipa.constants import Vowel, VowelHeight, VowelPlacement
|
||
from gruut_ipa.distances import get_closest
|
||
from gruut_ipa.phonemes import Phoneme, Phonemes, Pronunciation
|
||
|
||
# ---------------------------------------------------------------------
|
||
|
||
R_LIKE = ["ɹ", "ʁ", "r", "ʀ", "ɻ", "ɚ"]
|
||
SCHWA_PREFERRED = ["ə", "ɐ"]
|
||
GS = ["ɡ", "g"]
|
||
PHARY_GLOTTAL = ["ʡ", "ʔ"]
|
||
|
||
MATCHING_PHONEMES = typing.List[Phoneme]
|
||
|
||
|
||
@dataclass
|
||
class GuessedPhonemes:
|
||
"""Result from guess_phonemes"""
|
||
|
||
phonemes: MATCHING_PHONEMES
|
||
distance: typing.Optional[float] = None
|
||
|
||
|
||
def guess_phonemes(
|
||
from_phoneme: typing.Union[str, Phoneme], to_phonemes: Phonemes,
|
||
) -> GuessedPhonemes:
|
||
"""Get best matching phonemes for a single phoneme"""
|
||
best_phonemes: MATCHING_PHONEMES = []
|
||
best_dist: typing.Optional[float] = None
|
||
|
||
from_codepoints: typing.Optional[typing.Set[str]] = None
|
||
|
||
if isinstance(from_phoneme, str):
|
||
# Parse phoneme
|
||
from_phoneme = Phoneme(from_phoneme)
|
||
|
||
if from_phoneme.text in GS:
|
||
# Correctly map two forms of "g"
|
||
for maybe_g in GS:
|
||
if maybe_g in to_phonemes:
|
||
best_phonemes = [Phoneme(maybe_g)]
|
||
best_dist = 0.0
|
||
break
|
||
|
||
if (not best_phonemes) and from_phoneme.schwa:
|
||
if from_phoneme.schwa.r_coloured:
|
||
# Try r-like
|
||
for maybe_r_like in R_LIKE:
|
||
if maybe_r_like in to_phonemes:
|
||
best_phonemes = [Phoneme(maybe_r_like)]
|
||
best_dist = 0.0
|
||
break
|
||
|
||
if not best_phonemes:
|
||
for maybe_schwa in SCHWA_PREFERRED:
|
||
# Try known schwa preferences
|
||
if maybe_schwa in to_phonemes:
|
||
best_phonemes = [Phoneme(maybe_schwa)]
|
||
best_dist = 0.0
|
||
break
|
||
|
||
if not best_phonemes:
|
||
# Treat as a mid-central vowel
|
||
from_phoneme = copy(from_phoneme)
|
||
setattr(
|
||
from_phoneme,
|
||
"vowel",
|
||
Vowel(
|
||
ipa="ə",
|
||
height=VowelHeight.MID,
|
||
placement=VowelPlacement.CENTRAL,
|
||
rounded=False,
|
||
),
|
||
)
|
||
|
||
if (not best_phonemes) and (from_phoneme.text in R_LIKE):
|
||
# Map r-like consonant
|
||
for maybe_r in R_LIKE:
|
||
if maybe_r in to_phonemes:
|
||
best_phonemes = [Phoneme(maybe_r)]
|
||
best_dist = 0.0
|
||
break
|
||
|
||
if (not best_phonemes) and (from_phoneme.text in PHARY_GLOTTAL):
|
||
# Map or drop
|
||
for maybe_pg in PHARY_GLOTTAL:
|
||
if maybe_pg in to_phonemes:
|
||
best_phonemes = [Phoneme(maybe_pg)]
|
||
best_dist = 0
|
||
break
|
||
|
||
if not best_phonemes:
|
||
# Drop
|
||
return GuessedPhonemes(phonemes=[])
|
||
|
||
if best_phonemes:
|
||
return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist)
|
||
|
||
# Search through target phonemes
|
||
for to_phoneme in to_phonemes:
|
||
if from_phoneme.text == to_phoneme.text:
|
||
# Easy case: exact match
|
||
best_phonemes = [to_phoneme]
|
||
best_dist = 0.0
|
||
break
|
||
|
||
if from_phoneme.letters == to_phoneme.letters:
|
||
# Match except for elongation, accent
|
||
if from_codepoints is None:
|
||
from_codepoints = set(unicodedata.normalize("NFD", from_phoneme.text))
|
||
|
||
# Compute a "distance" based on how many codepoints different between the two phonemes.
|
||
# This should usually be < 1 so that it can be a better match than the vowel/consonant distances.
|
||
to_codepoints = set(unicodedata.normalize("NFD", to_phoneme.text))
|
||
|
||
# Divide by 10 to ensure this is usually < 1
|
||
dist = abs(len(from_codepoints) - len(to_codepoints)) / 10.0
|
||
|
||
if (best_dist is None) or (dist < best_dist):
|
||
best_phonemes = [to_phoneme]
|
||
best_dist = dist
|
||
|
||
continue
|
||
|
||
# if from_phoneme.vowel and to_phoneme.vowel:
|
||
# # Vowel distance
|
||
# dist = vowel_distance(from_phoneme.vowel, to_phoneme.vowel)
|
||
|
||
# # Extra penalty for not matching elongation
|
||
# dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0
|
||
|
||
# if (best_dist is None) or (dist < best_dist):
|
||
# best_dist = dist
|
||
# best_phonemes = [to_phoneme]
|
||
# continue
|
||
|
||
# if from_phoneme.consonant and to_phoneme.consonant:
|
||
# # Consonant distance
|
||
# dist = consonant_distance(from_phoneme.consonant, to_phoneme.consonant)
|
||
# # Extra penalty for not matching elongation
|
||
# dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0
|
||
|
||
# if (best_dist is None) or (dist < best_dist):
|
||
# best_dist = dist
|
||
# best_phonemes = [to_phoneme]
|
||
# continue
|
||
|
||
if len(from_phoneme.letters) > 1:
|
||
# Split apart and match each letter separately
|
||
best_split: MATCHING_PHONEMES = []
|
||
split_phonemes = Pronunciation.from_string(from_phoneme.text, keep_ties=False)
|
||
dist = 1.0
|
||
|
||
for split_phoneme in split_phonemes:
|
||
guessed = guess_phonemes(split_phoneme.text, to_phonemes)
|
||
|
||
if (not guessed.phonemes) or (guessed.distance is None):
|
||
break
|
||
|
||
dist += guessed.distance
|
||
best_split.extend(guessed.phonemes)
|
||
|
||
if (best_dist is None) or (dist < best_dist):
|
||
best_phonemes = best_split
|
||
best_dist = dist
|
||
elif best_dist is None:
|
||
closest = get_closest(from_phoneme.text)
|
||
|
||
if closest:
|
||
for candidate_str in closest:
|
||
for to_phoneme in to_phonemes:
|
||
if candidate_str == to_phoneme.text:
|
||
best_phonemes = [Phoneme(candidate_str)]
|
||
best_dist = 0.5
|
||
break
|
||
|
||
if best_dist is not None:
|
||
break
|
||
|
||
if best_dist is None:
|
||
# Last resort
|
||
for to_phoneme in to_phonemes:
|
||
if from_phoneme.letters[0] == to_phoneme.letters[0]:
|
||
best_phonemes = [to_phoneme]
|
||
best_dist = 10.0
|
||
break
|
||
|
||
return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist)
|
||
|
||
|
||
# ---------------------------------------------------------------------
|
||
|
||
# VOWEL_HEIGHT_NUM = {h: i for i, h in enumerate(VowelHeight)}
|
||
# VOWEL_PLACE_NUM = {p: i for i, p in enumerate(VowelPlacement)}
|
||
|
||
|
||
# def vowel_distance(vowel_1: Vowel, vowel_2: Vowel) -> float:
|
||
# """Return a distance measure between two vowels"""
|
||
# dist_height = (
|
||
# abs(VOWEL_HEIGHT_NUM[vowel_1.height] - VOWEL_HEIGHT_NUM[vowel_2.height]) * 2
|
||
# )
|
||
# dist_place = abs(
|
||
# VOWEL_PLACE_NUM[vowel_1.placement] - VOWEL_PLACE_NUM[vowel_2.placement]
|
||
# )
|
||
# dist_rounded = 1 if vowel_1.rounded != vowel_2.rounded else 0
|
||
|
||
# return dist_height + dist_place + dist_rounded
|
||
|
||
|
||
# CONSONANT_TYPE_NUM = {t: i for i, t in enumerate(ConsonantType)}
|
||
# CONSONANT_PLACE_NUM = {p: i for i, p in enumerate(ConsonantPlace)}
|
||
|
||
|
||
# def consonant_distance(consonant_1: Consonant, consonant_2: Consonant) -> float:
|
||
# """Return a distance measure between two consonants"""
|
||
# dist_type = abs(
|
||
# CONSONANT_TYPE_NUM[consonant_1.type] - CONSONANT_TYPE_NUM[consonant_2.type]
|
||
# )
|
||
# # dist_type = 1 if consonant_1.type != consonant_2.type else 0
|
||
# dist_place = abs(
|
||
# CONSONANT_PLACE_NUM[consonant_1.place] - CONSONANT_PLACE_NUM[consonant_2.place]
|
||
# )
|
||
# dist_voiced = 1 if consonant_1.voiced != consonant_2.voiced else 0
|
||
|
||
# return dist_type + dist_place + dist_voiced
|