ai-content-maker/.venv/Lib/site-packages/gruut_ipa/accent.py

231 lines
7.8 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
"""Methods for mapping phonemes from one language to another"""
import typing
import unicodedata
from copy import copy
from dataclasses import dataclass
from gruut_ipa.constants import Vowel, VowelHeight, VowelPlacement
from gruut_ipa.distances import get_closest
from gruut_ipa.phonemes import Phoneme, Phonemes, Pronunciation
# ---------------------------------------------------------------------
R_LIKE = ["ɹ", "ʁ", "r", "ʀ", "ɻ", "ɚ"]
SCHWA_PREFERRED = ["ə", "ɐ"]
GS = ["ɡ", "g"]
PHARY_GLOTTAL = ["ʡ", "ʔ"]
MATCHING_PHONEMES = typing.List[Phoneme]
@dataclass
class GuessedPhonemes:
"""Result from guess_phonemes"""
phonemes: MATCHING_PHONEMES
distance: typing.Optional[float] = None
def guess_phonemes(
from_phoneme: typing.Union[str, Phoneme], to_phonemes: Phonemes,
) -> GuessedPhonemes:
"""Get best matching phonemes for a single phoneme"""
best_phonemes: MATCHING_PHONEMES = []
best_dist: typing.Optional[float] = None
from_codepoints: typing.Optional[typing.Set[str]] = None
if isinstance(from_phoneme, str):
# Parse phoneme
from_phoneme = Phoneme(from_phoneme)
if from_phoneme.text in GS:
# Correctly map two forms of "g"
for maybe_g in GS:
if maybe_g in to_phonemes:
best_phonemes = [Phoneme(maybe_g)]
best_dist = 0.0
break
if (not best_phonemes) and from_phoneme.schwa:
if from_phoneme.schwa.r_coloured:
# Try r-like
for maybe_r_like in R_LIKE:
if maybe_r_like in to_phonemes:
best_phonemes = [Phoneme(maybe_r_like)]
best_dist = 0.0
break
if not best_phonemes:
for maybe_schwa in SCHWA_PREFERRED:
# Try known schwa preferences
if maybe_schwa in to_phonemes:
best_phonemes = [Phoneme(maybe_schwa)]
best_dist = 0.0
break
if not best_phonemes:
# Treat as a mid-central vowel
from_phoneme = copy(from_phoneme)
setattr(
from_phoneme,
"vowel",
Vowel(
ipa="ə",
height=VowelHeight.MID,
placement=VowelPlacement.CENTRAL,
rounded=False,
),
)
if (not best_phonemes) and (from_phoneme.text in R_LIKE):
# Map r-like consonant
for maybe_r in R_LIKE:
if maybe_r in to_phonemes:
best_phonemes = [Phoneme(maybe_r)]
best_dist = 0.0
break
if (not best_phonemes) and (from_phoneme.text in PHARY_GLOTTAL):
# Map or drop
for maybe_pg in PHARY_GLOTTAL:
if maybe_pg in to_phonemes:
best_phonemes = [Phoneme(maybe_pg)]
best_dist = 0
break
if not best_phonemes:
# Drop
return GuessedPhonemes(phonemes=[])
if best_phonemes:
return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist)
# Search through target phonemes
for to_phoneme in to_phonemes:
if from_phoneme.text == to_phoneme.text:
# Easy case: exact match
best_phonemes = [to_phoneme]
best_dist = 0.0
break
if from_phoneme.letters == to_phoneme.letters:
# Match except for elongation, accent
if from_codepoints is None:
from_codepoints = set(unicodedata.normalize("NFD", from_phoneme.text))
# Compute a "distance" based on how many codepoints different between the two phonemes.
# This should usually be < 1 so that it can be a better match than the vowel/consonant distances.
to_codepoints = set(unicodedata.normalize("NFD", to_phoneme.text))
# Divide by 10 to ensure this is usually < 1
dist = abs(len(from_codepoints) - len(to_codepoints)) / 10.0
if (best_dist is None) or (dist < best_dist):
best_phonemes = [to_phoneme]
best_dist = dist
continue
# if from_phoneme.vowel and to_phoneme.vowel:
# # Vowel distance
# dist = vowel_distance(from_phoneme.vowel, to_phoneme.vowel)
# # Extra penalty for not matching elongation
# dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0
# if (best_dist is None) or (dist < best_dist):
# best_dist = dist
# best_phonemes = [to_phoneme]
# continue
# if from_phoneme.consonant and to_phoneme.consonant:
# # Consonant distance
# dist = consonant_distance(from_phoneme.consonant, to_phoneme.consonant)
# # Extra penalty for not matching elongation
# dist += 0.5 if from_phoneme.elongated != to_phoneme.elongated else 0
# if (best_dist is None) or (dist < best_dist):
# best_dist = dist
# best_phonemes = [to_phoneme]
# continue
if len(from_phoneme.letters) > 1:
# Split apart and match each letter separately
best_split: MATCHING_PHONEMES = []
split_phonemes = Pronunciation.from_string(from_phoneme.text, keep_ties=False)
dist = 1.0
for split_phoneme in split_phonemes:
guessed = guess_phonemes(split_phoneme.text, to_phonemes)
if (not guessed.phonemes) or (guessed.distance is None):
break
dist += guessed.distance
best_split.extend(guessed.phonemes)
if (best_dist is None) or (dist < best_dist):
best_phonemes = best_split
best_dist = dist
elif best_dist is None:
closest = get_closest(from_phoneme.text)
if closest:
for candidate_str in closest:
for to_phoneme in to_phonemes:
if candidate_str == to_phoneme.text:
best_phonemes = [Phoneme(candidate_str)]
best_dist = 0.5
break
if best_dist is not None:
break
if best_dist is None:
# Last resort
for to_phoneme in to_phonemes:
if from_phoneme.letters[0] == to_phoneme.letters[0]:
best_phonemes = [to_phoneme]
best_dist = 10.0
break
return GuessedPhonemes(phonemes=best_phonemes, distance=best_dist)
# ---------------------------------------------------------------------
# VOWEL_HEIGHT_NUM = {h: i for i, h in enumerate(VowelHeight)}
# VOWEL_PLACE_NUM = {p: i for i, p in enumerate(VowelPlacement)}
# def vowel_distance(vowel_1: Vowel, vowel_2: Vowel) -> float:
# """Return a distance measure between two vowels"""
# dist_height = (
# abs(VOWEL_HEIGHT_NUM[vowel_1.height] - VOWEL_HEIGHT_NUM[vowel_2.height]) * 2
# )
# dist_place = abs(
# VOWEL_PLACE_NUM[vowel_1.placement] - VOWEL_PLACE_NUM[vowel_2.placement]
# )
# dist_rounded = 1 if vowel_1.rounded != vowel_2.rounded else 0
# return dist_height + dist_place + dist_rounded
# CONSONANT_TYPE_NUM = {t: i for i, t in enumerate(ConsonantType)}
# CONSONANT_PLACE_NUM = {p: i for i, p in enumerate(ConsonantPlace)}
# def consonant_distance(consonant_1: Consonant, consonant_2: Consonant) -> float:
# """Return a distance measure between two consonants"""
# dist_type = abs(
# CONSONANT_TYPE_NUM[consonant_1.type] - CONSONANT_TYPE_NUM[consonant_2.type]
# )
# # dist_type = 1 if consonant_1.type != consonant_2.type else 0
# dist_place = abs(
# CONSONANT_PLACE_NUM[consonant_1.place] - CONSONANT_PLACE_NUM[consonant_2.place]
# )
# dist_voiced = 1 if consonant_1.voiced != consonant_2.voiced else 0
# return dist_type + dist_place + dist_voiced