ai-content-maker/.venv/Lib/site-packages/gruut_ipa/features.py

252 lines
8.3 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
#!/usr/bin/env python3
"""Functions for converting IPA symbols to and from feature vectors."""
import dataclasses
import typing
from gruut_ipa.constants import (
CONSONANTS,
FEATURE_COLUMNS,
FEATURE_EMPTY,
FEATURE_KEYS,
FEATURE_ORDINAL_COLUMNS,
IPA,
SCHWAS,
VOWELS,
Break,
BreakType,
Consonant,
ConsonantPlace,
ConsonantType,
PhonemeLength,
Schwa,
Stress,
Vowel,
VowelHeight,
VowelPlacement,
)
def to_vector(
symbol: typing.Union[Vowel, Consonant, Schwa, Break]
) -> typing.Sequence[float]:
"""Converts a symbol into a feature vector"""
features: typing.Dict[str, str] = {}
if isinstance(symbol, Vowel):
features["symbol_type"] = "phoneme"
features["phoneme_type"] = "vowel"
features["vowel_height"] = symbol.height.value
features["vowel_place"] = symbol.placement.value
features["vowel_rounded"] = "rounded" if symbol.rounded else "unrounded"
features["phoneme_length"] = symbol.length.value
if symbol.nasalated:
features["diacritic"] = "nasalated"
if symbol.stress is not None:
features["vowel_stress"] = symbol.stress.value
elif isinstance(symbol, Consonant):
features["symbol_type"] = "phoneme"
features["phoneme_type"] = "consonant"
features["consonant_voiced"] = "voiced" if symbol.voiced else "unvoiced"
features["consonant_type"] = symbol.type.value
features["consonant_place"] = symbol.place.value
features["consonant_sounds_like"] = symbol.sounds_like.value
features["phoneme_length"] = symbol.length.value
if symbol.velarized:
features["diacritic"] = "velarized"
elif isinstance(symbol, Schwa):
features["symbol_type"] = "phoneme"
features["phoneme_type"] = "schwa"
features["phoneme_length"] = symbol.length.value
if symbol.r_coloured:
features["consonant_sounds_like"] = "r"
elif isinstance(symbol, Break):
features["symbol_type"] = "break"
features["break_type"] = symbol.type.value
else:
# Unsupported symbol type
raise ValueError(symbol)
return features_to_vector(features)
def from_vector(
vector: typing.Sequence[float],
) -> typing.Union[Vowel, Consonant, Schwa, Break]:
"""Converts a feature vector back into a symbol"""
features = vector_to_features(vector)
if features["symbol_type"] == "break":
break_type = BreakType(features["break_type"])
return Break(break_type)
if features["symbol_type"] == "phoneme":
if features["phoneme_type"] == "vowel":
height = VowelHeight(features["vowel_height"])
placement = VowelPlacement(features["vowel_place"])
rounded = features["vowel_rounded"] == "rounded"
nasalated = features["diacritic"] == "nasalated"
length = PhonemeLength(features["phoneme_length"])
stress: typing.Optional[Stress] = None
stress_val = features["vowel_stress"]
if stress_val != FEATURE_EMPTY:
stress = Stress(stress_val)
for vowel in VOWELS.values():
if (
(vowel.height == height)
and (vowel.placement == placement)
and (vowel.rounded == rounded)
and (vowel.nasalated == nasalated)
):
if (stress is None) and (length == PhonemeLength.NORMAL):
# Don't need to make a copy
return vowel
return dataclasses.replace(vowel, stress=stress)
raise ValueError(f"Unknown vowel: {features}")
if features["phoneme_type"] == "consonant":
c_type = ConsonantType(features["consonant_type"])
place = ConsonantPlace(features["consonant_place"])
voiced = features["consonant_voiced"] == "voiced"
velarized = features["diacritic"] == "velarized"
length = PhonemeLength(features["phoneme_length"])
for consonant in CONSONANTS.values():
if (
(consonant.type == c_type)
and (consonant.place == place)
and (consonant.voiced == voiced)
and (consonant.velarized == velarized)
):
if length == PhonemeLength.NORMAL:
# Don't need to make a copy
return consonant
return dataclasses.replace(consonant, length=length)
raise ValueError(f"Unknown vowel: {features}")
if features["phoneme_type"] == "schwa":
r_coloured = features["consonant_sounds_like"] == "r"
length = PhonemeLength(features["phoneme_length"])
for schwa in SCHWAS.values():
if schwa.r_coloured == r_coloured:
if length == PhonemeLength.NORMAL:
# Don't need to make a copy
return schwa
return dataclasses.replace(schwa, length=length)
raise ValueError(f"Unknown vowel: {features}")
# Unsupported phoneme type
raise ValueError(f"Unknown phoneme type: {features}")
# Unsupported symbol type
raise ValueError(f"Unknown symbol type: {features}")
def string_to_symbol(symbol_str: str) -> typing.Union[Vowel, Consonant, Schwa, Break]:
"""Get gruut IPA object for IPA symbol"""
if not symbol_str:
raise ValueError("Empty symbol")
# Check break first
if symbol_str == IPA.BREAK_WORD:
return Break(BreakType.WORD)
if symbol_str == IPA.BREAK_MINOR:
return Break(BreakType.MINOR)
if symbol_str == IPA.BREAK_MAJOR:
return Break(BreakType.MAJOR)
# Strip stress
maybe_stress: typing.Optional[Stress] = None
if symbol_str[0] == IPA.STRESS_PRIMARY:
maybe_stress = Stress.PRIMARY
symbol_str = symbol_str[1:]
elif symbol_str[0] == IPA.STRESS_SECONDARY:
maybe_stress = Stress.SECONDARY
symbol_str = symbol_str[1:]
if not symbol_str:
raise ValueError("No letters")
# Strip length
length = PhonemeLength.NORMAL
if symbol_str[-1] == IPA.HALF_LONG:
length = PhonemeLength.SHORT
symbol_str = symbol_str[:-1]
elif symbol_str[-1] == IPA.LONG:
length = PhonemeLength.LONG
symbol_str = symbol_str[:-1]
if not symbol_str:
raise ValueError("No letters")
# Look up
maybe_vowel = VOWELS.get(symbol_str)
if maybe_vowel is not None:
return dataclasses.replace(maybe_vowel, stress=maybe_stress, length=length)
maybe_consonant = CONSONANTS.get(symbol_str)
if maybe_consonant is not None:
return dataclasses.replace(maybe_consonant, length=length)
maybe_schwa = SCHWAS.get(symbol_str)
if maybe_schwa is not None:
return dataclasses.replace(maybe_schwa, length=length)
raise ValueError(f"Unsupported symbol type: {symbol_str}")
def features_to_vector(features: typing.Mapping[str, str]) -> typing.Sequence[float]:
"""Create phoneme feature vector from mapping"""
vector: typing.List[float] = []
for col, values in FEATURE_COLUMNS.items():
value = features.get(col, FEATURE_EMPTY)
if col in FEATURE_ORDINAL_COLUMNS:
# Single value normalized by number of possible values
vector.append(values.index(value) / len(values))
else:
# One-hot vector
for v in values:
vector.append(1.0 if (v == value) else 0.0)
return vector
def vector_to_features(vector: typing.Sequence[float]) -> typing.Mapping[str, str]:
"""Create mapping from phoneme feature vector"""
features: typing.Dict[str, str] = {}
for col_name, values in FEATURE_COLUMNS.items():
col_key = FEATURE_KEYS[col_name]
if col_name in FEATURE_ORDINAL_COLUMNS:
# Single value normalized by number of possible values
assert isinstance(col_key, int)
val_idx = int(vector[col_key] * len(values))
else:
# One-hot vector
assert isinstance(col_key, slice)
if 1.0 not in vector[col_key]:
assert False, (col_name, col_key, vector[col_key])
val_idx = vector[col_key].index(1.0)
features[col_name] = values[val_idx]
return features