252 lines
8.3 KiB
Python
252 lines
8.3 KiB
Python
|
#!/usr/bin/env python3
|
||
|
"""Functions for converting IPA symbols to and from feature vectors."""
|
||
|
import dataclasses
|
||
|
import typing
|
||
|
|
||
|
from gruut_ipa.constants import (
|
||
|
CONSONANTS,
|
||
|
FEATURE_COLUMNS,
|
||
|
FEATURE_EMPTY,
|
||
|
FEATURE_KEYS,
|
||
|
FEATURE_ORDINAL_COLUMNS,
|
||
|
IPA,
|
||
|
SCHWAS,
|
||
|
VOWELS,
|
||
|
Break,
|
||
|
BreakType,
|
||
|
Consonant,
|
||
|
ConsonantPlace,
|
||
|
ConsonantType,
|
||
|
PhonemeLength,
|
||
|
Schwa,
|
||
|
Stress,
|
||
|
Vowel,
|
||
|
VowelHeight,
|
||
|
VowelPlacement,
|
||
|
)
|
||
|
|
||
|
|
||
|
def to_vector(
|
||
|
symbol: typing.Union[Vowel, Consonant, Schwa, Break]
|
||
|
) -> typing.Sequence[float]:
|
||
|
"""Converts a symbol into a feature vector"""
|
||
|
features: typing.Dict[str, str] = {}
|
||
|
|
||
|
if isinstance(symbol, Vowel):
|
||
|
features["symbol_type"] = "phoneme"
|
||
|
features["phoneme_type"] = "vowel"
|
||
|
features["vowel_height"] = symbol.height.value
|
||
|
features["vowel_place"] = symbol.placement.value
|
||
|
features["vowel_rounded"] = "rounded" if symbol.rounded else "unrounded"
|
||
|
features["phoneme_length"] = symbol.length.value
|
||
|
|
||
|
if symbol.nasalated:
|
||
|
features["diacritic"] = "nasalated"
|
||
|
|
||
|
if symbol.stress is not None:
|
||
|
features["vowel_stress"] = symbol.stress.value
|
||
|
|
||
|
elif isinstance(symbol, Consonant):
|
||
|
features["symbol_type"] = "phoneme"
|
||
|
features["phoneme_type"] = "consonant"
|
||
|
features["consonant_voiced"] = "voiced" if symbol.voiced else "unvoiced"
|
||
|
features["consonant_type"] = symbol.type.value
|
||
|
features["consonant_place"] = symbol.place.value
|
||
|
features["consonant_sounds_like"] = symbol.sounds_like.value
|
||
|
features["phoneme_length"] = symbol.length.value
|
||
|
|
||
|
if symbol.velarized:
|
||
|
features["diacritic"] = "velarized"
|
||
|
|
||
|
elif isinstance(symbol, Schwa):
|
||
|
features["symbol_type"] = "phoneme"
|
||
|
features["phoneme_type"] = "schwa"
|
||
|
features["phoneme_length"] = symbol.length.value
|
||
|
|
||
|
if symbol.r_coloured:
|
||
|
features["consonant_sounds_like"] = "r"
|
||
|
|
||
|
elif isinstance(symbol, Break):
|
||
|
features["symbol_type"] = "break"
|
||
|
features["break_type"] = symbol.type.value
|
||
|
else:
|
||
|
# Unsupported symbol type
|
||
|
raise ValueError(symbol)
|
||
|
|
||
|
return features_to_vector(features)
|
||
|
|
||
|
|
||
|
def from_vector(
|
||
|
vector: typing.Sequence[float],
|
||
|
) -> typing.Union[Vowel, Consonant, Schwa, Break]:
|
||
|
"""Converts a feature vector back into a symbol"""
|
||
|
features = vector_to_features(vector)
|
||
|
if features["symbol_type"] == "break":
|
||
|
break_type = BreakType(features["break_type"])
|
||
|
return Break(break_type)
|
||
|
|
||
|
if features["symbol_type"] == "phoneme":
|
||
|
if features["phoneme_type"] == "vowel":
|
||
|
height = VowelHeight(features["vowel_height"])
|
||
|
placement = VowelPlacement(features["vowel_place"])
|
||
|
rounded = features["vowel_rounded"] == "rounded"
|
||
|
nasalated = features["diacritic"] == "nasalated"
|
||
|
length = PhonemeLength(features["phoneme_length"])
|
||
|
|
||
|
stress: typing.Optional[Stress] = None
|
||
|
stress_val = features["vowel_stress"]
|
||
|
if stress_val != FEATURE_EMPTY:
|
||
|
stress = Stress(stress_val)
|
||
|
|
||
|
for vowel in VOWELS.values():
|
||
|
if (
|
||
|
(vowel.height == height)
|
||
|
and (vowel.placement == placement)
|
||
|
and (vowel.rounded == rounded)
|
||
|
and (vowel.nasalated == nasalated)
|
||
|
):
|
||
|
if (stress is None) and (length == PhonemeLength.NORMAL):
|
||
|
# Don't need to make a copy
|
||
|
return vowel
|
||
|
|
||
|
return dataclasses.replace(vowel, stress=stress)
|
||
|
|
||
|
raise ValueError(f"Unknown vowel: {features}")
|
||
|
|
||
|
if features["phoneme_type"] == "consonant":
|
||
|
c_type = ConsonantType(features["consonant_type"])
|
||
|
place = ConsonantPlace(features["consonant_place"])
|
||
|
voiced = features["consonant_voiced"] == "voiced"
|
||
|
velarized = features["diacritic"] == "velarized"
|
||
|
length = PhonemeLength(features["phoneme_length"])
|
||
|
|
||
|
for consonant in CONSONANTS.values():
|
||
|
if (
|
||
|
(consonant.type == c_type)
|
||
|
and (consonant.place == place)
|
||
|
and (consonant.voiced == voiced)
|
||
|
and (consonant.velarized == velarized)
|
||
|
):
|
||
|
if length == PhonemeLength.NORMAL:
|
||
|
# Don't need to make a copy
|
||
|
return consonant
|
||
|
|
||
|
return dataclasses.replace(consonant, length=length)
|
||
|
|
||
|
raise ValueError(f"Unknown vowel: {features}")
|
||
|
|
||
|
if features["phoneme_type"] == "schwa":
|
||
|
r_coloured = features["consonant_sounds_like"] == "r"
|
||
|
length = PhonemeLength(features["phoneme_length"])
|
||
|
|
||
|
for schwa in SCHWAS.values():
|
||
|
if schwa.r_coloured == r_coloured:
|
||
|
if length == PhonemeLength.NORMAL:
|
||
|
# Don't need to make a copy
|
||
|
return schwa
|
||
|
|
||
|
return dataclasses.replace(schwa, length=length)
|
||
|
|
||
|
raise ValueError(f"Unknown vowel: {features}")
|
||
|
|
||
|
# Unsupported phoneme type
|
||
|
raise ValueError(f"Unknown phoneme type: {features}")
|
||
|
|
||
|
# Unsupported symbol type
|
||
|
raise ValueError(f"Unknown symbol type: {features}")
|
||
|
|
||
|
|
||
|
def string_to_symbol(symbol_str: str) -> typing.Union[Vowel, Consonant, Schwa, Break]:
|
||
|
"""Get gruut IPA object for IPA symbol"""
|
||
|
if not symbol_str:
|
||
|
raise ValueError("Empty symbol")
|
||
|
|
||
|
# Check break first
|
||
|
if symbol_str == IPA.BREAK_WORD:
|
||
|
return Break(BreakType.WORD)
|
||
|
|
||
|
if symbol_str == IPA.BREAK_MINOR:
|
||
|
return Break(BreakType.MINOR)
|
||
|
|
||
|
if symbol_str == IPA.BREAK_MAJOR:
|
||
|
return Break(BreakType.MAJOR)
|
||
|
|
||
|
# Strip stress
|
||
|
maybe_stress: typing.Optional[Stress] = None
|
||
|
if symbol_str[0] == IPA.STRESS_PRIMARY:
|
||
|
maybe_stress = Stress.PRIMARY
|
||
|
symbol_str = symbol_str[1:]
|
||
|
elif symbol_str[0] == IPA.STRESS_SECONDARY:
|
||
|
maybe_stress = Stress.SECONDARY
|
||
|
symbol_str = symbol_str[1:]
|
||
|
|
||
|
if not symbol_str:
|
||
|
raise ValueError("No letters")
|
||
|
|
||
|
# Strip length
|
||
|
length = PhonemeLength.NORMAL
|
||
|
if symbol_str[-1] == IPA.HALF_LONG:
|
||
|
length = PhonemeLength.SHORT
|
||
|
symbol_str = symbol_str[:-1]
|
||
|
elif symbol_str[-1] == IPA.LONG:
|
||
|
length = PhonemeLength.LONG
|
||
|
symbol_str = symbol_str[:-1]
|
||
|
|
||
|
if not symbol_str:
|
||
|
raise ValueError("No letters")
|
||
|
|
||
|
# Look up
|
||
|
maybe_vowel = VOWELS.get(symbol_str)
|
||
|
if maybe_vowel is not None:
|
||
|
return dataclasses.replace(maybe_vowel, stress=maybe_stress, length=length)
|
||
|
|
||
|
maybe_consonant = CONSONANTS.get(symbol_str)
|
||
|
if maybe_consonant is not None:
|
||
|
return dataclasses.replace(maybe_consonant, length=length)
|
||
|
|
||
|
maybe_schwa = SCHWAS.get(symbol_str)
|
||
|
if maybe_schwa is not None:
|
||
|
return dataclasses.replace(maybe_schwa, length=length)
|
||
|
|
||
|
raise ValueError(f"Unsupported symbol type: {symbol_str}")
|
||
|
|
||
|
|
||
|
def features_to_vector(features: typing.Mapping[str, str]) -> typing.Sequence[float]:
|
||
|
"""Create phoneme feature vector from mapping"""
|
||
|
vector: typing.List[float] = []
|
||
|
|
||
|
for col, values in FEATURE_COLUMNS.items():
|
||
|
value = features.get(col, FEATURE_EMPTY)
|
||
|
|
||
|
if col in FEATURE_ORDINAL_COLUMNS:
|
||
|
# Single value normalized by number of possible values
|
||
|
vector.append(values.index(value) / len(values))
|
||
|
else:
|
||
|
# One-hot vector
|
||
|
for v in values:
|
||
|
vector.append(1.0 if (v == value) else 0.0)
|
||
|
|
||
|
return vector
|
||
|
|
||
|
|
||
|
def vector_to_features(vector: typing.Sequence[float]) -> typing.Mapping[str, str]:
|
||
|
"""Create mapping from phoneme feature vector"""
|
||
|
features: typing.Dict[str, str] = {}
|
||
|
|
||
|
for col_name, values in FEATURE_COLUMNS.items():
|
||
|
col_key = FEATURE_KEYS[col_name]
|
||
|
if col_name in FEATURE_ORDINAL_COLUMNS:
|
||
|
# Single value normalized by number of possible values
|
||
|
assert isinstance(col_key, int)
|
||
|
val_idx = int(vector[col_key] * len(values))
|
||
|
else:
|
||
|
# One-hot vector
|
||
|
assert isinstance(col_key, slice)
|
||
|
if 1.0 not in vector[col_key]:
|
||
|
assert False, (col_name, col_key, vector[col_key])
|
||
|
val_idx = vector[col_key].index(1.0)
|
||
|
|
||
|
features[col_name] = values[val_idx]
|
||
|
|
||
|
return features
|