ai-content-maker/.venv/Lib/site-packages/gruut_ipa/features.py

#!/usr/bin/env python3
"""Functions for converting IPA symbols to and from feature vectors."""
import dataclasses
import typing

from gruut_ipa.constants import (
    CONSONANTS,
    FEATURE_COLUMNS,
    FEATURE_EMPTY,
    FEATURE_KEYS,
    FEATURE_ORDINAL_COLUMNS,
    IPA,
    SCHWAS,
    VOWELS,
    Break,
    BreakType,
    Consonant,
    ConsonantPlace,
    ConsonantType,
    PhonemeLength,
    Schwa,
    Stress,
    Vowel,
    VowelHeight,
    VowelPlacement,
)


def to_vector(
    symbol: typing.Union[Vowel, Consonant, Schwa, Break]
) -> typing.Sequence[float]:
    """Converts a symbol into a feature vector"""
    features: typing.Dict[str, str] = {}

    if isinstance(symbol, Vowel):
        features["symbol_type"] = "phoneme"
        features["phoneme_type"] = "vowel"
        features["vowel_height"] = symbol.height.value
        features["vowel_place"] = symbol.placement.value
        features["vowel_rounded"] = "rounded" if symbol.rounded else "unrounded"
        features["phoneme_length"] = symbol.length.value

        if symbol.nasalated:
            features["diacritic"] = "nasalated"

        if symbol.stress is not None:
            features["vowel_stress"] = symbol.stress.value

    elif isinstance(symbol, Consonant):
        features["symbol_type"] = "phoneme"
        features["phoneme_type"] = "consonant"
        features["consonant_voiced"] = "voiced" if symbol.voiced else "unvoiced"
        features["consonant_type"] = symbol.type.value
        features["consonant_place"] = symbol.place.value
        features["consonant_sounds_like"] = symbol.sounds_like.value
        features["phoneme_length"] = symbol.length.value

        if symbol.velarized:
            features["diacritic"] = "velarized"

    elif isinstance(symbol, Schwa):
        features["symbol_type"] = "phoneme"
        features["phoneme_type"] = "schwa"
        features["phoneme_length"] = symbol.length.value

        if symbol.r_coloured:
            features["consonant_sounds_like"] = "r"

    elif isinstance(symbol, Break):
        features["symbol_type"] = "break"
        features["break_type"] = symbol.type.value
    else:
        # Unsupported symbol type
        raise ValueError(symbol)

    return features_to_vector(features)


def from_vector(
    vector: typing.Sequence[float],
) -> typing.Union[Vowel, Consonant, Schwa, Break]:
    """Converts a feature vector back into a symbol"""
    features = vector_to_features(vector)
    if features["symbol_type"] == "break":
        break_type = BreakType(features["break_type"])
        return Break(break_type)

    if features["symbol_type"] == "phoneme":
        if features["phoneme_type"] == "vowel":
            height = VowelHeight(features["vowel_height"])
            placement = VowelPlacement(features["vowel_place"])
            rounded = features["vowel_rounded"] == "rounded"
            nasalated = features["diacritic"] == "nasalated"
            length = PhonemeLength(features["phoneme_length"])

            stress: typing.Optional[Stress] = None
            stress_val = features["vowel_stress"]
            if stress_val != FEATURE_EMPTY:
                stress = Stress(stress_val)

            for vowel in VOWELS.values():
                if (
                    (vowel.height == height)
                    and (vowel.placement == placement)
                    and (vowel.rounded == rounded)
                    and (vowel.nasalated == nasalated)
                ):
                    if (stress is None) and (length == PhonemeLength.NORMAL):
                        # Don't need to make a copy
                        return vowel

                    return dataclasses.replace(vowel, stress=stress)

            raise ValueError(f"Unknown vowel: {features}")

        if features["phoneme_type"] == "consonant":
            c_type = ConsonantType(features["consonant_type"])
            place = ConsonantPlace(features["consonant_place"])
            voiced = features["consonant_voiced"] == "voiced"
            velarized = features["diacritic"] == "velarized"
            length = PhonemeLength(features["phoneme_length"])

            for consonant in CONSONANTS.values():
                if (
                    (consonant.type == c_type)
                    and (consonant.place == place)
                    and (consonant.voiced == voiced)
                    and (consonant.velarized == velarized)
                ):
                    if length == PhonemeLength.NORMAL:
                        # Don't need to make a copy
                        return consonant

                    return dataclasses.replace(consonant, length=length)

            raise ValueError(f"Unknown vowel: {features}")

        if features["phoneme_type"] == "schwa":
            r_coloured = features["consonant_sounds_like"] == "r"
            length = PhonemeLength(features["phoneme_length"])

            for schwa in SCHWAS.values():
                if schwa.r_coloured == r_coloured:
                    if length == PhonemeLength.NORMAL:
                        # Don't need to make a copy
                        return schwa

                    return dataclasses.replace(schwa, length=length)

            raise ValueError(f"Unknown vowel: {features}")

        # Unsupported phoneme type
        raise ValueError(f"Unknown phoneme type: {features}")

    # Unsupported symbol type
    raise ValueError(f"Unknown symbol type: {features}")


def string_to_symbol(symbol_str: str) -> typing.Union[Vowel, Consonant, Schwa, Break]:
    """Get gruut IPA object for IPA symbol"""
    if not symbol_str:
        raise ValueError("Empty symbol")

    # Check break first
    if symbol_str == IPA.BREAK_WORD:
        return Break(BreakType.WORD)

    if symbol_str == IPA.BREAK_MINOR:
        return Break(BreakType.MINOR)

    if symbol_str == IPA.BREAK_MAJOR:
        return Break(BreakType.MAJOR)

    # Strip stress
    maybe_stress: typing.Optional[Stress] = None
    if symbol_str[0] == IPA.STRESS_PRIMARY:
        maybe_stress = Stress.PRIMARY
        symbol_str = symbol_str[1:]
    elif symbol_str[0] == IPA.STRESS_SECONDARY:
        maybe_stress = Stress.SECONDARY
        symbol_str = symbol_str[1:]

    if not symbol_str:
        raise ValueError("No letters")

    # Strip length
    length = PhonemeLength.NORMAL
    if symbol_str[-1] == IPA.HALF_LONG:
        length = PhonemeLength.SHORT
        symbol_str = symbol_str[:-1]
    elif symbol_str[-1] == IPA.LONG:
        length = PhonemeLength.LONG
        symbol_str = symbol_str[:-1]

    if not symbol_str:
        raise ValueError("No letters")

    # Look up
    maybe_vowel = VOWELS.get(symbol_str)
    if maybe_vowel is not None:
        return dataclasses.replace(maybe_vowel, stress=maybe_stress, length=length)

    maybe_consonant = CONSONANTS.get(symbol_str)
    if maybe_consonant is not None:
        return dataclasses.replace(maybe_consonant, length=length)

    maybe_schwa = SCHWAS.get(symbol_str)
    if maybe_schwa is not None:
        return dataclasses.replace(maybe_schwa, length=length)

    raise ValueError(f"Unsupported symbol type: {symbol_str}")


def features_to_vector(features: typing.Mapping[str, str]) -> typing.Sequence[float]:
    """Create phoneme feature vector from mapping"""
    vector: typing.List[float] = []

    for col, values in FEATURE_COLUMNS.items():
        value = features.get(col, FEATURE_EMPTY)

        if col in FEATURE_ORDINAL_COLUMNS:
            # Single value normalized by number of possible values
            vector.append(values.index(value) / len(values))
        else:
            # One-hot vector
            for v in values:
                vector.append(1.0 if (v == value) else 0.0)

    return vector


def vector_to_features(vector: typing.Sequence[float]) -> typing.Mapping[str, str]:
    """Create mapping from phoneme feature vector"""
    features: typing.Dict[str, str] = {}

    for col_name, values in FEATURE_COLUMNS.items():
        col_key = FEATURE_KEYS[col_name]
        if col_name in FEATURE_ORDINAL_COLUMNS:
            # Single value normalized by number of possible values
            assert isinstance(col_key, int)
            val_idx = int(vector[col_key] * len(values))
        else:
            # One-hot vector
            assert isinstance(col_key, slice)
            if 1.0 not in vector[col_key]:
                assert False, (col_name, col_key, vector[col_key])
            val_idx = vector[col_key].index(1.0)

        features[col_name] = values[val_idx]

    return features