ai-content-maker/.venv/Lib/site-packages/gruut_ipa/features.py

#!/usr/bin/env python3
"""Functions for converting IPA symbols to and from feature vectors."""
import dataclasses
import typing

from gruut_ipa.constants import (
    CONSONANTS,
    FEATURE_COLUMNS,
    FEATURE_EMPTY,
    FEATURE_KEYS,
    FEATURE_ORDINAL_COLUMNS,
    IPA,
    SCHWAS,
    VOWELS,
    Break,
    BreakType,
    Consonant,
    ConsonantPlace,
    ConsonantType,
    PhonemeLength,
    Schwa,
    Stress,
    Vowel,
    VowelHeight,
    VowelPlacement,
)


def to_vector(
    symbol: typing.Union[Vowel, Consonant, Schwa, Break]
) -> typing.Sequence[float]:
    """Converts a symbol into a feature vector"""
    features: typing.Dict[str, str] = {}

    if isinstance(symbol, Vowel):
        features["symbol_type"] = "phoneme"
        features["phoneme_type"] = "vowel"
        features["vowel_height"] = symbol.height.value
        features["vowel_place"] = symbol.placement.value
        features["vowel_rounded"] = "rounded" if symbol.rounded else "unrounded"
        features["phoneme_length"] = symbol.length.value

        if symbol.nasalated:
            features["diacritic"] = "nasalated"

        if symbol.stress is not None:
            features["vowel_stress"] = symbol.stress.value

    elif isinstance(symbol, Consonant):
        features["symbol_type"] = "phoneme"
        features["phoneme_type"] = "consonant"
        features["consonant_voiced"] = "voiced" if symbol.voiced else "unvoiced"
        features["consonant_type"] = symbol.type.value
        features["consonant_place"] = symbol.place.value
        features["consonant_sounds_like"] = symbol.sounds_like.value
        features["phoneme_length"] = symbol.length.value

        if symbol.velarized:
            features["diacritic"] = "velarized"

    elif isinstance(symbol, Schwa):
        features["symbol_type"] = "phoneme"
        features["phoneme_type"] = "schwa"
        features["phoneme_length"] = symbol.length.value

        if symbol.r_coloured:
            features["consonant_sounds_like"] = "r"

    elif isinstance(symbol, Break):
        features["symbol_type"] = "break"
        features["break_type"] = symbol.type.value
    else:
        # Unsupported symbol type
        raise ValueError(symbol)

    return features_to_vector(features)


def from_vector(
    vector: typing.Sequence[float],
) -> typing.Union[Vowel, Consonant, Schwa, Break]:
    """Converts a feature vector back into a symbol"""
    features = vector_to_features(vector)
    if features["symbol_type"] == "break":
        break_type = BreakType(features["break_type"])
        return Break(break_type)

    if features["symbol_type"] == "phoneme":
        if features["phoneme_type"] == "vowel":
            height = VowelHeight(features["vowel_height"])
            placement = VowelPlacement(features["vowel_place"])
            rounded = features["vowel_rounded"] == "rounded"
            nasalated = features["diacritic"] == "nasalated"
            length = PhonemeLength(features["phoneme_length"])

            stress: typing.Optional[Stress] = None
            stress_val = features["vowel_stress"]
            if stress_val != FEATURE_EMPTY:
                stress = Stress(stress_val)

            for vowel in VOWELS.values():
                if (
                    (vowel.height == height)
                    and (vowel.placement == placement)
                    and (vowel.rounded == rounded)
                    and (vowel.nasalated == nasalated)
                ):
                    if (stress is None) and (length == PhonemeLength.NORMAL):
                        # Don't need to make a copy
                        return vowel

                    return dataclasses.replace(vowel, stress=stress)

            raise ValueError(f"Unknown vowel: {features}")

        if features["phoneme_type"] == "consonant":
            c_type = ConsonantType(features["consonant_type"])
            place = ConsonantPlace(features["consonant_place"])
            voiced = features["consonant_voiced"] == "voiced"
            velarized = features["diacritic"] == "velarized"
            length = PhonemeLength(features["phoneme_length"])

            for consonant in CONSONANTS.values():
                if (
                    (consonant.type == c_type)
                    and (consonant.place == place)
                    and (consonant.voiced == voiced)
                    and (consonant.velarized == velarized)
                ):
                    if length == PhonemeLength.NORMAL:
                        # Don't need to make a copy
                        return consonant

                    return dataclasses.replace(consonant, length=length)

            raise ValueError(f"Unknown vowel: {features}")

        if features["phoneme_type"] == "schwa":
            r_coloured = features["consonant_sounds_like"] == "r"
            length = PhonemeLength(features["phoneme_length"])

            for schwa in SCHWAS.values():
                if schwa.r_coloured == r_coloured:
                    if length == PhonemeLength.NORMAL:
                        # Don't need to make a copy
                        return schwa

                    return dataclasses.replace(schwa, length=length)

            raise ValueError(f"Unknown vowel: {features}")

        # Unsupported phoneme type
        raise ValueError(f"Unknown phoneme type: {features}")

    # Unsupported symbol type
    raise ValueError(f"Unknown symbol type: {features}")


def string_to_symbol(symbol_str: str) -> typing.Union[Vowel, Consonant, Schwa, Break]:
    """Get gruut IPA object for IPA symbol"""
    if not symbol_str:
        raise ValueError("Empty symbol")

    # Check break first
    if symbol_str == IPA.BREAK_WORD:
        return Break(BreakType.WORD)

    if symbol_str == IPA.BREAK_MINOR:
        return Break(BreakType.MINOR)

    if symbol_str == IPA.BREAK_MAJOR:
        return Break(BreakType.MAJOR)

    # Strip stress
    maybe_stress: typing.Optional[Stress] = None
    if symbol_str[0] == IPA.STRESS_PRIMARY:
        maybe_stress = Stress.PRIMARY
        symbol_str = symbol_str[1:]
    elif symbol_str[0] == IPA.STRESS_SECONDARY:
        maybe_stress = Stress.SECONDARY
        symbol_str = symbol_str[1:]

    if not symbol_str:
        raise ValueError("No letters")

    # Strip length
    length = PhonemeLength.NORMAL
    if symbol_str[-1] == IPA.HALF_LONG:
        length = PhonemeLength.SHORT
        symbol_str = symbol_str[:-1]
    elif symbol_str[-1] == IPA.LONG:
        length = PhonemeLength.LONG
        symbol_str = symbol_str[:-1]

    if not symbol_str:
        raise ValueError("No letters")

    # Look up
    maybe_vowel = VOWELS.get(symbol_str)
    if maybe_vowel is not None:
        return dataclasses.replace(maybe_vowel, stress=maybe_stress, length=length)

    maybe_consonant = CONSONANTS.get(symbol_str)
    if maybe_consonant is not None:
        return dataclasses.replace(maybe_consonant, length=length)

    maybe_schwa = SCHWAS.get(symbol_str)
    if maybe_schwa is not None:
        return dataclasses.replace(maybe_schwa, length=length)

    raise ValueError(f"Unsupported symbol type: {symbol_str}")


def features_to_vector(features: typing.Mapping[str, str]) -> typing.Sequence[float]:
    """Create phoneme feature vector from mapping"""
    vector: typing.List[float] = []

    for col, values in FEATURE_COLUMNS.items():
        value = features.get(col, FEATURE_EMPTY)

        if col in FEATURE_ORDINAL_COLUMNS:
            # Single value normalized by number of possible values
            vector.append(values.index(value) / len(values))
        else:
            # One-hot vector
            for v in values:
                vector.append(1.0 if (v == value) else 0.0)

    return vector


def vector_to_features(vector: typing.Sequence[float]) -> typing.Mapping[str, str]:
    """Create mapping from phoneme feature vector"""
    features: typing.Dict[str, str] = {}

    for col_name, values in FEATURE_COLUMNS.items():
        col_key = FEATURE_KEYS[col_name]
        if col_name in FEATURE_ORDINAL_COLUMNS:
            # Single value normalized by number of possible values
            assert isinstance(col_key, int)
            val_idx = int(vector[col_key] * len(values))
        else:
            # One-hot vector
            assert isinstance(col_key, slice)
            if 1.0 not in vector[col_key]:
                assert False, (col_name, col_key, vector[col_key])
            val_idx = vector[col_key].index(1.0)

        features[col_name] = values[val_idx]

    return features
first commit 2024-05-03 04:18:51 +03:00			`#!/usr/bin/env python3`
			`"""Functions for converting IPA symbols to and from feature vectors."""`
			`import dataclasses`
			`import typing`

			`from gruut_ipa.constants import (`
			`CONSONANTS,`
			`FEATURE_COLUMNS,`
			`FEATURE_EMPTY,`
			`FEATURE_KEYS,`
			`FEATURE_ORDINAL_COLUMNS,`
			`IPA,`
			`SCHWAS,`
			`VOWELS,`
			`Break,`
			`BreakType,`
			`Consonant,`
			`ConsonantPlace,`
			`ConsonantType,`
			`PhonemeLength,`
			`Schwa,`
			`Stress,`
			`Vowel,`
			`VowelHeight,`
			`VowelPlacement,`
			`)`


			`def to_vector(`
			`symbol: typing.Union[Vowel, Consonant, Schwa, Break]`
			`) -> typing.Sequence[float]:`
			`"""Converts a symbol into a feature vector"""`
			`features: typing.Dict[str, str] = {}`

			`if isinstance(symbol, Vowel):`
			`features["symbol_type"] = "phoneme"`
			`features["phoneme_type"] = "vowel"`
			`features["vowel_height"] = symbol.height.value`
			`features["vowel_place"] = symbol.placement.value`
			`features["vowel_rounded"] = "rounded" if symbol.rounded else "unrounded"`
			`features["phoneme_length"] = symbol.length.value`

			`if symbol.nasalated:`
			`features["diacritic"] = "nasalated"`

			`if symbol.stress is not None:`
			`features["vowel_stress"] = symbol.stress.value`

			`elif isinstance(symbol, Consonant):`
			`features["symbol_type"] = "phoneme"`
			`features["phoneme_type"] = "consonant"`
			`features["consonant_voiced"] = "voiced" if symbol.voiced else "unvoiced"`
			`features["consonant_type"] = symbol.type.value`
			`features["consonant_place"] = symbol.place.value`
			`features["consonant_sounds_like"] = symbol.sounds_like.value`
			`features["phoneme_length"] = symbol.length.value`

			`if symbol.velarized:`
			`features["diacritic"] = "velarized"`

			`elif isinstance(symbol, Schwa):`
			`features["symbol_type"] = "phoneme"`
			`features["phoneme_type"] = "schwa"`
			`features["phoneme_length"] = symbol.length.value`

			`if symbol.r_coloured:`
			`features["consonant_sounds_like"] = "r"`

			`elif isinstance(symbol, Break):`
			`features["symbol_type"] = "break"`
			`features["break_type"] = symbol.type.value`
			`else:`
			`# Unsupported symbol type`
			`raise ValueError(symbol)`

			`return features_to_vector(features)`


			`def from_vector(`
			`vector: typing.Sequence[float],`
			`) -> typing.Union[Vowel, Consonant, Schwa, Break]:`
			`"""Converts a feature vector back into a symbol"""`
			`features = vector_to_features(vector)`
			`if features["symbol_type"] == "break":`
			`break_type = BreakType(features["break_type"])`
			`return Break(break_type)`

			`if features["symbol_type"] == "phoneme":`
			`if features["phoneme_type"] == "vowel":`
			`height = VowelHeight(features["vowel_height"])`
			`placement = VowelPlacement(features["vowel_place"])`
			`rounded = features["vowel_rounded"] == "rounded"`
			`nasalated = features["diacritic"] == "nasalated"`
			`length = PhonemeLength(features["phoneme_length"])`

			`stress: typing.Optional[Stress] = None`
			`stress_val = features["vowel_stress"]`
			`if stress_val != FEATURE_EMPTY:`
			`stress = Stress(stress_val)`

			`for vowel in VOWELS.values():`
			`if (`
			`(vowel.height == height)`
			`and (vowel.placement == placement)`
			`and (vowel.rounded == rounded)`
			`and (vowel.nasalated == nasalated)`
			`):`
			`if (stress is None) and (length == PhonemeLength.NORMAL):`
			`# Don't need to make a copy`
			`return vowel`

			`return dataclasses.replace(vowel, stress=stress)`

			`raise ValueError(f"Unknown vowel: {features}")`

			`if features["phoneme_type"] == "consonant":`
			`c_type = ConsonantType(features["consonant_type"])`
			`place = ConsonantPlace(features["consonant_place"])`
			`voiced = features["consonant_voiced"] == "voiced"`
			`velarized = features["diacritic"] == "velarized"`
			`length = PhonemeLength(features["phoneme_length"])`

			`for consonant in CONSONANTS.values():`
			`if (`
			`(consonant.type == c_type)`
			`and (consonant.place == place)`
			`and (consonant.voiced == voiced)`
			`and (consonant.velarized == velarized)`
			`):`
			`if length == PhonemeLength.NORMAL:`
			`# Don't need to make a copy`
			`return consonant`

			`return dataclasses.replace(consonant, length=length)`

			`raise ValueError(f"Unknown vowel: {features}")`

			`if features["phoneme_type"] == "schwa":`
			`r_coloured = features["consonant_sounds_like"] == "r"`
			`length = PhonemeLength(features["phoneme_length"])`

			`for schwa in SCHWAS.values():`
			`if schwa.r_coloured == r_coloured:`
			`if length == PhonemeLength.NORMAL:`
			`# Don't need to make a copy`
			`return schwa`

			`return dataclasses.replace(schwa, length=length)`

			`raise ValueError(f"Unknown vowel: {features}")`

			`# Unsupported phoneme type`
			`raise ValueError(f"Unknown phoneme type: {features}")`

			`# Unsupported symbol type`
			`raise ValueError(f"Unknown symbol type: {features}")`


			`def string_to_symbol(symbol_str: str) -> typing.Union[Vowel, Consonant, Schwa, Break]:`
			`"""Get gruut IPA object for IPA symbol"""`
			`if not symbol_str:`
			`raise ValueError("Empty symbol")`

			`# Check break first`
			`if symbol_str == IPA.BREAK_WORD:`
			`return Break(BreakType.WORD)`

			`if symbol_str == IPA.BREAK_MINOR:`
			`return Break(BreakType.MINOR)`

			`if symbol_str == IPA.BREAK_MAJOR:`
			`return Break(BreakType.MAJOR)`

			`# Strip stress`
			`maybe_stress: typing.Optional[Stress] = None`
			`if symbol_str[0] == IPA.STRESS_PRIMARY:`
			`maybe_stress = Stress.PRIMARY`
			`symbol_str = symbol_str[1:]`
			`elif symbol_str[0] == IPA.STRESS_SECONDARY:`
			`maybe_stress = Stress.SECONDARY`
			`symbol_str = symbol_str[1:]`

			`if not symbol_str:`
			`raise ValueError("No letters")`

			`# Strip length`
			`length = PhonemeLength.NORMAL`
			`if symbol_str[-1] == IPA.HALF_LONG:`
			`length = PhonemeLength.SHORT`
			`symbol_str = symbol_str[:-1]`
			`elif symbol_str[-1] == IPA.LONG:`
			`length = PhonemeLength.LONG`
			`symbol_str = symbol_str[:-1]`

			`if not symbol_str:`
			`raise ValueError("No letters")`

			`# Look up`
			`maybe_vowel = VOWELS.get(symbol_str)`
			`if maybe_vowel is not None:`
			`return dataclasses.replace(maybe_vowel, stress=maybe_stress, length=length)`

			`maybe_consonant = CONSONANTS.get(symbol_str)`
			`if maybe_consonant is not None:`
			`return dataclasses.replace(maybe_consonant, length=length)`

			`maybe_schwa = SCHWAS.get(symbol_str)`
			`if maybe_schwa is not None:`
			`return dataclasses.replace(maybe_schwa, length=length)`

			`raise ValueError(f"Unsupported symbol type: {symbol_str}")`


			`def features_to_vector(features: typing.Mapping[str, str]) -> typing.Sequence[float]:`
			`"""Create phoneme feature vector from mapping"""`
			`vector: typing.List[float] = []`

			`for col, values in FEATURE_COLUMNS.items():`
			`value = features.get(col, FEATURE_EMPTY)`

			`if col in FEATURE_ORDINAL_COLUMNS:`
			`# Single value normalized by number of possible values`
			`vector.append(values.index(value) / len(values))`
			`else:`
			`# One-hot vector`
			`for v in values:`
			`vector.append(1.0 if (v == value) else 0.0)`

			`return vector`


			`def vector_to_features(vector: typing.Sequence[float]) -> typing.Mapping[str, str]:`
			`"""Create mapping from phoneme feature vector"""`
			`features: typing.Dict[str, str] = {}`

			`for col_name, values in FEATURE_COLUMNS.items():`
			`col_key = FEATURE_KEYS[col_name]`
			`if col_name in FEATURE_ORDINAL_COLUMNS:`
			`# Single value normalized by number of possible values`
			`assert isinstance(col_key, int)`
			`val_idx = int(vector[col_key] * len(values))`
			`else:`
			`# One-hot vector`
			`assert isinstance(col_key, slice)`
			`if 1.0 not in vector[col_key]:`
			`assert False, (col_name, col_key, vector[col_key])`
			`val_idx = vector[col_key].index(1.0)`

			`features[col_name] = values[val_idx]`

			`return features`