ai-content-maker/.venv/Lib/site-packages/TTS/tts/configs/neuralhmm_tts_config.py

from dataclasses import dataclass, field
from typing import List

from TTS.tts.configs.shared_configs import BaseTTSConfig


@dataclass
class NeuralhmmTTSConfig(BaseTTSConfig):
    """
    Define parameters for Neural HMM TTS model.

    Example:

        >>> from TTS.tts.configs.overflow_config import OverflowConfig
        >>> config = OverflowConfig()

    Args:
        model (str):
            Model name used to select the right model class to initilize. Defaults to `Overflow`.
        run_eval_steps (int):
            Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
        save_step (int):
            Save local checkpoint every save_step steps. Defaults to 500.
        plot_step (int):
            Plot training stats on the logger every plot_step steps. Defaults to 1.
        model_param_stats (bool):
            Log model parameters stats on the logger dashboard. Defaults to False.
        force_generate_statistics (bool):
            Force generate mel normalization statistics. Defaults to False.
        mel_statistics_parameter_path (str):
            Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
            Defaults to None.
        num_chars (int):
            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
        state_per_phone (int):
            Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
        encoder_in_out_features (int):
            Channels of encoder input and character embedding tensors. Defaults to 512.
        encoder_n_convolutions (int):
            Number of convolution layers in the encoder. Defaults to 3.
        out_channels (int):
            Channels of the final model output. It must match the spectragram size. Defaults to 80.
        ar_order (int):
            Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
        sampling_temp (float):
            Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
        deterministic_transition (bool):
            deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
        duration_threshold (float):
            Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
        use_grad_checkpointing (bool):
            Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
        max_sampling_time (int):
            Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
        prenet_type (str):
            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
            Prenet. Defaults to `original`.
        prenet_dim (int):
            Dimension of the Prenet. Defaults to 256.
        prenet_n_layers (int):
            Number of layers in the Prenet. Defaults to 2.
        prenet_dropout (float):
            Dropout rate of the Prenet. Defaults to 0.5.
        prenet_dropout_at_inference (bool):
            Use dropout at inference time. Defaults to False.
        memory_rnn_dim (int):
            Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
        outputnet_size (list[int]):
            Size of the output network inside the neural HMM. Defaults to [1024].
        flat_start_params (dict):
            Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
            It will be recomputed when you pass the dataset.
        std_floor (float):
            Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
            It is called `variance flooring` in standard HMM literature.
        optimizer (str):
            Optimizer to use for training. Defaults to `adam`.
        optimizer_params (dict):
            Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
        grad_clip (float):
            Gradient clipping threshold. Defaults to 40_000.
        lr (float):
            Learning rate. Defaults to 1e-3.
        lr_scheduler (str):
            Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
            `TTS.utils.training`. Defaults to `None`.
        min_seq_len (int):
            Minimum input sequence length to be used at training.
        max_seq_len (int):
            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
    """

    model: str = "NeuralHMM_TTS"

    # Training and Checkpoint configs
    run_eval_steps: int = 100
    save_step: int = 500
    plot_step: int = 1
    model_param_stats: bool = False

    # data parameters
    force_generate_statistics: bool = False
    mel_statistics_parameter_path: str = None

    # Encoder parameters
    num_chars: int = None
    state_per_phone: int = 2
    encoder_in_out_features: int = 512
    encoder_n_convolutions: int = 3

    # HMM parameters
    out_channels: int = 80
    ar_order: int = 1
    sampling_temp: float = 0
    deterministic_transition: bool = True
    duration_threshold: float = 0.43
    use_grad_checkpointing: bool = True
    max_sampling_time: int = 1000

    ## Prenet parameters
    prenet_type: str = "original"
    prenet_dim: int = 256
    prenet_n_layers: int = 2
    prenet_dropout: float = 0.5
    prenet_dropout_at_inference: bool = True
    memory_rnn_dim: int = 1024

    ## Outputnet parameters
    outputnet_size: List[int] = field(default_factory=lambda: [1024])
    flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
    std_floor: float = 0.001

    # optimizer parameters
    optimizer: str = "Adam"
    optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
    grad_clip: float = 40000.0
    lr: float = 1e-3
    lr_scheduler: str = None

    # overrides
    min_text_len: int = 10
    max_text_len: int = 500
    min_audio_len: int = 512

    # testing
    test_sentences: List[str] = field(
        default_factory=lambda: [
            "Be a voice, not an echo.",
        ]
    )

    # Extra needed config
    r: int = 1
    use_d_vector_file: bool = False
    use_speaker_embedding: bool = False

    def check_values(self):
        """Validate the hyperparameters.

        Raises:
            AssertionError: when the parameters network is not defined
            AssertionError: transition probability is not between 0 and 1
        """
        assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
        assert (
            len(self.outputnet_size) >= 1
        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
        assert (
            0 < self.flat_start_params["transition_p"] < 1
        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
first commit 2024-05-03 04:18:51 +03:00			`from dataclasses import dataclass, field`
			`from typing import List`

			`from TTS.tts.configs.shared_configs import BaseTTSConfig`


			`@dataclass`
			`class NeuralhmmTTSConfig(BaseTTSConfig):`
			`"""`
			`Define parameters for Neural HMM TTS model.`

			`Example:`

			`>>> from TTS.tts.configs.overflow_config import OverflowConfig`
			`>>> config = OverflowConfig()`

			`Args:`
			`model (str):`
			Model name used to select the right model class to initilize. Defaults to `Overflow`.
			`run_eval_steps (int):`
			`Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.`
			`save_step (int):`
			`Save local checkpoint every save_step steps. Defaults to 500.`
			`plot_step (int):`
			`Plot training stats on the logger every plot_step steps. Defaults to 1.`
			`model_param_stats (bool):`
			`Log model parameters stats on the logger dashboard. Defaults to False.`
			`force_generate_statistics (bool):`
			`Force generate mel normalization statistics. Defaults to False.`
			`mel_statistics_parameter_path (str):`
			`Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.`
			`Defaults to None.`
			`num_chars (int):`
			`Number of characters used by the model. It must be defined before initializing the model. Defaults to None.`
			`state_per_phone (int):`
			Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
			`encoder_in_out_features (int):`
			`Channels of encoder input and character embedding tensors. Defaults to 512.`
			`encoder_n_convolutions (int):`
			`Number of convolution layers in the encoder. Defaults to 3.`
			`out_channels (int):`
			`Channels of the final model output. It must match the spectragram size. Defaults to 80.`
			`ar_order (int):`
			`Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.`
			`sampling_temp (float):`
			`Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.`
			`deterministic_transition (bool):`
			`deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.`
			`duration_threshold (float):`
			`Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.`
			`use_grad_checkpointing (bool):`
			`Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.`
			`max_sampling_time (int):`
			`Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.`
			`prenet_type (str):`
			`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
			Prenet. Defaults to `original`.
			`prenet_dim (int):`
			`Dimension of the Prenet. Defaults to 256.`
			`prenet_n_layers (int):`
			`Number of layers in the Prenet. Defaults to 2.`
			`prenet_dropout (float):`
			`Dropout rate of the Prenet. Defaults to 0.5.`
			`prenet_dropout_at_inference (bool):`
			`Use dropout at inference time. Defaults to False.`
			`memory_rnn_dim (int):`
			`Dimension of the memory LSTM to process the prenet output. Defaults to 1024.`
			`outputnet_size (list[int]):`
			`Size of the output network inside the neural HMM. Defaults to [1024].`
			`flat_start_params (dict):`
			Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
			`It will be recomputed when you pass the dataset.`
			`std_floor (float):`
			`Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.`
			It is called `variance flooring` in standard HMM literature.
			`optimizer (str):`
			Optimizer to use for training. Defaults to `adam`.
			`optimizer_params (dict):`
			Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
			`grad_clip (float):`
			`Gradient clipping threshold. Defaults to 40_000.`
			`lr (float):`
			`Learning rate. Defaults to 1e-3.`
			`lr_scheduler (str):`
			Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
			`TTS.utils.training`. Defaults to `None`.
			`min_seq_len (int):`
			`Minimum input sequence length to be used at training.`
			`max_seq_len (int):`
			`Maximum input sequence length to be used at training. Larger values result in more VRAM usage.`
			`"""`

			`model: str = "NeuralHMM_TTS"`

			`# Training and Checkpoint configs`
			`run_eval_steps: int = 100`
			`save_step: int = 500`
			`plot_step: int = 1`
			`model_param_stats: bool = False`

			`# data parameters`
			`force_generate_statistics: bool = False`
			`mel_statistics_parameter_path: str = None`

			`# Encoder parameters`
			`num_chars: int = None`
			`state_per_phone: int = 2`
			`encoder_in_out_features: int = 512`
			`encoder_n_convolutions: int = 3`

			`# HMM parameters`
			`out_channels: int = 80`
			`ar_order: int = 1`
			`sampling_temp: float = 0`
			`deterministic_transition: bool = True`
			`duration_threshold: float = 0.43`
			`use_grad_checkpointing: bool = True`
			`max_sampling_time: int = 1000`

			`## Prenet parameters`
			`prenet_type: str = "original"`
			`prenet_dim: int = 256`
			`prenet_n_layers: int = 2`
			`prenet_dropout: float = 0.5`
			`prenet_dropout_at_inference: bool = True`
			`memory_rnn_dim: int = 1024`

			`## Outputnet parameters`
			`outputnet_size: List[int] = field(default_factory=lambda: [1024])`
			`flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})`
			`std_floor: float = 0.001`

			`# optimizer parameters`
			`optimizer: str = "Adam"`
			`optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})`
			`grad_clip: float = 40000.0`
			`lr: float = 1e-3`
			`lr_scheduler: str = None`

			`# overrides`
			`min_text_len: int = 10`
			`max_text_len: int = 500`
			`min_audio_len: int = 512`

			`# testing`
			`test_sentences: List[str] = field(`
			`default_factory=lambda: [`
			`"Be a voice, not an echo.",`
			`]`
			`)`

			`# Extra needed config`
			`r: int = 1`
			`use_d_vector_file: bool = False`
			`use_speaker_embedding: bool = False`

			`def check_values(self):`
			`"""Validate the hyperparameters.`

			`Raises:`
			`AssertionError: when the parameters network is not defined`
			`AssertionError: transition probability is not between 0 and 1`
			`"""`
			`assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."`
			`assert (`
			`len(self.outputnet_size) >= 1`
			`), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"`
			`assert (`
			`0 < self.flat_start_params["transition_p"] < 1`
			`), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"`