103 lines
4.4 KiB
Python
103 lines
4.4 KiB
Python
|
from dataclasses import dataclass, field
|
||
|
|
||
|
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
|
||
|
from TTS.vocoder.models.wavernn import WavernnArgs
|
||
|
|
||
|
|
||
|
@dataclass
|
||
|
class WavernnConfig(BaseVocoderConfig):
|
||
|
"""Defines parameters for Wavernn vocoder.
|
||
|
Example:
|
||
|
|
||
|
>>> from TTS.vocoder.configs import WavernnConfig
|
||
|
>>> config = WavernnConfig()
|
||
|
|
||
|
Args:
|
||
|
model (str):
|
||
|
Model name used for selecting the right model at initialization. Defaults to `wavernn`.
|
||
|
mode (str):
|
||
|
Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
|
||
|
Gaussian Distribution and `bits` for quantized bits as the model's output.
|
||
|
mulaw (bool):
|
||
|
enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
|
||
|
to `True`.
|
||
|
generator_model (str):
|
||
|
One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||
|
considered as a generator too. Defaults to `WaveRNN`.
|
||
|
wavernn_model_params (dict):
|
||
|
kwargs for the WaveRNN model. Defaults to
|
||
|
`{
|
||
|
"rnn_dims": 512,
|
||
|
"fc_dims": 512,
|
||
|
"compute_dims": 128,
|
||
|
"res_out_dims": 128,
|
||
|
"num_res_blocks": 10,
|
||
|
"use_aux_net": True,
|
||
|
"use_upsample_net": True,
|
||
|
"upsample_factors": [4, 8, 8]
|
||
|
}`
|
||
|
batched (bool):
|
||
|
enable / disable the batched inference. It speeds up the inference by splitting the input into segments and
|
||
|
processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If
|
||
|
you set it False, without CUDA, it is too slow to be practical. Defaults to True.
|
||
|
target_samples (int):
|
||
|
Size of the segments in batched mode. Defaults to 11000.
|
||
|
overlap_sampels (int):
|
||
|
Size of the overlap between consecutive segments. Defaults to 550.
|
||
|
batch_size (int):
|
||
|
Batch size used at training. Larger values use more memory. Defaults to 256.
|
||
|
seq_len (int):
|
||
|
Audio segment length used at training. Larger values use more memory. Defaults to 1280.
|
||
|
|
||
|
use_noise_augment (bool):
|
||
|
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||
|
features. Defaults to True.
|
||
|
use_cache (bool):
|
||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||
|
not large enough. Defaults to True.
|
||
|
mixed_precision (bool):
|
||
|
enable / disable mixed precision training. Default is True.
|
||
|
eval_split_size (int):
|
||
|
Number of samples used for evalutaion. Defaults to 50.
|
||
|
num_epochs_before_test (int):
|
||
|
Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
|
||
|
wait some number of epochs not ot waste training time. Defaults to 10.
|
||
|
grad_clip (float):
|
||
|
Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0
|
||
|
lr (float):
|
||
|
Initila leraning rate. Defaults to 1e-4.
|
||
|
lr_scheduler (str):
|
||
|
One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
|
||
|
lr_scheduler_params (dict):
|
||
|
kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}`
|
||
|
"""
|
||
|
|
||
|
model: str = "wavernn"
|
||
|
|
||
|
# Model specific params
|
||
|
model_args: WavernnArgs = field(default_factory=WavernnArgs)
|
||
|
target_loss: str = "loss"
|
||
|
|
||
|
# Inference
|
||
|
batched: bool = True
|
||
|
target_samples: int = 11000
|
||
|
overlap_samples: int = 550
|
||
|
|
||
|
# Training - overrides
|
||
|
epochs: int = 10000
|
||
|
batch_size: int = 256
|
||
|
seq_len: int = 1280
|
||
|
use_noise_augment: bool = False
|
||
|
use_cache: bool = True
|
||
|
mixed_precision: bool = True
|
||
|
eval_split_size: int = 50
|
||
|
num_epochs_before_test: int = (
|
||
|
10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
|
||
|
)
|
||
|
|
||
|
# optimizer overrides
|
||
|
grad_clip: float = 4.0
|
||
|
lr: float = 1e-4 # Initial learning rate.
|
||
|
lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||
|
lr_scheduler_params: dict = field(default_factory=lambda: {"gamma": 0.5, "milestones": [200000, 400000, 600000]})
|