430 lines
18 KiB
Python
430 lines
18 KiB
Python
|
import os
|
||
|
import random
|
||
|
from typing import Dict, List, Tuple, Union
|
||
|
|
||
|
import torch
|
||
|
import torch.distributed as dist
|
||
|
from coqpit import Coqpit
|
||
|
from torch import nn
|
||
|
from torch.utils.data import DataLoader
|
||
|
from torch.utils.data.sampler import WeightedRandomSampler
|
||
|
from trainer.torch import DistributedSampler, DistributedSamplerWrapper
|
||
|
|
||
|
from TTS.model import BaseTrainerModel
|
||
|
from TTS.tts.datasets.dataset import TTSDataset
|
||
|
from TTS.tts.utils.data import get_length_balancer_weights
|
||
|
from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
|
||
|
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights
|
||
|
from TTS.tts.utils.synthesis import synthesis
|
||
|
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
|
||
|
|
||
|
# pylint: skip-file
|
||
|
|
||
|
|
||
|
class BaseVC(BaseTrainerModel):
|
||
|
"""Base `vc` class. Every new `vc` model must inherit this.
|
||
|
|
||
|
It defines common `vc` specific functions on top of `Model` implementation.
|
||
|
"""
|
||
|
|
||
|
MODEL_TYPE = "vc"
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
config: Coqpit,
|
||
|
ap: "AudioProcessor",
|
||
|
speaker_manager: SpeakerManager = None,
|
||
|
language_manager: LanguageManager = None,
|
||
|
):
|
||
|
super().__init__()
|
||
|
self.config = config
|
||
|
self.ap = ap
|
||
|
self.speaker_manager = speaker_manager
|
||
|
self.language_manager = language_manager
|
||
|
self._set_model_args(config)
|
||
|
|
||
|
def _set_model_args(self, config: Coqpit):
|
||
|
"""Setup model args based on the config type (`ModelConfig` or `ModelArgs`).
|
||
|
|
||
|
`ModelArgs` has all the fields reuqired to initialize the model architecture.
|
||
|
|
||
|
`ModelConfig` has all the fields required for training, inference and containes `ModelArgs`.
|
||
|
|
||
|
If the config is for training with a name like "*Config", then the model args are embeded in the
|
||
|
config.model_args
|
||
|
|
||
|
If the config is for the model with a name like "*Args", then we assign the directly.
|
||
|
"""
|
||
|
# don't use isintance not to import recursively
|
||
|
if "Config" in config.__class__.__name__:
|
||
|
self.config = config
|
||
|
self.args = config.model_args
|
||
|
elif "Args" in config.__class__.__name__:
|
||
|
self.args = config
|
||
|
else:
|
||
|
raise ValueError("config must be either a *Config or *Args")
|
||
|
|
||
|
def init_multispeaker(self, config: Coqpit, data: List = None):
|
||
|
"""Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
|
||
|
`in_channels` size of the connected layers.
|
||
|
|
||
|
This implementation yields 3 possible outcomes:
|
||
|
|
||
|
1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
|
||
|
2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
|
||
|
3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
|
||
|
`config.d_vector_dim` or 512.
|
||
|
|
||
|
You can override this function for new models.
|
||
|
|
||
|
Args:
|
||
|
config (Coqpit): Model configuration.
|
||
|
"""
|
||
|
# set number of speakers
|
||
|
if self.speaker_manager is not None:
|
||
|
self.num_speakers = self.speaker_manager.num_speakers
|
||
|
elif hasattr(config, "num_speakers"):
|
||
|
self.num_speakers = config.num_speakers
|
||
|
|
||
|
# set ultimate speaker embedding size
|
||
|
if config.use_speaker_embedding or config.use_d_vector_file:
|
||
|
self.embedded_speaker_dim = (
|
||
|
config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
|
||
|
)
|
||
|
# init speaker embedding layer
|
||
|
if config.use_speaker_embedding and not config.use_d_vector_file:
|
||
|
print(" > Init speaker_embedding layer.")
|
||
|
self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
||
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||
|
|
||
|
def get_aux_input(self, **kwargs) -> Dict:
|
||
|
"""Prepare and return `aux_input` used by `forward()`"""
|
||
|
return {"speaker_id": None, "style_wav": None, "d_vector": None, "language_id": None}
|
||
|
|
||
|
def get_aux_input_from_test_sentences(self, sentence_info):
|
||
|
if hasattr(self.config, "model_args"):
|
||
|
config = self.config.model_args
|
||
|
else:
|
||
|
config = self.config
|
||
|
|
||
|
# extract speaker and language info
|
||
|
text, speaker_name, style_wav, language_name = None, None, None, None
|
||
|
|
||
|
if isinstance(sentence_info, list):
|
||
|
if len(sentence_info) == 1:
|
||
|
text = sentence_info[0]
|
||
|
elif len(sentence_info) == 2:
|
||
|
text, speaker_name = sentence_info
|
||
|
elif len(sentence_info) == 3:
|
||
|
text, speaker_name, style_wav = sentence_info
|
||
|
elif len(sentence_info) == 4:
|
||
|
text, speaker_name, style_wav, language_name = sentence_info
|
||
|
else:
|
||
|
text = sentence_info
|
||
|
|
||
|
# get speaker id/d_vector
|
||
|
speaker_id, d_vector, language_id = None, None, None
|
||
|
if self.speaker_manager is not None:
|
||
|
if config.use_d_vector_file:
|
||
|
if speaker_name is None:
|
||
|
d_vector = self.speaker_manager.get_random_embedding()
|
||
|
else:
|
||
|
d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name)
|
||
|
elif config.use_speaker_embedding:
|
||
|
if speaker_name is None:
|
||
|
speaker_id = self.speaker_manager.get_random_id()
|
||
|
else:
|
||
|
speaker_id = self.speaker_manager.name_to_id[speaker_name]
|
||
|
|
||
|
# get language id
|
||
|
if self.language_manager is not None and config.use_language_embedding and language_name is not None:
|
||
|
language_id = self.language_manager.name_to_id[language_name]
|
||
|
|
||
|
return {
|
||
|
"text": text,
|
||
|
"speaker_id": speaker_id,
|
||
|
"style_wav": style_wav,
|
||
|
"d_vector": d_vector,
|
||
|
"language_id": language_id,
|
||
|
}
|
||
|
|
||
|
def format_batch(self, batch: Dict) -> Dict:
|
||
|
"""Generic batch formatting for `VCDataset`.
|
||
|
|
||
|
You must override this if you use a custom dataset.
|
||
|
|
||
|
Args:
|
||
|
batch (Dict): [description]
|
||
|
|
||
|
Returns:
|
||
|
Dict: [description]
|
||
|
"""
|
||
|
# setup input batch
|
||
|
text_input = batch["token_id"]
|
||
|
text_lengths = batch["token_id_lengths"]
|
||
|
speaker_names = batch["speaker_names"]
|
||
|
linear_input = batch["linear"]
|
||
|
mel_input = batch["mel"]
|
||
|
mel_lengths = batch["mel_lengths"]
|
||
|
stop_targets = batch["stop_targets"]
|
||
|
item_idx = batch["item_idxs"]
|
||
|
d_vectors = batch["d_vectors"]
|
||
|
speaker_ids = batch["speaker_ids"]
|
||
|
attn_mask = batch["attns"]
|
||
|
waveform = batch["waveform"]
|
||
|
pitch = batch["pitch"]
|
||
|
energy = batch["energy"]
|
||
|
language_ids = batch["language_ids"]
|
||
|
max_text_length = torch.max(text_lengths.float())
|
||
|
max_spec_length = torch.max(mel_lengths.float())
|
||
|
|
||
|
# compute durations from attention masks
|
||
|
durations = None
|
||
|
if attn_mask is not None:
|
||
|
durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2])
|
||
|
for idx, am in enumerate(attn_mask):
|
||
|
# compute raw durations
|
||
|
c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1]
|
||
|
# c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True)
|
||
|
c_idxs, counts = torch.unique(c_idxs, return_counts=True)
|
||
|
dur = torch.ones([text_lengths[idx]]).to(counts.dtype)
|
||
|
dur[c_idxs] = counts
|
||
|
# smooth the durations and set any 0 duration to 1
|
||
|
# by cutting off from the largest duration indeces.
|
||
|
extra_frames = dur.sum() - mel_lengths[idx]
|
||
|
largest_idxs = torch.argsort(-dur)[:extra_frames]
|
||
|
dur[largest_idxs] -= 1
|
||
|
assert (
|
||
|
dur.sum() == mel_lengths[idx]
|
||
|
), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
|
||
|
durations[idx, : text_lengths[idx]] = dur
|
||
|
|
||
|
# set stop targets wrt reduction factor
|
||
|
stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1)
|
||
|
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
|
||
|
stop_target_lengths = torch.divide(mel_lengths, self.config.r).ceil_()
|
||
|
|
||
|
return {
|
||
|
"text_input": text_input,
|
||
|
"text_lengths": text_lengths,
|
||
|
"speaker_names": speaker_names,
|
||
|
"mel_input": mel_input,
|
||
|
"mel_lengths": mel_lengths,
|
||
|
"linear_input": linear_input,
|
||
|
"stop_targets": stop_targets,
|
||
|
"stop_target_lengths": stop_target_lengths,
|
||
|
"attn_mask": attn_mask,
|
||
|
"durations": durations,
|
||
|
"speaker_ids": speaker_ids,
|
||
|
"d_vectors": d_vectors,
|
||
|
"max_text_length": float(max_text_length),
|
||
|
"max_spec_length": float(max_spec_length),
|
||
|
"item_idx": item_idx,
|
||
|
"waveform": waveform,
|
||
|
"pitch": pitch,
|
||
|
"energy": energy,
|
||
|
"language_ids": language_ids,
|
||
|
"audio_unique_names": batch["audio_unique_names"],
|
||
|
}
|
||
|
|
||
|
def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
|
||
|
weights = None
|
||
|
data_items = dataset.samples
|
||
|
|
||
|
if getattr(config, "use_language_weighted_sampler", False):
|
||
|
alpha = getattr(config, "language_weighted_sampler_alpha", 1.0)
|
||
|
print(" > Using Language weighted sampler with alpha:", alpha)
|
||
|
weights = get_language_balancer_weights(data_items) * alpha
|
||
|
|
||
|
if getattr(config, "use_speaker_weighted_sampler", False):
|
||
|
alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0)
|
||
|
print(" > Using Speaker weighted sampler with alpha:", alpha)
|
||
|
if weights is not None:
|
||
|
weights += get_speaker_balancer_weights(data_items) * alpha
|
||
|
else:
|
||
|
weights = get_speaker_balancer_weights(data_items) * alpha
|
||
|
|
||
|
if getattr(config, "use_length_weighted_sampler", False):
|
||
|
alpha = getattr(config, "length_weighted_sampler_alpha", 1.0)
|
||
|
print(" > Using Length weighted sampler with alpha:", alpha)
|
||
|
if weights is not None:
|
||
|
weights += get_length_balancer_weights(data_items) * alpha
|
||
|
else:
|
||
|
weights = get_length_balancer_weights(data_items) * alpha
|
||
|
|
||
|
if weights is not None:
|
||
|
sampler = WeightedRandomSampler(weights, len(weights))
|
||
|
else:
|
||
|
sampler = None
|
||
|
|
||
|
# sampler for DDP
|
||
|
if sampler is None:
|
||
|
sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||
|
else: # If a sampler is already defined use this sampler and DDP sampler together
|
||
|
sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler
|
||
|
|
||
|
return sampler
|
||
|
|
||
|
def get_data_loader(
|
||
|
self,
|
||
|
config: Coqpit,
|
||
|
assets: Dict,
|
||
|
is_eval: bool,
|
||
|
samples: Union[List[Dict], List[List]],
|
||
|
verbose: bool,
|
||
|
num_gpus: int,
|
||
|
rank: int = None,
|
||
|
) -> "DataLoader":
|
||
|
if is_eval and not config.run_eval:
|
||
|
loader = None
|
||
|
else:
|
||
|
# setup multi-speaker attributes
|
||
|
if self.speaker_manager is not None:
|
||
|
if hasattr(config, "model_args"):
|
||
|
speaker_id_mapping = (
|
||
|
self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None
|
||
|
)
|
||
|
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
|
||
|
config.use_d_vector_file = config.model_args.use_d_vector_file
|
||
|
else:
|
||
|
speaker_id_mapping = self.speaker_manager.name_to_id if config.use_speaker_embedding else None
|
||
|
d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
|
||
|
else:
|
||
|
speaker_id_mapping = None
|
||
|
d_vector_mapping = None
|
||
|
|
||
|
# setup multi-lingual attributes
|
||
|
if self.language_manager is not None:
|
||
|
language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None
|
||
|
else:
|
||
|
language_id_mapping = None
|
||
|
|
||
|
# init dataloader
|
||
|
dataset = TTSDataset(
|
||
|
outputs_per_step=config.r if "r" in config else 1,
|
||
|
compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec,
|
||
|
compute_f0=config.get("compute_f0", False),
|
||
|
f0_cache_path=config.get("f0_cache_path", None),
|
||
|
compute_energy=config.get("compute_energy", False),
|
||
|
energy_cache_path=config.get("energy_cache_path", None),
|
||
|
samples=samples,
|
||
|
ap=self.ap,
|
||
|
return_wav=config.return_wav if "return_wav" in config else False,
|
||
|
batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
|
||
|
min_text_len=config.min_text_len,
|
||
|
max_text_len=config.max_text_len,
|
||
|
min_audio_len=config.min_audio_len,
|
||
|
max_audio_len=config.max_audio_len,
|
||
|
phoneme_cache_path=config.phoneme_cache_path,
|
||
|
precompute_num_workers=config.precompute_num_workers,
|
||
|
use_noise_augment=False if is_eval else config.use_noise_augment,
|
||
|
verbose=verbose,
|
||
|
speaker_id_mapping=speaker_id_mapping,
|
||
|
d_vector_mapping=d_vector_mapping if config.use_d_vector_file else None,
|
||
|
tokenizer=None,
|
||
|
start_by_longest=config.start_by_longest,
|
||
|
language_id_mapping=language_id_mapping,
|
||
|
)
|
||
|
|
||
|
# wait all the DDP process to be ready
|
||
|
if num_gpus > 1:
|
||
|
dist.barrier()
|
||
|
|
||
|
# sort input sequences from short to long
|
||
|
dataset.preprocess_samples()
|
||
|
|
||
|
# get samplers
|
||
|
sampler = self.get_sampler(config, dataset, num_gpus)
|
||
|
|
||
|
loader = DataLoader(
|
||
|
dataset,
|
||
|
batch_size=config.eval_batch_size if is_eval else config.batch_size,
|
||
|
shuffle=config.shuffle if sampler is None else False, # if there is no other sampler
|
||
|
collate_fn=dataset.collate_fn,
|
||
|
drop_last=config.drop_last, # setting this False might cause issues in AMP training.
|
||
|
sampler=sampler,
|
||
|
num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
|
||
|
pin_memory=False,
|
||
|
)
|
||
|
return loader
|
||
|
|
||
|
def _get_test_aux_input(
|
||
|
self,
|
||
|
) -> Dict:
|
||
|
d_vector = None
|
||
|
if self.config.use_d_vector_file:
|
||
|
d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
|
||
|
d_vector = (random.sample(sorted(d_vector), 1),)
|
||
|
|
||
|
aux_inputs = {
|
||
|
"speaker_id": None
|
||
|
if not self.config.use_speaker_embedding
|
||
|
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
|
||
|
"d_vector": d_vector,
|
||
|
"style_wav": None, # TODO: handle GST style input
|
||
|
}
|
||
|
return aux_inputs
|
||
|
|
||
|
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
|
||
|
"""Generic test run for `vc` models used by `Trainer`.
|
||
|
|
||
|
You can override this for a different behaviour.
|
||
|
|
||
|
Args:
|
||
|
assets (dict): A dict of training assets. For `vc` models, it must include `{'audio_processor': ap}`.
|
||
|
|
||
|
Returns:
|
||
|
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
|
||
|
"""
|
||
|
print(" | > Synthesizing test sentences.")
|
||
|
test_audios = {}
|
||
|
test_figures = {}
|
||
|
test_sentences = self.config.test_sentences
|
||
|
aux_inputs = self._get_test_aux_input()
|
||
|
for idx, sen in enumerate(test_sentences):
|
||
|
if isinstance(sen, list):
|
||
|
aux_inputs = self.get_aux_input_from_test_sentences(sen)
|
||
|
sen = aux_inputs["text"]
|
||
|
outputs_dict = synthesis(
|
||
|
self,
|
||
|
sen,
|
||
|
self.config,
|
||
|
"cuda" in str(next(self.parameters()).device),
|
||
|
speaker_id=aux_inputs["speaker_id"],
|
||
|
d_vector=aux_inputs["d_vector"],
|
||
|
style_wav=aux_inputs["style_wav"],
|
||
|
use_griffin_lim=True,
|
||
|
do_trim_silence=False,
|
||
|
)
|
||
|
test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
|
||
|
test_figures["{}-prediction".format(idx)] = plot_spectrogram(
|
||
|
outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
|
||
|
)
|
||
|
test_figures["{}-alignment".format(idx)] = plot_alignment(
|
||
|
outputs_dict["outputs"]["alignments"], output_fig=False
|
||
|
)
|
||
|
return test_figures, test_audios
|
||
|
|
||
|
def on_init_start(self, trainer):
|
||
|
"""Save the speaker.pth and language_ids.json at the beginning of the training. Also update both paths."""
|
||
|
if self.speaker_manager is not None:
|
||
|
output_path = os.path.join(trainer.output_path, "speakers.pth")
|
||
|
self.speaker_manager.save_ids_to_file(output_path)
|
||
|
trainer.config.speakers_file = output_path
|
||
|
# some models don't have `model_args` set
|
||
|
if hasattr(trainer.config, "model_args"):
|
||
|
trainer.config.model_args.speakers_file = output_path
|
||
|
trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
|
||
|
print(f" > `speakers.pth` is saved to {output_path}.")
|
||
|
print(" > `speakers_file` is updated in the config.json.")
|
||
|
|
||
|
if self.language_manager is not None:
|
||
|
output_path = os.path.join(trainer.output_path, "language_ids.json")
|
||
|
self.language_manager.save_ids_to_file(output_path)
|
||
|
trainer.config.language_ids_file = output_path
|
||
|
if hasattr(trainer.config, "model_args"):
|
||
|
trainer.config.model_args.language_ids_file = output_path
|
||
|
trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
|
||
|
print(f" > `language_ids.json` is saved to {output_path}.")
|
||
|
print(" > `language_ids_file` is updated in the config.json.")
|