223 lines
9.4 KiB
Python
223 lines
9.4 KiB
Python
|
import json
|
||
|
import os
|
||
|
from typing import Any, Dict, List, Union
|
||
|
|
||
|
import fsspec
|
||
|
import numpy as np
|
||
|
import torch
|
||
|
from coqpit import Coqpit
|
||
|
|
||
|
from TTS.config import get_from_config_or_model_args_with_default
|
||
|
from TTS.tts.utils.managers import EmbeddingManager
|
||
|
|
||
|
|
||
|
class SpeakerManager(EmbeddingManager):
|
||
|
"""Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
|
||
|
in a way that can be queried by speaker or clip.
|
||
|
|
||
|
There are 3 different scenarios considered:
|
||
|
|
||
|
1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer.
|
||
|
2. Models using d-vectors. The datafile includes a dictionary in the following format.
|
||
|
|
||
|
::
|
||
|
|
||
|
{
|
||
|
'clip_name.wav':{
|
||
|
'name': 'speakerA',
|
||
|
'embedding'[<d_vector_values>]
|
||
|
},
|
||
|
...
|
||
|
}
|
||
|
|
||
|
|
||
|
3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and
|
||
|
computes the d-vectors for a given clip or speaker.
|
||
|
|
||
|
Args:
|
||
|
d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
|
||
|
speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by
|
||
|
TTS models. Defaults to "".
|
||
|
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
|
||
|
encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "".
|
||
|
|
||
|
Examples:
|
||
|
>>> # load audio processor and speaker encoder
|
||
|
>>> ap = AudioProcessor(**config.audio)
|
||
|
>>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
|
||
|
>>> # load a sample audio and compute embedding
|
||
|
>>> waveform = ap.load_wav(sample_wav_path)
|
||
|
>>> mel = ap.melspectrogram(waveform)
|
||
|
>>> d_vector = manager.compute_embeddings(mel.T)
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
data_items: List[List[Any]] = None,
|
||
|
d_vectors_file_path: str = "",
|
||
|
speaker_id_file_path: str = "",
|
||
|
encoder_model_path: str = "",
|
||
|
encoder_config_path: str = "",
|
||
|
use_cuda: bool = False,
|
||
|
):
|
||
|
super().__init__(
|
||
|
embedding_file_path=d_vectors_file_path,
|
||
|
id_file_path=speaker_id_file_path,
|
||
|
encoder_model_path=encoder_model_path,
|
||
|
encoder_config_path=encoder_config_path,
|
||
|
use_cuda=use_cuda,
|
||
|
)
|
||
|
|
||
|
if data_items:
|
||
|
self.set_ids_from_data(data_items, parse_key="speaker_name")
|
||
|
|
||
|
@property
|
||
|
def num_speakers(self):
|
||
|
return len(self.name_to_id)
|
||
|
|
||
|
@property
|
||
|
def speaker_names(self):
|
||
|
return list(self.name_to_id.keys())
|
||
|
|
||
|
def get_speakers(self) -> List:
|
||
|
return self.name_to_id
|
||
|
|
||
|
@staticmethod
|
||
|
def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
|
||
|
"""Initialize a speaker manager from config
|
||
|
|
||
|
Args:
|
||
|
config (Coqpit): Config object.
|
||
|
samples (Union[List[List], List[Dict]], optional): List of data samples to parse out the speaker names.
|
||
|
Defaults to None.
|
||
|
|
||
|
Returns:
|
||
|
SpeakerEncoder: Speaker encoder object.
|
||
|
"""
|
||
|
speaker_manager = None
|
||
|
if get_from_config_or_model_args_with_default(config, "use_speaker_embedding", False):
|
||
|
if samples:
|
||
|
speaker_manager = SpeakerManager(data_items=samples)
|
||
|
if get_from_config_or_model_args_with_default(config, "speaker_file", None):
|
||
|
speaker_manager = SpeakerManager(
|
||
|
speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
|
||
|
)
|
||
|
if get_from_config_or_model_args_with_default(config, "speakers_file", None):
|
||
|
speaker_manager = SpeakerManager(
|
||
|
speaker_id_file_path=get_from_config_or_model_args_with_default(config, "speakers_file", None)
|
||
|
)
|
||
|
|
||
|
if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
|
||
|
speaker_manager = SpeakerManager()
|
||
|
if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
|
||
|
speaker_manager = SpeakerManager(
|
||
|
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
|
||
|
)
|
||
|
return speaker_manager
|
||
|
|
||
|
|
||
|
def _set_file_path(path):
|
||
|
"""Find the speakers.json under the given path or the above it.
|
||
|
Intended to band aid the different paths returned in restored and continued training."""
|
||
|
path_restore = os.path.join(os.path.dirname(path), "speakers.json")
|
||
|
path_continue = os.path.join(path, "speakers.json")
|
||
|
fs = fsspec.get_mapper(path).fs
|
||
|
if fs.exists(path_restore):
|
||
|
return path_restore
|
||
|
if fs.exists(path_continue):
|
||
|
return path_continue
|
||
|
raise FileNotFoundError(f" [!] `speakers.json` not found in {path}")
|
||
|
|
||
|
|
||
|
def load_speaker_mapping(out_path):
|
||
|
"""Loads speaker mapping if already present."""
|
||
|
if os.path.splitext(out_path)[1] == ".json":
|
||
|
json_file = out_path
|
||
|
else:
|
||
|
json_file = _set_file_path(out_path)
|
||
|
with fsspec.open(json_file, "r") as f:
|
||
|
return json.load(f)
|
||
|
|
||
|
|
||
|
def save_speaker_mapping(out_path, speaker_mapping):
|
||
|
"""Saves speaker mapping if not yet present."""
|
||
|
if out_path is not None:
|
||
|
speakers_json_path = _set_file_path(out_path)
|
||
|
with fsspec.open(speakers_json_path, "w") as f:
|
||
|
json.dump(speaker_mapping, f, indent=4)
|
||
|
|
||
|
|
||
|
def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager:
|
||
|
"""Initiate a `SpeakerManager` instance by the provided config.
|
||
|
|
||
|
Args:
|
||
|
c (Coqpit): Model configuration.
|
||
|
restore_path (str): Path to a previous training folder.
|
||
|
data (List): Data samples used in training to infer speakers from. It must be provided if speaker embedding
|
||
|
layers is used. Defaults to None.
|
||
|
out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None.
|
||
|
|
||
|
Returns:
|
||
|
SpeakerManager: initialized and ready to use instance.
|
||
|
"""
|
||
|
speaker_manager = SpeakerManager()
|
||
|
if c.use_speaker_embedding:
|
||
|
if data is not None:
|
||
|
speaker_manager.set_ids_from_data(data, parse_key="speaker_name")
|
||
|
if restore_path:
|
||
|
speakers_file = _set_file_path(restore_path)
|
||
|
# restoring speaker manager from a previous run.
|
||
|
if c.use_d_vector_file:
|
||
|
# restore speaker manager with the embedding file
|
||
|
if not os.path.exists(speakers_file):
|
||
|
print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.d_vector_file")
|
||
|
if not os.path.exists(c.d_vector_file):
|
||
|
raise RuntimeError(
|
||
|
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file"
|
||
|
)
|
||
|
speaker_manager.load_embeddings_from_file(c.d_vector_file)
|
||
|
speaker_manager.load_embeddings_from_file(speakers_file)
|
||
|
elif not c.use_d_vector_file: # restor speaker manager with speaker ID file.
|
||
|
speaker_ids_from_data = speaker_manager.name_to_id
|
||
|
speaker_manager.load_ids_from_file(speakers_file)
|
||
|
assert all(
|
||
|
speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
|
||
|
), " [!] You cannot introduce new speakers to a pre-trained model."
|
||
|
elif c.use_d_vector_file and c.d_vector_file:
|
||
|
# new speaker manager with external speaker embeddings.
|
||
|
speaker_manager.load_embeddings_from_file(c.d_vector_file)
|
||
|
elif c.use_d_vector_file and not c.d_vector_file:
|
||
|
raise "use_d_vector_file is True, so you need pass a external speaker embedding file."
|
||
|
elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file:
|
||
|
# new speaker manager with speaker IDs file.
|
||
|
speaker_manager.load_ids_from_file(c.speakers_file)
|
||
|
|
||
|
if speaker_manager.num_speakers > 0:
|
||
|
print(
|
||
|
" > Speaker manager is loaded with {} speakers: {}".format(
|
||
|
speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
|
||
|
)
|
||
|
)
|
||
|
|
||
|
# save file if path is defined
|
||
|
if out_path:
|
||
|
out_file_path = os.path.join(out_path, "speakers.json")
|
||
|
print(f" > Saving `speakers.json` to {out_file_path}.")
|
||
|
if c.use_d_vector_file and c.d_vector_file:
|
||
|
speaker_manager.save_embeddings_to_file(out_file_path)
|
||
|
else:
|
||
|
speaker_manager.save_ids_to_file(out_file_path)
|
||
|
return speaker_manager
|
||
|
|
||
|
|
||
|
def get_speaker_balancer_weights(items: list):
|
||
|
speaker_names = np.array([item["speaker_name"] for item in items])
|
||
|
unique_speaker_names = np.unique(speaker_names).tolist()
|
||
|
speaker_ids = [unique_speaker_names.index(l) for l in speaker_names]
|
||
|
speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names])
|
||
|
weight_speaker = 1.0 / speaker_count
|
||
|
dataset_samples_weight = np.array([weight_speaker[l] for l in speaker_ids])
|
||
|
# normalize
|
||
|
dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
|
||
|
return torch.from_numpy(dataset_samples_weight).float()
|