ai-content-maker/.venv/Lib/site-packages/TTS/tts/models/base_tacotron.py

import copy
from abc import abstractmethod
from typing import Dict, Tuple

import torch
from coqpit import Coqpit
from torch import nn

from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.generic_utils import format_aux_input
from TTS.utils.io import load_fsspec
from TTS.utils.training import gradual_training_scheduler


class BaseTacotron(BaseTTS):
    """Base class shared by Tacotron and Tacotron2"""

    def __init__(
        self,
        config: "TacotronConfig",
        ap: "AudioProcessor",
        tokenizer: "TTSTokenizer",
        speaker_manager: SpeakerManager = None,
    ):
        super().__init__(config, ap, tokenizer, speaker_manager)

        # pass all config fields as class attributes
        for key in config:
            setattr(self, key, config[key])

        # layers
        self.embedding = None
        self.encoder = None
        self.decoder = None
        self.postnet = None

        # init tensors
        self.embedded_speakers = None
        self.embedded_speakers_projected = None

        # global style token
        if self.gst and self.use_gst:
            self.decoder_in_features += self.gst.gst_embedding_dim  # add gst embedding dim
            self.gst_layer = None

        # Capacitron
        if self.capacitron_vae and self.use_capacitron_vae:
            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim  # add capacitron embedding dim
            self.capacitron_vae_layer = None

        # additional layers
        self.decoder_backward = None
        self.coarse_decoder = None

    @staticmethod
    def _format_aux_input(aux_input: Dict) -> Dict:
        """Set missing fields to their default values"""
        if aux_input:
            return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
        return None

    #############################
    # INIT FUNCTIONS
    #############################

    def _init_backward_decoder(self):
        """Init the backward decoder for Forward-Backward decoding."""
        self.decoder_backward = copy.deepcopy(self.decoder)

    def _init_coarse_decoder(self):
        """Init the coarse decoder for Double-Decoder Consistency."""
        self.coarse_decoder = copy.deepcopy(self.decoder)
        self.coarse_decoder.r_init = self.ddc_r
        self.coarse_decoder.set_r(self.ddc_r)

    #############################
    # CORE FUNCTIONS
    #############################

    @abstractmethod
    def forward(self):
        pass

    @abstractmethod
    def inference(self):
        pass

    def load_checkpoint(
        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
        """Load model checkpoint and set up internals.

        Args:
            config (Coqpi): model configuration.
            checkpoint_path (str): path to checkpoint file.
            eval (bool, optional): whether to load model for evaluation.
            cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
        """
        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        # TODO: set r in run-time by taking it from the new config
        if "r" in state:
            # set r from the state (for compatibility with older checkpoints)
            self.decoder.set_r(state["r"])
        elif "config" in state:
            # set r from config used at training time (for inference)
            self.decoder.set_r(state["config"]["r"])
        else:
            # set r from the new config (for new-models)
            self.decoder.set_r(config.r)
        if eval:
            self.eval()
            print(f" > Model's reduction rate `r` is set to: {self.decoder.r}")
            assert not self.training

    def get_criterion(self) -> nn.Module:
        """Get the model criterion used in training."""
        return TacotronLoss(self.config)

    @staticmethod
    def init_from_config(config: Coqpit):
        """Initialize model from config."""
        from TTS.utils.audio import AudioProcessor

        ap = AudioProcessor.init_from_config(config)
        tokenizer = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config)
        return BaseTacotron(config, ap, tokenizer, speaker_manager)

    ##########################
    # TEST AND LOG FUNCTIONS #
    ##########################

    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
        """Generic test run for `tts` models used by `Trainer`.

        You can override this for a different behaviour.

        Args:
            assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.

        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        """
        print(" | > Synthesizing test sentences.")
        test_audios = {}
        test_figures = {}
        test_sentences = self.config.test_sentences
        aux_inputs = self._get_test_aux_input()
        for idx, sen in enumerate(test_sentences):
            outputs_dict = synthesis(
                self,
                sen,
                self.config,
                "cuda" in str(next(self.parameters()).device),
                speaker_id=aux_inputs["speaker_id"],
                d_vector=aux_inputs["d_vector"],
                style_wav=aux_inputs["style_wav"],
                use_griffin_lim=True,
                do_trim_silence=False,
            )
            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
                outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
            )
            test_figures["{}-alignment".format(idx)] = plot_alignment(
                outputs_dict["outputs"]["alignments"], output_fig=False
            )
        return {"figures": test_figures, "audios": test_audios}

    def test_log(
        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
    ) -> None:
        logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
        logger.test_figures(steps, outputs["figures"])

    #############################
    # COMMON COMPUTE FUNCTIONS
    #############################

    def compute_masks(self, text_lengths, mel_lengths):
        """Compute masks  against sequence paddings."""
        # B x T_in_max (boolean)
        input_mask = sequence_mask(text_lengths)
        output_mask = None
        if mel_lengths is not None:
            max_len = mel_lengths.max()
            r = self.decoder.r
            max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len
            output_mask = sequence_mask(mel_lengths, max_len=max_len)
        return input_mask, output_mask

    def _backward_pass(self, mel_specs, encoder_outputs, mask):
        """Run backwards decoder"""
        decoder_outputs_b, alignments_b, _ = self.decoder_backward(
            encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask
        )
        decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()
        return decoder_outputs_b, alignments_b

    def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask):
        """Double Decoder Consistency"""
        T = mel_specs.shape[1]
        if T % self.coarse_decoder.r > 0:
            padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)
            mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0))
        decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder(
            encoder_outputs.detach(), mel_specs, input_mask
        )
        # scale_factor = self.decoder.r_init / self.decoder.r
        alignments_backward = torch.nn.functional.interpolate(
            alignments_backward.transpose(1, 2),
            size=alignments.shape[1],
            mode="nearest",
        ).transpose(1, 2)
        decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
        decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
        return decoder_outputs_backward, alignments_backward

    #############################
    # EMBEDDING FUNCTIONS
    #############################

    def compute_gst(self, inputs, style_input, speaker_embedding=None):
        """Compute global style token"""
        if isinstance(style_input, dict):
            # multiply each style token with a weight
            query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs)
            if speaker_embedding is not None:
                query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1)

            _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
            gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs)
            for k_token, v_amplifier in style_input.items():
                key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
                gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
                gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
        elif style_input is None:
            # ignore style token and return zero tensor
            gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs)
        else:
            # compute style tokens
            gst_outputs = self.gst_layer(style_input, speaker_embedding)  # pylint: disable=not-callable
        inputs = self._concat_speaker_embedding(inputs, gst_outputs)
        return inputs

    def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
        """Capacitron Variational Autoencoder"""
        (
            VAE_outputs,
            posterior_distribution,
            prior_distribution,
            capacitron_beta,
        ) = self.capacitron_vae_layer(
            reference_mel_info,
            text_info,
            speaker_embedding,  # pylint: disable=not-callable
        )

        VAE_outputs = VAE_outputs.to(inputs.device)
        encoder_output = self._concat_speaker_embedding(
            inputs, VAE_outputs
        )  # concatenate to the output of the basic tacotron encoder
        return (
            encoder_output,
            posterior_distribution,
            prior_distribution,
            capacitron_beta,
        )

    @staticmethod
    def _add_speaker_embedding(outputs, embedded_speakers):
        embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
        outputs = outputs + embedded_speakers_
        return outputs

    @staticmethod
    def _concat_speaker_embedding(outputs, embedded_speakers):
        embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
        outputs = torch.cat([outputs, embedded_speakers_], dim=-1)
        return outputs

    #############################
    # CALLBACKS
    #############################

    def on_epoch_start(self, trainer):
        """Callback for setting values wrt gradual training schedule.

        Args:
            trainer (TrainerTTS): TTS trainer object that is used to train this model.
        """
        if self.gradual_training:
            r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config)
            trainer.config.r = r
            self.decoder.set_r(r)
            if trainer.config.bidirectional_decoder:
                trainer.model.decoder_backward.set_r(r)
            print(f"\n > Number of output frames: {self.decoder.r}")
first commit 2024-05-03 04:18:51 +03:00			`import copy`
			`from abc import abstractmethod`
			`from typing import Dict, Tuple`

			`import torch`
			`from coqpit import Coqpit`
			`from torch import nn`

			`from TTS.tts.layers.losses import TacotronLoss`
			`from TTS.tts.models.base_tts import BaseTTS`
			`from TTS.tts.utils.helpers import sequence_mask`
			`from TTS.tts.utils.speakers import SpeakerManager`
			`from TTS.tts.utils.synthesis import synthesis`
			`from TTS.tts.utils.text.tokenizer import TTSTokenizer`
			`from TTS.tts.utils.visual import plot_alignment, plot_spectrogram`
			`from TTS.utils.generic_utils import format_aux_input`
			`from TTS.utils.io import load_fsspec`
			`from TTS.utils.training import gradual_training_scheduler`


			`class BaseTacotron(BaseTTS):`
			`"""Base class shared by Tacotron and Tacotron2"""`

			`def __init__(`
			`self,`
			`config: "TacotronConfig",`
			`ap: "AudioProcessor",`
			`tokenizer: "TTSTokenizer",`
			`speaker_manager: SpeakerManager = None,`
			`):`
			`super().__init__(config, ap, tokenizer, speaker_manager)`

			`# pass all config fields as class attributes`
			`for key in config:`
			`setattr(self, key, config[key])`

			`# layers`
			`self.embedding = None`
			`self.encoder = None`
			`self.decoder = None`
			`self.postnet = None`

			`# init tensors`
			`self.embedded_speakers = None`
			`self.embedded_speakers_projected = None`

			`# global style token`
			`if self.gst and self.use_gst:`
			`self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim`
			`self.gst_layer = None`

			`# Capacitron`
			`if self.capacitron_vae and self.use_capacitron_vae:`
			`self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim # add capacitron embedding dim`
			`self.capacitron_vae_layer = None`

			`# additional layers`
			`self.decoder_backward = None`
			`self.coarse_decoder = None`

			`@staticmethod`
			`def _format_aux_input(aux_input: Dict) -> Dict:`
			`"""Set missing fields to their default values"""`
			`if aux_input:`
			`return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)`
			`return None`

			`#############################`
			`# INIT FUNCTIONS`
			`#############################`

			`def _init_backward_decoder(self):`
			`"""Init the backward decoder for Forward-Backward decoding."""`
			`self.decoder_backward = copy.deepcopy(self.decoder)`

			`def _init_coarse_decoder(self):`
			`"""Init the coarse decoder for Double-Decoder Consistency."""`
			`self.coarse_decoder = copy.deepcopy(self.decoder)`
			`self.coarse_decoder.r_init = self.ddc_r`
			`self.coarse_decoder.set_r(self.ddc_r)`

			`#############################`
			`# CORE FUNCTIONS`
			`#############################`

			`@abstractmethod`
			`def forward(self):`
			`pass`

			`@abstractmethod`
			`def inference(self):`
			`pass`

			`def load_checkpoint(`
			`self, config, checkpoint_path, eval=False, cache=False`
			`): # pylint: disable=unused-argument, redefined-builtin`
			`"""Load model checkpoint and set up internals.`

			`Args:`
			`config (Coqpi): model configuration.`
			`checkpoint_path (str): path to checkpoint file.`
			`eval (bool, optional): whether to load model for evaluation.`
			cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
			`"""`
			`state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)`
			`self.load_state_dict(state["model"])`
			`# TODO: set r in run-time by taking it from the new config`
			`if "r" in state:`
			`# set r from the state (for compatibility with older checkpoints)`
			`self.decoder.set_r(state["r"])`
			`elif "config" in state:`
			`# set r from config used at training time (for inference)`
			`self.decoder.set_r(state["config"]["r"])`
			`else:`
			`# set r from the new config (for new-models)`
			`self.decoder.set_r(config.r)`
			`if eval:`
			`self.eval()`
			print(f" > Model's reduction rate `r` is set to: {self.decoder.r}")
			`assert not self.training`

			`def get_criterion(self) -> nn.Module:`
			`"""Get the model criterion used in training."""`
			`return TacotronLoss(self.config)`

			`@staticmethod`
			`def init_from_config(config: Coqpit):`
			`"""Initialize model from config."""`
			`from TTS.utils.audio import AudioProcessor`

			`ap = AudioProcessor.init_from_config(config)`
			`tokenizer = TTSTokenizer.init_from_config(config)`
			`speaker_manager = SpeakerManager.init_from_config(config)`
			`return BaseTacotron(config, ap, tokenizer, speaker_manager)`

			`##########################`
			`# TEST AND LOG FUNCTIONS #`
			`##########################`

			`def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:`
			"""Generic test run for `tts` models used by `Trainer`.

			`You can override this for a different behaviour.`

			`Args:`
			assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.

			`Returns:`
			`Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.`
			`"""`
			`print(" \| > Synthesizing test sentences.")`
			`test_audios = {}`
			`test_figures = {}`
			`test_sentences = self.config.test_sentences`
			`aux_inputs = self._get_test_aux_input()`
			`for idx, sen in enumerate(test_sentences):`
			`outputs_dict = synthesis(`
			`self,`
			`sen,`
			`self.config,`
			`"cuda" in str(next(self.parameters()).device),`
			`speaker_id=aux_inputs["speaker_id"],`
			`d_vector=aux_inputs["d_vector"],`
			`style_wav=aux_inputs["style_wav"],`
			`use_griffin_lim=True,`
			`do_trim_silence=False,`
			`)`
			`test_audios["{}-audio".format(idx)] = outputs_dict["wav"]`
			`test_figures["{}-prediction".format(idx)] = plot_spectrogram(`
			`outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False`
			`)`
			`test_figures["{}-alignment".format(idx)] = plot_alignment(`
			`outputs_dict["outputs"]["alignments"], output_fig=False`
			`)`
			`return {"figures": test_figures, "audios": test_audios}`

			`def test_log(`
			`self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument`
			`) -> None:`
			`logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)`
			`logger.test_figures(steps, outputs["figures"])`

			`#############################`
			`# COMMON COMPUTE FUNCTIONS`
			`#############################`

			`def compute_masks(self, text_lengths, mel_lengths):`
			`"""Compute masks against sequence paddings."""`
			`# B x T_in_max (boolean)`
			`input_mask = sequence_mask(text_lengths)`
			`output_mask = None`
			`if mel_lengths is not None:`
			`max_len = mel_lengths.max()`
			`r = self.decoder.r`
			`max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len`
			`output_mask = sequence_mask(mel_lengths, max_len=max_len)`
			`return input_mask, output_mask`

			`def _backward_pass(self, mel_specs, encoder_outputs, mask):`
			`"""Run backwards decoder"""`
			`decoder_outputs_b, alignments_b, _ = self.decoder_backward(`
			`encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask`
			`)`
			`decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous()`
			`return decoder_outputs_b, alignments_b`

			`def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask):`
			`"""Double Decoder Consistency"""`
			`T = mel_specs.shape[1]`
			`if T % self.coarse_decoder.r > 0:`
			`padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r)`
			`mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0))`
			`decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder(`
			`encoder_outputs.detach(), mel_specs, input_mask`
			`)`
			`# scale_factor = self.decoder.r_init / self.decoder.r`
			`alignments_backward = torch.nn.functional.interpolate(`
			`alignments_backward.transpose(1, 2),`
			`size=alignments.shape[1],`
			`mode="nearest",`
			`).transpose(1, 2)`
			`decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)`
			`decoder_outputs_backward = decoder_outputs_backward[:, :T, :]`
			`return decoder_outputs_backward, alignments_backward`

			`#############################`
			`# EMBEDDING FUNCTIONS`
			`#############################`

			`def compute_gst(self, inputs, style_input, speaker_embedding=None):`
			`"""Compute global style token"""`
			`if isinstance(style_input, dict):`
			`# multiply each style token with a weight`
			`query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs)`
			`if speaker_embedding is not None:`
			`query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1)`

			`_GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)`
			`gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs)`
			`for k_token, v_amplifier in style_input.items():`
			`key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)`
			`gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)`
			`gst_outputs = gst_outputs + gst_outputs_att * v_amplifier`
			`elif style_input is None:`
			`# ignore style token and return zero tensor`
			`gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs)`
			`else:`
			`# compute style tokens`
			`gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable`
			`inputs = self._concat_speaker_embedding(inputs, gst_outputs)`
			`return inputs`

			`def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):`
			`"""Capacitron Variational Autoencoder"""`
			`(`
			`VAE_outputs,`
			`posterior_distribution,`
			`prior_distribution,`
			`capacitron_beta,`
			`) = self.capacitron_vae_layer(`
			`reference_mel_info,`
			`text_info,`
			`speaker_embedding, # pylint: disable=not-callable`
			`)`

			`VAE_outputs = VAE_outputs.to(inputs.device)`
			`encoder_output = self._concat_speaker_embedding(`
			`inputs, VAE_outputs`
			`) # concatenate to the output of the basic tacotron encoder`
			`return (`
			`encoder_output,`
			`posterior_distribution,`
			`prior_distribution,`
			`capacitron_beta,`
			`)`

			`@staticmethod`
			`def _add_speaker_embedding(outputs, embedded_speakers):`
			`embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)`
			`outputs = outputs + embedded_speakers_`
			`return outputs`

			`@staticmethod`
			`def _concat_speaker_embedding(outputs, embedded_speakers):`
			`embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)`
			`outputs = torch.cat([outputs, embedded_speakers_], dim=-1)`
			`return outputs`

			`#############################`
			`# CALLBACKS`
			`#############################`

			`def on_epoch_start(self, trainer):`
			`"""Callback for setting values wrt gradual training schedule.`

			`Args:`
			`trainer (TrainerTTS): TTS trainer object that is used to train this model.`
			`"""`
			`if self.gradual_training:`
			`r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config)`
			`trainer.config.r = r`
			`self.decoder.set_r(r)`
			`if trainer.config.bidirectional_decoder:`
			`trainer.model.decoder_backward.set_r(r)`
			`print(f"\n > Number of output frames: {self.decoder.r}")`