ai-content-maker/.venv/Lib/site-packages/TTS/encoder/models/base_encoder.py

import numpy as np
import torch
import torchaudio
from coqpit import Coqpit
from torch import nn

from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
from TTS.utils.generic_utils import set_init_dict
from TTS.utils.io import load_fsspec


class PreEmphasis(nn.Module):
    def __init__(self, coefficient=0.97):
        super().__init__()
        self.coefficient = coefficient
        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))

    def forward(self, x):
        assert len(x.size()) == 2

        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)


class BaseEncoder(nn.Module):
    """Base `encoder` class. Every new `encoder` model must inherit this.

    It defines common `encoder` specific functions.
    """

    # pylint: disable=W0102
    def __init__(self):
        super(BaseEncoder, self).__init__()

    def get_torch_mel_spectrogram_class(self, audio_config):
        return torch.nn.Sequential(
            PreEmphasis(audio_config["preemphasis"]),
            # TorchSTFT(
            #     n_fft=audio_config["fft_size"],
            #     hop_length=audio_config["hop_length"],
            #     win_length=audio_config["win_length"],
            #     sample_rate=audio_config["sample_rate"],
            #     window="hamming_window",
            #     mel_fmin=0.0,
            #     mel_fmax=None,
            #     use_htk=True,
            #     do_amp_to_db=False,
            #     n_mels=audio_config["num_mels"],
            #     power=2.0,
            #     use_mel=True,
            #     mel_norm=None,
            # )
            torchaudio.transforms.MelSpectrogram(
                sample_rate=audio_config["sample_rate"],
                n_fft=audio_config["fft_size"],
                win_length=audio_config["win_length"],
                hop_length=audio_config["hop_length"],
                window_fn=torch.hamming_window,
                n_mels=audio_config["num_mels"],
            ),
        )

    @torch.no_grad()
    def inference(self, x, l2_norm=True):
        return self.forward(x, l2_norm)

    @torch.no_grad()
    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
        """
        Generate embeddings for a batch of utterances
        x: 1xTxD
        """
        # map to the waveform size
        if self.use_torch_spec:
            num_frames = num_frames * self.audio_config["hop_length"]

        max_len = x.shape[1]

        if max_len < num_frames:
            num_frames = max_len

        offsets = np.linspace(0, max_len - num_frames, num=num_eval)

        frames_batch = []
        for offset in offsets:
            offset = int(offset)
            end_offset = int(offset + num_frames)
            frames = x[:, offset:end_offset]
            frames_batch.append(frames)

        frames_batch = torch.cat(frames_batch, dim=0)
        embeddings = self.inference(frames_batch, l2_norm=l2_norm)

        if return_mean:
            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
        return embeddings

    def get_criterion(self, c: Coqpit, num_classes=None):
        if c.loss == "ge2e":
            criterion = GE2ELoss(loss_method="softmax")
        elif c.loss == "angleproto":
            criterion = AngleProtoLoss()
        elif c.loss == "softmaxproto":
            criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
        else:
            raise Exception("The %s  not is a loss supported" % c.loss)
        return criterion

    def load_checkpoint(
        self,
        config: Coqpit,
        checkpoint_path: str,
        eval: bool = False,
        use_cuda: bool = False,
        criterion=None,
        cache=False,
    ):
        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        try:
            self.load_state_dict(state["model"])
            print(" > Model fully restored. ")
        except (KeyError, RuntimeError) as error:
            # If eval raise the error
            if eval:
                raise error

            print(" > Partial model initialization.")
            model_dict = self.state_dict()
            model_dict = set_init_dict(model_dict, state["model"], c)
            self.load_state_dict(model_dict)
            del model_dict

        # load the criterion for restore_path
        if criterion is not None and "criterion" in state:
            try:
                criterion.load_state_dict(state["criterion"])
            except (KeyError, RuntimeError) as error:
                print(" > Criterion load ignored because of:", error)

        # instance and load the criterion for the encoder classifier in inference time
        if (
            eval
            and criterion is None
            and "criterion" in state
            and getattr(config, "map_classid_to_classname", None) is not None
        ):
            criterion = self.get_criterion(config, len(config.map_classid_to_classname))
            criterion.load_state_dict(state["criterion"])

        if use_cuda:
            self.cuda()
            if criterion is not None:
                criterion = criterion.cuda()

        if eval:
            self.eval()
            assert not self.training

        if not eval:
            return criterion, state["step"]
        return criterion
first commit 2024-05-03 04:18:51 +03:00			`import numpy as np`
			`import torch`
			`import torchaudio`
			`from coqpit import Coqpit`
			`from torch import nn`

			`from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss`
			`from TTS.utils.generic_utils import set_init_dict`
			`from TTS.utils.io import load_fsspec`


			`class PreEmphasis(nn.Module):`
			`def __init__(self, coefficient=0.97):`
			`super().__init__()`
			`self.coefficient = coefficient`
			`self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))`

			`def forward(self, x):`
			`assert len(x.size()) == 2`

			`x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")`
			`return torch.nn.functional.conv1d(x, self.filter).squeeze(1)`


			`class BaseEncoder(nn.Module):`
			"""Base `encoder` class. Every new `encoder` model must inherit this.

			It defines common `encoder` specific functions.
			`"""`

			`# pylint: disable=W0102`
			`def __init__(self):`
			`super(BaseEncoder, self).__init__()`

			`def get_torch_mel_spectrogram_class(self, audio_config):`
			`return torch.nn.Sequential(`
			`PreEmphasis(audio_config["preemphasis"]),`
			`# TorchSTFT(`
			`# n_fft=audio_config["fft_size"],`
			`# hop_length=audio_config["hop_length"],`
			`# win_length=audio_config["win_length"],`
			`# sample_rate=audio_config["sample_rate"],`
			`# window="hamming_window",`
			`# mel_fmin=0.0,`
			`# mel_fmax=None,`
			`# use_htk=True,`
			`# do_amp_to_db=False,`
			`# n_mels=audio_config["num_mels"],`
			`# power=2.0,`
			`# use_mel=True,`
			`# mel_norm=None,`
			`# )`
			`torchaudio.transforms.MelSpectrogram(`
			`sample_rate=audio_config["sample_rate"],`
			`n_fft=audio_config["fft_size"],`
			`win_length=audio_config["win_length"],`
			`hop_length=audio_config["hop_length"],`
			`window_fn=torch.hamming_window,`
			`n_mels=audio_config["num_mels"],`
			`),`
			`)`

			`@torch.no_grad()`
			`def inference(self, x, l2_norm=True):`
			`return self.forward(x, l2_norm)`

			`@torch.no_grad()`
			`def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):`
			`"""`
			`Generate embeddings for a batch of utterances`
			`x: 1xTxD`
			`"""`
			`# map to the waveform size`
			`if self.use_torch_spec:`
			`num_frames = num_frames * self.audio_config["hop_length"]`

			`max_len = x.shape[1]`

			`if max_len < num_frames:`
			`num_frames = max_len`

			`offsets = np.linspace(0, max_len - num_frames, num=num_eval)`

			`frames_batch = []`
			`for offset in offsets:`
			`offset = int(offset)`
			`end_offset = int(offset + num_frames)`
			`frames = x[:, offset:end_offset]`
			`frames_batch.append(frames)`

			`frames_batch = torch.cat(frames_batch, dim=0)`
			`embeddings = self.inference(frames_batch, l2_norm=l2_norm)`

			`if return_mean:`
			`embeddings = torch.mean(embeddings, dim=0, keepdim=True)`
			`return embeddings`

			`def get_criterion(self, c: Coqpit, num_classes=None):`
			`if c.loss == "ge2e":`
			`criterion = GE2ELoss(loss_method="softmax")`
			`elif c.loss == "angleproto":`
			`criterion = AngleProtoLoss()`
			`elif c.loss == "softmaxproto":`
			`criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)`
			`else:`
			`raise Exception("The %s not is a loss supported" % c.loss)`
			`return criterion`

			`def load_checkpoint(`
			`self,`
			`config: Coqpit,`
			`checkpoint_path: str,`
			`eval: bool = False,`
			`use_cuda: bool = False,`
			`criterion=None,`
			`cache=False,`
			`):`
			`state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)`
			`try:`
			`self.load_state_dict(state["model"])`
			`print(" > Model fully restored. ")`
			`except (KeyError, RuntimeError) as error:`
			`# If eval raise the error`
			`if eval:`
			`raise error`

			`print(" > Partial model initialization.")`
			`model_dict = self.state_dict()`
			`model_dict = set_init_dict(model_dict, state["model"], c)`
			`self.load_state_dict(model_dict)`
			`del model_dict`

			`# load the criterion for restore_path`
			`if criterion is not None and "criterion" in state:`
			`try:`
			`criterion.load_state_dict(state["criterion"])`
			`except (KeyError, RuntimeError) as error:`
			`print(" > Criterion load ignored because of:", error)`

			`# instance and load the criterion for the encoder classifier in inference time`
			`if (`
			`eval`
			`and criterion is None`
			`and "criterion" in state`
			`and getattr(config, "map_classid_to_classname", None) is not None`
			`):`
			`criterion = self.get_criterion(config, len(config.map_classid_to_classname))`
			`criterion.load_state_dict(state["criterion"])`

			`if use_cuda:`
			`self.cuda()`
			`if criterion is not None:`
			`criterion = criterion.cuda()`

			`if eval:`
			`self.eval()`
			`assert not self.training`

			`if not eval:`
			`return criterion, state["step"]`
			`return criterion`