ai-content-maker/.venv/Lib/site-packages/TTS/tts/models/bark.py

import os
from dataclasses import dataclass
from typing import Optional

import numpy as np
from coqpit import Coqpit
from encodec import EncodecModel
from transformers import BertTokenizer

from TTS.tts.layers.bark.inference_funcs import (
    codec_decode,
    generate_coarse,
    generate_fine,
    generate_text_semantic,
    generate_voice,
    load_voice,
)
from TTS.tts.layers.bark.load_model import load_model
from TTS.tts.layers.bark.model import GPT
from TTS.tts.layers.bark.model_fine import FineGPT
from TTS.tts.models.base_tts import BaseTTS


@dataclass
class BarkAudioConfig(Coqpit):
    sample_rate: int = 24000
    output_sample_rate: int = 24000


class Bark(BaseTTS):
    def __init__(
        self,
        config: Coqpit,
        tokenizer: BertTokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased"),
    ) -> None:
        super().__init__(config=config, ap=None, tokenizer=None, speaker_manager=None, language_manager=None)
        self.config.num_chars = len(tokenizer)
        self.tokenizer = tokenizer
        self.semantic_model = GPT(config.semantic_config)
        self.coarse_model = GPT(config.coarse_config)
        self.fine_model = FineGPT(config.fine_config)
        self.encodec = EncodecModel.encodec_model_24khz()
        self.encodec.set_target_bandwidth(6.0)

    @property
    def device(self):
        return next(self.parameters()).device

    def load_bark_models(self):
        self.semantic_model, self.config = load_model(
            ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
        )
        self.coarse_model, self.config = load_model(
            ckpt_path=self.config.LOCAL_MODEL_PATHS["coarse"],
            device=self.device,
            config=self.config,
            model_type="coarse",
        )
        self.fine_model, self.config = load_model(
            ckpt_path=self.config.LOCAL_MODEL_PATHS["fine"], device=self.device, config=self.config, model_type="fine"
        )

    def train_step(
        self,
    ):
        pass

    def text_to_semantic(
        self,
        text: str,
        history_prompt: Optional[str] = None,
        temp: float = 0.7,
        base=None,
        allow_early_stop=True,
        **kwargs,
    ):
        """Generate semantic array from text.

        Args:
            text: text to be turned into audio
            history_prompt: history choice for audio cloning
            temp: generation temperature (1.0 more diverse, 0.0 more conservative)

        Returns:
            numpy semantic array to be fed into `semantic_to_waveform`
        """
        x_semantic = generate_text_semantic(
            text,
            self,
            history_prompt=history_prompt,
            temp=temp,
            base=base,
            allow_early_stop=allow_early_stop,
            **kwargs,
        )
        return x_semantic

    def semantic_to_waveform(
        self,
        semantic_tokens: np.ndarray,
        history_prompt: Optional[str] = None,
        temp: float = 0.7,
        base=None,
    ):
        """Generate audio array from semantic input.

        Args:
            semantic_tokens: semantic token output from `text_to_semantic`
            history_prompt: history choice for audio cloning
            temp: generation temperature (1.0 more diverse, 0.0 more conservative)

        Returns:
            numpy audio array at sample frequency 24khz
        """
        x_coarse_gen = generate_coarse(
            semantic_tokens,
            self,
            history_prompt=history_prompt,
            temp=temp,
            base=base,
        )
        x_fine_gen = generate_fine(
            x_coarse_gen,
            self,
            history_prompt=history_prompt,
            temp=0.5,
            base=base,
        )
        audio_arr = codec_decode(x_fine_gen, self)
        return audio_arr, x_coarse_gen, x_fine_gen

    def generate_audio(
        self,
        text: str,
        history_prompt: Optional[str] = None,
        text_temp: float = 0.7,
        waveform_temp: float = 0.7,
        base=None,
        allow_early_stop=True,
        **kwargs,
    ):
        """Generate audio array from input text.

        Args:
            text: text to be turned into audio
            history_prompt: history choice for audio cloning
            text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
            waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)

        Returns:
            numpy audio array at sample frequency 24khz
        """
        x_semantic = self.text_to_semantic(
            text,
            history_prompt=history_prompt,
            temp=text_temp,
            base=base,
            allow_early_stop=allow_early_stop,
            **kwargs,
        )
        audio_arr, c, f = self.semantic_to_waveform(
            x_semantic, history_prompt=history_prompt, temp=waveform_temp, base=base
        )
        return audio_arr, [x_semantic, c, f]

    def generate_voice(self, audio, speaker_id, voice_dir):
        """Generate a voice from the given audio and text.

        Args:
            audio (str): Path to the audio file.
            speaker_id (str): Speaker name.
            voice_dir (str): Path to the directory to save the generate voice.
        """
        if voice_dir is not None:
            voice_dirs = [voice_dir]
            try:
                _ = load_voice(speaker_id, voice_dirs)
            except (KeyError, FileNotFoundError):
                output_path = os.path.join(voice_dir, speaker_id + ".npz")
                os.makedirs(voice_dir, exist_ok=True)
                generate_voice(audio, self, output_path)

    def _set_voice_dirs(self, voice_dirs):
        def_voice_dir = None
        if isinstance(self.config.DEF_SPEAKER_DIR, str):
            os.makedirs(self.config.DEF_SPEAKER_DIR, exist_ok=True)
            if os.path.isdir(self.config.DEF_SPEAKER_DIR):
                def_voice_dir = self.config.DEF_SPEAKER_DIR
        _voice_dirs = [def_voice_dir] if def_voice_dir is not None else []
        if voice_dirs is not None:
            if isinstance(voice_dirs, str):
                voice_dirs = [voice_dirs]
            _voice_dirs = voice_dirs + _voice_dirs
        return _voice_dirs

    # TODO: remove config from synthesize
    def synthesize(
        self, text, config, speaker_id="random", voice_dirs=None, **kwargs
    ):  # pylint: disable=unused-argument
        """Synthesize speech with the given input text.

        Args:
            text (str): Input text.
            config (BarkConfig): Config with inference parameters.
            speaker_id (str): One of the available speaker names. If `random`, it generates a random speaker.
            speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in
                `voice_dirs` with the name `speaker_id`. Defaults to None.
            voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.
            **kwargs: Model specific inference settings used by `generate_audio()` and `TTS.tts.layers.bark.inference_funcs.generate_text_semantic().

        Returns:
            A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
            `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
            as latents used at inference.

        """
        speaker_id = "random" if speaker_id is None else speaker_id
        voice_dirs = self._set_voice_dirs(voice_dirs)
        history_prompt = load_voice(self, speaker_id, voice_dirs)
        outputs = self.generate_audio(text, history_prompt=history_prompt, **kwargs)
        return_dict = {
            "wav": outputs[0],
            "text_inputs": text,
        }

        return return_dict

    def eval_step(self):
        ...

    def forward(self):
        ...

    def inference(self):
        ...

    @staticmethod
    def init_from_config(config: "BarkConfig", **kwargs):  # pylint: disable=unused-argument
        return Bark(config)

    # pylint: disable=unused-argument, redefined-builtin
    def load_checkpoint(
        self,
        config,
        checkpoint_dir,
        text_model_path=None,
        coarse_model_path=None,
        fine_model_path=None,
        hubert_model_path=None,
        hubert_tokenizer_path=None,
        eval=False,
        strict=True,
        **kwargs,
    ):
        """Load a model checkpoints from a directory. This model is with multiple checkpoint files and it
        expects to have all the files to be under the given `checkpoint_dir` with the rigth names.
        If eval is True, set the model to eval mode.

        Args:
            config (TortoiseConfig): The model config.
            checkpoint_dir (str): The directory where the checkpoints are stored.
            ar_checkpoint_path (str, optional): The path to the autoregressive checkpoint. Defaults to None.
            diff_checkpoint_path (str, optional): The path to the diffusion checkpoint. Defaults to None.
            clvp_checkpoint_path (str, optional): The path to the CLVP checkpoint. Defaults to None.
            vocoder_checkpoint_path (str, optional): The path to the vocoder checkpoint. Defaults to None.
            eval (bool, optional): Whether to set the model to eval mode. Defaults to False.
            strict (bool, optional): Whether to load the model strictly. Defaults to True.
        """
        text_model_path = text_model_path or os.path.join(checkpoint_dir, "text_2.pt")
        coarse_model_path = coarse_model_path or os.path.join(checkpoint_dir, "coarse_2.pt")
        fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")
        hubert_model_path = hubert_model_path or os.path.join(checkpoint_dir, "hubert.pt")
        hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth")

        self.config.LOCAL_MODEL_PATHS["text"] = text_model_path
        self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path
        self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path
        self.config.LOCAL_MODEL_PATHS["hubert"] = hubert_model_path
        self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path

        self.load_bark_models()

        if eval:
            self.eval()
first commit 2024-05-03 04:18:51 +03:00			`import os`
			`from dataclasses import dataclass`
			`from typing import Optional`

			`import numpy as np`
			`from coqpit import Coqpit`
			`from encodec import EncodecModel`
			`from transformers import BertTokenizer`

			`from TTS.tts.layers.bark.inference_funcs import (`
			`codec_decode,`
			`generate_coarse,`
			`generate_fine,`
			`generate_text_semantic,`
			`generate_voice,`
			`load_voice,`
			`)`
			`from TTS.tts.layers.bark.load_model import load_model`
			`from TTS.tts.layers.bark.model import GPT`
			`from TTS.tts.layers.bark.model_fine import FineGPT`
			`from TTS.tts.models.base_tts import BaseTTS`


			`@dataclass`
			`class BarkAudioConfig(Coqpit):`
			`sample_rate: int = 24000`
			`output_sample_rate: int = 24000`


			`class Bark(BaseTTS):`
			`def __init__(`
			`self,`
			`config: Coqpit,`
			`tokenizer: BertTokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased"),`
			`) -> None:`
			`super().__init__(config=config, ap=None, tokenizer=None, speaker_manager=None, language_manager=None)`
			`self.config.num_chars = len(tokenizer)`
			`self.tokenizer = tokenizer`
			`self.semantic_model = GPT(config.semantic_config)`
			`self.coarse_model = GPT(config.coarse_config)`
			`self.fine_model = FineGPT(config.fine_config)`
			`self.encodec = EncodecModel.encodec_model_24khz()`
			`self.encodec.set_target_bandwidth(6.0)`

			`@property`
			`def device(self):`
			`return next(self.parameters()).device`

			`def load_bark_models(self):`
			`self.semantic_model, self.config = load_model(`
			`ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"`
			`)`
			`self.coarse_model, self.config = load_model(`
			`ckpt_path=self.config.LOCAL_MODEL_PATHS["coarse"],`
			`device=self.device,`
			`config=self.config,`
			`model_type="coarse",`
			`)`
			`self.fine_model, self.config = load_model(`
			`ckpt_path=self.config.LOCAL_MODEL_PATHS["fine"], device=self.device, config=self.config, model_type="fine"`
			`)`

			`def train_step(`
			`self,`
			`):`
			`pass`

			`def text_to_semantic(`
			`self,`
			`text: str,`
			`history_prompt: Optional[str] = None,`
			`temp: float = 0.7,`
			`base=None,`
			`allow_early_stop=True,`
			`**kwargs,`
			`):`
			`"""Generate semantic array from text.`

			`Args:`
			`text: text to be turned into audio`
			`history_prompt: history choice for audio cloning`
			`temp: generation temperature (1.0 more diverse, 0.0 more conservative)`

			`Returns:`
			numpy semantic array to be fed into `semantic_to_waveform`
			`"""`
			`x_semantic = generate_text_semantic(`
			`text,`
			`self,`
			`history_prompt=history_prompt,`
			`temp=temp,`
			`base=base,`
			`allow_early_stop=allow_early_stop,`
			`**kwargs,`
			`)`
			`return x_semantic`

			`def semantic_to_waveform(`
			`self,`
			`semantic_tokens: np.ndarray,`
			`history_prompt: Optional[str] = None,`
			`temp: float = 0.7,`
			`base=None,`
			`):`
			`"""Generate audio array from semantic input.`

			`Args:`
			semantic_tokens: semantic token output from `text_to_semantic`
			`history_prompt: history choice for audio cloning`
			`temp: generation temperature (1.0 more diverse, 0.0 more conservative)`

			`Returns:`
			`numpy audio array at sample frequency 24khz`
			`"""`
			`x_coarse_gen = generate_coarse(`
			`semantic_tokens,`
			`self,`
			`history_prompt=history_prompt,`
			`temp=temp,`
			`base=base,`
			`)`
			`x_fine_gen = generate_fine(`
			`x_coarse_gen,`
			`self,`
			`history_prompt=history_prompt,`
			`temp=0.5,`
			`base=base,`
			`)`
			`audio_arr = codec_decode(x_fine_gen, self)`
			`return audio_arr, x_coarse_gen, x_fine_gen`

			`def generate_audio(`
			`self,`
			`text: str,`
			`history_prompt: Optional[str] = None,`
			`text_temp: float = 0.7,`
			`waveform_temp: float = 0.7,`
			`base=None,`
			`allow_early_stop=True,`
			`**kwargs,`
			`):`
			`"""Generate audio array from input text.`

			`Args:`
			`text: text to be turned into audio`
			`history_prompt: history choice for audio cloning`
			`text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)`
			`waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)`

			`Returns:`
			`numpy audio array at sample frequency 24khz`
			`"""`
			`x_semantic = self.text_to_semantic(`
			`text,`
			`history_prompt=history_prompt,`
			`temp=text_temp,`
			`base=base,`
			`allow_early_stop=allow_early_stop,`
			`**kwargs,`
			`)`
			`audio_arr, c, f = self.semantic_to_waveform(`
			`x_semantic, history_prompt=history_prompt, temp=waveform_temp, base=base`
			`)`
			`return audio_arr, [x_semantic, c, f]`

			`def generate_voice(self, audio, speaker_id, voice_dir):`
			`"""Generate a voice from the given audio and text.`

			`Args:`
			`audio (str): Path to the audio file.`
			`speaker_id (str): Speaker name.`
			`voice_dir (str): Path to the directory to save the generate voice.`
			`"""`
			`if voice_dir is not None:`
			`voice_dirs = [voice_dir]`
			`try:`
			`_ = load_voice(speaker_id, voice_dirs)`
			`except (KeyError, FileNotFoundError):`
			`output_path = os.path.join(voice_dir, speaker_id + ".npz")`
			`os.makedirs(voice_dir, exist_ok=True)`
			`generate_voice(audio, self, output_path)`

			`def _set_voice_dirs(self, voice_dirs):`
			`def_voice_dir = None`
			`if isinstance(self.config.DEF_SPEAKER_DIR, str):`
			`os.makedirs(self.config.DEF_SPEAKER_DIR, exist_ok=True)`
			`if os.path.isdir(self.config.DEF_SPEAKER_DIR):`
			`def_voice_dir = self.config.DEF_SPEAKER_DIR`
			`_voice_dirs = [def_voice_dir] if def_voice_dir is not None else []`
			`if voice_dirs is not None:`
			`if isinstance(voice_dirs, str):`
			`voice_dirs = [voice_dirs]`
			`_voice_dirs = voice_dirs + _voice_dirs`
			`return _voice_dirs`

			`# TODO: remove config from synthesize`
			`def synthesize(`
			`self, text, config, speaker_id="random", voice_dirs=None, **kwargs`
			`): # pylint: disable=unused-argument`
			`"""Synthesize speech with the given input text.`

			`Args:`
			`text (str): Input text.`
			`config (BarkConfig): Config with inference parameters.`
			speaker_id (str): One of the available speaker names. If `random`, it generates a random speaker.
			`speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in`
			`voice_dirs` with the name `speaker_id`. Defaults to None.
			`voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.`
			**kwargs: Model specific inference settings used by `generate_audio()` and `TTS.tts.layers.bark.inference_funcs.generate_text_semantic().

			`Returns:`
			A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
			`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
			`as latents used at inference.`

			`"""`
			`speaker_id = "random" if speaker_id is None else speaker_id`
			`voice_dirs = self._set_voice_dirs(voice_dirs)`
			`history_prompt = load_voice(self, speaker_id, voice_dirs)`
			`outputs = self.generate_audio(text, history_prompt=history_prompt, **kwargs)`
			`return_dict = {`
			`"wav": outputs[0],`
			`"text_inputs": text,`
			`}`

			`return return_dict`

			`def eval_step(self):`
			`...`

			`def forward(self):`
			`...`

			`def inference(self):`
			`...`

			`@staticmethod`
			`def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument`
			`return Bark(config)`

			`# pylint: disable=unused-argument, redefined-builtin`
			`def load_checkpoint(`
			`self,`
			`config,`
			`checkpoint_dir,`
			`text_model_path=None,`
			`coarse_model_path=None,`
			`fine_model_path=None,`
			`hubert_model_path=None,`
			`hubert_tokenizer_path=None,`
			`eval=False,`
			`strict=True,`
			`**kwargs,`
			`):`
			`"""Load a model checkpoints from a directory. This model is with multiple checkpoint files and it`
			expects to have all the files to be under the given `checkpoint_dir` with the rigth names.
			`If eval is True, set the model to eval mode.`

			`Args:`
			`config (TortoiseConfig): The model config.`
			`checkpoint_dir (str): The directory where the checkpoints are stored.`
			`ar_checkpoint_path (str, optional): The path to the autoregressive checkpoint. Defaults to None.`
			`diff_checkpoint_path (str, optional): The path to the diffusion checkpoint. Defaults to None.`
			`clvp_checkpoint_path (str, optional): The path to the CLVP checkpoint. Defaults to None.`
			`vocoder_checkpoint_path (str, optional): The path to the vocoder checkpoint. Defaults to None.`
			`eval (bool, optional): Whether to set the model to eval mode. Defaults to False.`
			`strict (bool, optional): Whether to load the model strictly. Defaults to True.`
			`"""`
			`text_model_path = text_model_path or os.path.join(checkpoint_dir, "text_2.pt")`
			`coarse_model_path = coarse_model_path or os.path.join(checkpoint_dir, "coarse_2.pt")`
			`fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")`
			`hubert_model_path = hubert_model_path or os.path.join(checkpoint_dir, "hubert.pt")`
			`hubert_tokenizer_path = hubert_tokenizer_path or os.path.join(checkpoint_dir, "tokenizer.pth")`

			`self.config.LOCAL_MODEL_PATHS["text"] = text_model_path`
			`self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path`
			`self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path`
			`self.config.LOCAL_MODEL_PATHS["hubert"] = hubert_model_path`
			`self.config.LOCAL_MODEL_PATHS["hubert_tokenizer"] = hubert_tokenizer_path`

			`self.load_bark_models()`

			`if eval:`
			`self.eval()`