import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union
import torch
from torch import Tensor
from torchaudio._internal import load_state_dict_from_url
from torchaudio.functional import mu_law_decoding
from torchaudio.models import Tacotron2, WaveRNN
from torchaudio.transforms import GriffinLim, InverseMelScale
from . import utils
from .interface import Tacotron2TTSBundle
__all__ = []
_BASE_URL = "https://download.pytorch.org/torchaudio/models"
# Pipeline implementation - Text Processor
class _EnglishCharProcessor(Tacotron2TTSBundle.TextProcessor):
def __init__(self):
self._tokens = utils._get_chars()
self._mapping = {s: i for i, s in enumerate(self._tokens)}
def tokens(self):
return self._tokens
def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
if isinstance(texts, str):
texts = [texts]
indices = [[self._mapping[c] for c in t.lower() if c in self._mapping] for t in texts]
return utils._to_tensor(indices)
class _EnglishPhoneProcessor(Tacotron2TTSBundle.TextProcessor):
def __init__(self, *, dl_kwargs=None):
self._tokens = utils._get_phones()
self._mapping = {p: i for i, p in enumerate(self._tokens)}
self._phonemizer = utils._load_phonemizer("en_us_cmudict_forward.pt", dl_kwargs=dl_kwargs)
self._pattern = r"(\[[A-Z]+?\]|[_!'(),.:;? -])"
def tokens(self):
return self._tokens
def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
if isinstance(texts, str):
texts = [texts]
indices = []
for phones in self._phonemizer(texts, lang="en_us"):
# '[F][UW][B][AA][R]!' -> ['F', 'UW', 'B', 'AA', 'R', '!']
ret = [re.sub(r"[\[\]]", "", r) for r in re.findall(self._pattern, phones)]
indices.append([self._mapping[p] for p in ret])
return utils._to_tensor(indices)
# Pipeline implementation - Vocoder
class _WaveRNNVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
def __init__(self, model: WaveRNN, min_level_db: Optional[float] = -100):
self._sample_rate = 22050
self._model = model
self._min_level_db = min_level_db
def sample_rate(self):
return self._sample_rate
def forward(self, mel_spec, lengths=None):
mel_spec = torch.exp(mel_spec)
mel_spec = 20 * torch.log10(torch.clamp(mel_spec, min=1e-5))
if self._min_level_db is not None:
mel_spec = (self._min_level_db - mel_spec) / self._min_level_db
mel_spec = torch.clamp(mel_spec, min=0, max=1)
waveform, lengths = self._model.infer(mel_spec, lengths)
waveform = utils._unnormalize_waveform(waveform, self._model.n_bits)
waveform = mu_law_decoding(waveform, self._model.n_classes)
waveform = waveform.squeeze(1)
return waveform, lengths
class _GriffinLimVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
def __init__(self):
self._sample_rate = 22050
self._inv_mel = InverseMelScale(
n_stft=(1024 // 2 + 1),
self._griffin_lim = GriffinLim(
def sample_rate(self):
return self._sample_rate
def forward(self, mel_spec, lengths=None):
mel_spec = torch.exp(mel_spec)
mel_spec = mel_spec.clone().detach().requires_grad_(True)
spec = self._inv_mel(mel_spec)
spec = spec.detach().requires_grad_(False)
waveforms = self._griffin_lim(spec)
return waveforms, lengths
# Bundle classes mixins
class _CharMixin:
def get_text_processor(self) -> Tacotron2TTSBundle.TextProcessor:
return _EnglishCharProcessor()
class _PhoneMixin:
def get_text_processor(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.TextProcessor:
return _EnglishPhoneProcessor(dl_kwargs=dl_kwargs)
class _Tacotron2Mixin:
_tacotron2_path: str
_tacotron2_params: Dict[str, Any]
def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
model = Tacotron2(**self._tacotron2_params)
url = f"{_BASE_URL}/{self._tacotron2_path}"
dl_kwargs = {} if dl_kwargs is None else dl_kwargs
state_dict = load_state_dict_from_url(url, **dl_kwargs)
return model
class _WaveRNNMixin:
_wavernn_path: Optional[str]
_wavernn_params: Optional[Dict[str, Any]]
def get_vocoder(self, *, dl_kwargs=None):
wavernn = self._get_wavernn(dl_kwargs=dl_kwargs)
return _WaveRNNVocoder(wavernn)
def _get_wavernn(self, *, dl_kwargs=None):
model = WaveRNN(**self._wavernn_params)
url = f"{_BASE_URL}/{self._wavernn_path}"
dl_kwargs = {} if dl_kwargs is None else dl_kwargs
state_dict = load_state_dict_from_url(url, **dl_kwargs)
return model
class _GriffinLimMixin:
def get_vocoder(self, **_):
return _GriffinLimVocoder()
# Bundle classes
class _Tacotron2WaveRNNCharBundle(_WaveRNNMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
class _Tacotron2WaveRNNPhoneBundle(_WaveRNNMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
class _Tacotron2GriffinLimCharBundle(_GriffinLimMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
class _Tacotron2GriffinLimPhoneBundle(_GriffinLimMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
# Instantiate bundle objects
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.
The text processor encodes the input texts character-by-character.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The default parameters were used.
Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
Example - "Hello world! T T S stands for Text to Speech!"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
""" # noqa: E501
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.
The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The text processor is set to the *"english_phonemes"*.
Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
Example - "Hello world! T T S stands for Text to Speech!"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
""" # noqa: E501
TACOTRON2_WAVERNN_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.
The text processor encodes the input texts character-by-character.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.
You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.
Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
Example - "Hello world! T T S stands for Text to Speech!"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
""" # noqa: E501
TACOTRON2_WAVERNN_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.
The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.
You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.
You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.
Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
Example - "Hello world! T T S stands for Text to Speech!"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
.. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
:alt: Spectrogram generated by Tacotron2
.. raw:: html
<audio controls="controls">
<source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
Your browser does not support the <code>audio</code> element.
""" # noqa: E501