239 lines
6.5 KiB
Python
239 lines
6.5 KiB
Python
|
import librosa
|
||
|
import matplotlib
|
||
|
import matplotlib.pyplot as plt
|
||
|
import numpy as np
|
||
|
import torch
|
||
|
from matplotlib.colors import LogNorm
|
||
|
|
||
|
matplotlib.use("Agg")
|
||
|
|
||
|
|
||
|
def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False, plot_log=False):
|
||
|
if isinstance(alignment, torch.Tensor):
|
||
|
alignment_ = alignment.detach().cpu().numpy().squeeze()
|
||
|
else:
|
||
|
alignment_ = alignment
|
||
|
alignment_ = alignment_.astype(np.float32) if alignment_.dtype == np.float16 else alignment_
|
||
|
fig, ax = plt.subplots(figsize=fig_size)
|
||
|
im = ax.imshow(
|
||
|
alignment_.T, aspect="auto", origin="lower", interpolation="none", norm=LogNorm() if plot_log else None
|
||
|
)
|
||
|
fig.colorbar(im, ax=ax)
|
||
|
xlabel = "Decoder timestep"
|
||
|
if info is not None:
|
||
|
xlabel += "\n\n" + info
|
||
|
plt.xlabel(xlabel)
|
||
|
plt.ylabel("Encoder timestep")
|
||
|
# plt.yticks(range(len(text)), list(text))
|
||
|
plt.tight_layout()
|
||
|
if title is not None:
|
||
|
plt.title(title)
|
||
|
if not output_fig:
|
||
|
plt.close()
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
|
||
|
if isinstance(spectrogram, torch.Tensor):
|
||
|
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
|
||
|
else:
|
||
|
spectrogram_ = spectrogram.T
|
||
|
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
|
||
|
if ap is not None:
|
||
|
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
|
||
|
fig = plt.figure(figsize=fig_size)
|
||
|
plt.imshow(spectrogram_, aspect="auto", origin="lower")
|
||
|
plt.colorbar()
|
||
|
plt.tight_layout()
|
||
|
if not output_fig:
|
||
|
plt.close()
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False):
|
||
|
"""Plot pitch curves on top of the spectrogram.
|
||
|
|
||
|
Args:
|
||
|
pitch (np.array): Pitch values.
|
||
|
spectrogram (np.array): Spectrogram values.
|
||
|
|
||
|
Shapes:
|
||
|
pitch: :math:`(T,)`
|
||
|
spec: :math:`(C, T)`
|
||
|
"""
|
||
|
|
||
|
if isinstance(spectrogram, torch.Tensor):
|
||
|
spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
|
||
|
else:
|
||
|
spectrogram_ = spectrogram.T
|
||
|
spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
|
||
|
if ap is not None:
|
||
|
spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
|
||
|
|
||
|
old_fig_size = plt.rcParams["figure.figsize"]
|
||
|
if fig_size is not None:
|
||
|
plt.rcParams["figure.figsize"] = fig_size
|
||
|
|
||
|
fig, ax = plt.subplots()
|
||
|
|
||
|
ax.imshow(spectrogram_, aspect="auto", origin="lower")
|
||
|
ax.set_xlabel("time")
|
||
|
ax.set_ylabel("spec_freq")
|
||
|
|
||
|
ax2 = ax.twinx()
|
||
|
ax2.plot(pitch, linewidth=5.0, color="red")
|
||
|
ax2.set_ylabel("F0")
|
||
|
|
||
|
plt.rcParams["figure.figsize"] = old_fig_size
|
||
|
if not output_fig:
|
||
|
plt.close()
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
|
||
|
"""Plot pitch curves on top of the input characters.
|
||
|
|
||
|
Args:
|
||
|
pitch (np.array): Pitch values.
|
||
|
chars (str): Characters to place to the x-axis.
|
||
|
|
||
|
Shapes:
|
||
|
pitch: :math:`(T,)`
|
||
|
"""
|
||
|
old_fig_size = plt.rcParams["figure.figsize"]
|
||
|
if fig_size is not None:
|
||
|
plt.rcParams["figure.figsize"] = fig_size
|
||
|
|
||
|
fig, ax = plt.subplots()
|
||
|
|
||
|
x = np.array(range(len(chars)))
|
||
|
my_xticks = chars
|
||
|
plt.xticks(x, my_xticks)
|
||
|
|
||
|
ax.set_xlabel("characters")
|
||
|
ax.set_ylabel("freq")
|
||
|
|
||
|
ax2 = ax.twinx()
|
||
|
ax2.plot(pitch, linewidth=5.0, color="red")
|
||
|
ax2.set_ylabel("F0")
|
||
|
|
||
|
plt.rcParams["figure.figsize"] = old_fig_size
|
||
|
if not output_fig:
|
||
|
plt.close()
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False):
|
||
|
"""Plot energy curves on top of the input characters.
|
||
|
|
||
|
Args:
|
||
|
energy (np.array): energy values.
|
||
|
chars (str): Characters to place to the x-axis.
|
||
|
|
||
|
Shapes:
|
||
|
energy: :math:`(T,)`
|
||
|
"""
|
||
|
old_fig_size = plt.rcParams["figure.figsize"]
|
||
|
if fig_size is not None:
|
||
|
plt.rcParams["figure.figsize"] = fig_size
|
||
|
|
||
|
fig, ax = plt.subplots()
|
||
|
|
||
|
x = np.array(range(len(chars)))
|
||
|
my_xticks = chars
|
||
|
plt.xticks(x, my_xticks)
|
||
|
|
||
|
ax.set_xlabel("characters")
|
||
|
ax.set_ylabel("freq")
|
||
|
|
||
|
ax2 = ax.twinx()
|
||
|
ax2.plot(energy, linewidth=5.0, color="red")
|
||
|
ax2.set_ylabel("energy")
|
||
|
|
||
|
plt.rcParams["figure.figsize"] = old_fig_size
|
||
|
if not output_fig:
|
||
|
plt.close()
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def visualize(
|
||
|
alignment,
|
||
|
postnet_output,
|
||
|
text,
|
||
|
hop_length,
|
||
|
CONFIG,
|
||
|
tokenizer,
|
||
|
stop_tokens=None,
|
||
|
decoder_output=None,
|
||
|
output_path=None,
|
||
|
figsize=(8, 24),
|
||
|
output_fig=False,
|
||
|
):
|
||
|
"""Intended to be used in Notebooks."""
|
||
|
|
||
|
if decoder_output is not None:
|
||
|
num_plot = 4
|
||
|
else:
|
||
|
num_plot = 3
|
||
|
|
||
|
label_fontsize = 16
|
||
|
fig = plt.figure(figsize=figsize)
|
||
|
|
||
|
plt.subplot(num_plot, 1, 1)
|
||
|
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
||
|
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||
|
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||
|
# compute phoneme representation and back
|
||
|
if CONFIG.use_phonemes:
|
||
|
seq = tokenizer.text_to_ids(text)
|
||
|
text = tokenizer.ids_to_text(seq)
|
||
|
print(text)
|
||
|
plt.yticks(range(len(text)), list(text))
|
||
|
plt.colorbar()
|
||
|
|
||
|
if stop_tokens is not None:
|
||
|
# plot stopnet predictions
|
||
|
plt.subplot(num_plot, 1, 2)
|
||
|
plt.plot(range(len(stop_tokens)), list(stop_tokens))
|
||
|
|
||
|
# plot postnet spectrogram
|
||
|
plt.subplot(num_plot, 1, 3)
|
||
|
librosa.display.specshow(
|
||
|
postnet_output.T,
|
||
|
sr=CONFIG.audio["sample_rate"],
|
||
|
hop_length=hop_length,
|
||
|
x_axis="time",
|
||
|
y_axis="linear",
|
||
|
fmin=CONFIG.audio["mel_fmin"],
|
||
|
fmax=CONFIG.audio["mel_fmax"],
|
||
|
)
|
||
|
|
||
|
plt.xlabel("Time", fontsize=label_fontsize)
|
||
|
plt.ylabel("Hz", fontsize=label_fontsize)
|
||
|
plt.tight_layout()
|
||
|
plt.colorbar()
|
||
|
|
||
|
if decoder_output is not None:
|
||
|
plt.subplot(num_plot, 1, 4)
|
||
|
librosa.display.specshow(
|
||
|
decoder_output.T,
|
||
|
sr=CONFIG.audio["sample_rate"],
|
||
|
hop_length=hop_length,
|
||
|
x_axis="time",
|
||
|
y_axis="linear",
|
||
|
fmin=CONFIG.audio["mel_fmin"],
|
||
|
fmax=CONFIG.audio["mel_fmax"],
|
||
|
)
|
||
|
plt.xlabel("Time", fontsize=label_fontsize)
|
||
|
plt.ylabel("Hz", fontsize=label_fontsize)
|
||
|
plt.tight_layout()
|
||
|
plt.colorbar()
|
||
|
|
||
|
if output_path:
|
||
|
print(output_path)
|
||
|
fig.savefig(output_path)
|
||
|
plt.close()
|
||
|
|
||
|
if not output_fig:
|
||
|
plt.close()
|