#!/usr/bin/env python # -*- coding: utf-8 -*- """Feature inversion""" import warnings import numpy as np import scipy.fftpack from ..util.exceptions import ParameterError from ..core.spectrum import griffinlim from ..core.spectrum import db_to_power from ..util.utils import tiny from .. import filters from ..util import nnls, expand_to from numpy.typing import DTypeLike from typing import Any, Callable, Optional, Union from .._typing import _WindowSpec, _PadModeSTFT __all__ = ["mel_to_stft", "mel_to_audio", "mfcc_to_mel", "mfcc_to_audio"] def mel_to_stft( M: np.ndarray, *, sr: float = 22050, n_fft: int = 2048, power: float = 2.0, **kwargs: Any, ) -> np.ndarray: """Approximate STFT magnitude from a Mel power spectrogram. Parameters ---------- M : np.ndarray [shape=(..., n_mels, n), non-negative] The spectrogram as produced by `feature.melspectrogram` sr : number > 0 [scalar] sampling rate of the underlying signal n_fft : int > 0 [scalar] number of FFT components in the resulting STFT power : float > 0 [scalar] Exponent for the magnitude melspectrogram **kwargs : additional keyword arguments for Mel filter bank parameters fmin : float >= 0 [scalar] lowest frequency (in Hz) fmax : float >= 0 [scalar] highest frequency (in Hz). If `None`, use ``fmax = sr / 2.0`` htk : bool [scalar] use HTK formula instead of Slaney norm : {None, 'slaney', or number} [scalar] If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization). If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm. See `librosa.util.normalize` for a full description of supported norm values (including `+-np.inf`). Otherwise, leave all the triangles aiming for a peak value of 1.0 dtype : np.dtype The data type of the output basis. By default, uses 32-bit (single-precision) floating point. Returns ------- S : np.ndarray [shape=(..., n_fft, t), non-negative] An approximate linear magnitude spectrogram See Also -------- librosa.feature.melspectrogram librosa.stft librosa.filters.mel librosa.util.nnls Examples -------- >>> y, sr = librosa.load(librosa.ex('trumpet')) >>> S = librosa.util.abs2(librosa.stft(y)) >>> mel_spec = librosa.feature.melspectrogram(S=S, sr=sr) >>> S_inv = librosa.feature.inverse.mel_to_stft(mel_spec, sr=sr) Compare the results visually >>> import matplotlib.pyplot as plt >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True) >>> img = librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max, top_db=None), ... y_axis='log', x_axis='time', ax=ax[0]) >>> ax[0].set(title='Original STFT') >>> ax[0].label_outer() >>> librosa.display.specshow(librosa.amplitude_to_db(S_inv, ref=np.max, top_db=None), ... y_axis='log', x_axis='time', ax=ax[1]) >>> ax[1].set(title='Reconstructed STFT') >>> ax[1].label_outer() >>> librosa.display.specshow(librosa.amplitude_to_db(np.abs(S_inv - S), ... ref=S.max(), top_db=None), ... vmax=0, y_axis='log', x_axis='time', cmap='magma', ax=ax[2]) >>> ax[2].set(title='Residual error (dB)') >>> fig.colorbar(img, ax=ax, format="%+2.f dB") """ # Construct a mel basis with dtype matching the input data mel_basis = filters.mel( sr=sr, n_fft=n_fft, n_mels=M.shape[-2], dtype=M.dtype, **kwargs ) # Find the non-negative least squares solution, and apply # the inverse exponent. # We'll do the exponentiation in-place. inverse = nnls(mel_basis, M) return np.power(inverse, 1.0 / power, out=inverse) def mel_to_audio( M: np.ndarray, *, sr: float = 22050, n_fft: int = 2048, hop_length: Optional[int] = None, win_length: Optional[int] = None, window: _WindowSpec = "hann", center: bool = True, pad_mode: _PadModeSTFT = "constant", power: float = 2.0, n_iter: int = 32, length: Optional[int] = None, dtype: DTypeLike = np.float32, **kwargs: Any, ) -> np.ndarray: """Invert a mel power spectrogram to audio using Griffin-Lim. This is primarily a convenience wrapper for: >>> S = librosa.feature.inverse.mel_to_stft(M) >>> y = librosa.griffinlim(S) Parameters ---------- M : np.ndarray [shape=(..., n_mels, n), non-negative] The spectrogram as produced by `feature.melspectrogram` sr : number > 0 [scalar] sampling rate of the underlying signal n_fft : int > 0 [scalar] number of FFT components in the resulting STFT hop_length : None or int > 0 The hop length of the STFT. If not provided, it will default to ``n_fft // 4`` win_length : None or int > 0 The window length of the STFT. By default, it will equal ``n_fft`` window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] A window specification as supported by `stft` or `istft` center : boolean If `True`, the STFT is assumed to use centered frames. If `False`, the STFT is assumed to use left-aligned frames. pad_mode : string If ``center=True``, the padding mode to use at the edges of the signal. By default, STFT uses zero padding. power : float > 0 [scalar] Exponent for the magnitude melspectrogram n_iter : int > 0 The number of iterations for Griffin-Lim length : None or int > 0 If provided, the output ``y`` is zero-padded or clipped to exactly ``length`` samples. dtype : np.dtype Real numeric type for the time-domain signal. Default is 32-bit float. **kwargs : additional keyword arguments for Mel filter bank parameters fmin : float >= 0 [scalar] lowest frequency (in Hz) fmax : float >= 0 [scalar] highest frequency (in Hz). If `None`, use ``fmax = sr / 2.0`` htk : bool [scalar] use HTK formula instead of Slaney norm : {None, 'slaney', or number} [scalar] If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization). If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm. See `librosa.util.normalize` for a full description of supported norm values (including `+-np.inf`). Otherwise, leave all the triangles aiming for a peak value of 1.0 Returns ------- y : np.ndarray [shape(..., n,)] time-domain signal reconstructed from ``M`` See Also -------- librosa.griffinlim librosa.feature.melspectrogram librosa.filters.mel librosa.feature.inverse.mel_to_stft """ stft = mel_to_stft(M, sr=sr, n_fft=n_fft, power=power, **kwargs) return griffinlim( stft, n_iter=n_iter, hop_length=hop_length, win_length=win_length, n_fft=n_fft, window=window, center=center, dtype=dtype, length=length, pad_mode=pad_mode, ) def mfcc_to_mel( mfcc: np.ndarray, *, n_mels: int = 128, dct_type: int = 2, norm: Optional[str] = "ortho", ref: float = 1.0, lifter: float = 0, ) -> np.ndarray: """Invert Mel-frequency cepstral coefficients to approximate a Mel power spectrogram. This inversion proceeds in two steps: 1. The inverse DCT is applied to the MFCCs 2. `librosa.db_to_power` is applied to map the dB-scaled result to a power spectrogram Parameters ---------- mfcc : np.ndarray [shape=(..., n_mfcc, n)] The Mel-frequency cepstral coefficients n_mels : int > 0 The number of Mel frequencies dct_type : {1, 2, 3} Discrete cosine transform (DCT) type By default, DCT type-2 is used. norm : None or 'ortho' If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal DCT basis. Normalization is not supported for `dct_type=1`. ref : float Reference power for (inverse) decibel calculation lifter : number >= 0 If ``lifter>0``, apply inverse liftering (inverse cepstral filtering):: M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter) * lifter / 2) Returns ------- M : np.ndarray [shape=(..., n_mels, n)] An approximate Mel power spectrum recovered from ``mfcc`` Warns ----- UserWarning due to critical values in lifter array that invokes underflow. See Also -------- librosa.feature.mfcc librosa.feature.melspectrogram scipy.fftpack.dct """ if lifter > 0: n_mfcc = mfcc.shape[-2] idx = np.arange(1, 1 + n_mfcc, dtype=mfcc.dtype) idx = expand_to(idx, ndim=mfcc.ndim, axes=-2) lifter_sine = 1 + lifter * 0.5 * np.sin(np.pi * idx / lifter) # raise a UserWarning if lifter array includes critical values if np.any(np.abs(lifter_sine) < np.finfo(lifter_sine.dtype).eps): warnings.warn( message="lifter array includes critical values that may invoke underflow.", category=UserWarning, stacklevel=2, ) # lifter mfcc values mfcc = mfcc / (lifter_sine + tiny(mfcc)) elif lifter != 0: raise ParameterError("MFCC to mel lifter must be a non-negative number.") logmel = scipy.fftpack.idct(mfcc, axis=-2, type=dct_type, norm=norm, n=n_mels) return db_to_power(logmel, ref=ref) def mfcc_to_audio( mfcc: np.ndarray, *, n_mels: int = 128, dct_type: int = 2, norm: Optional[str] = "ortho", ref: float = 1.0, lifter: float = 0, **kwargs: Any, ) -> np.ndarray: """Convert Mel-frequency cepstral coefficients to a time-domain audio signal This function is primarily a convenience wrapper for the following steps: 1. Convert mfcc to Mel power spectrum (`mfcc_to_mel`) 2. Convert Mel power spectrum to time-domain audio (`mel_to_audio`) Parameters ---------- mfcc : np.ndarray [shape=(..., n_mfcc, n)] The Mel-frequency cepstral coefficients n_mels : int > 0 The number of Mel frequencies dct_type : {1, 2, 3} Discrete cosine transform (DCT) type By default, DCT type-2 is used. norm : None or 'ortho' If ``dct_type`` is `2 or 3`, setting ``norm='ortho'`` uses an orthonormal DCT basis. Normalization is not supported for ``dct_type=1``. ref : float Reference power for (inverse) decibel calculation lifter : number >= 0 If ``lifter>0``, apply inverse liftering (inverse cepstral filtering):: M[n, :] <- M[n, :] / (1 + sin(pi * (n + 1) / lifter)) * lifter / 2 **kwargs : additional keyword arguments to pass through to `mel_to_audio` M : np.ndarray [shape=(..., n_mels, n), non-negative] The spectrogram as produced by `feature.melspectrogram` sr : number > 0 [scalar] sampling rate of the underlying signal n_fft : int > 0 [scalar] number of FFT components in the resulting STFT hop_length : None or int > 0 The hop length of the STFT. If not provided, it will default to ``n_fft // 4`` win_length : None or int > 0 The window length of the STFT. By default, it will equal ``n_fft`` window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] A window specification as supported by `stft` or `istft` center : boolean If `True`, the STFT is assumed to use centered frames. If `False`, the STFT is assumed to use left-aligned frames. pad_mode : string If ``center=True``, the padding mode to use at the edges of the signal. By default, STFT uses zero padding. power : float > 0 [scalar] Exponent for the magnitude melspectrogram n_iter : int > 0 The number of iterations for Griffin-Lim length : None or int > 0 If provided, the output ``y`` is zero-padded or clipped to exactly ``length`` samples. dtype : np.dtype Real numeric type for the time-domain signal. Default is 32-bit float. **kwargs : additional keyword arguments for Mel filter bank parameters fmin : float >= 0 [scalar] lowest frequency (in Hz) fmax : float >= 0 [scalar] highest frequency (in Hz). If `None`, use ``fmax = sr / 2.0`` htk : bool [scalar] use HTK formula instead of Slaney Returns ------- y : np.ndarray [shape=(..., n)] A time-domain signal reconstructed from `mfcc` See Also -------- mfcc_to_mel mel_to_audio librosa.feature.mfcc librosa.griffinlim scipy.fftpack.dct """ mel_spec = mfcc_to_mel( mfcc, n_mels=n_mels, dct_type=dct_type, norm=norm, ref=ref, lifter=lifter ) return mel_to_audio(mel_spec, **kwargs)