ai-content-maker/.venv/Lib/site-packages/librosa/core/spectrum.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Utilities for spectral processing"""
import warnings

import numpy as np
import scipy
import scipy.ndimage
import scipy.signal
import scipy.interpolate

from numba import jit

from . import convert
from .fft import get_fftlib
from .audio import resample
from .._cache import cache
from .. import util
from ..util.exceptions import ParameterError
from ..filters import get_window, semitone_filterbank
from ..filters import window_sumsquare
from numpy.typing import DTypeLike
from typing import Any, Callable, Optional, Tuple, List, Union, overload
from typing_extensions import Literal
from .._typing import _WindowSpec, _PadMode, _PadModeSTFT

__all__ = [
    "stft",
    "istft",
    "magphase",
    "iirt",
    "reassigned_spectrogram",
    "phase_vocoder",
    "perceptual_weighting",
    "power_to_db",
    "db_to_power",
    "amplitude_to_db",
    "db_to_amplitude",
    "fmt",
    "pcen",
    "griffinlim",
]


@cache(level=20)
def stft(
    y: np.ndarray,
    *,
    n_fft: int = 2048,
    hop_length: Optional[int] = None,
    win_length: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    dtype: Optional[DTypeLike] = None,
    pad_mode: _PadModeSTFT = "constant",
    out: Optional[np.ndarray] = None,
) -> np.ndarray:
    """Short-time Fourier transform (STFT).

    The STFT represents a signal in the time-frequency domain by
    computing discrete Fourier transforms (DFT) over short overlapping
    windows.

    This function returns a complex-valued matrix D such that

    - ``np.abs(D[..., f, t])`` is the magnitude of frequency bin ``f``
      at frame ``t``, and

    - ``np.angle(D[..., f, t])`` is the phase of frequency bin ``f``
      at frame ``t``.

    The integers ``t`` and ``f`` can be converted to physical units by means
    of the utility functions `frames_to_samples` and `fft_frequencies`.

    Parameters
    ----------
    y : np.ndarray [shape=(..., n)], real-valued
        input signal. Multi-channel is supported.

    n_fft : int > 0 [scalar]
        length of the windowed signal after padding with zeros.
        The number of rows in the STFT matrix ``D`` is ``(1 + n_fft/2)``.
        The default value, ``n_fft=2048`` samples, corresponds to a physical
        duration of 93 milliseconds at a sample rate of 22050 Hz, i.e. the
        default sample rate in librosa. This value is well adapted for music
        signals. However, in speech processing, the recommended value is 512,
        corresponding to 23 milliseconds at a sample rate of 22050 Hz.
        In any case, we recommend setting ``n_fft`` to a power of two for
        optimizing the speed of the fast Fourier transform (FFT) algorithm.

    hop_length : int > 0 [scalar]
        number of audio samples between adjacent STFT columns.

        Smaller values increase the number of columns in ``D`` without
        affecting the frequency resolution of the STFT.

        If unspecified, defaults to ``win_length // 4`` (see below).

    win_length : int <= n_fft [scalar]
        Each frame of audio is windowed by ``window`` of length ``win_length``
        and then padded with zeros to match ``n_fft``.  Padding is added on
        both the left- and the right-side of the window so that the window
        is centered within the frame.

        Smaller values improve the temporal resolution of the STFT (i.e. the
        ability to discriminate impulses that are closely spaced in time)
        at the expense of frequency resolution (i.e. the ability to discriminate
        pure tones that are closely spaced in frequency). This effect is known
        as the time-frequency localization trade-off and needs to be adjusted
        according to the properties of the input signal ``y``.

        If unspecified, defaults to ``win_length = n_fft``.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        Either:

        - a window specification (string, tuple, or number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.windows.hann`
        - a vector or array of length ``n_fft``

        Defaults to a raised cosine window (`'hann'`), which is adequate for
        most applications in audio signal processing.

        .. see also:: `filters.get_window`

    center : boolean
        If ``True``, the signal ``y`` is padded so that frame
        ``D[:, t]`` is centered at ``y[t * hop_length]``.

        If ``False``, then ``D[:, t]`` begins at ``y[t * hop_length]``.

        Defaults to ``True``,  which simplifies the alignment of ``D`` onto a
        time grid by means of `librosa.frames_to_samples`.
        Note, however, that ``center`` must be set to `False` when analyzing
        signals with `librosa.stream`.

        .. see also:: `librosa.stream`

    dtype : np.dtype, optional
        Complex numeric type for ``D``.  Default is inferred to match the
        precision of the input signal.

    pad_mode : string or function
        If ``center=True``, this argument is passed to `np.pad` for padding
        the edges of the signal ``y``. By default (``pad_mode="constant"``),
        ``y`` is padded on both sides with zeros.

        .. note:: Not all padding modes supported by `numpy.pad` are supported here.
            `wrap`, `mean`, `maximum`, `median`, and `minimum` are not supported.

            Other modes that depend at most on input values at the edges of the
            signal (e.g., `constant`, `edge`, `linear_ramp`) are supported.

        If ``center=False``,  this argument is ignored.

        .. see also:: `numpy.pad`

    out : np.ndarray or None
        A pre-allocated, complex-valued array to store the STFT results.
        This must be of compatible shape and dtype for the given input parameters.

        If `out` is larger than necessary for the provided input signal, then only
        a prefix slice of `out` will be used.

        If not provided, a new array is allocated and returned.

    Returns
    -------
    D : np.ndarray [shape=(..., 1 + n_fft/2, n_frames), dtype=dtype]
        Complex-valued matrix of short-term Fourier transform
        coefficients.

        If a pre-allocated `out` array is provided, then `D` will be
        a reference to `out`.

        If `out` is larger than necessary, then `D` will be a sliced
        view: `D = out[..., :n_frames]`.

    See Also
    --------
    istft : Inverse STFT
    reassigned_spectrogram : Time-frequency reassigned spectrogram

    Notes
    -----
    This function caches at level 20.

    Examples
    --------
    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> S = np.abs(librosa.stft(y))
    >>> S
    array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05],
           [3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05],
           ...,
           [7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03],
           [7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]],
          dtype=float32)

    Use left-aligned frames, instead of centered frames

    >>> S_left = librosa.stft(y, center=False)

    Use a shorter hop length

    >>> D_short = librosa.stft(y, hop_length=64)

    Display a spectrogram

    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots()
    >>> img = librosa.display.specshow(librosa.amplitude_to_db(S,
    ...                                                        ref=np.max),
    ...                                y_axis='log', x_axis='time', ax=ax)
    >>> ax.set_title('Power spectrogram')
    >>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
    """
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)
    elif not util.is_positive_int(hop_length):
        raise ParameterError(f"hop_length={hop_length} must be a positive integer")

    # Check audio is valid
    util.valid_audio(y, mono=False)

    fft_window = get_window(window, win_length, fftbins=True)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, size=n_fft)

    # Reshape so that the window can be broadcast
    fft_window = util.expand_to(fft_window, ndim=1 + y.ndim, axes=-2)

    # Pad the time series so that frames are centered
    if center:
        if pad_mode in ("wrap", "maximum", "mean", "median", "minimum"):
            # Note: padding with a user-provided function "works", but
            # use at your own risk.
            # Since we don't pass-through kwargs here, any arguments
            # to a user-provided pad function should be encapsulated
            # by using functools.partial:
            #
            # >>> my_pad_func = functools.partial(pad_func, foo=x, bar=y)
            # >>> librosa.stft(..., pad_mode=my_pad_func)

            raise ParameterError(
                f"pad_mode='{pad_mode}' is not supported by librosa.stft"
            )

        if n_fft > y.shape[-1]:
            warnings.warn(
                f"n_fft={n_fft} is too large for input signal of length={y.shape[-1]}"
            )

        # Set up the padding array to be empty, and we'll fix the target dimension later
        padding = [(0, 0) for _ in range(y.ndim)]

        # How many frames depend on left padding?
        start_k = int(np.ceil(n_fft // 2 / hop_length))

        # What's the first frame that depends on extra right-padding?
        tail_k = (y.shape[-1] + n_fft // 2 - n_fft) // hop_length + 1

        if tail_k <= start_k:
            # If tail and head overlap, then just copy-pad the signal and carry on
            start = 0
            extra = 0
            padding[-1] = (n_fft // 2, n_fft // 2)
            y = np.pad(y, padding, mode=pad_mode)
        else:
            # If tail and head do not overlap, then we can implement padding on each part separately
            # and avoid a full copy-pad

            # "Middle" of the signal starts here, and does not depend on head padding
            start = start_k * hop_length - n_fft // 2
            padding[-1] = (n_fft // 2, 0)

            # +1 here is to ensure enough samples to fill the window
            # fixes bug #1567
            y_pre = np.pad(
                y[..., : (start_k - 1) * hop_length - n_fft // 2 + n_fft + 1],
                padding,
                mode=pad_mode,
            )
            y_frames_pre = util.frame(y_pre, frame_length=n_fft, hop_length=hop_length)
            # Trim this down to the exact number of frames we should have
            y_frames_pre = y_frames_pre[..., :start_k]

            # How many extra frames do we have from the head?
            extra = y_frames_pre.shape[-1]

            # Determine if we have any frames that will fit inside the tail pad
            if tail_k * hop_length - n_fft // 2 + n_fft <= y.shape[-1] + n_fft // 2:
                padding[-1] = (0, n_fft // 2)
                y_post = np.pad(
                    y[..., (tail_k) * hop_length - n_fft // 2 :], padding, mode=pad_mode
                )
                y_frames_post = util.frame(
                    y_post, frame_length=n_fft, hop_length=hop_length
                )
                # How many extra frames do we have from the tail?
                extra += y_frames_post.shape[-1]
            else:
                # In this event, the first frame that touches tail padding would run off
                # the end of the padded array
                # We'll circumvent this by allocating an empty frame buffer for the tail
                # this keeps the subsequent logic simple
                post_shape = list(y_frames_pre.shape)
                post_shape[-1] = 0
                y_frames_post = np.empty_like(y_frames_pre, shape=post_shape)
    else:
        if n_fft > y.shape[-1]:
            raise ParameterError(
                f"n_fft={n_fft} is too large for uncentered analysis of input signal of length={y.shape[-1]}"
            )

        # "Middle" of the signal starts at sample 0
        start = 0
        # We have no extra frames
        extra = 0

    fft = get_fftlib()

    if dtype is None:
        dtype = util.dtype_r2c(y.dtype)

    # Window the time series.
    y_frames = util.frame(y[..., start:], frame_length=n_fft, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    shape = list(y_frames.shape)

    # This is our frequency dimension
    shape[-2] = 1 + n_fft // 2

    # If there's padding, there will be extra head and tail frames
    shape[-1] += extra

    if out is None:
        stft_matrix = np.zeros(shape, dtype=dtype, order="F")
    elif not (np.allclose(out.shape[:-1], shape[:-1]) and out.shape[-1] >= shape[-1]):
        raise ParameterError(
            f"Shape mismatch for provided output array out.shape={out.shape} and target shape={shape}"
        )
    elif not np.iscomplexobj(out):
        raise ParameterError(f"output with dtype={out.dtype} is not of complex type")
    else:
        if np.allclose(shape, out.shape):
            stft_matrix = out
        else:
            stft_matrix = out[..., : shape[-1]]

    # Fill in the warm-up
    if center and extra > 0:
        off_start = y_frames_pre.shape[-1]
        stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)

        off_end = y_frames_post.shape[-1]
        if off_end > 0:
            stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
    else:
        off_start = 0

    n_columns = int(
        util.MAX_MEM_BLOCK // (np.prod(y_frames.shape[:-1]) * y_frames.itemsize)
    )
    n_columns = max(n_columns, 1)

    for bl_s in range(0, y_frames.shape[-1], n_columns):
        bl_t = min(bl_s + n_columns, y_frames.shape[-1])

        stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(
            fft_window * y_frames[..., bl_s:bl_t], axis=-2
        )
    return stft_matrix


@cache(level=30)
def istft(
    stft_matrix: np.ndarray,
    *,
    hop_length: Optional[int] = None,
    win_length: Optional[int] = None,
    n_fft: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    dtype: Optional[DTypeLike] = None,
    length: Optional[int] = None,
    out: Optional[np.ndarray] = None,
) -> np.ndarray:
    """
    Inverse short-time Fourier transform (ISTFT).

    Converts a complex-valued spectrogram ``stft_matrix`` to time-series ``y``
    by minimizing the mean squared error between ``stft_matrix`` and STFT of
    ``y`` as described in [#]_ up to Section 2 (reconstruction from MSTFT).

    In general, window function, hop length and other parameters should be same
    as in stft, which mostly leads to perfect reconstruction of a signal from
    unmodified ``stft_matrix``.

    .. [#] D. W. Griffin and J. S. Lim,
        "Signal estimation from modified short-time Fourier transform,"
        IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.

    Parameters
    ----------
    stft_matrix : np.ndarray [shape=(..., 1 + n_fft//2, t)]
        STFT matrix from ``stft``

    hop_length : int > 0 [scalar]
        Number of frames between STFT columns.
        If unspecified, defaults to ``win_length // 4``.

    win_length : int <= n_fft = 2 * (stft_matrix.shape[0] - 1)
        When reconstructing the time series, each frame is windowed
        and each sample is normalized by the sum of squared window
        according to the ``window`` function (see below).

        If unspecified, defaults to ``n_fft``.

    n_fft : int > 0 or None
        The number of samples per frame in the input spectrogram.
        By default, this will be inferred from the shape of ``stft_matrix``.
        However, if an odd frame length was used, you can specify the correct
        length by setting ``n_fft``.

    window : string, tuple, number, function, np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, or number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.windows.hann`
        - a user-specified window vector of length ``n_fft``

        .. see also:: `filters.get_window`

    center : boolean
        - If ``True``, ``D`` is assumed to have centered frames.
        - If ``False``, ``D`` is assumed to have left-aligned frames.

    dtype : numeric type
        Real numeric type for ``y``.  Default is to match the numerical
        precision of the input spectrogram.

    length : int > 0, optional
        If provided, the output ``y`` is zero-padded or clipped to exactly
        ``length`` samples.

    out : np.ndarray or None
        A pre-allocated, complex-valued array to store the reconstructed signal
        ``y``.  This must be of the correct shape for the given input parameters.

        If not provided, a new array is allocated and returned.

    Returns
    -------
    y : np.ndarray [shape=(..., n)]
        time domain signal reconstructed from ``stft_matrix``.
        If ``stft_matrix`` contains more than two axes
        (e.g., from a stereo input signal), then ``y`` will match shape on the leading dimensions.

    See Also
    --------
    stft : Short-time Fourier Transform

    Notes
    -----
    This function caches at level 30.

    Examples
    --------
    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> D = librosa.stft(y)
    >>> y_hat = librosa.istft(D)
    >>> y_hat
    array([-1.407e-03, -4.461e-04, ...,  5.131e-06, -1.417e-05],
          dtype=float32)

    Exactly preserving length of the input signal requires explicit padding.
    Otherwise, a partial frame at the end of ``y`` will not be represented.

    >>> n = len(y)
    >>> n_fft = 2048
    >>> y_pad = librosa.util.fix_length(y, size=n + n_fft // 2)
    >>> D = librosa.stft(y_pad, n_fft=n_fft)
    >>> y_out = librosa.istft(D, length=n)
    >>> np.max(np.abs(y - y_out))
    8.940697e-08
    """
    if n_fft is None:
        n_fft = 2 * (stft_matrix.shape[-2] - 1)

    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    ifft_window = get_window(window, win_length, fftbins=True)

    # Pad out to match n_fft, and add broadcasting axes
    ifft_window = util.pad_center(ifft_window, size=n_fft)
    ifft_window = util.expand_to(ifft_window, ndim=stft_matrix.ndim, axes=-2)

    # For efficiency, trim STFT frames according to signal length if available
    if length:
        if center:
            padded_length = length + 2 * (n_fft // 2)
        else:
            padded_length = length
        n_frames = min(stft_matrix.shape[-1], int(np.ceil(padded_length / hop_length)))
    else:
        n_frames = stft_matrix.shape[-1]

    if dtype is None:
        dtype = util.dtype_c2r(stft_matrix.dtype)

    shape = list(stft_matrix.shape[:-2])
    expected_signal_len = n_fft + hop_length * (n_frames - 1)

    if length:
        expected_signal_len = length
    elif center:
        expected_signal_len -= 2 * (n_fft // 2)

    shape.append(expected_signal_len)

    if out is None:
        y = np.zeros(shape, dtype=dtype)
    elif not np.allclose(out.shape, shape):
        raise ParameterError(
            f"Shape mismatch for provided output array out.shape={out.shape} != {shape}"
        )
    else:
        y = out
        # Since we'll be doing overlap-add here, this needs to be initialized to zero.
        y.fill(0.0)

    fft = get_fftlib()

    if center:
        # First frame that does not depend on padding
        #  k * hop_length - n_fft//2 >= 0
        # k * hop_length >= n_fft // 2
        # k >= (n_fft//2 / hop_length)

        start_frame = int(np.ceil((n_fft // 2) / hop_length))

        # Do overlap-add on the head block
        ytmp = ifft_window * fft.irfft(stft_matrix[..., :start_frame], n=n_fft, axis=-2)

        shape[-1] = n_fft + hop_length * (start_frame - 1)
        head_buffer = np.zeros(shape, dtype=dtype)

        __overlap_add(head_buffer, ytmp, hop_length)

        # If y is smaller than the head buffer, take everything
        if y.shape[-1] < shape[-1] - n_fft // 2:
            y[..., :] = head_buffer[..., n_fft // 2 : y.shape[-1] + n_fft // 2]
        else:
            # Trim off the first n_fft//2 samples from the head and copy into target buffer
            y[..., : shape[-1] - n_fft // 2] = head_buffer[..., n_fft // 2 :]

        # This offset compensates for any differences between frame alignment
        # and padding truncation
        offset = start_frame * hop_length - n_fft // 2

    else:
        start_frame = 0
        offset = 0

    n_columns = int(
        util.MAX_MEM_BLOCK // (np.prod(stft_matrix.shape[:-1]) * stft_matrix.itemsize)
    )
    n_columns = max(n_columns, 1)

    frame = 0
    for bl_s in range(start_frame, n_frames, n_columns):
        bl_t = min(bl_s + n_columns, n_frames)

        # invert the block and apply the window function
        ytmp = ifft_window * fft.irfft(stft_matrix[..., bl_s:bl_t], n=n_fft, axis=-2)

        # Overlap-add the istft block starting at the i'th frame
        __overlap_add(y[..., frame * hop_length + offset :], ytmp, hop_length)

        frame += bl_t - bl_s

    # Normalize by sum of squared window
    ifft_window_sum = window_sumsquare(
        window=window,
        n_frames=n_frames,
        win_length=win_length,
        n_fft=n_fft,
        hop_length=hop_length,
        dtype=dtype,
    )

    if center:
        start = n_fft // 2
    else:
        start = 0

    ifft_window_sum = util.fix_length(ifft_window_sum[..., start:], size=y.shape[-1])

    approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum)

    y[..., approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]

    return y


@jit(nopython=True, cache=True)
def __overlap_add(y, ytmp, hop_length):
    # numba-accelerated overlap add for inverse stft
    # y is the pre-allocated output buffer
    # ytmp is the windowed inverse-stft frames
    # hop_length is the hop-length of the STFT analysis

    n_fft = ytmp.shape[-2]
    N = n_fft
    for frame in range(ytmp.shape[-1]):
        sample = frame * hop_length
        if N > y.shape[-1] - sample:
            N = y.shape[-1] - sample

        y[..., sample : (sample + N)] += ytmp[..., :N, frame]


def __reassign_frequencies(
    y: np.ndarray,
    sr: float = 22050,
    S: Optional[np.ndarray] = None,
    n_fft: int = 2048,
    hop_length: Optional[int] = None,
    win_length: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    dtype: Optional[DTypeLike] = None,
    pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, np.ndarray]:
    """Instantaneous frequencies based on a spectrogram representation.

    The reassignment vector is calculated using equation 5.20 in Flandrin,
    Auger, & Chassande-Mottin 2002::

        omega_reassigned = omega - np.imag(S_dh/S_h)

    where ``S_h`` is the complex STFT calculated using the original window, and
    ``S_dh`` is the complex STFT calculated using the derivative of the original
    window.

    See `reassigned_spectrogram` for references.

    It is recommended to use ``pad_mode="wrap"`` or else ``center=False``, rather
    than the defaults. Frequency reassignment assumes that the energy in each
    FFT bin is associated with exactly one signal component. Reflection padding
    at the edges of the signal may invalidate the reassigned estimates in the
    boundary frames.

    Parameters
    ----------
    y : np.ndarray [shape=(..., n,)], real-valued
        audio time series. Multi-channel is supported.

    sr : number > 0 [scalar]
        sampling rate of ``y``

    S : np.ndarray [shape=(..., d, t)] or None
        (optional) complex STFT calculated using the other arguments provided
        to `__reassign_frequencies`

    n_fft : int > 0 [scalar]
        FFT window size. Defaults to 2048.

    hop_length : int > 0 [scalar]
        hop length, number samples between subsequent frames.
        If not supplied, defaults to ``win_length // 4``.

    win_length : int > 0, <= n_fft
        Window length. Defaults to ``n_fft``.
        See ``stft`` for details.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.windows.hann`
        - a user-specified window vector of length ``n_fft``

        See `stft` for details.

        .. see also:: `filters.get_window`

    center : boolean
        - If ``True``, the signal ``y`` is padded so that frame
          ``S[:, t]`` is centered at ``y[t * hop_length]``.
        - If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.

    dtype : numeric type
        Complex numeric type for ``S``. Default is inferred to match
        the numerical precision of the input signal.

    pad_mode : string
        If ``center=True``, the padding mode to use at the edges of the signal.
        By default, STFT uses zero padding.

    Returns
    -------
    freqs : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
        Instantaneous frequencies:
        ``freqs[f, t]`` is the frequency for bin ``f``, frame ``t``.
    S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex]
        Short-time Fourier transform

    Warns
    -----
    RuntimeWarning
        Frequencies with zero support will produce a divide-by-zero warning and
        will be returned as `np.nan`.

    See Also
    --------
    stft : Short-time Fourier Transform
    reassigned_spectrogram : Time-frequency reassigned spectrogram

    Examples
    --------
    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> frequencies, S = librosa.core.spectrum.__reassign_frequencies(y, sr=sr)
    >>> frequencies
    array([[0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00],
           [3.628e+00, 4.698e+00, ..., 1.239e+01, 1.072e+01],
           ...,
           [1.101e+04, 1.102e+04, ..., 1.105e+04, 1.102e+04],
           [1.102e+04, 1.102e+04, ..., 1.102e+04, 1.102e+04]])
    """
    # retrieve window samples if needed so that the window derivative can be
    # calculated
    if win_length is None:
        win_length = n_fft

    window = get_window(window, win_length, fftbins=True)
    window = util.pad_center(window, size=n_fft)

    if S is None:
        if dtype is None:
            dtype = util.dtype_r2c(y.dtype)

        S_h = stft(
            y=y,
            n_fft=n_fft,
            hop_length=hop_length,
            window=window,
            center=center,
            dtype=dtype,
            pad_mode=pad_mode,
        )

    else:
        if dtype is None:
            dtype = S.dtype

        S_h = S

    # cyclic gradient to correctly handle edges of a periodic window
    window_derivative = util.cyclic_gradient(window)

    S_dh = stft(
        y=y,
        n_fft=n_fft,
        hop_length=hop_length,
        window=window_derivative,
        center=center,
        dtype=dtype,
        pad_mode=pad_mode,
    )

    # equation 5.20 of Flandrin, Auger, & Chassande-Mottin 2002
    # the sign of the correction is reversed in some papers - see Plante,
    # Meyer, & Ainsworth 1998 pp. 283-284
    correction = -np.imag(S_dh / S_h)

    freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft)
    freqs = util.expand_to(freqs, ndim=correction.ndim, axes=-2) + correction * (
        0.5 * sr / np.pi
    )

    return freqs, S_h


def __reassign_times(
    y: np.ndarray,
    sr: float = 22050,
    S: Optional[np.ndarray] = None,
    n_fft: int = 2048,
    hop_length: Optional[int] = None,
    win_length: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    dtype: Optional[DTypeLike] = None,
    pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, np.ndarray]:
    """Time reassignments based on a spectrogram representation.

    The reassignment vector is calculated using equation 5.23 in Flandrin,
    Auger, & Chassande-Mottin 2002::

        t_reassigned = t + np.real(S_th/S_h)

    where ``S_h`` is the complex STFT calculated using the original window, and
    ``S_th`` is the complex STFT calculated using the original window multiplied
    by the time offset from the window center.

    See `reassigned_spectrogram` for references.

    It is recommended to use ``pad_mode="constant"`` (zero padding) or else
    ``center=False``, rather than the defaults. Time reassignment assumes that
    the energy in each FFT bin is associated with exactly one impulse event.
    Reflection padding at the edges of the signal may invalidate the reassigned
    estimates in the boundary frames.

    Parameters
    ----------
    y : np.ndarray [shape=(..., n,)], real-valued
        audio time series. Multi-channel is supported.

    sr : number > 0 [scalar]
        sampling rate of ``y``

    S : np.ndarray [shape=(..., d, t)] or None
        (optional) complex STFT calculated using the other arguments provided
        to `__reassign_times`

    n_fft : int > 0 [scalar]
        FFT window size. Defaults to 2048.

    hop_length : int > 0 [scalar]
        hop length, number samples between subsequent frames.
        If not supplied, defaults to ``win_length // 4``.

    win_length : int > 0, <= n_fft
        Window length. Defaults to ``n_fft``.
        See `stft` for details.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.windows.hann`
        - a user-specified window vector of length ``n_fft``

        See `stft` for details.

        .. see also:: `filters.get_window`

    center : boolean
        - If ``True``, the signal ``y`` is padded so that frame
          ``S[:, t]`` is centered at ``y[t * hop_length]``.
        - If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.

    dtype : numeric type
        Complex numeric type for ``S``. Default is inferred to match
        the precision of the input signal.

    pad_mode : string
        If ``center=True``, the padding mode to use at the edges of the signal.
        By default, STFT uses zero padding.

    Returns
    -------
    times : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
        Reassigned times:
        ``times[f, t]`` is the time for bin ``f``, frame ``t``.
    S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex]
        Short-time Fourier transform

    Warns
    -----
    RuntimeWarning
        Time estimates with zero support will produce a divide-by-zero warning
        and will be returned as `np.nan`.

    See Also
    --------
    stft : Short-time Fourier Transform
    reassigned_spectrogram : Time-frequency reassigned spectrogram

    Examples
    --------
    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> times, S = librosa.core.spectrum.__reassign_times(y, sr=sr)
    >>> times
    array([[ 2.268e-05,  1.144e-02, ...,  5.332e+00,  5.333e+00],
           [ 2.268e-05,  1.451e-02, ...,  5.334e+00,  5.333e+00],
           ...,
           [ 2.268e-05, -6.177e-04, ...,  5.368e+00,  5.327e+00],
           [ 2.268e-05,  1.420e-03, ...,  5.307e+00,  5.328e+00]])
    """
    # retrieve window samples if needed so that the time-weighted window can be
    # calculated
    if win_length is None:
        win_length = n_fft

    window = get_window(window, win_length, fftbins=True)
    window = util.pad_center(window, size=n_fft)

    # retrieve hop length if needed so that the frame times can be calculated
    if hop_length is None:
        hop_length = int(win_length // 4)

    if S is None:
        if dtype is None:
            dtype = util.dtype_r2c(y.dtype)
        S_h = stft(
            y=y,
            n_fft=n_fft,
            hop_length=hop_length,
            window=window,
            center=center,
            dtype=dtype,
            pad_mode=pad_mode,
        )

    else:
        if dtype is None:
            dtype = S.dtype
        S_h = S

    # calculate window weighted by time
    half_width = n_fft // 2

    window_times: np.ndarray
    if n_fft % 2:
        window_times = np.arange(-half_width, half_width + 1)

    else:
        window_times = np.arange(0.5 - half_width, half_width)

    window_time_weighted = window * window_times

    S_th = stft(
        y=y,
        n_fft=n_fft,
        hop_length=hop_length,
        window=window_time_weighted,
        center=center,
        dtype=dtype,
        pad_mode=pad_mode,
    )

    # equation 5.23 of Flandrin, Auger, & Chassande-Mottin 2002
    # the sign of the correction is reversed in some papers - see Plante,
    # Meyer, & Ainsworth 1998 pp. 283-284
    correction = np.real(S_th / S_h)

    if center:
        pad_length = None

    else:
        pad_length = n_fft

    times = convert.frames_to_time(
        np.arange(S_h.shape[-1]), sr=sr, hop_length=hop_length, n_fft=pad_length
    )

    times = util.expand_to(times, ndim=correction.ndim, axes=-1) + correction / sr

    return times, S_h


def reassigned_spectrogram(
    y: np.ndarray,
    *,
    sr: float = 22050,
    S: Optional[np.ndarray] = None,
    n_fft: int = 2048,
    hop_length: Optional[int] = None,
    win_length: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    reassign_frequencies: bool = True,
    reassign_times: bool = True,
    ref_power: Union[float, Callable] = 1e-6,
    fill_nan: bool = False,
    clip: bool = True,
    dtype: Optional[DTypeLike] = None,
    pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    r"""Time-frequency reassigned spectrogram.

    The reassignment vectors are calculated using equations 5.20 and 5.23 in
    [#]_::

        t_reassigned = t + np.real(S_th/S_h)
        omega_reassigned = omega - np.imag(S_dh/S_h)

    where ``S_h`` is the complex STFT calculated using the original window,
    ``S_dh`` is the complex STFT calculated using the derivative of the original
    window, and ``S_th`` is the complex STFT calculated using the original window
    multiplied by the time offset from the window center. See [#]_ for
    additional algorithms, and [#]_ and [#]_ for history and discussion of the
    method.

    .. [#] Flandrin, P., Auger, F., & Chassande-Mottin, E. (2002).
        Time-Frequency reassignment: From principles to algorithms. In
        Applications in Time-Frequency Signal Processing (Vol. 10, pp.
        179-204). CRC Press.

    .. [#] Fulop, S. A., & Fitz, K. (2006). Algorithms for computing the
        time-corrected instantaneous frequency (reassigned) spectrogram, with
        applications. The Journal of the Acoustical Society of America, 119(1),
        360. doi:10.1121/1.2133000

    .. [#] Auger, F., Flandrin, P., Lin, Y.-T., McLaughlin, S., Meignen, S.,
        Oberlin, T., & Wu, H.-T. (2013). Time-Frequency Reassignment and
        Synchrosqueezing: An Overview. IEEE Signal Processing Magazine, 30(6),
        32-41. doi:10.1109/MSP.2013.2265316

    .. [#] Hainsworth, S., Macleod, M. (2003). Time-frequency reassignment: a
        review and analysis. Tech. Rep. CUED/FINFENG/TR.459, Cambridge
        University Engineering Department

    Parameters
    ----------
    y : np.ndarray [shape=(..., n)], real-valued
        audio time series. Multi-channel is supported.

    sr : number > 0 [scalar]
        sampling rate of ``y``

    S : np.ndarray [shape=(..., d, t)] or None
        (optional) complex STFT calculated using the other arguments provided
        to ``reassigned_spectrogram``

    n_fft : int > 0 [scalar]
        FFT window size. Defaults to 2048.

    hop_length : int > 0 [scalar]
        hop length, number samples between subsequent frames.
        If not supplied, defaults to ``win_length // 4``.

    win_length : int > 0, <= n_fft
        Window length. Defaults to ``n_fft``.
        See `stft` for details.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.windows.hann`
        - a user-specified window vector of length ``n_fft``

        See `stft` for details.

        .. see also:: `filters.get_window`

    center : boolean
        - If ``True`` (default), the signal ``y`` is padded so that frame
          ``S[:, t]`` is centered at ``y[t * hop_length]``. See `Notes` for
          recommended usage in this function.
        - If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.

    reassign_frequencies : boolean
        - If ``True`` (default), the returned frequencies will be instantaneous
          frequency estimates.
        - If ``False``, the returned frequencies will be a read-only view of the
          STFT bin frequencies for all frames.

    reassign_times : boolean
        - If ``True`` (default), the returned times will be corrected
          (reassigned) time estimates for each bin.
        - If ``False``, the returned times will be a read-only view of the STFT
          frame times for all bins.

    ref_power : float >= 0 or callable
        Minimum power threshold for estimating time-frequency reassignments.
        Any bin with ``np.abs(S[f, t])**2 < ref_power`` will be returned as
        `np.nan` in both frequency and time, unless ``fill_nan`` is ``True``. If 0
        is provided, then only bins with zero power will be returned as
        `np.nan` (unless ``fill_nan=True``).

    fill_nan : boolean
        - If ``False`` (default), the frequency and time reassignments for bins
          below the power threshold provided in ``ref_power`` will be returned as
          `np.nan`.
        - If ``True``, the frequency and time reassignments for these bins will
          be returned as the bin center frequencies and frame times.

    clip : boolean
        - If ``True`` (default), estimated frequencies outside the range
          `[0, 0.5 * sr]` or times outside the range `[0, len(y) / sr]` will be
          clipped to those ranges.
        - If ``False``, estimated frequencies and times beyond the bounds of the
          spectrogram may be returned.

    dtype : numeric type
        Complex numeric type for STFT calculation. Default is inferred to match
        the precision of the input signal.

    pad_mode : string
        If ``center=True``, the padding mode to use at the edges of the signal.
        By default, STFT uses zero padding.

    Returns
    -------
    freqs, times, mags : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
        Instantaneous frequencies:
            ``freqs[..., f, t]`` is the frequency for bin ``f``, frame ``t``.
            If ``reassign_frequencies=False``, this will instead be a read-only array
            of the same shape containing the bin center frequencies for all frames.

        Reassigned times:
            ``times[..., f, t]`` is the time for bin ``f``, frame ``t``.
            If ``reassign_times=False``, this will instead be a read-only array of
            the same shape containing the frame times for all bins.

        Magnitudes from short-time Fourier transform:
            ``mags[..., f, t]`` is the magnitude for bin ``f``, frame ``t``.

    Warns
    -----
    RuntimeWarning
        Frequency or time estimates with zero support will produce a
        divide-by-zero warning, and will be returned as `np.nan` unless
        ``fill_nan=True``.

    See Also
    --------
    stft : Short-time Fourier Transform

    Notes
    -----
    It is recommended to use ``center=False`` with this function rather than the
    librosa default ``True``. Unlike ``stft``, reassigned times are not aligned to
    the left or center of each frame, so padding the signal does not affect the
    meaning of the reassigned times. However, reassignment assumes that the
    energy in each FFT bin is associated with exactly one signal component and
    impulse event.

    If ``reassign_times`` is ``False``, the frame times that are returned will be
    aligned to the left or center of the frame, depending on the value of
    ``center``. In this case, if ``center`` is ``True``, then ``pad_mode="wrap"`` is
    recommended for valid estimation of the instantaneous frequencies in the
    boundary frames.

    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> amin = 1e-10
    >>> n_fft = 64
    >>> sr = 4000
    >>> y = 1e-3 * librosa.clicks(times=[0.3], sr=sr, click_duration=1.0,
    ...                           click_freq=1200.0, length=8000) +\
    ...     1e-3 * librosa.clicks(times=[1.5], sr=sr, click_duration=0.5,
    ...                           click_freq=400.0, length=8000) +\
    ...     1e-3 * librosa.chirp(fmin=200, fmax=1600, sr=sr, duration=2.0) +\
    ...     1e-6 * np.random.randn(2*sr)
    >>> freqs, times, mags = librosa.reassigned_spectrogram(y=y, sr=sr,
    ...                                                     n_fft=n_fft)
    >>> mags_db = librosa.amplitude_to_db(mags, ref=np.max)

    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> img = librosa.display.specshow(mags_db, x_axis="s", y_axis="linear", sr=sr,
    ...                          hop_length=n_fft//4, ax=ax[0])
    >>> ax[0].set(title="Spectrogram", xlabel=None)
    >>> ax[0].label_outer()
    >>> ax[1].scatter(times, freqs, c=mags_db, cmap="magma", alpha=0.1, s=5)
    >>> ax[1].set_title("Reassigned spectrogram")
    >>> fig.colorbar(img, ax=ax, format="%+2.f dB")
    """
    if not callable(ref_power) and ref_power < 0:
        raise ParameterError("ref_power must be non-negative or callable.")

    if not reassign_frequencies and not reassign_times:
        raise ParameterError("reassign_frequencies or reassign_times must be True.")

    if win_length is None:
        win_length = n_fft

    if hop_length is None:
        hop_length = int(win_length // 4)

    # frequency and time reassignment if requested
    if reassign_frequencies:
        freqs, S = __reassign_frequencies(
            y=y,
            sr=sr,
            S=S,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            dtype=dtype,
            pad_mode=pad_mode,
        )

    if reassign_times:
        times, S = __reassign_times(
            y=y,
            sr=sr,
            S=S,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            dtype=dtype,
            pad_mode=pad_mode,
        )

    assert S is not None

    mags: np.ndarray = np.abs(S)

    # clean up reassignment issues: divide-by-zero, bins with near-zero power,
    # and estimates outside the spectrogram bounds

    # retrieve bin frequencies and frame times to replace missing estimates
    if fill_nan or not reassign_frequencies or not reassign_times:
        if center:
            pad_length = None

        else:
            pad_length = n_fft

        bin_freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft)

        frame_times = convert.frames_to_time(
            frames=np.arange(S.shape[-1]),
            sr=sr,
            hop_length=hop_length,
            n_fft=pad_length,
        )

    # find bins below the power threshold
    # reassigned bins with zero power will already be NaN
    if callable(ref_power):
        ref_p = ref_power(mags**2)
    else:
        ref_p = ref_power
    mags_low = np.less(mags, ref_p**0.5, where=~np.isnan(mags))

    # for reassigned estimates, optionally set thresholded bins to NaN, return
    # bin frequencies and frame times in place of NaN generated by
    # divide-by-zero and power threshold, and clip to spectrogram bounds
    if reassign_frequencies:
        if ref_p > 0:
            freqs[mags_low] = np.nan

        if fill_nan:
            freqs = np.where(np.isnan(freqs), bin_freqs[:, np.newaxis], freqs)

        if clip:
            np.clip(freqs, 0, sr / 2.0, out=freqs)

    # or if reassignment was not requested, return bin frequencies and frame
    # times for every cell is the spectrogram
    else:
        freqs = np.broadcast_to(bin_freqs[:, np.newaxis], S.shape)

    if reassign_times:
        if ref_p > 0:
            times[mags_low] = np.nan

        if fill_nan:
            times = np.where(np.isnan(times), frame_times[np.newaxis, :], times)

        if clip:
            np.clip(times, 0, y.shape[-1] / float(sr), out=times)

    else:
        times = np.broadcast_to(frame_times[np.newaxis, :], S.shape)

    return freqs, times, mags


def magphase(D: np.ndarray, *, power: float = 1) -> Tuple[np.ndarray, np.ndarray]:
    """Separate a complex-valued spectrogram D into its magnitude (S)
    and phase (P) components, so that ``D = S * P``.

    Parameters
    ----------
    D : np.ndarray [shape=(..., d, t), dtype=complex]
        complex-valued spectrogram
    power : float > 0
        Exponent for the magnitude spectrogram,
        e.g., 1 for energy, 2 for power, etc.

    Returns
    -------
    D_mag : np.ndarray [shape=(..., d, t), dtype=real]
        magnitude of ``D``, raised to ``power``
    D_phase : np.ndarray [shape=(..., d, t), dtype=complex]
        ``exp(1.j * phi)`` where ``phi`` is the phase of ``D``

    Examples
    --------
    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> D = librosa.stft(y)
    >>> magnitude, phase = librosa.magphase(D)
    >>> magnitude
    array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05],
           [3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05],
           ...,
           [7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03],
           [7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]],
          dtype=float32)
    >>> phase
    array([[ 1.   +0.000e+00j,  1.   +0.000e+00j, ...,
            -1.   -8.742e-08j, -1.   -8.742e-08j],
           [-1.   -8.742e-08j, -0.775-6.317e-01j, ...,
            -0.885-4.648e-01j,  0.472-8.815e-01j],
           ...,
           [ 1.   -4.342e-12j,  0.028-9.996e-01j, ...,
            -0.222-9.751e-01j, -0.75 -6.610e-01j],
           [-1.   -8.742e-08j, -1.   -8.742e-08j, ...,
             1.   +0.000e+00j,  1.   +0.000e+00j]], dtype=complex64)

    Or get the phase angle (in radians)

    >>> np.angle(phase)
    array([[ 0.000e+00,  0.000e+00, ..., -3.142e+00, -3.142e+00],
           [-3.142e+00, -2.458e+00, ..., -2.658e+00, -1.079e+00],
           ...,
           [-4.342e-12, -1.543e+00, ..., -1.794e+00, -2.419e+00],
           [-3.142e+00, -3.142e+00, ...,  0.000e+00,  0.000e+00]],
          dtype=float32)
    """
    mag = np.abs(D)

    # Prevent NaNs and return magnitude 0, phase 1+0j for zero
    zeros_to_ones = mag == 0
    mag_nonzero = mag + zeros_to_ones
    # Compute real and imaginary separately, because complex division can
    # produce NaNs when denormalized numbers are involved (< ~2e-39 for
    # complex64, ~5e-309 for complex128)
    phase = np.empty_like(D, dtype=util.dtype_r2c(D.dtype))
    phase.real = D.real / mag_nonzero + zeros_to_ones
    phase.imag = D.imag / mag_nonzero

    mag **= power

    return mag, phase


def phase_vocoder(
    D: np.ndarray,
    *,
    rate: float,
    hop_length: Optional[int] = None,
    n_fft: Optional[int] = None,
) -> np.ndarray:
    """Phase vocoder.  Given an STFT matrix D, speed up by a factor of ``rate``

    Based on the implementation provided by [#]_.

    This is a simplified implementation, intended primarily for
    reference and pedagogical purposes.  It makes no attempt to
    handle transients, and is likely to produce many audible
    artifacts.  For a higher quality implementation, we recommend
    the RubberBand library [#]_ and its Python wrapper `pyrubberband`.

    .. [#] Ellis, D. P. W. "A phase vocoder in Matlab."
        Columbia University, 2002.
        https://www.ee.columbia.edu/~dpwe/resources/matlab/pvoc/

    .. [#] https://breakfastquay.com/rubberband/

    Examples
    --------
    >>> # Play at double speed
    >>> y, sr   = librosa.load(librosa.ex('trumpet'))
    >>> D       = librosa.stft(y, n_fft=2048, hop_length=512)
    >>> D_fast  = librosa.phase_vocoder(D, rate=2.0, hop_length=512)
    >>> y_fast  = librosa.istft(D_fast, hop_length=512)

    >>> # Or play at 1/3 speed
    >>> y, sr   = librosa.load(librosa.ex('trumpet'))
    >>> D       = librosa.stft(y, n_fft=2048, hop_length=512)
    >>> D_slow  = librosa.phase_vocoder(D, rate=1./3, hop_length=512)
    >>> y_slow  = librosa.istft(D_slow, hop_length=512)

    Parameters
    ----------
    D : np.ndarray [shape=(..., d, t), dtype=complex]
        STFT matrix

    rate : float > 0 [scalar]
        Speed-up factor: ``rate > 1`` is faster, ``rate < 1`` is slower.

    hop_length : int > 0 [scalar] or None
        The number of samples between successive columns of ``D``.

        If None, defaults to ``n_fft//4 = (D.shape[0]-1)//2``

    n_fft : int > 0 or None
        The number of samples per frame in D.
        By default (None), this will be inferred from the shape of D.
        However, if D was constructed using an odd-length window, the correct
        frame length can be specified here.

    Returns
    -------
    D_stretched : np.ndarray [shape=(..., d, t / rate), dtype=complex]
        time-stretched STFT

    See Also
    --------
    pyrubberband
    """
    if n_fft is None:
        n_fft = 2 * (D.shape[-2] - 1)

    if hop_length is None:
        hop_length = int(n_fft // 4)

    time_steps = np.arange(0, D.shape[-1], rate, dtype=np.float64)

    # Create an empty output array
    shape = list(D.shape)
    shape[-1] = len(time_steps)
    d_stretch = np.zeros_like(D, shape=shape)

    # Expected phase advance in each bin
    phi_advance = np.linspace(0, np.pi * hop_length, D.shape[-2])

    # Phase accumulator; initialize to the first sample
    phase_acc = np.angle(D[..., 0])

    # Pad 0 columns to simplify boundary logic
    padding = [(0, 0) for _ in D.shape]
    padding[-1] = (0, 2)
    D = np.pad(D, padding, mode="constant")

    for t, step in enumerate(time_steps):
        columns = D[..., int(step) : int(step + 2)]

        # Weighting for linear magnitude interpolation
        alpha = np.mod(step, 1.0)
        mag = (1.0 - alpha) * np.abs(columns[..., 0]) + alpha * np.abs(columns[..., 1])

        # Store to output array
        d_stretch[..., t] = util.phasor(phase_acc, mag=mag)

        # Compute phase advance
        dphase = np.angle(columns[..., 1]) - np.angle(columns[..., 0]) - phi_advance

        # Wrap to -pi:pi range
        dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))

        # Accumulate phase
        phase_acc += phi_advance + dphase

    return d_stretch


@cache(level=20)
def iirt(
    y: np.ndarray,
    *,
    sr: float = 22050,
    win_length: int = 2048,
    hop_length: Optional[int] = None,
    center: bool = True,
    tuning: float = 0.0,
    pad_mode: _PadMode = "constant",
    flayout: str = "sos",
    res_type: str = "soxr_hq",
    **kwargs: Any,
) -> np.ndarray:
    r"""Time-frequency representation using IIR filters

    This function will return a time-frequency representation
    using a multirate filter bank consisting of IIR filters. [#]_

    First, ``y`` is resampled as needed according to the provided ``sample_rates``.

    Then, a filterbank with with ``n`` band-pass filters is designed.

    The resampled input signals are processed by the filterbank as a whole.
    (`scipy.signal.filtfilt` resp. `sosfiltfilt` is used to make the phase linear.)
    The output of the filterbank is cut into frames.
    For each band, the short-time mean-square power (STMSP) is calculated by
    summing ``win_length`` subsequent filtered time samples.

    When called with the default set of parameters, it will generate the TF-representation
    (pitch filterbank):

        * 85 filters with MIDI pitches [24, 108] as ``center_freqs``.
        * each filter having a bandwidth of one semitone.

    .. [#] Müller, Meinard.
           "Information Retrieval for Music and Motion."
           Springer Verlag. 2007.

    Parameters
    ----------
    y : np.ndarray [shape=(..., n)]
        audio time series. Multi-channel is supported.
    sr : number > 0 [scalar]
        sampling rate of ``y``
    win_length : int > 0, <= n_fft
        Window length.
    hop_length : int > 0 [scalar]
        Hop length, number samples between subsequent frames.
        If not supplied, defaults to ``win_length // 4``.
    center : boolean
        - If ``True``, the signal ``y`` is padded so that frame
          ``D[..., :, t]`` is centered at ``y[t * hop_length]``.
        - If ``False``, then `D[..., :, t]`` begins at ``y[t * hop_length]``
    tuning : float [scalar]
        Tuning deviation from A440 in fractions of a bin.
    pad_mode : string
        If ``center=True``, the padding mode to use at the edges of the signal.
        By default, this function uses zero padding.
    flayout : string
        - If `sos` (default), a series of second-order filters is used for filtering with `scipy.signal.sosfiltfilt`.
          Minimizes numerical precision errors for high-order filters, but is slower.
        - If `ba`, the standard difference equation is used for filtering with `scipy.signal.filtfilt`.
          Can be unstable for high-order filters.
    res_type : string
        The resampling mode.  See `librosa.resample` for details.
    **kwargs : additional keyword arguments
        Additional arguments for `librosa.filters.semitone_filterbank`
        (e.g., could be used to provide another set of ``center_freqs`` and ``sample_rates``).

    Returns
    -------
    bands_power : np.ndarray [shape=(..., n, t), dtype=dtype]
        Short-time mean-square power for the input signal.

    Raises
    ------
    ParameterError
        If ``flayout`` is not None, `ba`, or `sos`.

    See Also
    --------
    librosa.filters.semitone_filterbank
    librosa.filters.mr_frequencies
    librosa.cqt
    scipy.signal.filtfilt
    scipy.signal.sosfiltfilt

    Examples
    --------
    >>> import matplotlib.pyplot as plt
    >>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3)
    >>> D = np.abs(librosa.iirt(y))
    >>> C = np.abs(librosa.cqt(y=y, sr=sr))
    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
    ...                                y_axis='cqt_hz', x_axis='time', ax=ax[0])
    >>> ax[0].set(title='Constant-Q transform')
    >>> ax[0].label_outer()
    >>> img = librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
    ...                                y_axis='cqt_hz', x_axis='time', ax=ax[1])
    >>> ax[1].set_title('Semitone spectrogram (iirt)')
    >>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
    """
    if flayout not in ("ba", "sos"):
        raise ParameterError(f"Unsupported flayout={flayout}")

    # check audio input
    util.valid_audio(y, mono=False)

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = win_length // 4

    # Pad the time series so that frames are centered
    if center:
        padding = [(0, 0) for _ in y.shape]
        padding[-1] = (win_length // 2, win_length // 2)
        y = np.pad(y, padding, mode=pad_mode)

    # get the semitone filterbank
    filterbank_ct, sample_rates = semitone_filterbank(
        tuning=tuning, flayout=flayout, **kwargs
    )

    # create three downsampled versions of the audio signal
    y_resampled = []

    y_srs = np.unique(sample_rates)

    for cur_sr in y_srs:
        y_resampled.append(resample(y, orig_sr=sr, target_sr=cur_sr, res_type=res_type))

    # Compute the number of frames that will fit. The end may get truncated.
    n_frames = int(1 + (y.shape[-1] - win_length) // hop_length)

    # Pre-allocate the output array
    shape = list(y.shape)
    # Time dimension reduces to n_frames
    shape[-1] = n_frames
    # Insert a new axis at position -2 for filter response
    shape.insert(-1, len(filterbank_ct))

    bands_power = np.empty_like(y, shape=shape)

    slices: List[Union[int, slice]] = [slice(None) for _ in bands_power.shape]
    for i, (cur_sr, cur_filter) in enumerate(zip(sample_rates, filterbank_ct)):
        slices[-2] = i

        # filter the signal
        cur_sr_idx = np.flatnonzero(y_srs == cur_sr)[0]

        if flayout == "ba":
            cur_filter_output = scipy.signal.filtfilt(
                cur_filter[0], cur_filter[1], y_resampled[cur_sr_idx], axis=-1
            )
        elif flayout == "sos":
            cur_filter_output = scipy.signal.sosfiltfilt(
                cur_filter, y_resampled[cur_sr_idx], axis=-1
            )

        factor = sr / cur_sr
        hop_length_STMSP = hop_length / factor
        win_length_STMSP_round = int(round(win_length / factor))

        # hop_length_STMSP is used here as a floating-point number.
        # The discretization happens at the end to avoid accumulated rounding errors.
        start_idx = np.arange(
            0, cur_filter_output.shape[-1] - win_length_STMSP_round, hop_length_STMSP
        )
        if len(start_idx) < n_frames:
            min_length = (
                int(np.ceil(n_frames * hop_length_STMSP)) + win_length_STMSP_round
            )
            cur_filter_output = util.fix_length(cur_filter_output, size=min_length)
            start_idx = np.arange(
                0,
                cur_filter_output.shape[-1] - win_length_STMSP_round,
                hop_length_STMSP,
            )
        start_idx = np.round(start_idx).astype(int)[:n_frames]

        idx = np.add.outer(start_idx, np.arange(win_length_STMSP_round))

        bands_power[tuple(slices)] = factor * np.sum(
            cur_filter_output[..., idx] ** 2, axis=-1
        )

    return bands_power


@cache(level=30)
def power_to_db(
    S: np.ndarray,
    *,
    ref: Union[float, Callable] = 1.0,
    amin: float = 1e-10,
    top_db: Optional[float] = 80.0,
) -> np.ndarray:
    """Convert a power spectrogram (amplitude squared) to decibel (dB) units

    This computes the scaling ``10 * log10(S / ref)`` in a numerically
    stable way.

    Parameters
    ----------
    S : np.ndarray
        input power

    ref : scalar or callable
        If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::

            10 * log10(S / ref)

        Zeros in the output correspond to positions where ``S == ref``.

        If callable, the reference value is computed as ``ref(S)``.

    amin : float > 0 [scalar]
        minimum threshold for ``abs(S)`` and ``ref``

    top_db : float >= 0 [scalar]
        threshold the output at ``top_db`` below the peak:
        ``max(10 * log10(S/ref)) - top_db``

    Returns
    -------
    S_db : np.ndarray
        ``S_db ~= 10 * log10(S) - 10 * log10(ref)``

    See Also
    --------
    perceptual_weighting
    db_to_power
    amplitude_to_db
    db_to_amplitude

    Notes
    -----
    This function caches at level 30.

    Examples
    --------
    Get a power spectrogram from a waveform ``y``

    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> S = np.abs(librosa.stft(y))
    >>> librosa.power_to_db(S**2)
    array([[-41.809, -41.809, ..., -41.809, -41.809],
           [-41.809, -41.809, ..., -41.809, -41.809],
           ...,
           [-41.809, -41.809, ..., -41.809, -41.809],
           [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)

    Compute dB relative to peak power

    >>> librosa.power_to_db(S**2, ref=np.max)
    array([[-80., -80., ..., -80., -80.],
           [-80., -80., ..., -80., -80.],
           ...,
           [-80., -80., ..., -80., -80.],
           [-80., -80., ..., -80., -80.]], dtype=float32)

    Or compare to median power

    >>> librosa.power_to_db(S**2, ref=np.median)
    array([[16.578, 16.578, ..., 16.578, 16.578],
           [16.578, 16.578, ..., 16.578, 16.578],
           ...,
           [16.578, 16.578, ..., 16.578, 16.578],
           [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)

    And plot the results

    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
    ...                                   ax=ax[0])
    >>> ax[0].set(title='Power spectrogram')
    >>> ax[0].label_outer()
    >>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
    ...                                  sr=sr, y_axis='log', x_axis='time', ax=ax[1])
    >>> ax[1].set(title='Log-Power spectrogram')
    >>> fig.colorbar(imgpow, ax=ax[0])
    >>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
    """
    S = np.asarray(S)

    if amin <= 0:
        raise ParameterError("amin must be strictly positive")

    if np.issubdtype(S.dtype, np.complexfloating):
        warnings.warn(
            "power_to_db was called on complex input so phase "
            "information will be discarded. To suppress this warning, "
            "call power_to_db(np.abs(D)**2) instead.",
            stacklevel=2,
        )
        magnitude = np.abs(S)
    else:
        magnitude = S

    if callable(ref):
        # User supplied a function to calculate reference power
        ref_value = ref(magnitude)
    else:
        ref_value = np.abs(ref)

    log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))

    if top_db is not None:
        if top_db < 0:
            raise ParameterError("top_db must be non-negative")
        log_spec = np.maximum(log_spec, log_spec.max() - top_db)

    return log_spec


@cache(level=30)
def db_to_power(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray:
    """Convert a dB-scale spectrogram to a power spectrogram.

    This effectively inverts ``power_to_db``::

        db_to_power(S_db) ~= ref * 10.0**(S_db / 10)

    Parameters
    ----------
    S_db : np.ndarray
        dB-scaled spectrogram
    ref : number > 0
        Reference power: output will be scaled by this value

    Returns
    -------
    S : np.ndarray
        Power spectrogram

    Notes
    -----
    This function caches at level 30.
    """
    return ref * np.power(10.0, 0.1 * S_db)


@cache(level=30)
def amplitude_to_db(
    S: np.ndarray,
    *,
    ref: Union[float, Callable] = 1.0,
    amin: float = 1e-5,
    top_db: Optional[float] = 80.0,
) -> np.ndarray:
    """Convert an amplitude spectrogram to dB-scaled spectrogram.

    This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``,
    but is provided for convenience.

    Parameters
    ----------
    S : np.ndarray
        input amplitude

    ref : scalar or callable
        If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:
        ``20 * log10(S / ref)``.
        Zeros in the output correspond to positions where ``S == ref``.

        If callable, the reference value is computed as ``ref(S)``.

    amin : float > 0 [scalar]
        minimum threshold for ``S`` and ``ref``

    top_db : float >= 0 [scalar]
        threshold the output at ``top_db`` below the peak:
        ``max(20 * log10(S/ref)) - top_db``

    Returns
    -------
    S_db : np.ndarray
        ``S`` measured in dB

    See Also
    --------
    power_to_db, db_to_amplitude

    Notes
    -----
    This function caches at level 30.
    """
    S = np.asarray(S)

    if np.issubdtype(S.dtype, np.complexfloating):
        warnings.warn(
            "amplitude_to_db was called on complex input so phase "
            "information will be discarded. To suppress this warning, "
            "call amplitude_to_db(np.abs(S)) instead.",
            stacklevel=2,
        )

    magnitude = np.abs(S)

    if callable(ref):
        # User supplied a function to calculate reference power
        ref_value = ref(magnitude)
    else:
        ref_value = np.abs(ref)

    power = np.square(magnitude, out=magnitude)

    return power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db)


@cache(level=30)
def db_to_amplitude(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray:
    """Convert a dB-scaled spectrogram to an amplitude spectrogram.

    This effectively inverts `amplitude_to_db`::

        db_to_amplitude(S_db) ~= 10.0**(0.5 * S_db/10 + log10(ref))

    Parameters
    ----------
    S_db : np.ndarray
        dB-scaled spectrogram
    ref : number > 0
        Optional reference power.

    Returns
    -------
    S : np.ndarray
        Linear magnitude spectrogram

    Notes
    -----
    This function caches at level 30.
    """
    return db_to_power(S_db, ref=ref**2) ** 0.5


@cache(level=30)
def perceptual_weighting(
    S: np.ndarray, frequencies: np.ndarray, *, kind: str = "A", **kwargs: Any
) -> np.ndarray:
    """Perceptual weighting of a power spectrogram::

        S_p[..., f, :] = frequency_weighting(f, 'A') + 10*log(S[..., f, :] / ref)

    Parameters
    ----------
    S : np.ndarray [shape=(..., d, t)]
        Power spectrogram
    frequencies : np.ndarray [shape=(d,)]
        Center frequency for each row of` `S``
    kind : str
        The frequency weighting curve to use.
        e.g. `'A'`, `'B'`, `'C'`, `'D'`, `None or 'Z'`
    **kwargs : additional keyword arguments
        Additional keyword arguments to `power_to_db`.

    Returns
    -------
    S_p : np.ndarray [shape=(..., d, t)]
        perceptually weighted version of ``S``

    See Also
    --------
    power_to_db

    Notes
    -----
    This function caches at level 30.

    Examples
    --------
    Re-weight a CQT power spectrum, using peak power as reference

    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1')))
    >>> freqs = librosa.cqt_frequencies(C.shape[0],
    ...                                 fmin=librosa.note_to_hz('A1'))
    >>> perceptual_CQT = librosa.perceptual_weighting(C**2,
    ...                                               freqs,
    ...                                               ref=np.max)
    >>> perceptual_CQT
    array([[ -96.528,  -97.101, ..., -108.561, -108.561],
           [ -95.88 ,  -96.479, ..., -107.551, -107.551],
           ...,
           [ -65.142,  -53.256, ...,  -80.098,  -80.098],
           [ -71.542,  -53.197, ...,  -80.311,  -80.311]])

    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> img = librosa.display.specshow(librosa.amplitude_to_db(C,
    ...                                                        ref=np.max),
    ...                                fmin=librosa.note_to_hz('A1'),
    ...                                y_axis='cqt_hz', x_axis='time',
    ...                                ax=ax[0])
    >>> ax[0].set(title='Log CQT power')
    >>> ax[0].label_outer()
    >>> imgp = librosa.display.specshow(perceptual_CQT, y_axis='cqt_hz',
    ...                                 fmin=librosa.note_to_hz('A1'),
    ...                                 x_axis='time', ax=ax[1])
    >>> ax[1].set(title='Perceptually weighted log CQT')
    >>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB")
    >>> fig.colorbar(imgp, ax=ax[1], format="%+2.0f dB")
    """
    offset = convert.frequency_weighting(frequencies, kind=kind).reshape((-1, 1))

    result: np.ndarray = offset + power_to_db(S, **kwargs)
    return result


@cache(level=30)
def fmt(
    y: np.ndarray,
    *,
    t_min: float = 0.5,
    n_fmt: Optional[int] = None,
    kind: str = "cubic",
    beta: float = 0.5,
    over_sample: float = 1,
    axis: int = -1,
) -> np.ndarray:
    """Fast Mellin transform (FMT)

    The Mellin of a signal `y` is performed by interpolating `y` on an exponential time
    axis, applying a polynomial window, and then taking the discrete Fourier transform.

    When the Mellin parameter (beta) is 1/2, it is also known as the scale transform. [#]_
    The scale transform can be useful for audio analysis because its magnitude is invariant
    to scaling of the domain (e.g., time stretching or compression).  This is analogous
    to the magnitude of the Fourier transform being invariant to shifts in the input domain.

    .. [#] De Sena, Antonio, and Davide Rocchesso.
        "A fast Mellin and scale transform."
        EURASIP Journal on Applied Signal Processing 2007.1 (2007): 75-75.

    .. [#] Cohen, L.
        "The scale representation."
        IEEE Transactions on Signal Processing 41, no. 12 (1993): 3275-3292.

    Parameters
    ----------
    y : np.ndarray, real-valued
        The input signal(s).  Can be multidimensional.
        The target axis must contain at least 3 samples.

    t_min : float > 0
        The minimum time spacing (in samples).
        This value should generally be less than 1 to preserve as much information as
        possible.

    n_fmt : int > 2 or None
        The number of scale transform bins to use.
        If None, then ``n_bins = over_sample * ceil(n * log((n-1)/t_min))`` is taken,
        where ``n = y.shape[axis]``

    kind : str
        The type of interpolation to use when re-sampling the input.
        See `scipy.interpolate.interp1d` for possible values.

        Note that the default is to use high-precision (cubic) interpolation.
        This can be slow in practice; if speed is preferred over accuracy,
        then consider using ``kind='linear'``.

    beta : float
        The Mellin parameter.  ``beta=0.5`` provides the scale transform.

    over_sample : float >= 1
        Over-sampling factor for exponential resampling.

    axis : int
        The axis along which to transform ``y``

    Returns
    -------
    x_scale : np.ndarray [dtype=complex]
        The scale transform of ``y`` along the ``axis`` dimension.

    Raises
    ------
    ParameterError
        if ``n_fmt < 2`` or ``t_min <= 0``
        or if ``y`` is not finite
        or if ``y.shape[axis] < 3``.

    Notes
    -----
    This function caches at level 30.

    Examples
    --------
    >>> # Generate a signal and time-stretch it (with energy normalization)
    >>> scale = 1.25
    >>> freq = 3.0
    >>> x1 = np.linspace(0, 1, num=1024, endpoint=False)
    >>> x2 = np.linspace(0, 1, num=int(scale * len(x1)), endpoint=False)
    >>> y1 = np.sin(2 * np.pi * freq * x1)
    >>> y2 = np.sin(2 * np.pi * freq * x2) / np.sqrt(scale)
    >>> # Verify that the two signals have the same energy
    >>> np.sum(np.abs(y1)**2), np.sum(np.abs(y2)**2)
        (255.99999999999997, 255.99999999999969)
    >>> scale1 = librosa.fmt(y1, n_fmt=512)
    >>> scale2 = librosa.fmt(y2, n_fmt=512)

    >>> # And plot the results
    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots(nrows=2)
    >>> ax[0].plot(y1, label='Original')
    >>> ax[0].plot(y2, linestyle='--', label='Stretched')
    >>> ax[0].set(xlabel='time (samples)', title='Input signals')
    >>> ax[0].legend()
    >>> ax[1].semilogy(np.abs(scale1), label='Original')
    >>> ax[1].semilogy(np.abs(scale2), linestyle='--', label='Stretched')
    >>> ax[1].set(xlabel='scale coefficients', title='Scale transform magnitude')
    >>> ax[1].legend()

    >>> # Plot the scale transform of an onset strength autocorrelation
    >>> y, sr = librosa.load(librosa.ex('choice'))
    >>> odf = librosa.onset.onset_strength(y=y, sr=sr)
    >>> # Auto-correlate with up to 10 seconds lag
    >>> odf_ac = librosa.autocorrelate(odf, max_size=10 * sr // 512)
    >>> # Normalize
    >>> odf_ac = librosa.util.normalize(odf_ac, norm=np.inf)
    >>> # Compute the scale transform
    >>> odf_ac_scale = librosa.fmt(librosa.util.normalize(odf_ac), n_fmt=512)
    >>> # Plot the results
    >>> fig, ax = plt.subplots(nrows=3)
    >>> ax[0].plot(odf, label='Onset strength')
    >>> ax[0].set(xlabel='Time (frames)', title='Onset strength')
    >>> ax[1].plot(odf_ac, label='Onset autocorrelation')
    >>> ax[1].set(xlabel='Lag (frames)', title='Onset autocorrelation')
    >>> ax[2].semilogy(np.abs(odf_ac_scale), label='Scale transform magnitude')
    >>> ax[2].set(xlabel='scale coefficients')
    """
    n = y.shape[axis]

    if n < 3:
        raise ParameterError(f"y.shape[{axis}]=={n} < 3")

    if t_min <= 0:
        raise ParameterError(f"t_min={t_min} must be a positive number")

    if n_fmt is None:
        if over_sample < 1:
            raise ParameterError(f"over_sample={over_sample} must be >= 1")

        # The base is the maximum ratio between adjacent samples
        # Since the sample spacing is increasing, this is simply the
        # ratio between the positions of the last two samples: (n-1)/(n-2)
        log_base = np.log(n - 1) - np.log(n - 2)

        n_fmt = int(np.ceil(over_sample * (np.log(n - 1) - np.log(t_min)) / log_base))

    elif n_fmt < 3:
        raise ParameterError(f"n_fmt=={n_fmt} < 3")
    else:
        log_base = (np.log(n_fmt - 1) - np.log(n_fmt - 2)) / over_sample

    if not np.all(np.isfinite(y)):
        raise ParameterError("y must be finite everywhere")

    base = np.exp(log_base)
    # original grid: signal covers [0, 1).  This range is arbitrary, but convenient.
    # The final sample is positioned at (n-1)/n, so we omit the endpoint
    x = np.linspace(0, 1, num=n, endpoint=False)

    # build the interpolator
    f_interp = scipy.interpolate.interp1d(x, y, kind=kind, axis=axis)

    # build the new sampling grid
    # exponentially spaced between t_min/n and 1 (exclusive)
    # we'll go one past where we need, and drop the last sample
    # When over-sampling, the last input sample contributions n_over samples.
    # To keep the spacing consistent, we over-sample by n_over, and then
    # trim the final samples.
    n_over = int(np.ceil(over_sample))
    x_exp = np.logspace(
        (np.log(t_min) - np.log(n)) / log_base,
        0,
        num=n_fmt + n_over,
        endpoint=False,
        base=base,
    )[:-n_over]

    # Clean up any rounding errors at the boundaries of the interpolation
    # The interpolator gets angry if we try to extrapolate, so clipping is necessary here.
    if x_exp[0] < t_min or x_exp[-1] > float(n - 1.0) / n:
        x_exp = np.clip(x_exp, float(t_min) / n, x[-1])

    # Make sure that all sample points are unique
    # This should never happen!
    if len(np.unique(x_exp)) != len(x_exp):
        raise ParameterError("Redundant sample positions in Mellin transform")

    # Resample the signal
    y_res = f_interp(x_exp)

    # Broadcast the window correctly
    shape = [1] * y_res.ndim
    shape[axis] = -1

    # Apply the window and fft
    # Normalization is absorbed into the window here for expedience
    fft = get_fftlib()
    result: np.ndarray = fft.rfft(
        y_res * ((x_exp**beta).reshape(shape) * np.sqrt(n) / n_fmt), axis=axis
    )
    return result


@overload
def pcen(
    S: np.ndarray,
    *,
    sr: float = ...,
    hop_length: int = ...,
    gain: float = ...,
    bias: float = ...,
    power: float = ...,
    time_constant: float = ...,
    eps: float = ...,
    b: Optional[float] = ...,
    max_size: int = ...,
    ref: Optional[np.ndarray] = ...,
    axis: int = ...,
    max_axis: Optional[int] = ...,
    zi: Optional[np.ndarray] = ...,
    return_zf: Literal[False] = ...,
) -> np.ndarray:
    ...


@overload
def pcen(
    S: np.ndarray,
    *,
    sr: float = ...,
    hop_length: int = ...,
    gain: float = ...,
    bias: float = ...,
    power: float = ...,
    time_constant: float = ...,
    eps: float = ...,
    b: Optional[float] = ...,
    max_size: int = ...,
    ref: Optional[np.ndarray] = ...,
    axis: int = ...,
    max_axis: Optional[int] = ...,
    zi: Optional[np.ndarray] = ...,
    return_zf: Literal[True],
) -> Tuple[np.ndarray, np.ndarray]:
    ...


@overload
def pcen(
    S: np.ndarray,
    *,
    sr: float = ...,
    hop_length: int = ...,
    gain: float = ...,
    bias: float = ...,
    power: float = ...,
    time_constant: float = ...,
    eps: float = ...,
    b: Optional[float] = ...,
    max_size: int = ...,
    ref: Optional[np.ndarray] = ...,
    axis: int = ...,
    max_axis: Optional[int] = ...,
    zi: Optional[np.ndarray] = ...,
    return_zf: bool = ...,
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
    ...


@cache(level=30)
def pcen(
    S: np.ndarray,
    *,
    sr: float = 22050,
    hop_length: int = 512,
    gain: float = 0.98,
    bias: float = 2,
    power: float = 0.5,
    time_constant: float = 0.400,
    eps: float = 1e-6,
    b: Optional[float] = None,
    max_size: int = 1,
    ref: Optional[np.ndarray] = None,
    axis: int = -1,
    max_axis: Optional[int] = None,
    zi: Optional[np.ndarray] = None,
    return_zf: bool = False,
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
    """Per-channel energy normalization (PCEN)

    This function normalizes a time-frequency representation ``S`` by
    performing automatic gain control, followed by nonlinear compression [#]_ ::

        P[f, t] = (S / (eps + M[f, t])**gain + bias)**power - bias**power

    IMPORTANT: the default values of eps, gain, bias, and power match the
    original publication, in which ``S`` is a 40-band mel-frequency
    spectrogram with 25 ms windowing, 10 ms frame shift, and raw audio values
    in the interval [-2**31; 2**31-1[. If you use these default values, we
    recommend to make sure that the raw audio is properly scaled to this
    interval, and not to [-1, 1[ as is most often the case.

    The matrix ``M`` is the result of applying a low-pass, temporal IIR filter
    to ``S``::

        M[f, t] = (1 - b) * M[f, t - 1] + b * S[f, t]

    If ``b`` is not provided, it is calculated as::

        b = (sqrt(1 + 4* T**2) - 1) / (2 * T**2)

    where ``T = time_constant * sr / hop_length``. [#]_

    This normalization is designed to suppress background noise and
    emphasize foreground signals, and can be used as an alternative to
    decibel scaling (`amplitude_to_db`).

    This implementation also supports smoothing across frequency bins
    by specifying ``max_size > 1``.  If this option is used, the filtered
    spectrogram ``M`` is computed as::

        M[f, t] = (1 - b) * M[f, t - 1] + b * R[f, t]

    where ``R`` has been max-filtered along the frequency axis, similar to
    the SuperFlux algorithm implemented in `onset.onset_strength`::

        R[f, t] = max(S[f - max_size//2: f + max_size//2, t])

    This can be used to perform automatic gain control on signals that cross
    or span multiple frequency bans, which may be desirable for spectrograms
    with high frequency resolution.

    .. [#] Wang, Y., Getreuer, P., Hughes, T., Lyon, R. F., & Saurous, R. A.
       (2017, March). Trainable frontend for robust and far-field keyword spotting.
       In Acoustics, Speech and Signal Processing (ICASSP), 2017
       IEEE International Conference on (pp. 5670-5674). IEEE.

    .. [#] Lostanlen, V., Salamon, J., McFee, B., Cartwright, M., Farnsworth, A.,
       Kelling, S., and Bello, J. P. Per-Channel Energy Normalization: Why and How.
       IEEE Signal Processing Letters, 26(1), 39-43.

    Parameters
    ----------
    S : np.ndarray (non-negative)
        The input (magnitude) spectrogram

    sr : number > 0 [scalar]
        The audio sampling rate

    hop_length : int > 0 [scalar]
        The hop length of ``S``, expressed in samples

    gain : number >= 0 [scalar]
        The gain factor.  Typical values should be slightly less than 1.

    bias : number >= 0 [scalar]
        The bias point of the nonlinear compression (default: 2)

    power : number >= 0 [scalar]
        The compression exponent.  Typical values should be between 0 and 0.5.
        Smaller values of ``power`` result in stronger compression.
        At the limit ``power=0``, polynomial compression becomes logarithmic.

    time_constant : number > 0 [scalar]
        The time constant for IIR filtering, measured in seconds.

    eps : number > 0 [scalar]
        A small constant used to ensure numerical stability of the filter.

    b : number in [0, 1]  [scalar]
        The filter coefficient for the low-pass filter.
        If not provided, it will be inferred from ``time_constant``.

    max_size : int > 0 [scalar]
        The width of the max filter applied to the frequency axis.
        If left as `1`, no filtering is performed.

    ref : None or np.ndarray (shape=S.shape)
        An optional pre-computed reference spectrum (``R`` in the above).
        If not provided it will be computed from ``S``.

    axis : int [scalar]
        The (time) axis of the input spectrogram.

    max_axis : None or int [scalar]
        The frequency axis of the input spectrogram.
        If `None`, and ``S`` is two-dimensional, it will be inferred
        as the opposite from ``axis``.
        If ``S`` is not two-dimensional, and ``max_size > 1``, an error
        will be raised.

    zi : np.ndarray
        The initial filter delay values.

        This may be the ``zf`` (final delay values) of a previous call to ``pcen``, or
        computed by `scipy.signal.lfilter_zi`.

    return_zf : bool
        If ``True``, return the final filter delay values along with the PCEN output ``P``.
        This is primarily useful in streaming contexts, where the final state of one
        block of processing should be used to initialize the next block.

        If ``False`` (default) only the PCEN values ``P`` are returned.

    Returns
    -------
    P : np.ndarray, non-negative [shape=(n, m)]
        The per-channel energy normalized version of ``S``.
    zf : np.ndarray (optional)
        The final filter delay values.  Only returned if ``return_zf=True``.

    See Also
    --------
    amplitude_to_db
    librosa.onset.onset_strength

    Examples
    --------
    Compare PCEN to log amplitude (dB) scaling on Mel spectra

    >>> import matplotlib.pyplot as plt
    >>> y, sr = librosa.load(librosa.ex('robin'))

    >>> # We recommend scaling y to the range [-2**31, 2**31[ before applying
    >>> # PCEN's default parameters. Furthermore, we use power=1 to get a
    >>> # magnitude spectrum instead of a power spectrum.
    >>> S = librosa.feature.melspectrogram(y=y, sr=sr, power=1)
    >>> log_S = librosa.amplitude_to_db(S, ref=np.max)
    >>> pcen_S = librosa.pcen(S * (2**31))
    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> img = librosa.display.specshow(log_S, x_axis='time', y_axis='mel', ax=ax[0])
    >>> ax[0].set(title='log amplitude (dB)', xlabel=None)
    >>> ax[0].label_outer()
    >>> imgpcen = librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[1])
    >>> ax[1].set(title='Per-channel energy normalization')
    >>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB")
    >>> fig.colorbar(imgpcen, ax=ax[1])

    Compare PCEN with and without max-filtering

    >>> pcen_max = librosa.pcen(S * (2**31), max_size=3)
    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[0])
    >>> ax[0].set(title='Per-channel energy normalization (no max-filter)')
    >>> ax[0].label_outer()
    >>> img = librosa.display.specshow(pcen_max, x_axis='time', y_axis='mel', ax=ax[1])
    >>> ax[1].set(title='Per-channel energy normalization (max_size=3)')
    >>> fig.colorbar(img, ax=ax)
    """
    if power < 0:
        raise ParameterError(f"power={power} must be nonnegative")

    if gain < 0:
        raise ParameterError(f"gain={gain} must be non-negative")

    if bias < 0:
        raise ParameterError(f"bias={bias} must be non-negative")

    if eps <= 0:
        raise ParameterError(f"eps={eps} must be strictly positive")

    if time_constant <= 0:
        raise ParameterError(f"time_constant={time_constant} must be strictly positive")

    if not util.is_positive_int(max_size):
        raise ParameterError(f"max_size={max_size} must be a positive integer")

    if b is None:
        t_frames = time_constant * sr / float(hop_length)
        # By default, this solves the equation for b:
        #   b**2  + (1 - b) / t_frames  - 2 = 0
        # which approximates the full-width half-max of the
        # squared frequency response of the IIR low-pass filter

        b = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)

    if not 0 <= b <= 1:
        raise ParameterError(f"b={b} must be between 0 and 1")

    if np.issubdtype(S.dtype, np.complexfloating):
        warnings.warn(
            "pcen was called on complex input so phase "
            "information will be discarded. To suppress this warning, "
            "call pcen(np.abs(D)) instead.",
            stacklevel=2,
        )
        S = np.abs(S)

    if ref is None:
        if max_size == 1:
            ref = S
        elif S.ndim == 1:
            raise ParameterError(
                "Max-filtering cannot be applied to 1-dimensional input"
            )
        else:
            if max_axis is None:
                if S.ndim != 2:
                    raise ParameterError(
                        f"Max-filtering a {S.ndim:d}-dimensional spectrogram "
                        "requires you to specify max_axis"
                    )
                # if axis = 0, max_axis=1
                # if axis = +- 1, max_axis = 0
                max_axis = np.mod(1 - axis, 2)

            ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=max_axis)

    if zi is None:
        # Make sure zi matches dimension to input
        shape = tuple([1] * ref.ndim)
        zi = np.empty(shape)
        zi[:] = scipy.signal.lfilter_zi([b], [1, b - 1])[:]

    # Temporal integration
    S_smooth: np.ndarray
    zf: np.ndarray
    S_smooth, zf = scipy.signal.lfilter([b], [1, b - 1], ref, zi=zi, axis=axis)

    # Adaptive gain control
    # Working in log-space gives us some stability, and a slight speedup
    smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))

    # Dynamic range compression
    S_out: np.ndarray
    if power == 0:
        S_out = np.log1p(S * smooth)
    elif bias == 0:
        S_out = np.exp(power * (np.log(S) + np.log(smooth)))
    else:
        S_out = (bias**power) * np.expm1(power * np.log1p(S * smooth / bias))

    if return_zf:
        return S_out, zf
    else:
        return S_out


def griffinlim(
    S: np.ndarray,
    *,
    n_iter: int = 32,
    hop_length: Optional[int] = None,
    win_length: Optional[int] = None,
    n_fft: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    dtype: Optional[DTypeLike] = None,
    length: Optional[int] = None,
    pad_mode: _PadModeSTFT = "constant",
    momentum: float = 0.99,
    init: Optional[str] = "random",
    random_state: Optional[
        Union[int, np.random.RandomState, np.random.Generator]
    ] = None,
) -> np.ndarray:
    """Approximate magnitude spectrogram inversion using the "fast" Griffin-Lim algorithm.

    Given a short-time Fourier transform magnitude matrix (``S``), the algorithm randomly
    initializes phase estimates, and then alternates forward- and inverse-STFT
    operations. [#]_

    Note that this assumes reconstruction of a real-valued time-domain signal, and
    that ``S`` contains only the non-negative frequencies (as computed by
    `stft`).

    The "fast" GL method [#]_ uses a momentum parameter to accelerate convergence.

    .. [#] D. W. Griffin and J. S. Lim,
        "Signal estimation from modified short-time Fourier transform,"
        IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.

    .. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
        "A fast Griffin-Lim algorithm,"
        IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
        Oct. 2013.

    Parameters
    ----------
    S : np.ndarray [shape=(..., n_fft // 2 + 1, t), non-negative]
        An array of short-time Fourier transform magnitudes as produced by
        `stft`.

    n_iter : int > 0
        The number of iterations to run

    hop_length : None or int > 0
        The hop length of the STFT.  If not provided, it will default to ``n_fft // 4``

    win_length : None or int > 0
        The window length of the STFT.  By default, it will equal ``n_fft``

    n_fft : None or int > 0
        The number of samples per frame.
        By default, this will be inferred from the shape of ``S`` as an even number.
        However, if an odd frame length was used, you can explicitly set ``n_fft``.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        A window specification as supported by `stft` or `istft`

    center : boolean
        If ``True``, the STFT is assumed to use centered frames.
        If ``False``, the STFT is assumed to use left-aligned frames.

    dtype : np.dtype
        Real numeric type for the time-domain signal.  Default is inferred
        to match the precision of the input spectrogram.

    length : None or int > 0
        If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
        samples.

    pad_mode : string
        If ``center=True``, the padding mode to use at the edges of the signal.
        By default, STFT uses zero padding.

    momentum : number >= 0
        The momentum parameter for fast Griffin-Lim.
        Setting this to 0 recovers the original Griffin-Lim method [1]_.
        Values near 1 can lead to faster convergence, but above 1 may not converge.

    init : None or 'random' [default]
        If 'random' (the default), then phase values are initialized randomly
        according to ``random_state``.  This is recommended when the input ``S`` is
        a magnitude spectrogram with no initial phase estimates.

        If `None`, then the phase is initialized from ``S``.  This is useful when
        an initial guess for phase can be provided, or when you want to resume
        Griffin-Lim from a previous output.

    random_state : None, int, np.random.RandomState, or np.random.Generator
        If int, random_state is the seed used by the random number generator
        for phase initialization.

        If `np.random.RandomState` or `np.random.Generator` instance, the random number
        generator itself.

        If `None`, defaults to the `np.random.default_rng()` object.

    Returns
    -------
    y : np.ndarray [shape=(..., n)]
        time-domain signal reconstructed from ``S``

    See Also
    --------
    stft
    istft
    magphase
    filters.get_window

    Examples
    --------
    A basic STFT inverse example

    >>> y, sr = librosa.load(librosa.ex('trumpet'))
    >>> # Get the magnitude spectrogram
    >>> S = np.abs(librosa.stft(y))
    >>> # Invert using Griffin-Lim
    >>> y_inv = librosa.griffinlim(S)
    >>> # Invert without estimating phase
    >>> y_istft = librosa.istft(S)

    Wave-plot the results

    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
    >>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
    >>> ax[0].set(title='Original', xlabel=None)
    >>> ax[0].label_outer()
    >>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
    >>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
    >>> ax[1].label_outer()
    >>> librosa.display.waveshow(y_istft, sr=sr, color='r', ax=ax[2])
    >>> ax[2].set_title('Magnitude-only istft reconstruction')
    """
    if random_state is None:
        rng = np.random.default_rng()
    elif isinstance(random_state, int):
        rng = np.random.RandomState(seed=random_state)  # type: ignore
    elif isinstance(random_state, (np.random.RandomState, np.random.Generator)):
        rng = random_state  # type: ignore
    else:
        raise ParameterError(f"Unsupported random_state={random_state!r}")

    if momentum > 1:
        warnings.warn(
            f"Griffin-Lim with momentum={momentum} > 1 can be unstable. "
            "Proceed with caution!",
            stacklevel=2,
        )
    elif momentum < 0:
        raise ParameterError(f"griffinlim() called with momentum={momentum} < 0")

    # Infer n_fft from the spectrogram shape
    if n_fft is None:
        n_fft = 2 * (S.shape[-2] - 1)

    # Infer the dtype from S
    angles = np.empty(S.shape, dtype=util.dtype_r2c(S.dtype))
    eps = util.tiny(angles)

    if init == "random":
        # randomly initialize the phase
        angles[:] = util.phasor((2 * np.pi * rng.random(size=S.shape)))
    elif init is None:
        # Initialize an all ones complex matrix
        angles[:] = 1.0
    else:
        raise ParameterError(f"init={init} must either None or 'random'")

    # Place-holders for temporary data and reconstructed buffer
    rebuilt = None
    tprev = None
    inverse = None

    # Absorb magnitudes into angles
    angles *= S
    for _ in range(n_iter):
        # Invert with our current estimate of the phases
        inverse = istft(
            angles,
            hop_length=hop_length,
            win_length=win_length,
            n_fft=n_fft,
            window=window,
            center=center,
            dtype=dtype,
            length=length,
            out=inverse,
        )

        # Rebuild the spectrogram
        rebuilt = stft(
            inverse,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=center,
            pad_mode=pad_mode,
            out=rebuilt,
        )

        # Update our phase estimates
        angles[:] = rebuilt
        if tprev is not None:
            angles -= (momentum / (1 + momentum)) * tprev
        angles /= np.abs(angles) + eps
        angles *= S
        # Store
        rebuilt, tprev = tprev, rebuilt

    # Return the final phase estimates
    return istft(
        angles,
        hop_length=hop_length,
        win_length=win_length,
        n_fft=n_fft,
        window=window,
        center=center,
        dtype=dtype,
        length=length,
        out=inverse,
    )


def _spectrogram(
    *,
    y: Optional[np.ndarray] = None,
    S: Optional[np.ndarray] = None,
    n_fft: Optional[int] = 2048,
    hop_length: Optional[int] = 512,
    power: float = 1,
    win_length: Optional[int] = None,
    window: _WindowSpec = "hann",
    center: bool = True,
    pad_mode: _PadModeSTFT = "constant",
) -> Tuple[np.ndarray, int]:
    """Retrieve a magnitude spectrogram.

    This is primarily used in feature extraction functions that can operate on
    either audio time-series or spectrogram input.

    Parameters
    ----------
    y : None or np.ndarray
        If provided, an audio time series

    S : None or np.ndarray
        Spectrogram input, optional

    n_fft : int > 0
        STFT window size

    hop_length : int > 0
        STFT hop length

    power : float > 0
        Exponent for the magnitude spectrogram,
        e.g., 1 for energy, 2 for power, etc.

    win_length : int <= n_fft [scalar]
        Each frame of audio is windowed by ``window``.
        The window will be of length ``win_length`` and then padded
        with zeros to match ``n_fft``.

        If unspecified, defaults to ``win_length = n_fft``.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, or number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.windows.hann`
        - a vector or array of length ``n_fft``

        .. see also:: `filters.get_window`

    center : boolean
        - If ``True``, the signal ``y`` is padded so that frame
          ``t`` is centered at ``y[t * hop_length]``.
        - If ``False``, then frame ``t`` begins at ``y[t * hop_length]``

    pad_mode : string
        If ``center=True``, the padding mode to use at the edges of the signal.
        By default, STFT uses zero padding.

    Returns
    -------
    S_out : np.ndarray [dtype=np.float]
        - If ``S`` is provided as input, then ``S_out == S``
        - Else, ``S_out = |stft(y, ...)|**power``
    n_fft : int > 0
        - If ``S`` is provided, then ``n_fft`` is inferred from ``S``
        - Else, copied from input
    """
    if S is not None:
        # Infer n_fft from spectrogram shape, but only if it mismatches
        if n_fft is None or n_fft // 2 + 1 != S.shape[-2]:
            n_fft = 2 * (S.shape[-2] - 1)
    else:
        # Otherwise, compute a magnitude spectrogram from input
        if n_fft is None:
            raise ParameterError(f"Unable to compute spectrogram with n_fft={n_fft}")
        if y is None:
            raise ParameterError(
                "Input signal must be provided to compute a spectrogram"
            )
        S = (
            np.abs(
                stft(
                    y,
                    n_fft=n_fft,
                    hop_length=hop_length,
                    win_length=win_length,
                    center=center,
                    window=window,
                    pad_mode=pad_mode,
                )
            )
            ** power
        )

    return S, n_fft