2836 lines
95 KiB
Python
2836 lines
95 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
"""Utilities for spectral processing"""
|
||
import warnings
|
||
|
||
import numpy as np
|
||
import scipy
|
||
import scipy.ndimage
|
||
import scipy.signal
|
||
import scipy.interpolate
|
||
|
||
from numba import jit
|
||
|
||
from . import convert
|
||
from .fft import get_fftlib
|
||
from .audio import resample
|
||
from .._cache import cache
|
||
from .. import util
|
||
from ..util.exceptions import ParameterError
|
||
from ..filters import get_window, semitone_filterbank
|
||
from ..filters import window_sumsquare
|
||
from numpy.typing import DTypeLike
|
||
from typing import Any, Callable, Optional, Tuple, List, Union, overload
|
||
from typing_extensions import Literal
|
||
from .._typing import _WindowSpec, _PadMode, _PadModeSTFT
|
||
|
||
__all__ = [
|
||
"stft",
|
||
"istft",
|
||
"magphase",
|
||
"iirt",
|
||
"reassigned_spectrogram",
|
||
"phase_vocoder",
|
||
"perceptual_weighting",
|
||
"power_to_db",
|
||
"db_to_power",
|
||
"amplitude_to_db",
|
||
"db_to_amplitude",
|
||
"fmt",
|
||
"pcen",
|
||
"griffinlim",
|
||
]
|
||
|
||
|
||
@cache(level=20)
|
||
def stft(
|
||
y: np.ndarray,
|
||
*,
|
||
n_fft: int = 2048,
|
||
hop_length: Optional[int] = None,
|
||
win_length: Optional[int] = None,
|
||
window: _WindowSpec = "hann",
|
||
center: bool = True,
|
||
dtype: Optional[DTypeLike] = None,
|
||
pad_mode: _PadModeSTFT = "constant",
|
||
out: Optional[np.ndarray] = None,
|
||
) -> np.ndarray:
|
||
"""Short-time Fourier transform (STFT).
|
||
|
||
The STFT represents a signal in the time-frequency domain by
|
||
computing discrete Fourier transforms (DFT) over short overlapping
|
||
windows.
|
||
|
||
This function returns a complex-valued matrix D such that
|
||
|
||
- ``np.abs(D[..., f, t])`` is the magnitude of frequency bin ``f``
|
||
at frame ``t``, and
|
||
|
||
- ``np.angle(D[..., f, t])`` is the phase of frequency bin ``f``
|
||
at frame ``t``.
|
||
|
||
The integers ``t`` and ``f`` can be converted to physical units by means
|
||
of the utility functions `frames_to_samples` and `fft_frequencies`.
|
||
|
||
Parameters
|
||
----------
|
||
y : np.ndarray [shape=(..., n)], real-valued
|
||
input signal. Multi-channel is supported.
|
||
|
||
n_fft : int > 0 [scalar]
|
||
length of the windowed signal after padding with zeros.
|
||
The number of rows in the STFT matrix ``D`` is ``(1 + n_fft/2)``.
|
||
The default value, ``n_fft=2048`` samples, corresponds to a physical
|
||
duration of 93 milliseconds at a sample rate of 22050 Hz, i.e. the
|
||
default sample rate in librosa. This value is well adapted for music
|
||
signals. However, in speech processing, the recommended value is 512,
|
||
corresponding to 23 milliseconds at a sample rate of 22050 Hz.
|
||
In any case, we recommend setting ``n_fft`` to a power of two for
|
||
optimizing the speed of the fast Fourier transform (FFT) algorithm.
|
||
|
||
hop_length : int > 0 [scalar]
|
||
number of audio samples between adjacent STFT columns.
|
||
|
||
Smaller values increase the number of columns in ``D`` without
|
||
affecting the frequency resolution of the STFT.
|
||
|
||
If unspecified, defaults to ``win_length // 4`` (see below).
|
||
|
||
win_length : int <= n_fft [scalar]
|
||
Each frame of audio is windowed by ``window`` of length ``win_length``
|
||
and then padded with zeros to match ``n_fft``. Padding is added on
|
||
both the left- and the right-side of the window so that the window
|
||
is centered within the frame.
|
||
|
||
Smaller values improve the temporal resolution of the STFT (i.e. the
|
||
ability to discriminate impulses that are closely spaced in time)
|
||
at the expense of frequency resolution (i.e. the ability to discriminate
|
||
pure tones that are closely spaced in frequency). This effect is known
|
||
as the time-frequency localization trade-off and needs to be adjusted
|
||
according to the properties of the input signal ``y``.
|
||
|
||
If unspecified, defaults to ``win_length = n_fft``.
|
||
|
||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||
Either:
|
||
|
||
- a window specification (string, tuple, or number);
|
||
see `scipy.signal.get_window`
|
||
- a window function, such as `scipy.signal.windows.hann`
|
||
- a vector or array of length ``n_fft``
|
||
|
||
Defaults to a raised cosine window (`'hann'`), which is adequate for
|
||
most applications in audio signal processing.
|
||
|
||
.. see also:: `filters.get_window`
|
||
|
||
center : boolean
|
||
If ``True``, the signal ``y`` is padded so that frame
|
||
``D[:, t]`` is centered at ``y[t * hop_length]``.
|
||
|
||
If ``False``, then ``D[:, t]`` begins at ``y[t * hop_length]``.
|
||
|
||
Defaults to ``True``, which simplifies the alignment of ``D`` onto a
|
||
time grid by means of `librosa.frames_to_samples`.
|
||
Note, however, that ``center`` must be set to `False` when analyzing
|
||
signals with `librosa.stream`.
|
||
|
||
.. see also:: `librosa.stream`
|
||
|
||
dtype : np.dtype, optional
|
||
Complex numeric type for ``D``. Default is inferred to match the
|
||
precision of the input signal.
|
||
|
||
pad_mode : string or function
|
||
If ``center=True``, this argument is passed to `np.pad` for padding
|
||
the edges of the signal ``y``. By default (``pad_mode="constant"``),
|
||
``y`` is padded on both sides with zeros.
|
||
|
||
.. note:: Not all padding modes supported by `numpy.pad` are supported here.
|
||
`wrap`, `mean`, `maximum`, `median`, and `minimum` are not supported.
|
||
|
||
Other modes that depend at most on input values at the edges of the
|
||
signal (e.g., `constant`, `edge`, `linear_ramp`) are supported.
|
||
|
||
If ``center=False``, this argument is ignored.
|
||
|
||
.. see also:: `numpy.pad`
|
||
|
||
out : np.ndarray or None
|
||
A pre-allocated, complex-valued array to store the STFT results.
|
||
This must be of compatible shape and dtype for the given input parameters.
|
||
|
||
If `out` is larger than necessary for the provided input signal, then only
|
||
a prefix slice of `out` will be used.
|
||
|
||
If not provided, a new array is allocated and returned.
|
||
|
||
Returns
|
||
-------
|
||
D : np.ndarray [shape=(..., 1 + n_fft/2, n_frames), dtype=dtype]
|
||
Complex-valued matrix of short-term Fourier transform
|
||
coefficients.
|
||
|
||
If a pre-allocated `out` array is provided, then `D` will be
|
||
a reference to `out`.
|
||
|
||
If `out` is larger than necessary, then `D` will be a sliced
|
||
view: `D = out[..., :n_frames]`.
|
||
|
||
See Also
|
||
--------
|
||
istft : Inverse STFT
|
||
reassigned_spectrogram : Time-frequency reassigned spectrogram
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 20.
|
||
|
||
Examples
|
||
--------
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> S = np.abs(librosa.stft(y))
|
||
>>> S
|
||
array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05],
|
||
[3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05],
|
||
...,
|
||
[7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03],
|
||
[7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]],
|
||
dtype=float32)
|
||
|
||
Use left-aligned frames, instead of centered frames
|
||
|
||
>>> S_left = librosa.stft(y, center=False)
|
||
|
||
Use a shorter hop length
|
||
|
||
>>> D_short = librosa.stft(y, hop_length=64)
|
||
|
||
Display a spectrogram
|
||
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> fig, ax = plt.subplots()
|
||
>>> img = librosa.display.specshow(librosa.amplitude_to_db(S,
|
||
... ref=np.max),
|
||
... y_axis='log', x_axis='time', ax=ax)
|
||
>>> ax.set_title('Power spectrogram')
|
||
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
|
||
"""
|
||
# By default, use the entire frame
|
||
if win_length is None:
|
||
win_length = n_fft
|
||
|
||
# Set the default hop, if it's not already specified
|
||
if hop_length is None:
|
||
hop_length = int(win_length // 4)
|
||
elif not util.is_positive_int(hop_length):
|
||
raise ParameterError(f"hop_length={hop_length} must be a positive integer")
|
||
|
||
# Check audio is valid
|
||
util.valid_audio(y, mono=False)
|
||
|
||
fft_window = get_window(window, win_length, fftbins=True)
|
||
|
||
# Pad the window out to n_fft size
|
||
fft_window = util.pad_center(fft_window, size=n_fft)
|
||
|
||
# Reshape so that the window can be broadcast
|
||
fft_window = util.expand_to(fft_window, ndim=1 + y.ndim, axes=-2)
|
||
|
||
# Pad the time series so that frames are centered
|
||
if center:
|
||
if pad_mode in ("wrap", "maximum", "mean", "median", "minimum"):
|
||
# Note: padding with a user-provided function "works", but
|
||
# use at your own risk.
|
||
# Since we don't pass-through kwargs here, any arguments
|
||
# to a user-provided pad function should be encapsulated
|
||
# by using functools.partial:
|
||
#
|
||
# >>> my_pad_func = functools.partial(pad_func, foo=x, bar=y)
|
||
# >>> librosa.stft(..., pad_mode=my_pad_func)
|
||
|
||
raise ParameterError(
|
||
f"pad_mode='{pad_mode}' is not supported by librosa.stft"
|
||
)
|
||
|
||
if n_fft > y.shape[-1]:
|
||
warnings.warn(
|
||
f"n_fft={n_fft} is too large for input signal of length={y.shape[-1]}"
|
||
)
|
||
|
||
# Set up the padding array to be empty, and we'll fix the target dimension later
|
||
padding = [(0, 0) for _ in range(y.ndim)]
|
||
|
||
# How many frames depend on left padding?
|
||
start_k = int(np.ceil(n_fft // 2 / hop_length))
|
||
|
||
# What's the first frame that depends on extra right-padding?
|
||
tail_k = (y.shape[-1] + n_fft // 2 - n_fft) // hop_length + 1
|
||
|
||
if tail_k <= start_k:
|
||
# If tail and head overlap, then just copy-pad the signal and carry on
|
||
start = 0
|
||
extra = 0
|
||
padding[-1] = (n_fft // 2, n_fft // 2)
|
||
y = np.pad(y, padding, mode=pad_mode)
|
||
else:
|
||
# If tail and head do not overlap, then we can implement padding on each part separately
|
||
# and avoid a full copy-pad
|
||
|
||
# "Middle" of the signal starts here, and does not depend on head padding
|
||
start = start_k * hop_length - n_fft // 2
|
||
padding[-1] = (n_fft // 2, 0)
|
||
|
||
# +1 here is to ensure enough samples to fill the window
|
||
# fixes bug #1567
|
||
y_pre = np.pad(
|
||
y[..., : (start_k - 1) * hop_length - n_fft // 2 + n_fft + 1],
|
||
padding,
|
||
mode=pad_mode,
|
||
)
|
||
y_frames_pre = util.frame(y_pre, frame_length=n_fft, hop_length=hop_length)
|
||
# Trim this down to the exact number of frames we should have
|
||
y_frames_pre = y_frames_pre[..., :start_k]
|
||
|
||
# How many extra frames do we have from the head?
|
||
extra = y_frames_pre.shape[-1]
|
||
|
||
# Determine if we have any frames that will fit inside the tail pad
|
||
if tail_k * hop_length - n_fft // 2 + n_fft <= y.shape[-1] + n_fft // 2:
|
||
padding[-1] = (0, n_fft // 2)
|
||
y_post = np.pad(
|
||
y[..., (tail_k) * hop_length - n_fft // 2 :], padding, mode=pad_mode
|
||
)
|
||
y_frames_post = util.frame(
|
||
y_post, frame_length=n_fft, hop_length=hop_length
|
||
)
|
||
# How many extra frames do we have from the tail?
|
||
extra += y_frames_post.shape[-1]
|
||
else:
|
||
# In this event, the first frame that touches tail padding would run off
|
||
# the end of the padded array
|
||
# We'll circumvent this by allocating an empty frame buffer for the tail
|
||
# this keeps the subsequent logic simple
|
||
post_shape = list(y_frames_pre.shape)
|
||
post_shape[-1] = 0
|
||
y_frames_post = np.empty_like(y_frames_pre, shape=post_shape)
|
||
else:
|
||
if n_fft > y.shape[-1]:
|
||
raise ParameterError(
|
||
f"n_fft={n_fft} is too large for uncentered analysis of input signal of length={y.shape[-1]}"
|
||
)
|
||
|
||
# "Middle" of the signal starts at sample 0
|
||
start = 0
|
||
# We have no extra frames
|
||
extra = 0
|
||
|
||
fft = get_fftlib()
|
||
|
||
if dtype is None:
|
||
dtype = util.dtype_r2c(y.dtype)
|
||
|
||
# Window the time series.
|
||
y_frames = util.frame(y[..., start:], frame_length=n_fft, hop_length=hop_length)
|
||
|
||
# Pre-allocate the STFT matrix
|
||
shape = list(y_frames.shape)
|
||
|
||
# This is our frequency dimension
|
||
shape[-2] = 1 + n_fft // 2
|
||
|
||
# If there's padding, there will be extra head and tail frames
|
||
shape[-1] += extra
|
||
|
||
if out is None:
|
||
stft_matrix = np.zeros(shape, dtype=dtype, order="F")
|
||
elif not (np.allclose(out.shape[:-1], shape[:-1]) and out.shape[-1] >= shape[-1]):
|
||
raise ParameterError(
|
||
f"Shape mismatch for provided output array out.shape={out.shape} and target shape={shape}"
|
||
)
|
||
elif not np.iscomplexobj(out):
|
||
raise ParameterError(f"output with dtype={out.dtype} is not of complex type")
|
||
else:
|
||
if np.allclose(shape, out.shape):
|
||
stft_matrix = out
|
||
else:
|
||
stft_matrix = out[..., : shape[-1]]
|
||
|
||
# Fill in the warm-up
|
||
if center and extra > 0:
|
||
off_start = y_frames_pre.shape[-1]
|
||
stft_matrix[..., :off_start] = fft.rfft(fft_window * y_frames_pre, axis=-2)
|
||
|
||
off_end = y_frames_post.shape[-1]
|
||
if off_end > 0:
|
||
stft_matrix[..., -off_end:] = fft.rfft(fft_window * y_frames_post, axis=-2)
|
||
else:
|
||
off_start = 0
|
||
|
||
n_columns = int(
|
||
util.MAX_MEM_BLOCK // (np.prod(y_frames.shape[:-1]) * y_frames.itemsize)
|
||
)
|
||
n_columns = max(n_columns, 1)
|
||
|
||
for bl_s in range(0, y_frames.shape[-1], n_columns):
|
||
bl_t = min(bl_s + n_columns, y_frames.shape[-1])
|
||
|
||
stft_matrix[..., bl_s + off_start : bl_t + off_start] = fft.rfft(
|
||
fft_window * y_frames[..., bl_s:bl_t], axis=-2
|
||
)
|
||
return stft_matrix
|
||
|
||
|
||
@cache(level=30)
|
||
def istft(
|
||
stft_matrix: np.ndarray,
|
||
*,
|
||
hop_length: Optional[int] = None,
|
||
win_length: Optional[int] = None,
|
||
n_fft: Optional[int] = None,
|
||
window: _WindowSpec = "hann",
|
||
center: bool = True,
|
||
dtype: Optional[DTypeLike] = None,
|
||
length: Optional[int] = None,
|
||
out: Optional[np.ndarray] = None,
|
||
) -> np.ndarray:
|
||
"""
|
||
Inverse short-time Fourier transform (ISTFT).
|
||
|
||
Converts a complex-valued spectrogram ``stft_matrix`` to time-series ``y``
|
||
by minimizing the mean squared error between ``stft_matrix`` and STFT of
|
||
``y`` as described in [#]_ up to Section 2 (reconstruction from MSTFT).
|
||
|
||
In general, window function, hop length and other parameters should be same
|
||
as in stft, which mostly leads to perfect reconstruction of a signal from
|
||
unmodified ``stft_matrix``.
|
||
|
||
.. [#] D. W. Griffin and J. S. Lim,
|
||
"Signal estimation from modified short-time Fourier transform,"
|
||
IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
|
||
|
||
Parameters
|
||
----------
|
||
stft_matrix : np.ndarray [shape=(..., 1 + n_fft//2, t)]
|
||
STFT matrix from ``stft``
|
||
|
||
hop_length : int > 0 [scalar]
|
||
Number of frames between STFT columns.
|
||
If unspecified, defaults to ``win_length // 4``.
|
||
|
||
win_length : int <= n_fft = 2 * (stft_matrix.shape[0] - 1)
|
||
When reconstructing the time series, each frame is windowed
|
||
and each sample is normalized by the sum of squared window
|
||
according to the ``window`` function (see below).
|
||
|
||
If unspecified, defaults to ``n_fft``.
|
||
|
||
n_fft : int > 0 or None
|
||
The number of samples per frame in the input spectrogram.
|
||
By default, this will be inferred from the shape of ``stft_matrix``.
|
||
However, if an odd frame length was used, you can specify the correct
|
||
length by setting ``n_fft``.
|
||
|
||
window : string, tuple, number, function, np.ndarray [shape=(n_fft,)]
|
||
- a window specification (string, tuple, or number);
|
||
see `scipy.signal.get_window`
|
||
- a window function, such as `scipy.signal.windows.hann`
|
||
- a user-specified window vector of length ``n_fft``
|
||
|
||
.. see also:: `filters.get_window`
|
||
|
||
center : boolean
|
||
- If ``True``, ``D`` is assumed to have centered frames.
|
||
- If ``False``, ``D`` is assumed to have left-aligned frames.
|
||
|
||
dtype : numeric type
|
||
Real numeric type for ``y``. Default is to match the numerical
|
||
precision of the input spectrogram.
|
||
|
||
length : int > 0, optional
|
||
If provided, the output ``y`` is zero-padded or clipped to exactly
|
||
``length`` samples.
|
||
|
||
out : np.ndarray or None
|
||
A pre-allocated, complex-valued array to store the reconstructed signal
|
||
``y``. This must be of the correct shape for the given input parameters.
|
||
|
||
If not provided, a new array is allocated and returned.
|
||
|
||
Returns
|
||
-------
|
||
y : np.ndarray [shape=(..., n)]
|
||
time domain signal reconstructed from ``stft_matrix``.
|
||
If ``stft_matrix`` contains more than two axes
|
||
(e.g., from a stereo input signal), then ``y`` will match shape on the leading dimensions.
|
||
|
||
See Also
|
||
--------
|
||
stft : Short-time Fourier Transform
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 30.
|
||
|
||
Examples
|
||
--------
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> D = librosa.stft(y)
|
||
>>> y_hat = librosa.istft(D)
|
||
>>> y_hat
|
||
array([-1.407e-03, -4.461e-04, ..., 5.131e-06, -1.417e-05],
|
||
dtype=float32)
|
||
|
||
Exactly preserving length of the input signal requires explicit padding.
|
||
Otherwise, a partial frame at the end of ``y`` will not be represented.
|
||
|
||
>>> n = len(y)
|
||
>>> n_fft = 2048
|
||
>>> y_pad = librosa.util.fix_length(y, size=n + n_fft // 2)
|
||
>>> D = librosa.stft(y_pad, n_fft=n_fft)
|
||
>>> y_out = librosa.istft(D, length=n)
|
||
>>> np.max(np.abs(y - y_out))
|
||
8.940697e-08
|
||
"""
|
||
if n_fft is None:
|
||
n_fft = 2 * (stft_matrix.shape[-2] - 1)
|
||
|
||
# By default, use the entire frame
|
||
if win_length is None:
|
||
win_length = n_fft
|
||
|
||
# Set the default hop, if it's not already specified
|
||
if hop_length is None:
|
||
hop_length = int(win_length // 4)
|
||
|
||
ifft_window = get_window(window, win_length, fftbins=True)
|
||
|
||
# Pad out to match n_fft, and add broadcasting axes
|
||
ifft_window = util.pad_center(ifft_window, size=n_fft)
|
||
ifft_window = util.expand_to(ifft_window, ndim=stft_matrix.ndim, axes=-2)
|
||
|
||
# For efficiency, trim STFT frames according to signal length if available
|
||
if length:
|
||
if center:
|
||
padded_length = length + 2 * (n_fft // 2)
|
||
else:
|
||
padded_length = length
|
||
n_frames = min(stft_matrix.shape[-1], int(np.ceil(padded_length / hop_length)))
|
||
else:
|
||
n_frames = stft_matrix.shape[-1]
|
||
|
||
if dtype is None:
|
||
dtype = util.dtype_c2r(stft_matrix.dtype)
|
||
|
||
shape = list(stft_matrix.shape[:-2])
|
||
expected_signal_len = n_fft + hop_length * (n_frames - 1)
|
||
|
||
if length:
|
||
expected_signal_len = length
|
||
elif center:
|
||
expected_signal_len -= 2 * (n_fft // 2)
|
||
|
||
shape.append(expected_signal_len)
|
||
|
||
if out is None:
|
||
y = np.zeros(shape, dtype=dtype)
|
||
elif not np.allclose(out.shape, shape):
|
||
raise ParameterError(
|
||
f"Shape mismatch for provided output array out.shape={out.shape} != {shape}"
|
||
)
|
||
else:
|
||
y = out
|
||
# Since we'll be doing overlap-add here, this needs to be initialized to zero.
|
||
y.fill(0.0)
|
||
|
||
fft = get_fftlib()
|
||
|
||
if center:
|
||
# First frame that does not depend on padding
|
||
# k * hop_length - n_fft//2 >= 0
|
||
# k * hop_length >= n_fft // 2
|
||
# k >= (n_fft//2 / hop_length)
|
||
|
||
start_frame = int(np.ceil((n_fft // 2) / hop_length))
|
||
|
||
# Do overlap-add on the head block
|
||
ytmp = ifft_window * fft.irfft(stft_matrix[..., :start_frame], n=n_fft, axis=-2)
|
||
|
||
shape[-1] = n_fft + hop_length * (start_frame - 1)
|
||
head_buffer = np.zeros(shape, dtype=dtype)
|
||
|
||
__overlap_add(head_buffer, ytmp, hop_length)
|
||
|
||
# If y is smaller than the head buffer, take everything
|
||
if y.shape[-1] < shape[-1] - n_fft // 2:
|
||
y[..., :] = head_buffer[..., n_fft // 2 : y.shape[-1] + n_fft // 2]
|
||
else:
|
||
# Trim off the first n_fft//2 samples from the head and copy into target buffer
|
||
y[..., : shape[-1] - n_fft // 2] = head_buffer[..., n_fft // 2 :]
|
||
|
||
# This offset compensates for any differences between frame alignment
|
||
# and padding truncation
|
||
offset = start_frame * hop_length - n_fft // 2
|
||
|
||
else:
|
||
start_frame = 0
|
||
offset = 0
|
||
|
||
n_columns = int(
|
||
util.MAX_MEM_BLOCK // (np.prod(stft_matrix.shape[:-1]) * stft_matrix.itemsize)
|
||
)
|
||
n_columns = max(n_columns, 1)
|
||
|
||
frame = 0
|
||
for bl_s in range(start_frame, n_frames, n_columns):
|
||
bl_t = min(bl_s + n_columns, n_frames)
|
||
|
||
# invert the block and apply the window function
|
||
ytmp = ifft_window * fft.irfft(stft_matrix[..., bl_s:bl_t], n=n_fft, axis=-2)
|
||
|
||
# Overlap-add the istft block starting at the i'th frame
|
||
__overlap_add(y[..., frame * hop_length + offset :], ytmp, hop_length)
|
||
|
||
frame += bl_t - bl_s
|
||
|
||
# Normalize by sum of squared window
|
||
ifft_window_sum = window_sumsquare(
|
||
window=window,
|
||
n_frames=n_frames,
|
||
win_length=win_length,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
dtype=dtype,
|
||
)
|
||
|
||
if center:
|
||
start = n_fft // 2
|
||
else:
|
||
start = 0
|
||
|
||
ifft_window_sum = util.fix_length(ifft_window_sum[..., start:], size=y.shape[-1])
|
||
|
||
approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum)
|
||
|
||
y[..., approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]
|
||
|
||
return y
|
||
|
||
|
||
@jit(nopython=True, cache=True)
|
||
def __overlap_add(y, ytmp, hop_length):
|
||
# numba-accelerated overlap add for inverse stft
|
||
# y is the pre-allocated output buffer
|
||
# ytmp is the windowed inverse-stft frames
|
||
# hop_length is the hop-length of the STFT analysis
|
||
|
||
n_fft = ytmp.shape[-2]
|
||
N = n_fft
|
||
for frame in range(ytmp.shape[-1]):
|
||
sample = frame * hop_length
|
||
if N > y.shape[-1] - sample:
|
||
N = y.shape[-1] - sample
|
||
|
||
y[..., sample : (sample + N)] += ytmp[..., :N, frame]
|
||
|
||
|
||
def __reassign_frequencies(
|
||
y: np.ndarray,
|
||
sr: float = 22050,
|
||
S: Optional[np.ndarray] = None,
|
||
n_fft: int = 2048,
|
||
hop_length: Optional[int] = None,
|
||
win_length: Optional[int] = None,
|
||
window: _WindowSpec = "hann",
|
||
center: bool = True,
|
||
dtype: Optional[DTypeLike] = None,
|
||
pad_mode: _PadModeSTFT = "constant",
|
||
) -> Tuple[np.ndarray, np.ndarray]:
|
||
"""Instantaneous frequencies based on a spectrogram representation.
|
||
|
||
The reassignment vector is calculated using equation 5.20 in Flandrin,
|
||
Auger, & Chassande-Mottin 2002::
|
||
|
||
omega_reassigned = omega - np.imag(S_dh/S_h)
|
||
|
||
where ``S_h`` is the complex STFT calculated using the original window, and
|
||
``S_dh`` is the complex STFT calculated using the derivative of the original
|
||
window.
|
||
|
||
See `reassigned_spectrogram` for references.
|
||
|
||
It is recommended to use ``pad_mode="wrap"`` or else ``center=False``, rather
|
||
than the defaults. Frequency reassignment assumes that the energy in each
|
||
FFT bin is associated with exactly one signal component. Reflection padding
|
||
at the edges of the signal may invalidate the reassigned estimates in the
|
||
boundary frames.
|
||
|
||
Parameters
|
||
----------
|
||
y : np.ndarray [shape=(..., n,)], real-valued
|
||
audio time series. Multi-channel is supported.
|
||
|
||
sr : number > 0 [scalar]
|
||
sampling rate of ``y``
|
||
|
||
S : np.ndarray [shape=(..., d, t)] or None
|
||
(optional) complex STFT calculated using the other arguments provided
|
||
to `__reassign_frequencies`
|
||
|
||
n_fft : int > 0 [scalar]
|
||
FFT window size. Defaults to 2048.
|
||
|
||
hop_length : int > 0 [scalar]
|
||
hop length, number samples between subsequent frames.
|
||
If not supplied, defaults to ``win_length // 4``.
|
||
|
||
win_length : int > 0, <= n_fft
|
||
Window length. Defaults to ``n_fft``.
|
||
See ``stft`` for details.
|
||
|
||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||
- a window specification (string, tuple, number);
|
||
see `scipy.signal.get_window`
|
||
- a window function, such as `scipy.signal.windows.hann`
|
||
- a user-specified window vector of length ``n_fft``
|
||
|
||
See `stft` for details.
|
||
|
||
.. see also:: `filters.get_window`
|
||
|
||
center : boolean
|
||
- If ``True``, the signal ``y`` is padded so that frame
|
||
``S[:, t]`` is centered at ``y[t * hop_length]``.
|
||
- If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.
|
||
|
||
dtype : numeric type
|
||
Complex numeric type for ``S``. Default is inferred to match
|
||
the numerical precision of the input signal.
|
||
|
||
pad_mode : string
|
||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||
By default, STFT uses zero padding.
|
||
|
||
Returns
|
||
-------
|
||
freqs : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
|
||
Instantaneous frequencies:
|
||
``freqs[f, t]`` is the frequency for bin ``f``, frame ``t``.
|
||
S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex]
|
||
Short-time Fourier transform
|
||
|
||
Warns
|
||
-----
|
||
RuntimeWarning
|
||
Frequencies with zero support will produce a divide-by-zero warning and
|
||
will be returned as `np.nan`.
|
||
|
||
See Also
|
||
--------
|
||
stft : Short-time Fourier Transform
|
||
reassigned_spectrogram : Time-frequency reassigned spectrogram
|
||
|
||
Examples
|
||
--------
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> frequencies, S = librosa.core.spectrum.__reassign_frequencies(y, sr=sr)
|
||
>>> frequencies
|
||
array([[0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00],
|
||
[3.628e+00, 4.698e+00, ..., 1.239e+01, 1.072e+01],
|
||
...,
|
||
[1.101e+04, 1.102e+04, ..., 1.105e+04, 1.102e+04],
|
||
[1.102e+04, 1.102e+04, ..., 1.102e+04, 1.102e+04]])
|
||
"""
|
||
# retrieve window samples if needed so that the window derivative can be
|
||
# calculated
|
||
if win_length is None:
|
||
win_length = n_fft
|
||
|
||
window = get_window(window, win_length, fftbins=True)
|
||
window = util.pad_center(window, size=n_fft)
|
||
|
||
if S is None:
|
||
if dtype is None:
|
||
dtype = util.dtype_r2c(y.dtype)
|
||
|
||
S_h = stft(
|
||
y=y,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
window=window,
|
||
center=center,
|
||
dtype=dtype,
|
||
pad_mode=pad_mode,
|
||
)
|
||
|
||
else:
|
||
if dtype is None:
|
||
dtype = S.dtype
|
||
|
||
S_h = S
|
||
|
||
# cyclic gradient to correctly handle edges of a periodic window
|
||
window_derivative = util.cyclic_gradient(window)
|
||
|
||
S_dh = stft(
|
||
y=y,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
window=window_derivative,
|
||
center=center,
|
||
dtype=dtype,
|
||
pad_mode=pad_mode,
|
||
)
|
||
|
||
# equation 5.20 of Flandrin, Auger, & Chassande-Mottin 2002
|
||
# the sign of the correction is reversed in some papers - see Plante,
|
||
# Meyer, & Ainsworth 1998 pp. 283-284
|
||
correction = -np.imag(S_dh / S_h)
|
||
|
||
freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft)
|
||
freqs = util.expand_to(freqs, ndim=correction.ndim, axes=-2) + correction * (
|
||
0.5 * sr / np.pi
|
||
)
|
||
|
||
return freqs, S_h
|
||
|
||
|
||
def __reassign_times(
|
||
y: np.ndarray,
|
||
sr: float = 22050,
|
||
S: Optional[np.ndarray] = None,
|
||
n_fft: int = 2048,
|
||
hop_length: Optional[int] = None,
|
||
win_length: Optional[int] = None,
|
||
window: _WindowSpec = "hann",
|
||
center: bool = True,
|
||
dtype: Optional[DTypeLike] = None,
|
||
pad_mode: _PadModeSTFT = "constant",
|
||
) -> Tuple[np.ndarray, np.ndarray]:
|
||
"""Time reassignments based on a spectrogram representation.
|
||
|
||
The reassignment vector is calculated using equation 5.23 in Flandrin,
|
||
Auger, & Chassande-Mottin 2002::
|
||
|
||
t_reassigned = t + np.real(S_th/S_h)
|
||
|
||
where ``S_h`` is the complex STFT calculated using the original window, and
|
||
``S_th`` is the complex STFT calculated using the original window multiplied
|
||
by the time offset from the window center.
|
||
|
||
See `reassigned_spectrogram` for references.
|
||
|
||
It is recommended to use ``pad_mode="constant"`` (zero padding) or else
|
||
``center=False``, rather than the defaults. Time reassignment assumes that
|
||
the energy in each FFT bin is associated with exactly one impulse event.
|
||
Reflection padding at the edges of the signal may invalidate the reassigned
|
||
estimates in the boundary frames.
|
||
|
||
Parameters
|
||
----------
|
||
y : np.ndarray [shape=(..., n,)], real-valued
|
||
audio time series. Multi-channel is supported.
|
||
|
||
sr : number > 0 [scalar]
|
||
sampling rate of ``y``
|
||
|
||
S : np.ndarray [shape=(..., d, t)] or None
|
||
(optional) complex STFT calculated using the other arguments provided
|
||
to `__reassign_times`
|
||
|
||
n_fft : int > 0 [scalar]
|
||
FFT window size. Defaults to 2048.
|
||
|
||
hop_length : int > 0 [scalar]
|
||
hop length, number samples between subsequent frames.
|
||
If not supplied, defaults to ``win_length // 4``.
|
||
|
||
win_length : int > 0, <= n_fft
|
||
Window length. Defaults to ``n_fft``.
|
||
See `stft` for details.
|
||
|
||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||
- a window specification (string, tuple, number);
|
||
see `scipy.signal.get_window`
|
||
- a window function, such as `scipy.signal.windows.hann`
|
||
- a user-specified window vector of length ``n_fft``
|
||
|
||
See `stft` for details.
|
||
|
||
.. see also:: `filters.get_window`
|
||
|
||
center : boolean
|
||
- If ``True``, the signal ``y`` is padded so that frame
|
||
``S[:, t]`` is centered at ``y[t * hop_length]``.
|
||
- If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.
|
||
|
||
dtype : numeric type
|
||
Complex numeric type for ``S``. Default is inferred to match
|
||
the precision of the input signal.
|
||
|
||
pad_mode : string
|
||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||
By default, STFT uses zero padding.
|
||
|
||
Returns
|
||
-------
|
||
times : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
|
||
Reassigned times:
|
||
``times[f, t]`` is the time for bin ``f``, frame ``t``.
|
||
S : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=complex]
|
||
Short-time Fourier transform
|
||
|
||
Warns
|
||
-----
|
||
RuntimeWarning
|
||
Time estimates with zero support will produce a divide-by-zero warning
|
||
and will be returned as `np.nan`.
|
||
|
||
See Also
|
||
--------
|
||
stft : Short-time Fourier Transform
|
||
reassigned_spectrogram : Time-frequency reassigned spectrogram
|
||
|
||
Examples
|
||
--------
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> times, S = librosa.core.spectrum.__reassign_times(y, sr=sr)
|
||
>>> times
|
||
array([[ 2.268e-05, 1.144e-02, ..., 5.332e+00, 5.333e+00],
|
||
[ 2.268e-05, 1.451e-02, ..., 5.334e+00, 5.333e+00],
|
||
...,
|
||
[ 2.268e-05, -6.177e-04, ..., 5.368e+00, 5.327e+00],
|
||
[ 2.268e-05, 1.420e-03, ..., 5.307e+00, 5.328e+00]])
|
||
"""
|
||
# retrieve window samples if needed so that the time-weighted window can be
|
||
# calculated
|
||
if win_length is None:
|
||
win_length = n_fft
|
||
|
||
window = get_window(window, win_length, fftbins=True)
|
||
window = util.pad_center(window, size=n_fft)
|
||
|
||
# retrieve hop length if needed so that the frame times can be calculated
|
||
if hop_length is None:
|
||
hop_length = int(win_length // 4)
|
||
|
||
if S is None:
|
||
if dtype is None:
|
||
dtype = util.dtype_r2c(y.dtype)
|
||
S_h = stft(
|
||
y=y,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
window=window,
|
||
center=center,
|
||
dtype=dtype,
|
||
pad_mode=pad_mode,
|
||
)
|
||
|
||
else:
|
||
if dtype is None:
|
||
dtype = S.dtype
|
||
S_h = S
|
||
|
||
# calculate window weighted by time
|
||
half_width = n_fft // 2
|
||
|
||
window_times: np.ndarray
|
||
if n_fft % 2:
|
||
window_times = np.arange(-half_width, half_width + 1)
|
||
|
||
else:
|
||
window_times = np.arange(0.5 - half_width, half_width)
|
||
|
||
window_time_weighted = window * window_times
|
||
|
||
S_th = stft(
|
||
y=y,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
window=window_time_weighted,
|
||
center=center,
|
||
dtype=dtype,
|
||
pad_mode=pad_mode,
|
||
)
|
||
|
||
# equation 5.23 of Flandrin, Auger, & Chassande-Mottin 2002
|
||
# the sign of the correction is reversed in some papers - see Plante,
|
||
# Meyer, & Ainsworth 1998 pp. 283-284
|
||
correction = np.real(S_th / S_h)
|
||
|
||
if center:
|
||
pad_length = None
|
||
|
||
else:
|
||
pad_length = n_fft
|
||
|
||
times = convert.frames_to_time(
|
||
np.arange(S_h.shape[-1]), sr=sr, hop_length=hop_length, n_fft=pad_length
|
||
)
|
||
|
||
times = util.expand_to(times, ndim=correction.ndim, axes=-1) + correction / sr
|
||
|
||
return times, S_h
|
||
|
||
|
||
def reassigned_spectrogram(
|
||
y: np.ndarray,
|
||
*,
|
||
sr: float = 22050,
|
||
S: Optional[np.ndarray] = None,
|
||
n_fft: int = 2048,
|
||
hop_length: Optional[int] = None,
|
||
win_length: Optional[int] = None,
|
||
window: _WindowSpec = "hann",
|
||
center: bool = True,
|
||
reassign_frequencies: bool = True,
|
||
reassign_times: bool = True,
|
||
ref_power: Union[float, Callable] = 1e-6,
|
||
fill_nan: bool = False,
|
||
clip: bool = True,
|
||
dtype: Optional[DTypeLike] = None,
|
||
pad_mode: _PadModeSTFT = "constant",
|
||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||
r"""Time-frequency reassigned spectrogram.
|
||
|
||
The reassignment vectors are calculated using equations 5.20 and 5.23 in
|
||
[#]_::
|
||
|
||
t_reassigned = t + np.real(S_th/S_h)
|
||
omega_reassigned = omega - np.imag(S_dh/S_h)
|
||
|
||
where ``S_h`` is the complex STFT calculated using the original window,
|
||
``S_dh`` is the complex STFT calculated using the derivative of the original
|
||
window, and ``S_th`` is the complex STFT calculated using the original window
|
||
multiplied by the time offset from the window center. See [#]_ for
|
||
additional algorithms, and [#]_ and [#]_ for history and discussion of the
|
||
method.
|
||
|
||
.. [#] Flandrin, P., Auger, F., & Chassande-Mottin, E. (2002).
|
||
Time-Frequency reassignment: From principles to algorithms. In
|
||
Applications in Time-Frequency Signal Processing (Vol. 10, pp.
|
||
179-204). CRC Press.
|
||
|
||
.. [#] Fulop, S. A., & Fitz, K. (2006). Algorithms for computing the
|
||
time-corrected instantaneous frequency (reassigned) spectrogram, with
|
||
applications. The Journal of the Acoustical Society of America, 119(1),
|
||
360. doi:10.1121/1.2133000
|
||
|
||
.. [#] Auger, F., Flandrin, P., Lin, Y.-T., McLaughlin, S., Meignen, S.,
|
||
Oberlin, T., & Wu, H.-T. (2013). Time-Frequency Reassignment and
|
||
Synchrosqueezing: An Overview. IEEE Signal Processing Magazine, 30(6),
|
||
32-41. doi:10.1109/MSP.2013.2265316
|
||
|
||
.. [#] Hainsworth, S., Macleod, M. (2003). Time-frequency reassignment: a
|
||
review and analysis. Tech. Rep. CUED/FINFENG/TR.459, Cambridge
|
||
University Engineering Department
|
||
|
||
Parameters
|
||
----------
|
||
y : np.ndarray [shape=(..., n)], real-valued
|
||
audio time series. Multi-channel is supported.
|
||
|
||
sr : number > 0 [scalar]
|
||
sampling rate of ``y``
|
||
|
||
S : np.ndarray [shape=(..., d, t)] or None
|
||
(optional) complex STFT calculated using the other arguments provided
|
||
to ``reassigned_spectrogram``
|
||
|
||
n_fft : int > 0 [scalar]
|
||
FFT window size. Defaults to 2048.
|
||
|
||
hop_length : int > 0 [scalar]
|
||
hop length, number samples between subsequent frames.
|
||
If not supplied, defaults to ``win_length // 4``.
|
||
|
||
win_length : int > 0, <= n_fft
|
||
Window length. Defaults to ``n_fft``.
|
||
See `stft` for details.
|
||
|
||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||
- a window specification (string, tuple, number);
|
||
see `scipy.signal.get_window`
|
||
- a window function, such as `scipy.signal.windows.hann`
|
||
- a user-specified window vector of length ``n_fft``
|
||
|
||
See `stft` for details.
|
||
|
||
.. see also:: `filters.get_window`
|
||
|
||
center : boolean
|
||
- If ``True`` (default), the signal ``y`` is padded so that frame
|
||
``S[:, t]`` is centered at ``y[t * hop_length]``. See `Notes` for
|
||
recommended usage in this function.
|
||
- If ``False``, then ``S[:, t]`` begins at ``y[t * hop_length]``.
|
||
|
||
reassign_frequencies : boolean
|
||
- If ``True`` (default), the returned frequencies will be instantaneous
|
||
frequency estimates.
|
||
- If ``False``, the returned frequencies will be a read-only view of the
|
||
STFT bin frequencies for all frames.
|
||
|
||
reassign_times : boolean
|
||
- If ``True`` (default), the returned times will be corrected
|
||
(reassigned) time estimates for each bin.
|
||
- If ``False``, the returned times will be a read-only view of the STFT
|
||
frame times for all bins.
|
||
|
||
ref_power : float >= 0 or callable
|
||
Minimum power threshold for estimating time-frequency reassignments.
|
||
Any bin with ``np.abs(S[f, t])**2 < ref_power`` will be returned as
|
||
`np.nan` in both frequency and time, unless ``fill_nan`` is ``True``. If 0
|
||
is provided, then only bins with zero power will be returned as
|
||
`np.nan` (unless ``fill_nan=True``).
|
||
|
||
fill_nan : boolean
|
||
- If ``False`` (default), the frequency and time reassignments for bins
|
||
below the power threshold provided in ``ref_power`` will be returned as
|
||
`np.nan`.
|
||
- If ``True``, the frequency and time reassignments for these bins will
|
||
be returned as the bin center frequencies and frame times.
|
||
|
||
clip : boolean
|
||
- If ``True`` (default), estimated frequencies outside the range
|
||
`[0, 0.5 * sr]` or times outside the range `[0, len(y) / sr]` will be
|
||
clipped to those ranges.
|
||
- If ``False``, estimated frequencies and times beyond the bounds of the
|
||
spectrogram may be returned.
|
||
|
||
dtype : numeric type
|
||
Complex numeric type for STFT calculation. Default is inferred to match
|
||
the precision of the input signal.
|
||
|
||
pad_mode : string
|
||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||
By default, STFT uses zero padding.
|
||
|
||
Returns
|
||
-------
|
||
freqs, times, mags : np.ndarray [shape=(..., 1 + n_fft/2, t), dtype=real]
|
||
Instantaneous frequencies:
|
||
``freqs[..., f, t]`` is the frequency for bin ``f``, frame ``t``.
|
||
If ``reassign_frequencies=False``, this will instead be a read-only array
|
||
of the same shape containing the bin center frequencies for all frames.
|
||
|
||
Reassigned times:
|
||
``times[..., f, t]`` is the time for bin ``f``, frame ``t``.
|
||
If ``reassign_times=False``, this will instead be a read-only array of
|
||
the same shape containing the frame times for all bins.
|
||
|
||
Magnitudes from short-time Fourier transform:
|
||
``mags[..., f, t]`` is the magnitude for bin ``f``, frame ``t``.
|
||
|
||
Warns
|
||
-----
|
||
RuntimeWarning
|
||
Frequency or time estimates with zero support will produce a
|
||
divide-by-zero warning, and will be returned as `np.nan` unless
|
||
``fill_nan=True``.
|
||
|
||
See Also
|
||
--------
|
||
stft : Short-time Fourier Transform
|
||
|
||
Notes
|
||
-----
|
||
It is recommended to use ``center=False`` with this function rather than the
|
||
librosa default ``True``. Unlike ``stft``, reassigned times are not aligned to
|
||
the left or center of each frame, so padding the signal does not affect the
|
||
meaning of the reassigned times. However, reassignment assumes that the
|
||
energy in each FFT bin is associated with exactly one signal component and
|
||
impulse event.
|
||
|
||
If ``reassign_times`` is ``False``, the frame times that are returned will be
|
||
aligned to the left or center of the frame, depending on the value of
|
||
``center``. In this case, if ``center`` is ``True``, then ``pad_mode="wrap"`` is
|
||
recommended for valid estimation of the instantaneous frequencies in the
|
||
boundary frames.
|
||
|
||
Examples
|
||
--------
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> amin = 1e-10
|
||
>>> n_fft = 64
|
||
>>> sr = 4000
|
||
>>> y = 1e-3 * librosa.clicks(times=[0.3], sr=sr, click_duration=1.0,
|
||
... click_freq=1200.0, length=8000) +\
|
||
... 1e-3 * librosa.clicks(times=[1.5], sr=sr, click_duration=0.5,
|
||
... click_freq=400.0, length=8000) +\
|
||
... 1e-3 * librosa.chirp(fmin=200, fmax=1600, sr=sr, duration=2.0) +\
|
||
... 1e-6 * np.random.randn(2*sr)
|
||
>>> freqs, times, mags = librosa.reassigned_spectrogram(y=y, sr=sr,
|
||
... n_fft=n_fft)
|
||
>>> mags_db = librosa.amplitude_to_db(mags, ref=np.max)
|
||
|
||
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
|
||
>>> img = librosa.display.specshow(mags_db, x_axis="s", y_axis="linear", sr=sr,
|
||
... hop_length=n_fft//4, ax=ax[0])
|
||
>>> ax[0].set(title="Spectrogram", xlabel=None)
|
||
>>> ax[0].label_outer()
|
||
>>> ax[1].scatter(times, freqs, c=mags_db, cmap="magma", alpha=0.1, s=5)
|
||
>>> ax[1].set_title("Reassigned spectrogram")
|
||
>>> fig.colorbar(img, ax=ax, format="%+2.f dB")
|
||
"""
|
||
if not callable(ref_power) and ref_power < 0:
|
||
raise ParameterError("ref_power must be non-negative or callable.")
|
||
|
||
if not reassign_frequencies and not reassign_times:
|
||
raise ParameterError("reassign_frequencies or reassign_times must be True.")
|
||
|
||
if win_length is None:
|
||
win_length = n_fft
|
||
|
||
if hop_length is None:
|
||
hop_length = int(win_length // 4)
|
||
|
||
# frequency and time reassignment if requested
|
||
if reassign_frequencies:
|
||
freqs, S = __reassign_frequencies(
|
||
y=y,
|
||
sr=sr,
|
||
S=S,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
win_length=win_length,
|
||
window=window,
|
||
center=center,
|
||
dtype=dtype,
|
||
pad_mode=pad_mode,
|
||
)
|
||
|
||
if reassign_times:
|
||
times, S = __reassign_times(
|
||
y=y,
|
||
sr=sr,
|
||
S=S,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
win_length=win_length,
|
||
window=window,
|
||
center=center,
|
||
dtype=dtype,
|
||
pad_mode=pad_mode,
|
||
)
|
||
|
||
assert S is not None
|
||
|
||
mags: np.ndarray = np.abs(S)
|
||
|
||
# clean up reassignment issues: divide-by-zero, bins with near-zero power,
|
||
# and estimates outside the spectrogram bounds
|
||
|
||
# retrieve bin frequencies and frame times to replace missing estimates
|
||
if fill_nan or not reassign_frequencies or not reassign_times:
|
||
if center:
|
||
pad_length = None
|
||
|
||
else:
|
||
pad_length = n_fft
|
||
|
||
bin_freqs = convert.fft_frequencies(sr=sr, n_fft=n_fft)
|
||
|
||
frame_times = convert.frames_to_time(
|
||
frames=np.arange(S.shape[-1]),
|
||
sr=sr,
|
||
hop_length=hop_length,
|
||
n_fft=pad_length,
|
||
)
|
||
|
||
# find bins below the power threshold
|
||
# reassigned bins with zero power will already be NaN
|
||
if callable(ref_power):
|
||
ref_p = ref_power(mags**2)
|
||
else:
|
||
ref_p = ref_power
|
||
mags_low = np.less(mags, ref_p**0.5, where=~np.isnan(mags))
|
||
|
||
# for reassigned estimates, optionally set thresholded bins to NaN, return
|
||
# bin frequencies and frame times in place of NaN generated by
|
||
# divide-by-zero and power threshold, and clip to spectrogram bounds
|
||
if reassign_frequencies:
|
||
if ref_p > 0:
|
||
freqs[mags_low] = np.nan
|
||
|
||
if fill_nan:
|
||
freqs = np.where(np.isnan(freqs), bin_freqs[:, np.newaxis], freqs)
|
||
|
||
if clip:
|
||
np.clip(freqs, 0, sr / 2.0, out=freqs)
|
||
|
||
# or if reassignment was not requested, return bin frequencies and frame
|
||
# times for every cell is the spectrogram
|
||
else:
|
||
freqs = np.broadcast_to(bin_freqs[:, np.newaxis], S.shape)
|
||
|
||
if reassign_times:
|
||
if ref_p > 0:
|
||
times[mags_low] = np.nan
|
||
|
||
if fill_nan:
|
||
times = np.where(np.isnan(times), frame_times[np.newaxis, :], times)
|
||
|
||
if clip:
|
||
np.clip(times, 0, y.shape[-1] / float(sr), out=times)
|
||
|
||
else:
|
||
times = np.broadcast_to(frame_times[np.newaxis, :], S.shape)
|
||
|
||
return freqs, times, mags
|
||
|
||
|
||
def magphase(D: np.ndarray, *, power: float = 1) -> Tuple[np.ndarray, np.ndarray]:
|
||
"""Separate a complex-valued spectrogram D into its magnitude (S)
|
||
and phase (P) components, so that ``D = S * P``.
|
||
|
||
Parameters
|
||
----------
|
||
D : np.ndarray [shape=(..., d, t), dtype=complex]
|
||
complex-valued spectrogram
|
||
power : float > 0
|
||
Exponent for the magnitude spectrogram,
|
||
e.g., 1 for energy, 2 for power, etc.
|
||
|
||
Returns
|
||
-------
|
||
D_mag : np.ndarray [shape=(..., d, t), dtype=real]
|
||
magnitude of ``D``, raised to ``power``
|
||
D_phase : np.ndarray [shape=(..., d, t), dtype=complex]
|
||
``exp(1.j * phi)`` where ``phi`` is the phase of ``D``
|
||
|
||
Examples
|
||
--------
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> D = librosa.stft(y)
|
||
>>> magnitude, phase = librosa.magphase(D)
|
||
>>> magnitude
|
||
array([[5.395e-03, 3.332e-03, ..., 9.862e-07, 1.201e-05],
|
||
[3.244e-03, 2.690e-03, ..., 9.536e-07, 1.201e-05],
|
||
...,
|
||
[7.523e-05, 3.722e-05, ..., 1.188e-04, 1.031e-03],
|
||
[7.640e-05, 3.944e-05, ..., 5.180e-04, 1.346e-03]],
|
||
dtype=float32)
|
||
>>> phase
|
||
array([[ 1. +0.000e+00j, 1. +0.000e+00j, ...,
|
||
-1. -8.742e-08j, -1. -8.742e-08j],
|
||
[-1. -8.742e-08j, -0.775-6.317e-01j, ...,
|
||
-0.885-4.648e-01j, 0.472-8.815e-01j],
|
||
...,
|
||
[ 1. -4.342e-12j, 0.028-9.996e-01j, ...,
|
||
-0.222-9.751e-01j, -0.75 -6.610e-01j],
|
||
[-1. -8.742e-08j, -1. -8.742e-08j, ...,
|
||
1. +0.000e+00j, 1. +0.000e+00j]], dtype=complex64)
|
||
|
||
Or get the phase angle (in radians)
|
||
|
||
>>> np.angle(phase)
|
||
array([[ 0.000e+00, 0.000e+00, ..., -3.142e+00, -3.142e+00],
|
||
[-3.142e+00, -2.458e+00, ..., -2.658e+00, -1.079e+00],
|
||
...,
|
||
[-4.342e-12, -1.543e+00, ..., -1.794e+00, -2.419e+00],
|
||
[-3.142e+00, -3.142e+00, ..., 0.000e+00, 0.000e+00]],
|
||
dtype=float32)
|
||
"""
|
||
mag = np.abs(D)
|
||
|
||
# Prevent NaNs and return magnitude 0, phase 1+0j for zero
|
||
zeros_to_ones = mag == 0
|
||
mag_nonzero = mag + zeros_to_ones
|
||
# Compute real and imaginary separately, because complex division can
|
||
# produce NaNs when denormalized numbers are involved (< ~2e-39 for
|
||
# complex64, ~5e-309 for complex128)
|
||
phase = np.empty_like(D, dtype=util.dtype_r2c(D.dtype))
|
||
phase.real = D.real / mag_nonzero + zeros_to_ones
|
||
phase.imag = D.imag / mag_nonzero
|
||
|
||
mag **= power
|
||
|
||
return mag, phase
|
||
|
||
|
||
def phase_vocoder(
|
||
D: np.ndarray,
|
||
*,
|
||
rate: float,
|
||
hop_length: Optional[int] = None,
|
||
n_fft: Optional[int] = None,
|
||
) -> np.ndarray:
|
||
"""Phase vocoder. Given an STFT matrix D, speed up by a factor of ``rate``
|
||
|
||
Based on the implementation provided by [#]_.
|
||
|
||
This is a simplified implementation, intended primarily for
|
||
reference and pedagogical purposes. It makes no attempt to
|
||
handle transients, and is likely to produce many audible
|
||
artifacts. For a higher quality implementation, we recommend
|
||
the RubberBand library [#]_ and its Python wrapper `pyrubberband`.
|
||
|
||
.. [#] Ellis, D. P. W. "A phase vocoder in Matlab."
|
||
Columbia University, 2002.
|
||
https://www.ee.columbia.edu/~dpwe/resources/matlab/pvoc/
|
||
|
||
.. [#] https://breakfastquay.com/rubberband/
|
||
|
||
Examples
|
||
--------
|
||
>>> # Play at double speed
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> D = librosa.stft(y, n_fft=2048, hop_length=512)
|
||
>>> D_fast = librosa.phase_vocoder(D, rate=2.0, hop_length=512)
|
||
>>> y_fast = librosa.istft(D_fast, hop_length=512)
|
||
|
||
>>> # Or play at 1/3 speed
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> D = librosa.stft(y, n_fft=2048, hop_length=512)
|
||
>>> D_slow = librosa.phase_vocoder(D, rate=1./3, hop_length=512)
|
||
>>> y_slow = librosa.istft(D_slow, hop_length=512)
|
||
|
||
Parameters
|
||
----------
|
||
D : np.ndarray [shape=(..., d, t), dtype=complex]
|
||
STFT matrix
|
||
|
||
rate : float > 0 [scalar]
|
||
Speed-up factor: ``rate > 1`` is faster, ``rate < 1`` is slower.
|
||
|
||
hop_length : int > 0 [scalar] or None
|
||
The number of samples between successive columns of ``D``.
|
||
|
||
If None, defaults to ``n_fft//4 = (D.shape[0]-1)//2``
|
||
|
||
n_fft : int > 0 or None
|
||
The number of samples per frame in D.
|
||
By default (None), this will be inferred from the shape of D.
|
||
However, if D was constructed using an odd-length window, the correct
|
||
frame length can be specified here.
|
||
|
||
Returns
|
||
-------
|
||
D_stretched : np.ndarray [shape=(..., d, t / rate), dtype=complex]
|
||
time-stretched STFT
|
||
|
||
See Also
|
||
--------
|
||
pyrubberband
|
||
"""
|
||
if n_fft is None:
|
||
n_fft = 2 * (D.shape[-2] - 1)
|
||
|
||
if hop_length is None:
|
||
hop_length = int(n_fft // 4)
|
||
|
||
time_steps = np.arange(0, D.shape[-1], rate, dtype=np.float64)
|
||
|
||
# Create an empty output array
|
||
shape = list(D.shape)
|
||
shape[-1] = len(time_steps)
|
||
d_stretch = np.zeros_like(D, shape=shape)
|
||
|
||
# Expected phase advance in each bin
|
||
phi_advance = np.linspace(0, np.pi * hop_length, D.shape[-2])
|
||
|
||
# Phase accumulator; initialize to the first sample
|
||
phase_acc = np.angle(D[..., 0])
|
||
|
||
# Pad 0 columns to simplify boundary logic
|
||
padding = [(0, 0) for _ in D.shape]
|
||
padding[-1] = (0, 2)
|
||
D = np.pad(D, padding, mode="constant")
|
||
|
||
for t, step in enumerate(time_steps):
|
||
columns = D[..., int(step) : int(step + 2)]
|
||
|
||
# Weighting for linear magnitude interpolation
|
||
alpha = np.mod(step, 1.0)
|
||
mag = (1.0 - alpha) * np.abs(columns[..., 0]) + alpha * np.abs(columns[..., 1])
|
||
|
||
# Store to output array
|
||
d_stretch[..., t] = util.phasor(phase_acc, mag=mag)
|
||
|
||
# Compute phase advance
|
||
dphase = np.angle(columns[..., 1]) - np.angle(columns[..., 0]) - phi_advance
|
||
|
||
# Wrap to -pi:pi range
|
||
dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))
|
||
|
||
# Accumulate phase
|
||
phase_acc += phi_advance + dphase
|
||
|
||
return d_stretch
|
||
|
||
|
||
@cache(level=20)
|
||
def iirt(
|
||
y: np.ndarray,
|
||
*,
|
||
sr: float = 22050,
|
||
win_length: int = 2048,
|
||
hop_length: Optional[int] = None,
|
||
center: bool = True,
|
||
tuning: float = 0.0,
|
||
pad_mode: _PadMode = "constant",
|
||
flayout: str = "sos",
|
||
res_type: str = "soxr_hq",
|
||
**kwargs: Any,
|
||
) -> np.ndarray:
|
||
r"""Time-frequency representation using IIR filters
|
||
|
||
This function will return a time-frequency representation
|
||
using a multirate filter bank consisting of IIR filters. [#]_
|
||
|
||
First, ``y`` is resampled as needed according to the provided ``sample_rates``.
|
||
|
||
Then, a filterbank with with ``n`` band-pass filters is designed.
|
||
|
||
The resampled input signals are processed by the filterbank as a whole.
|
||
(`scipy.signal.filtfilt` resp. `sosfiltfilt` is used to make the phase linear.)
|
||
The output of the filterbank is cut into frames.
|
||
For each band, the short-time mean-square power (STMSP) is calculated by
|
||
summing ``win_length`` subsequent filtered time samples.
|
||
|
||
When called with the default set of parameters, it will generate the TF-representation
|
||
(pitch filterbank):
|
||
|
||
* 85 filters with MIDI pitches [24, 108] as ``center_freqs``.
|
||
* each filter having a bandwidth of one semitone.
|
||
|
||
.. [#] Müller, Meinard.
|
||
"Information Retrieval for Music and Motion."
|
||
Springer Verlag. 2007.
|
||
|
||
Parameters
|
||
----------
|
||
y : np.ndarray [shape=(..., n)]
|
||
audio time series. Multi-channel is supported.
|
||
sr : number > 0 [scalar]
|
||
sampling rate of ``y``
|
||
win_length : int > 0, <= n_fft
|
||
Window length.
|
||
hop_length : int > 0 [scalar]
|
||
Hop length, number samples between subsequent frames.
|
||
If not supplied, defaults to ``win_length // 4``.
|
||
center : boolean
|
||
- If ``True``, the signal ``y`` is padded so that frame
|
||
``D[..., :, t]`` is centered at ``y[t * hop_length]``.
|
||
- If ``False``, then `D[..., :, t]`` begins at ``y[t * hop_length]``
|
||
tuning : float [scalar]
|
||
Tuning deviation from A440 in fractions of a bin.
|
||
pad_mode : string
|
||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||
By default, this function uses zero padding.
|
||
flayout : string
|
||
- If `sos` (default), a series of second-order filters is used for filtering with `scipy.signal.sosfiltfilt`.
|
||
Minimizes numerical precision errors for high-order filters, but is slower.
|
||
- If `ba`, the standard difference equation is used for filtering with `scipy.signal.filtfilt`.
|
||
Can be unstable for high-order filters.
|
||
res_type : string
|
||
The resampling mode. See `librosa.resample` for details.
|
||
**kwargs : additional keyword arguments
|
||
Additional arguments for `librosa.filters.semitone_filterbank`
|
||
(e.g., could be used to provide another set of ``center_freqs`` and ``sample_rates``).
|
||
|
||
Returns
|
||
-------
|
||
bands_power : np.ndarray [shape=(..., n, t), dtype=dtype]
|
||
Short-time mean-square power for the input signal.
|
||
|
||
Raises
|
||
------
|
||
ParameterError
|
||
If ``flayout`` is not None, `ba`, or `sos`.
|
||
|
||
See Also
|
||
--------
|
||
librosa.filters.semitone_filterbank
|
||
librosa.filters.mr_frequencies
|
||
librosa.cqt
|
||
scipy.signal.filtfilt
|
||
scipy.signal.sosfiltfilt
|
||
|
||
Examples
|
||
--------
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'), duration=3)
|
||
>>> D = np.abs(librosa.iirt(y))
|
||
>>> C = np.abs(librosa.cqt(y=y, sr=sr))
|
||
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
|
||
>>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
|
||
... y_axis='cqt_hz', x_axis='time', ax=ax[0])
|
||
>>> ax[0].set(title='Constant-Q transform')
|
||
>>> ax[0].label_outer()
|
||
>>> img = librosa.display.specshow(librosa.amplitude_to_db(D, ref=np.max),
|
||
... y_axis='cqt_hz', x_axis='time', ax=ax[1])
|
||
>>> ax[1].set_title('Semitone spectrogram (iirt)')
|
||
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
|
||
"""
|
||
if flayout not in ("ba", "sos"):
|
||
raise ParameterError(f"Unsupported flayout={flayout}")
|
||
|
||
# check audio input
|
||
util.valid_audio(y, mono=False)
|
||
|
||
# Set the default hop, if it's not already specified
|
||
if hop_length is None:
|
||
hop_length = win_length // 4
|
||
|
||
# Pad the time series so that frames are centered
|
||
if center:
|
||
padding = [(0, 0) for _ in y.shape]
|
||
padding[-1] = (win_length // 2, win_length // 2)
|
||
y = np.pad(y, padding, mode=pad_mode)
|
||
|
||
# get the semitone filterbank
|
||
filterbank_ct, sample_rates = semitone_filterbank(
|
||
tuning=tuning, flayout=flayout, **kwargs
|
||
)
|
||
|
||
# create three downsampled versions of the audio signal
|
||
y_resampled = []
|
||
|
||
y_srs = np.unique(sample_rates)
|
||
|
||
for cur_sr in y_srs:
|
||
y_resampled.append(resample(y, orig_sr=sr, target_sr=cur_sr, res_type=res_type))
|
||
|
||
# Compute the number of frames that will fit. The end may get truncated.
|
||
n_frames = int(1 + (y.shape[-1] - win_length) // hop_length)
|
||
|
||
# Pre-allocate the output array
|
||
shape = list(y.shape)
|
||
# Time dimension reduces to n_frames
|
||
shape[-1] = n_frames
|
||
# Insert a new axis at position -2 for filter response
|
||
shape.insert(-1, len(filterbank_ct))
|
||
|
||
bands_power = np.empty_like(y, shape=shape)
|
||
|
||
slices: List[Union[int, slice]] = [slice(None) for _ in bands_power.shape]
|
||
for i, (cur_sr, cur_filter) in enumerate(zip(sample_rates, filterbank_ct)):
|
||
slices[-2] = i
|
||
|
||
# filter the signal
|
||
cur_sr_idx = np.flatnonzero(y_srs == cur_sr)[0]
|
||
|
||
if flayout == "ba":
|
||
cur_filter_output = scipy.signal.filtfilt(
|
||
cur_filter[0], cur_filter[1], y_resampled[cur_sr_idx], axis=-1
|
||
)
|
||
elif flayout == "sos":
|
||
cur_filter_output = scipy.signal.sosfiltfilt(
|
||
cur_filter, y_resampled[cur_sr_idx], axis=-1
|
||
)
|
||
|
||
factor = sr / cur_sr
|
||
hop_length_STMSP = hop_length / factor
|
||
win_length_STMSP_round = int(round(win_length / factor))
|
||
|
||
# hop_length_STMSP is used here as a floating-point number.
|
||
# The discretization happens at the end to avoid accumulated rounding errors.
|
||
start_idx = np.arange(
|
||
0, cur_filter_output.shape[-1] - win_length_STMSP_round, hop_length_STMSP
|
||
)
|
||
if len(start_idx) < n_frames:
|
||
min_length = (
|
||
int(np.ceil(n_frames * hop_length_STMSP)) + win_length_STMSP_round
|
||
)
|
||
cur_filter_output = util.fix_length(cur_filter_output, size=min_length)
|
||
start_idx = np.arange(
|
||
0,
|
||
cur_filter_output.shape[-1] - win_length_STMSP_round,
|
||
hop_length_STMSP,
|
||
)
|
||
start_idx = np.round(start_idx).astype(int)[:n_frames]
|
||
|
||
idx = np.add.outer(start_idx, np.arange(win_length_STMSP_round))
|
||
|
||
bands_power[tuple(slices)] = factor * np.sum(
|
||
cur_filter_output[..., idx] ** 2, axis=-1
|
||
)
|
||
|
||
return bands_power
|
||
|
||
|
||
@cache(level=30)
|
||
def power_to_db(
|
||
S: np.ndarray,
|
||
*,
|
||
ref: Union[float, Callable] = 1.0,
|
||
amin: float = 1e-10,
|
||
top_db: Optional[float] = 80.0,
|
||
) -> np.ndarray:
|
||
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units
|
||
|
||
This computes the scaling ``10 * log10(S / ref)`` in a numerically
|
||
stable way.
|
||
|
||
Parameters
|
||
----------
|
||
S : np.ndarray
|
||
input power
|
||
|
||
ref : scalar or callable
|
||
If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::
|
||
|
||
10 * log10(S / ref)
|
||
|
||
Zeros in the output correspond to positions where ``S == ref``.
|
||
|
||
If callable, the reference value is computed as ``ref(S)``.
|
||
|
||
amin : float > 0 [scalar]
|
||
minimum threshold for ``abs(S)`` and ``ref``
|
||
|
||
top_db : float >= 0 [scalar]
|
||
threshold the output at ``top_db`` below the peak:
|
||
``max(10 * log10(S/ref)) - top_db``
|
||
|
||
Returns
|
||
-------
|
||
S_db : np.ndarray
|
||
``S_db ~= 10 * log10(S) - 10 * log10(ref)``
|
||
|
||
See Also
|
||
--------
|
||
perceptual_weighting
|
||
db_to_power
|
||
amplitude_to_db
|
||
db_to_amplitude
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 30.
|
||
|
||
Examples
|
||
--------
|
||
Get a power spectrogram from a waveform ``y``
|
||
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> S = np.abs(librosa.stft(y))
|
||
>>> librosa.power_to_db(S**2)
|
||
array([[-41.809, -41.809, ..., -41.809, -41.809],
|
||
[-41.809, -41.809, ..., -41.809, -41.809],
|
||
...,
|
||
[-41.809, -41.809, ..., -41.809, -41.809],
|
||
[-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)
|
||
|
||
Compute dB relative to peak power
|
||
|
||
>>> librosa.power_to_db(S**2, ref=np.max)
|
||
array([[-80., -80., ..., -80., -80.],
|
||
[-80., -80., ..., -80., -80.],
|
||
...,
|
||
[-80., -80., ..., -80., -80.],
|
||
[-80., -80., ..., -80., -80.]], dtype=float32)
|
||
|
||
Or compare to median power
|
||
|
||
>>> librosa.power_to_db(S**2, ref=np.median)
|
||
array([[16.578, 16.578, ..., 16.578, 16.578],
|
||
[16.578, 16.578, ..., 16.578, 16.578],
|
||
...,
|
||
[16.578, 16.578, ..., 16.578, 16.578],
|
||
[16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)
|
||
|
||
And plot the results
|
||
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
|
||
>>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
|
||
... ax=ax[0])
|
||
>>> ax[0].set(title='Power spectrogram')
|
||
>>> ax[0].label_outer()
|
||
>>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
|
||
... sr=sr, y_axis='log', x_axis='time', ax=ax[1])
|
||
>>> ax[1].set(title='Log-Power spectrogram')
|
||
>>> fig.colorbar(imgpow, ax=ax[0])
|
||
>>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
|
||
"""
|
||
S = np.asarray(S)
|
||
|
||
if amin <= 0:
|
||
raise ParameterError("amin must be strictly positive")
|
||
|
||
if np.issubdtype(S.dtype, np.complexfloating):
|
||
warnings.warn(
|
||
"power_to_db was called on complex input so phase "
|
||
"information will be discarded. To suppress this warning, "
|
||
"call power_to_db(np.abs(D)**2) instead.",
|
||
stacklevel=2,
|
||
)
|
||
magnitude = np.abs(S)
|
||
else:
|
||
magnitude = S
|
||
|
||
if callable(ref):
|
||
# User supplied a function to calculate reference power
|
||
ref_value = ref(magnitude)
|
||
else:
|
||
ref_value = np.abs(ref)
|
||
|
||
log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude))
|
||
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
|
||
|
||
if top_db is not None:
|
||
if top_db < 0:
|
||
raise ParameterError("top_db must be non-negative")
|
||
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
|
||
|
||
return log_spec
|
||
|
||
|
||
@cache(level=30)
|
||
def db_to_power(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray:
|
||
"""Convert a dB-scale spectrogram to a power spectrogram.
|
||
|
||
This effectively inverts ``power_to_db``::
|
||
|
||
db_to_power(S_db) ~= ref * 10.0**(S_db / 10)
|
||
|
||
Parameters
|
||
----------
|
||
S_db : np.ndarray
|
||
dB-scaled spectrogram
|
||
ref : number > 0
|
||
Reference power: output will be scaled by this value
|
||
|
||
Returns
|
||
-------
|
||
S : np.ndarray
|
||
Power spectrogram
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 30.
|
||
"""
|
||
return ref * np.power(10.0, 0.1 * S_db)
|
||
|
||
|
||
@cache(level=30)
|
||
def amplitude_to_db(
|
||
S: np.ndarray,
|
||
*,
|
||
ref: Union[float, Callable] = 1.0,
|
||
amin: float = 1e-5,
|
||
top_db: Optional[float] = 80.0,
|
||
) -> np.ndarray:
|
||
"""Convert an amplitude spectrogram to dB-scaled spectrogram.
|
||
|
||
This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``,
|
||
but is provided for convenience.
|
||
|
||
Parameters
|
||
----------
|
||
S : np.ndarray
|
||
input amplitude
|
||
|
||
ref : scalar or callable
|
||
If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:
|
||
``20 * log10(S / ref)``.
|
||
Zeros in the output correspond to positions where ``S == ref``.
|
||
|
||
If callable, the reference value is computed as ``ref(S)``.
|
||
|
||
amin : float > 0 [scalar]
|
||
minimum threshold for ``S`` and ``ref``
|
||
|
||
top_db : float >= 0 [scalar]
|
||
threshold the output at ``top_db`` below the peak:
|
||
``max(20 * log10(S/ref)) - top_db``
|
||
|
||
Returns
|
||
-------
|
||
S_db : np.ndarray
|
||
``S`` measured in dB
|
||
|
||
See Also
|
||
--------
|
||
power_to_db, db_to_amplitude
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 30.
|
||
"""
|
||
S = np.asarray(S)
|
||
|
||
if np.issubdtype(S.dtype, np.complexfloating):
|
||
warnings.warn(
|
||
"amplitude_to_db was called on complex input so phase "
|
||
"information will be discarded. To suppress this warning, "
|
||
"call amplitude_to_db(np.abs(S)) instead.",
|
||
stacklevel=2,
|
||
)
|
||
|
||
magnitude = np.abs(S)
|
||
|
||
if callable(ref):
|
||
# User supplied a function to calculate reference power
|
||
ref_value = ref(magnitude)
|
||
else:
|
||
ref_value = np.abs(ref)
|
||
|
||
power = np.square(magnitude, out=magnitude)
|
||
|
||
return power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db)
|
||
|
||
|
||
@cache(level=30)
|
||
def db_to_amplitude(S_db: np.ndarray, *, ref: float = 1.0) -> np.ndarray:
|
||
"""Convert a dB-scaled spectrogram to an amplitude spectrogram.
|
||
|
||
This effectively inverts `amplitude_to_db`::
|
||
|
||
db_to_amplitude(S_db) ~= 10.0**(0.5 * S_db/10 + log10(ref))
|
||
|
||
Parameters
|
||
----------
|
||
S_db : np.ndarray
|
||
dB-scaled spectrogram
|
||
ref : number > 0
|
||
Optional reference power.
|
||
|
||
Returns
|
||
-------
|
||
S : np.ndarray
|
||
Linear magnitude spectrogram
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 30.
|
||
"""
|
||
return db_to_power(S_db, ref=ref**2) ** 0.5
|
||
|
||
|
||
@cache(level=30)
|
||
def perceptual_weighting(
|
||
S: np.ndarray, frequencies: np.ndarray, *, kind: str = "A", **kwargs: Any
|
||
) -> np.ndarray:
|
||
"""Perceptual weighting of a power spectrogram::
|
||
|
||
S_p[..., f, :] = frequency_weighting(f, 'A') + 10*log(S[..., f, :] / ref)
|
||
|
||
Parameters
|
||
----------
|
||
S : np.ndarray [shape=(..., d, t)]
|
||
Power spectrogram
|
||
frequencies : np.ndarray [shape=(d,)]
|
||
Center frequency for each row of` `S``
|
||
kind : str
|
||
The frequency weighting curve to use.
|
||
e.g. `'A'`, `'B'`, `'C'`, `'D'`, `None or 'Z'`
|
||
**kwargs : additional keyword arguments
|
||
Additional keyword arguments to `power_to_db`.
|
||
|
||
Returns
|
||
-------
|
||
S_p : np.ndarray [shape=(..., d, t)]
|
||
perceptually weighted version of ``S``
|
||
|
||
See Also
|
||
--------
|
||
power_to_db
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 30.
|
||
|
||
Examples
|
||
--------
|
||
Re-weight a CQT power spectrum, using peak power as reference
|
||
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1')))
|
||
>>> freqs = librosa.cqt_frequencies(C.shape[0],
|
||
... fmin=librosa.note_to_hz('A1'))
|
||
>>> perceptual_CQT = librosa.perceptual_weighting(C**2,
|
||
... freqs,
|
||
... ref=np.max)
|
||
>>> perceptual_CQT
|
||
array([[ -96.528, -97.101, ..., -108.561, -108.561],
|
||
[ -95.88 , -96.479, ..., -107.551, -107.551],
|
||
...,
|
||
[ -65.142, -53.256, ..., -80.098, -80.098],
|
||
[ -71.542, -53.197, ..., -80.311, -80.311]])
|
||
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
|
||
>>> img = librosa.display.specshow(librosa.amplitude_to_db(C,
|
||
... ref=np.max),
|
||
... fmin=librosa.note_to_hz('A1'),
|
||
... y_axis='cqt_hz', x_axis='time',
|
||
... ax=ax[0])
|
||
>>> ax[0].set(title='Log CQT power')
|
||
>>> ax[0].label_outer()
|
||
>>> imgp = librosa.display.specshow(perceptual_CQT, y_axis='cqt_hz',
|
||
... fmin=librosa.note_to_hz('A1'),
|
||
... x_axis='time', ax=ax[1])
|
||
>>> ax[1].set(title='Perceptually weighted log CQT')
|
||
>>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB")
|
||
>>> fig.colorbar(imgp, ax=ax[1], format="%+2.0f dB")
|
||
"""
|
||
offset = convert.frequency_weighting(frequencies, kind=kind).reshape((-1, 1))
|
||
|
||
result: np.ndarray = offset + power_to_db(S, **kwargs)
|
||
return result
|
||
|
||
|
||
@cache(level=30)
|
||
def fmt(
|
||
y: np.ndarray,
|
||
*,
|
||
t_min: float = 0.5,
|
||
n_fmt: Optional[int] = None,
|
||
kind: str = "cubic",
|
||
beta: float = 0.5,
|
||
over_sample: float = 1,
|
||
axis: int = -1,
|
||
) -> np.ndarray:
|
||
"""Fast Mellin transform (FMT)
|
||
|
||
The Mellin of a signal `y` is performed by interpolating `y` on an exponential time
|
||
axis, applying a polynomial window, and then taking the discrete Fourier transform.
|
||
|
||
When the Mellin parameter (beta) is 1/2, it is also known as the scale transform. [#]_
|
||
The scale transform can be useful for audio analysis because its magnitude is invariant
|
||
to scaling of the domain (e.g., time stretching or compression). This is analogous
|
||
to the magnitude of the Fourier transform being invariant to shifts in the input domain.
|
||
|
||
.. [#] De Sena, Antonio, and Davide Rocchesso.
|
||
"A fast Mellin and scale transform."
|
||
EURASIP Journal on Applied Signal Processing 2007.1 (2007): 75-75.
|
||
|
||
.. [#] Cohen, L.
|
||
"The scale representation."
|
||
IEEE Transactions on Signal Processing 41, no. 12 (1993): 3275-3292.
|
||
|
||
Parameters
|
||
----------
|
||
y : np.ndarray, real-valued
|
||
The input signal(s). Can be multidimensional.
|
||
The target axis must contain at least 3 samples.
|
||
|
||
t_min : float > 0
|
||
The minimum time spacing (in samples).
|
||
This value should generally be less than 1 to preserve as much information as
|
||
possible.
|
||
|
||
n_fmt : int > 2 or None
|
||
The number of scale transform bins to use.
|
||
If None, then ``n_bins = over_sample * ceil(n * log((n-1)/t_min))`` is taken,
|
||
where ``n = y.shape[axis]``
|
||
|
||
kind : str
|
||
The type of interpolation to use when re-sampling the input.
|
||
See `scipy.interpolate.interp1d` for possible values.
|
||
|
||
Note that the default is to use high-precision (cubic) interpolation.
|
||
This can be slow in practice; if speed is preferred over accuracy,
|
||
then consider using ``kind='linear'``.
|
||
|
||
beta : float
|
||
The Mellin parameter. ``beta=0.5`` provides the scale transform.
|
||
|
||
over_sample : float >= 1
|
||
Over-sampling factor for exponential resampling.
|
||
|
||
axis : int
|
||
The axis along which to transform ``y``
|
||
|
||
Returns
|
||
-------
|
||
x_scale : np.ndarray [dtype=complex]
|
||
The scale transform of ``y`` along the ``axis`` dimension.
|
||
|
||
Raises
|
||
------
|
||
ParameterError
|
||
if ``n_fmt < 2`` or ``t_min <= 0``
|
||
or if ``y`` is not finite
|
||
or if ``y.shape[axis] < 3``.
|
||
|
||
Notes
|
||
-----
|
||
This function caches at level 30.
|
||
|
||
Examples
|
||
--------
|
||
>>> # Generate a signal and time-stretch it (with energy normalization)
|
||
>>> scale = 1.25
|
||
>>> freq = 3.0
|
||
>>> x1 = np.linspace(0, 1, num=1024, endpoint=False)
|
||
>>> x2 = np.linspace(0, 1, num=int(scale * len(x1)), endpoint=False)
|
||
>>> y1 = np.sin(2 * np.pi * freq * x1)
|
||
>>> y2 = np.sin(2 * np.pi * freq * x2) / np.sqrt(scale)
|
||
>>> # Verify that the two signals have the same energy
|
||
>>> np.sum(np.abs(y1)**2), np.sum(np.abs(y2)**2)
|
||
(255.99999999999997, 255.99999999999969)
|
||
>>> scale1 = librosa.fmt(y1, n_fmt=512)
|
||
>>> scale2 = librosa.fmt(y2, n_fmt=512)
|
||
|
||
>>> # And plot the results
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> fig, ax = plt.subplots(nrows=2)
|
||
>>> ax[0].plot(y1, label='Original')
|
||
>>> ax[0].plot(y2, linestyle='--', label='Stretched')
|
||
>>> ax[0].set(xlabel='time (samples)', title='Input signals')
|
||
>>> ax[0].legend()
|
||
>>> ax[1].semilogy(np.abs(scale1), label='Original')
|
||
>>> ax[1].semilogy(np.abs(scale2), linestyle='--', label='Stretched')
|
||
>>> ax[1].set(xlabel='scale coefficients', title='Scale transform magnitude')
|
||
>>> ax[1].legend()
|
||
|
||
>>> # Plot the scale transform of an onset strength autocorrelation
|
||
>>> y, sr = librosa.load(librosa.ex('choice'))
|
||
>>> odf = librosa.onset.onset_strength(y=y, sr=sr)
|
||
>>> # Auto-correlate with up to 10 seconds lag
|
||
>>> odf_ac = librosa.autocorrelate(odf, max_size=10 * sr // 512)
|
||
>>> # Normalize
|
||
>>> odf_ac = librosa.util.normalize(odf_ac, norm=np.inf)
|
||
>>> # Compute the scale transform
|
||
>>> odf_ac_scale = librosa.fmt(librosa.util.normalize(odf_ac), n_fmt=512)
|
||
>>> # Plot the results
|
||
>>> fig, ax = plt.subplots(nrows=3)
|
||
>>> ax[0].plot(odf, label='Onset strength')
|
||
>>> ax[0].set(xlabel='Time (frames)', title='Onset strength')
|
||
>>> ax[1].plot(odf_ac, label='Onset autocorrelation')
|
||
>>> ax[1].set(xlabel='Lag (frames)', title='Onset autocorrelation')
|
||
>>> ax[2].semilogy(np.abs(odf_ac_scale), label='Scale transform magnitude')
|
||
>>> ax[2].set(xlabel='scale coefficients')
|
||
"""
|
||
n = y.shape[axis]
|
||
|
||
if n < 3:
|
||
raise ParameterError(f"y.shape[{axis}]=={n} < 3")
|
||
|
||
if t_min <= 0:
|
||
raise ParameterError(f"t_min={t_min} must be a positive number")
|
||
|
||
if n_fmt is None:
|
||
if over_sample < 1:
|
||
raise ParameterError(f"over_sample={over_sample} must be >= 1")
|
||
|
||
# The base is the maximum ratio between adjacent samples
|
||
# Since the sample spacing is increasing, this is simply the
|
||
# ratio between the positions of the last two samples: (n-1)/(n-2)
|
||
log_base = np.log(n - 1) - np.log(n - 2)
|
||
|
||
n_fmt = int(np.ceil(over_sample * (np.log(n - 1) - np.log(t_min)) / log_base))
|
||
|
||
elif n_fmt < 3:
|
||
raise ParameterError(f"n_fmt=={n_fmt} < 3")
|
||
else:
|
||
log_base = (np.log(n_fmt - 1) - np.log(n_fmt - 2)) / over_sample
|
||
|
||
if not np.all(np.isfinite(y)):
|
||
raise ParameterError("y must be finite everywhere")
|
||
|
||
base = np.exp(log_base)
|
||
# original grid: signal covers [0, 1). This range is arbitrary, but convenient.
|
||
# The final sample is positioned at (n-1)/n, so we omit the endpoint
|
||
x = np.linspace(0, 1, num=n, endpoint=False)
|
||
|
||
# build the interpolator
|
||
f_interp = scipy.interpolate.interp1d(x, y, kind=kind, axis=axis)
|
||
|
||
# build the new sampling grid
|
||
# exponentially spaced between t_min/n and 1 (exclusive)
|
||
# we'll go one past where we need, and drop the last sample
|
||
# When over-sampling, the last input sample contributions n_over samples.
|
||
# To keep the spacing consistent, we over-sample by n_over, and then
|
||
# trim the final samples.
|
||
n_over = int(np.ceil(over_sample))
|
||
x_exp = np.logspace(
|
||
(np.log(t_min) - np.log(n)) / log_base,
|
||
0,
|
||
num=n_fmt + n_over,
|
||
endpoint=False,
|
||
base=base,
|
||
)[:-n_over]
|
||
|
||
# Clean up any rounding errors at the boundaries of the interpolation
|
||
# The interpolator gets angry if we try to extrapolate, so clipping is necessary here.
|
||
if x_exp[0] < t_min or x_exp[-1] > float(n - 1.0) / n:
|
||
x_exp = np.clip(x_exp, float(t_min) / n, x[-1])
|
||
|
||
# Make sure that all sample points are unique
|
||
# This should never happen!
|
||
if len(np.unique(x_exp)) != len(x_exp):
|
||
raise ParameterError("Redundant sample positions in Mellin transform")
|
||
|
||
# Resample the signal
|
||
y_res = f_interp(x_exp)
|
||
|
||
# Broadcast the window correctly
|
||
shape = [1] * y_res.ndim
|
||
shape[axis] = -1
|
||
|
||
# Apply the window and fft
|
||
# Normalization is absorbed into the window here for expedience
|
||
fft = get_fftlib()
|
||
result: np.ndarray = fft.rfft(
|
||
y_res * ((x_exp**beta).reshape(shape) * np.sqrt(n) / n_fmt), axis=axis
|
||
)
|
||
return result
|
||
|
||
|
||
@overload
|
||
def pcen(
|
||
S: np.ndarray,
|
||
*,
|
||
sr: float = ...,
|
||
hop_length: int = ...,
|
||
gain: float = ...,
|
||
bias: float = ...,
|
||
power: float = ...,
|
||
time_constant: float = ...,
|
||
eps: float = ...,
|
||
b: Optional[float] = ...,
|
||
max_size: int = ...,
|
||
ref: Optional[np.ndarray] = ...,
|
||
axis: int = ...,
|
||
max_axis: Optional[int] = ...,
|
||
zi: Optional[np.ndarray] = ...,
|
||
return_zf: Literal[False] = ...,
|
||
) -> np.ndarray:
|
||
...
|
||
|
||
|
||
@overload
|
||
def pcen(
|
||
S: np.ndarray,
|
||
*,
|
||
sr: float = ...,
|
||
hop_length: int = ...,
|
||
gain: float = ...,
|
||
bias: float = ...,
|
||
power: float = ...,
|
||
time_constant: float = ...,
|
||
eps: float = ...,
|
||
b: Optional[float] = ...,
|
||
max_size: int = ...,
|
||
ref: Optional[np.ndarray] = ...,
|
||
axis: int = ...,
|
||
max_axis: Optional[int] = ...,
|
||
zi: Optional[np.ndarray] = ...,
|
||
return_zf: Literal[True],
|
||
) -> Tuple[np.ndarray, np.ndarray]:
|
||
...
|
||
|
||
|
||
@overload
|
||
def pcen(
|
||
S: np.ndarray,
|
||
*,
|
||
sr: float = ...,
|
||
hop_length: int = ...,
|
||
gain: float = ...,
|
||
bias: float = ...,
|
||
power: float = ...,
|
||
time_constant: float = ...,
|
||
eps: float = ...,
|
||
b: Optional[float] = ...,
|
||
max_size: int = ...,
|
||
ref: Optional[np.ndarray] = ...,
|
||
axis: int = ...,
|
||
max_axis: Optional[int] = ...,
|
||
zi: Optional[np.ndarray] = ...,
|
||
return_zf: bool = ...,
|
||
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
|
||
...
|
||
|
||
|
||
@cache(level=30)
|
||
def pcen(
|
||
S: np.ndarray,
|
||
*,
|
||
sr: float = 22050,
|
||
hop_length: int = 512,
|
||
gain: float = 0.98,
|
||
bias: float = 2,
|
||
power: float = 0.5,
|
||
time_constant: float = 0.400,
|
||
eps: float = 1e-6,
|
||
b: Optional[float] = None,
|
||
max_size: int = 1,
|
||
ref: Optional[np.ndarray] = None,
|
||
axis: int = -1,
|
||
max_axis: Optional[int] = None,
|
||
zi: Optional[np.ndarray] = None,
|
||
return_zf: bool = False,
|
||
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
|
||
"""Per-channel energy normalization (PCEN)
|
||
|
||
This function normalizes a time-frequency representation ``S`` by
|
||
performing automatic gain control, followed by nonlinear compression [#]_ ::
|
||
|
||
P[f, t] = (S / (eps + M[f, t])**gain + bias)**power - bias**power
|
||
|
||
IMPORTANT: the default values of eps, gain, bias, and power match the
|
||
original publication, in which ``S`` is a 40-band mel-frequency
|
||
spectrogram with 25 ms windowing, 10 ms frame shift, and raw audio values
|
||
in the interval [-2**31; 2**31-1[. If you use these default values, we
|
||
recommend to make sure that the raw audio is properly scaled to this
|
||
interval, and not to [-1, 1[ as is most often the case.
|
||
|
||
The matrix ``M`` is the result of applying a low-pass, temporal IIR filter
|
||
to ``S``::
|
||
|
||
M[f, t] = (1 - b) * M[f, t - 1] + b * S[f, t]
|
||
|
||
If ``b`` is not provided, it is calculated as::
|
||
|
||
b = (sqrt(1 + 4* T**2) - 1) / (2 * T**2)
|
||
|
||
where ``T = time_constant * sr / hop_length``. [#]_
|
||
|
||
This normalization is designed to suppress background noise and
|
||
emphasize foreground signals, and can be used as an alternative to
|
||
decibel scaling (`amplitude_to_db`).
|
||
|
||
This implementation also supports smoothing across frequency bins
|
||
by specifying ``max_size > 1``. If this option is used, the filtered
|
||
spectrogram ``M`` is computed as::
|
||
|
||
M[f, t] = (1 - b) * M[f, t - 1] + b * R[f, t]
|
||
|
||
where ``R`` has been max-filtered along the frequency axis, similar to
|
||
the SuperFlux algorithm implemented in `onset.onset_strength`::
|
||
|
||
R[f, t] = max(S[f - max_size//2: f + max_size//2, t])
|
||
|
||
This can be used to perform automatic gain control on signals that cross
|
||
or span multiple frequency bans, which may be desirable for spectrograms
|
||
with high frequency resolution.
|
||
|
||
.. [#] Wang, Y., Getreuer, P., Hughes, T., Lyon, R. F., & Saurous, R. A.
|
||
(2017, March). Trainable frontend for robust and far-field keyword spotting.
|
||
In Acoustics, Speech and Signal Processing (ICASSP), 2017
|
||
IEEE International Conference on (pp. 5670-5674). IEEE.
|
||
|
||
.. [#] Lostanlen, V., Salamon, J., McFee, B., Cartwright, M., Farnsworth, A.,
|
||
Kelling, S., and Bello, J. P. Per-Channel Energy Normalization: Why and How.
|
||
IEEE Signal Processing Letters, 26(1), 39-43.
|
||
|
||
Parameters
|
||
----------
|
||
S : np.ndarray (non-negative)
|
||
The input (magnitude) spectrogram
|
||
|
||
sr : number > 0 [scalar]
|
||
The audio sampling rate
|
||
|
||
hop_length : int > 0 [scalar]
|
||
The hop length of ``S``, expressed in samples
|
||
|
||
gain : number >= 0 [scalar]
|
||
The gain factor. Typical values should be slightly less than 1.
|
||
|
||
bias : number >= 0 [scalar]
|
||
The bias point of the nonlinear compression (default: 2)
|
||
|
||
power : number >= 0 [scalar]
|
||
The compression exponent. Typical values should be between 0 and 0.5.
|
||
Smaller values of ``power`` result in stronger compression.
|
||
At the limit ``power=0``, polynomial compression becomes logarithmic.
|
||
|
||
time_constant : number > 0 [scalar]
|
||
The time constant for IIR filtering, measured in seconds.
|
||
|
||
eps : number > 0 [scalar]
|
||
A small constant used to ensure numerical stability of the filter.
|
||
|
||
b : number in [0, 1] [scalar]
|
||
The filter coefficient for the low-pass filter.
|
||
If not provided, it will be inferred from ``time_constant``.
|
||
|
||
max_size : int > 0 [scalar]
|
||
The width of the max filter applied to the frequency axis.
|
||
If left as `1`, no filtering is performed.
|
||
|
||
ref : None or np.ndarray (shape=S.shape)
|
||
An optional pre-computed reference spectrum (``R`` in the above).
|
||
If not provided it will be computed from ``S``.
|
||
|
||
axis : int [scalar]
|
||
The (time) axis of the input spectrogram.
|
||
|
||
max_axis : None or int [scalar]
|
||
The frequency axis of the input spectrogram.
|
||
If `None`, and ``S`` is two-dimensional, it will be inferred
|
||
as the opposite from ``axis``.
|
||
If ``S`` is not two-dimensional, and ``max_size > 1``, an error
|
||
will be raised.
|
||
|
||
zi : np.ndarray
|
||
The initial filter delay values.
|
||
|
||
This may be the ``zf`` (final delay values) of a previous call to ``pcen``, or
|
||
computed by `scipy.signal.lfilter_zi`.
|
||
|
||
return_zf : bool
|
||
If ``True``, return the final filter delay values along with the PCEN output ``P``.
|
||
This is primarily useful in streaming contexts, where the final state of one
|
||
block of processing should be used to initialize the next block.
|
||
|
||
If ``False`` (default) only the PCEN values ``P`` are returned.
|
||
|
||
Returns
|
||
-------
|
||
P : np.ndarray, non-negative [shape=(n, m)]
|
||
The per-channel energy normalized version of ``S``.
|
||
zf : np.ndarray (optional)
|
||
The final filter delay values. Only returned if ``return_zf=True``.
|
||
|
||
See Also
|
||
--------
|
||
amplitude_to_db
|
||
librosa.onset.onset_strength
|
||
|
||
Examples
|
||
--------
|
||
Compare PCEN to log amplitude (dB) scaling on Mel spectra
|
||
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> y, sr = librosa.load(librosa.ex('robin'))
|
||
|
||
>>> # We recommend scaling y to the range [-2**31, 2**31[ before applying
|
||
>>> # PCEN's default parameters. Furthermore, we use power=1 to get a
|
||
>>> # magnitude spectrum instead of a power spectrum.
|
||
>>> S = librosa.feature.melspectrogram(y=y, sr=sr, power=1)
|
||
>>> log_S = librosa.amplitude_to_db(S, ref=np.max)
|
||
>>> pcen_S = librosa.pcen(S * (2**31))
|
||
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
|
||
>>> img = librosa.display.specshow(log_S, x_axis='time', y_axis='mel', ax=ax[0])
|
||
>>> ax[0].set(title='log amplitude (dB)', xlabel=None)
|
||
>>> ax[0].label_outer()
|
||
>>> imgpcen = librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[1])
|
||
>>> ax[1].set(title='Per-channel energy normalization')
|
||
>>> fig.colorbar(img, ax=ax[0], format="%+2.0f dB")
|
||
>>> fig.colorbar(imgpcen, ax=ax[1])
|
||
|
||
Compare PCEN with and without max-filtering
|
||
|
||
>>> pcen_max = librosa.pcen(S * (2**31), max_size=3)
|
||
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
|
||
>>> librosa.display.specshow(pcen_S, x_axis='time', y_axis='mel', ax=ax[0])
|
||
>>> ax[0].set(title='Per-channel energy normalization (no max-filter)')
|
||
>>> ax[0].label_outer()
|
||
>>> img = librosa.display.specshow(pcen_max, x_axis='time', y_axis='mel', ax=ax[1])
|
||
>>> ax[1].set(title='Per-channel energy normalization (max_size=3)')
|
||
>>> fig.colorbar(img, ax=ax)
|
||
"""
|
||
if power < 0:
|
||
raise ParameterError(f"power={power} must be nonnegative")
|
||
|
||
if gain < 0:
|
||
raise ParameterError(f"gain={gain} must be non-negative")
|
||
|
||
if bias < 0:
|
||
raise ParameterError(f"bias={bias} must be non-negative")
|
||
|
||
if eps <= 0:
|
||
raise ParameterError(f"eps={eps} must be strictly positive")
|
||
|
||
if time_constant <= 0:
|
||
raise ParameterError(f"time_constant={time_constant} must be strictly positive")
|
||
|
||
if not util.is_positive_int(max_size):
|
||
raise ParameterError(f"max_size={max_size} must be a positive integer")
|
||
|
||
if b is None:
|
||
t_frames = time_constant * sr / float(hop_length)
|
||
# By default, this solves the equation for b:
|
||
# b**2 + (1 - b) / t_frames - 2 = 0
|
||
# which approximates the full-width half-max of the
|
||
# squared frequency response of the IIR low-pass filter
|
||
|
||
b = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
|
||
|
||
if not 0 <= b <= 1:
|
||
raise ParameterError(f"b={b} must be between 0 and 1")
|
||
|
||
if np.issubdtype(S.dtype, np.complexfloating):
|
||
warnings.warn(
|
||
"pcen was called on complex input so phase "
|
||
"information will be discarded. To suppress this warning, "
|
||
"call pcen(np.abs(D)) instead.",
|
||
stacklevel=2,
|
||
)
|
||
S = np.abs(S)
|
||
|
||
if ref is None:
|
||
if max_size == 1:
|
||
ref = S
|
||
elif S.ndim == 1:
|
||
raise ParameterError(
|
||
"Max-filtering cannot be applied to 1-dimensional input"
|
||
)
|
||
else:
|
||
if max_axis is None:
|
||
if S.ndim != 2:
|
||
raise ParameterError(
|
||
f"Max-filtering a {S.ndim:d}-dimensional spectrogram "
|
||
"requires you to specify max_axis"
|
||
)
|
||
# if axis = 0, max_axis=1
|
||
# if axis = +- 1, max_axis = 0
|
||
max_axis = np.mod(1 - axis, 2)
|
||
|
||
ref = scipy.ndimage.maximum_filter1d(S, max_size, axis=max_axis)
|
||
|
||
if zi is None:
|
||
# Make sure zi matches dimension to input
|
||
shape = tuple([1] * ref.ndim)
|
||
zi = np.empty(shape)
|
||
zi[:] = scipy.signal.lfilter_zi([b], [1, b - 1])[:]
|
||
|
||
# Temporal integration
|
||
S_smooth: np.ndarray
|
||
zf: np.ndarray
|
||
S_smooth, zf = scipy.signal.lfilter([b], [1, b - 1], ref, zi=zi, axis=axis)
|
||
|
||
# Adaptive gain control
|
||
# Working in log-space gives us some stability, and a slight speedup
|
||
smooth = np.exp(-gain * (np.log(eps) + np.log1p(S_smooth / eps)))
|
||
|
||
# Dynamic range compression
|
||
S_out: np.ndarray
|
||
if power == 0:
|
||
S_out = np.log1p(S * smooth)
|
||
elif bias == 0:
|
||
S_out = np.exp(power * (np.log(S) + np.log(smooth)))
|
||
else:
|
||
S_out = (bias**power) * np.expm1(power * np.log1p(S * smooth / bias))
|
||
|
||
if return_zf:
|
||
return S_out, zf
|
||
else:
|
||
return S_out
|
||
|
||
|
||
def griffinlim(
|
||
S: np.ndarray,
|
||
*,
|
||
n_iter: int = 32,
|
||
hop_length: Optional[int] = None,
|
||
win_length: Optional[int] = None,
|
||
n_fft: Optional[int] = None,
|
||
window: _WindowSpec = "hann",
|
||
center: bool = True,
|
||
dtype: Optional[DTypeLike] = None,
|
||
length: Optional[int] = None,
|
||
pad_mode: _PadModeSTFT = "constant",
|
||
momentum: float = 0.99,
|
||
init: Optional[str] = "random",
|
||
random_state: Optional[
|
||
Union[int, np.random.RandomState, np.random.Generator]
|
||
] = None,
|
||
) -> np.ndarray:
|
||
"""Approximate magnitude spectrogram inversion using the "fast" Griffin-Lim algorithm.
|
||
|
||
Given a short-time Fourier transform magnitude matrix (``S``), the algorithm randomly
|
||
initializes phase estimates, and then alternates forward- and inverse-STFT
|
||
operations. [#]_
|
||
|
||
Note that this assumes reconstruction of a real-valued time-domain signal, and
|
||
that ``S`` contains only the non-negative frequencies (as computed by
|
||
`stft`).
|
||
|
||
The "fast" GL method [#]_ uses a momentum parameter to accelerate convergence.
|
||
|
||
.. [#] D. W. Griffin and J. S. Lim,
|
||
"Signal estimation from modified short-time Fourier transform,"
|
||
IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
|
||
|
||
.. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
|
||
"A fast Griffin-Lim algorithm,"
|
||
IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
|
||
Oct. 2013.
|
||
|
||
Parameters
|
||
----------
|
||
S : np.ndarray [shape=(..., n_fft // 2 + 1, t), non-negative]
|
||
An array of short-time Fourier transform magnitudes as produced by
|
||
`stft`.
|
||
|
||
n_iter : int > 0
|
||
The number of iterations to run
|
||
|
||
hop_length : None or int > 0
|
||
The hop length of the STFT. If not provided, it will default to ``n_fft // 4``
|
||
|
||
win_length : None or int > 0
|
||
The window length of the STFT. By default, it will equal ``n_fft``
|
||
|
||
n_fft : None or int > 0
|
||
The number of samples per frame.
|
||
By default, this will be inferred from the shape of ``S`` as an even number.
|
||
However, if an odd frame length was used, you can explicitly set ``n_fft``.
|
||
|
||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||
A window specification as supported by `stft` or `istft`
|
||
|
||
center : boolean
|
||
If ``True``, the STFT is assumed to use centered frames.
|
||
If ``False``, the STFT is assumed to use left-aligned frames.
|
||
|
||
dtype : np.dtype
|
||
Real numeric type for the time-domain signal. Default is inferred
|
||
to match the precision of the input spectrogram.
|
||
|
||
length : None or int > 0
|
||
If provided, the output ``y`` is zero-padded or clipped to exactly ``length``
|
||
samples.
|
||
|
||
pad_mode : string
|
||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||
By default, STFT uses zero padding.
|
||
|
||
momentum : number >= 0
|
||
The momentum parameter for fast Griffin-Lim.
|
||
Setting this to 0 recovers the original Griffin-Lim method [1]_.
|
||
Values near 1 can lead to faster convergence, but above 1 may not converge.
|
||
|
||
init : None or 'random' [default]
|
||
If 'random' (the default), then phase values are initialized randomly
|
||
according to ``random_state``. This is recommended when the input ``S`` is
|
||
a magnitude spectrogram with no initial phase estimates.
|
||
|
||
If `None`, then the phase is initialized from ``S``. This is useful when
|
||
an initial guess for phase can be provided, or when you want to resume
|
||
Griffin-Lim from a previous output.
|
||
|
||
random_state : None, int, np.random.RandomState, or np.random.Generator
|
||
If int, random_state is the seed used by the random number generator
|
||
for phase initialization.
|
||
|
||
If `np.random.RandomState` or `np.random.Generator` instance, the random number
|
||
generator itself.
|
||
|
||
If `None`, defaults to the `np.random.default_rng()` object.
|
||
|
||
Returns
|
||
-------
|
||
y : np.ndarray [shape=(..., n)]
|
||
time-domain signal reconstructed from ``S``
|
||
|
||
See Also
|
||
--------
|
||
stft
|
||
istft
|
||
magphase
|
||
filters.get_window
|
||
|
||
Examples
|
||
--------
|
||
A basic STFT inverse example
|
||
|
||
>>> y, sr = librosa.load(librosa.ex('trumpet'))
|
||
>>> # Get the magnitude spectrogram
|
||
>>> S = np.abs(librosa.stft(y))
|
||
>>> # Invert using Griffin-Lim
|
||
>>> y_inv = librosa.griffinlim(S)
|
||
>>> # Invert without estimating phase
|
||
>>> y_istft = librosa.istft(S)
|
||
|
||
Wave-plot the results
|
||
|
||
>>> import matplotlib.pyplot as plt
|
||
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
|
||
>>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
|
||
>>> ax[0].set(title='Original', xlabel=None)
|
||
>>> ax[0].label_outer()
|
||
>>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
|
||
>>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
|
||
>>> ax[1].label_outer()
|
||
>>> librosa.display.waveshow(y_istft, sr=sr, color='r', ax=ax[2])
|
||
>>> ax[2].set_title('Magnitude-only istft reconstruction')
|
||
"""
|
||
if random_state is None:
|
||
rng = np.random.default_rng()
|
||
elif isinstance(random_state, int):
|
||
rng = np.random.RandomState(seed=random_state) # type: ignore
|
||
elif isinstance(random_state, (np.random.RandomState, np.random.Generator)):
|
||
rng = random_state # type: ignore
|
||
else:
|
||
raise ParameterError(f"Unsupported random_state={random_state!r}")
|
||
|
||
if momentum > 1:
|
||
warnings.warn(
|
||
f"Griffin-Lim with momentum={momentum} > 1 can be unstable. "
|
||
"Proceed with caution!",
|
||
stacklevel=2,
|
||
)
|
||
elif momentum < 0:
|
||
raise ParameterError(f"griffinlim() called with momentum={momentum} < 0")
|
||
|
||
# Infer n_fft from the spectrogram shape
|
||
if n_fft is None:
|
||
n_fft = 2 * (S.shape[-2] - 1)
|
||
|
||
# Infer the dtype from S
|
||
angles = np.empty(S.shape, dtype=util.dtype_r2c(S.dtype))
|
||
eps = util.tiny(angles)
|
||
|
||
if init == "random":
|
||
# randomly initialize the phase
|
||
angles[:] = util.phasor((2 * np.pi * rng.random(size=S.shape)))
|
||
elif init is None:
|
||
# Initialize an all ones complex matrix
|
||
angles[:] = 1.0
|
||
else:
|
||
raise ParameterError(f"init={init} must either None or 'random'")
|
||
|
||
# Place-holders for temporary data and reconstructed buffer
|
||
rebuilt = None
|
||
tprev = None
|
||
inverse = None
|
||
|
||
# Absorb magnitudes into angles
|
||
angles *= S
|
||
for _ in range(n_iter):
|
||
# Invert with our current estimate of the phases
|
||
inverse = istft(
|
||
angles,
|
||
hop_length=hop_length,
|
||
win_length=win_length,
|
||
n_fft=n_fft,
|
||
window=window,
|
||
center=center,
|
||
dtype=dtype,
|
||
length=length,
|
||
out=inverse,
|
||
)
|
||
|
||
# Rebuild the spectrogram
|
||
rebuilt = stft(
|
||
inverse,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
win_length=win_length,
|
||
window=window,
|
||
center=center,
|
||
pad_mode=pad_mode,
|
||
out=rebuilt,
|
||
)
|
||
|
||
# Update our phase estimates
|
||
angles[:] = rebuilt
|
||
if tprev is not None:
|
||
angles -= (momentum / (1 + momentum)) * tprev
|
||
angles /= np.abs(angles) + eps
|
||
angles *= S
|
||
# Store
|
||
rebuilt, tprev = tprev, rebuilt
|
||
|
||
# Return the final phase estimates
|
||
return istft(
|
||
angles,
|
||
hop_length=hop_length,
|
||
win_length=win_length,
|
||
n_fft=n_fft,
|
||
window=window,
|
||
center=center,
|
||
dtype=dtype,
|
||
length=length,
|
||
out=inverse,
|
||
)
|
||
|
||
|
||
def _spectrogram(
|
||
*,
|
||
y: Optional[np.ndarray] = None,
|
||
S: Optional[np.ndarray] = None,
|
||
n_fft: Optional[int] = 2048,
|
||
hop_length: Optional[int] = 512,
|
||
power: float = 1,
|
||
win_length: Optional[int] = None,
|
||
window: _WindowSpec = "hann",
|
||
center: bool = True,
|
||
pad_mode: _PadModeSTFT = "constant",
|
||
) -> Tuple[np.ndarray, int]:
|
||
"""Retrieve a magnitude spectrogram.
|
||
|
||
This is primarily used in feature extraction functions that can operate on
|
||
either audio time-series or spectrogram input.
|
||
|
||
Parameters
|
||
----------
|
||
y : None or np.ndarray
|
||
If provided, an audio time series
|
||
|
||
S : None or np.ndarray
|
||
Spectrogram input, optional
|
||
|
||
n_fft : int > 0
|
||
STFT window size
|
||
|
||
hop_length : int > 0
|
||
STFT hop length
|
||
|
||
power : float > 0
|
||
Exponent for the magnitude spectrogram,
|
||
e.g., 1 for energy, 2 for power, etc.
|
||
|
||
win_length : int <= n_fft [scalar]
|
||
Each frame of audio is windowed by ``window``.
|
||
The window will be of length ``win_length`` and then padded
|
||
with zeros to match ``n_fft``.
|
||
|
||
If unspecified, defaults to ``win_length = n_fft``.
|
||
|
||
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
|
||
- a window specification (string, tuple, or number);
|
||
see `scipy.signal.get_window`
|
||
- a window function, such as `scipy.signal.windows.hann`
|
||
- a vector or array of length ``n_fft``
|
||
|
||
.. see also:: `filters.get_window`
|
||
|
||
center : boolean
|
||
- If ``True``, the signal ``y`` is padded so that frame
|
||
``t`` is centered at ``y[t * hop_length]``.
|
||
- If ``False``, then frame ``t`` begins at ``y[t * hop_length]``
|
||
|
||
pad_mode : string
|
||
If ``center=True``, the padding mode to use at the edges of the signal.
|
||
By default, STFT uses zero padding.
|
||
|
||
Returns
|
||
-------
|
||
S_out : np.ndarray [dtype=np.float]
|
||
- If ``S`` is provided as input, then ``S_out == S``
|
||
- Else, ``S_out = |stft(y, ...)|**power``
|
||
n_fft : int > 0
|
||
- If ``S`` is provided, then ``n_fft`` is inferred from ``S``
|
||
- Else, copied from input
|
||
"""
|
||
if S is not None:
|
||
# Infer n_fft from spectrogram shape, but only if it mismatches
|
||
if n_fft is None or n_fft // 2 + 1 != S.shape[-2]:
|
||
n_fft = 2 * (S.shape[-2] - 1)
|
||
else:
|
||
# Otherwise, compute a magnitude spectrogram from input
|
||
if n_fft is None:
|
||
raise ParameterError(f"Unable to compute spectrogram with n_fft={n_fft}")
|
||
if y is None:
|
||
raise ParameterError(
|
||
"Input signal must be provided to compute a spectrogram"
|
||
)
|
||
S = (
|
||
np.abs(
|
||
stft(
|
||
y,
|
||
n_fft=n_fft,
|
||
hop_length=hop_length,
|
||
win_length=win_length,
|
||
center=center,
|
||
window=window,
|
||
pad_mode=pad_mode,
|
||
)
|
||
)
|
||
** power
|
||
)
|
||
|
||
return S, n_fft
|