ai-content-maker/.venv/Lib/site-packages/librosa/core/constantq.py

1502 lines
44 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Constant-Q transforms"""
import warnings
import numpy as np
from numba import jit
from . import audio
from .intervals import interval_frequencies
from .fft import get_fftlib
from .convert import cqt_frequencies, note_to_hz
from .spectrum import stft, istft
from .pitch import estimate_tuning
from .._cache import cache
from .. import filters
from .. import util
from ..util.exceptions import ParameterError
from numpy.typing import DTypeLike
from typing import Optional, Union, Collection, List
from .._typing import _WindowSpec, _PadMode, _FloatLike_co, _ensure_not_reachable
__all__ = ["cqt", "hybrid_cqt", "pseudo_cqt", "icqt", "griffinlim_cqt", "vqt"]
# TODO: ivqt, griffinlim_vqt
@cache(level=20)
def cqt(
y: np.ndarray,
*,
sr: float = 22050,
hop_length: int = 512,
fmin: Optional[_FloatLike_co] = None,
n_bins: int = 84,
bins_per_octave: int = 12,
tuning: Optional[float] = 0.0,
filter_scale: float = 1,
norm: Optional[float] = 1,
sparsity: float = 0.01,
window: _WindowSpec = "hann",
scale: bool = True,
pad_mode: _PadMode = "constant",
res_type: Optional[str] = "soxr_hq",
dtype: Optional[DTypeLike] = None,
) -> np.ndarray:
"""Compute the constant-Q transform of an audio signal.
This implementation is based on the recursive sub-sampling method
described by [#]_.
.. [#] Schoerkhuber, Christian, and Anssi Klapuri.
"Constant-Q transform toolbox for music processing."
7th Sound and Music Computing Conference, Barcelona, Spain. 2010.
Parameters
----------
y : np.ndarray [shape=(..., n)]
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length of
each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the CQT. This is analogous to
``norm=None`` in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string
The resampling mode for recursive downsampling.
dtype : np.dtype
The (complex) data type of the output array. By default, this is inferred to match
the numerical precision of the input signal.
Returns
-------
CQT : np.ndarray [shape=(..., n_bins, t)]
Constant-Q value each frequency at each time.
See Also
--------
vqt
librosa.resample
librosa.util.normalize
Notes
-----
This function caches at level 20.
Examples
--------
Generate and plot a constant-Q power spectrum
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> C = np.abs(librosa.cqt(y, sr=sr))
>>> fig, ax = plt.subplots()
>>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax)
>>> ax.set_title('Constant-Q power spectrum')
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
Limit the frequency range
>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
... n_bins=60))
>>> C
array([[6.830e-04, 6.361e-04, ..., 7.362e-09, 9.102e-09],
[5.366e-04, 4.818e-04, ..., 8.953e-09, 1.067e-08],
...,
[4.288e-02, 4.580e-01, ..., 1.529e-05, 5.572e-06],
[2.965e-03, 1.508e-01, ..., 8.965e-06, 1.455e-05]])
Using a higher frequency resolution
>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
... n_bins=60 * 2, bins_per_octave=12 * 2))
>>> C
array([[5.468e-04, 5.382e-04, ..., 5.911e-09, 6.105e-09],
[4.118e-04, 4.014e-04, ..., 7.788e-09, 8.160e-09],
...,
[2.780e-03, 1.424e-01, ..., 4.225e-06, 2.388e-05],
[5.147e-02, 6.959e-02, ..., 1.694e-05, 5.811e-06]])
"""
# CQT is the special case of VQT with gamma=0
return vqt(
y=y,
sr=sr,
hop_length=hop_length,
fmin=fmin,
n_bins=n_bins,
intervals="equal",
gamma=0,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
window=window,
scale=scale,
pad_mode=pad_mode,
res_type=res_type,
dtype=dtype,
)
@cache(level=20)
def hybrid_cqt(
y: np.ndarray,
*,
sr: float = 22050,
hop_length: int = 512,
fmin: Optional[_FloatLike_co] = None,
n_bins: int = 84,
bins_per_octave: int = 12,
tuning: Optional[float] = 0.0,
filter_scale: float = 1,
norm: Optional[float] = 1,
sparsity: float = 0.01,
window: _WindowSpec = "hann",
scale: bool = True,
pad_mode: _PadMode = "constant",
res_type: str = "soxr_hq",
dtype: Optional[DTypeLike] = None,
) -> np.ndarray:
"""Compute the hybrid constant-Q transform of an audio signal.
Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
the hop_length is longer than half the filter length and the full CQT
for lower frequencies.
Parameters
----------
y : np.ndarray [shape=(..., n)]
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter filter_scale factor. Larger values use longer windows.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length of
each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the CQT. This is analogous to
``norm=None`` in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string
Resampling mode. See `librosa.cqt` for details.
dtype : np.dtype, optional
The complex dtype to use for computing the CQT.
By default, this is inferred to match the precision of
the input signal.
Returns
-------
CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
Constant-Q energy for each frequency at each time.
See Also
--------
cqt
pseudo_cqt
Notes
-----
This function caches at level 20.
"""
if fmin is None:
# C1 by default
fmin = note_to_hz("C1")
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
# Get all CQT frequencies
freqs = cqt_frequencies(n_bins, fmin=fmin, bins_per_octave=bins_per_octave)
# Pre-compute alpha
if n_bins == 1:
alpha = __et_relative_bw(bins_per_octave)
else:
alpha = filters._relative_bandwidth(freqs=freqs)
# Compute the length of each constant-Q basis function
lengths, _ = filters.wavelet_lengths(
freqs=freqs, sr=sr, filter_scale=filter_scale, window=window, alpha=alpha
)
# Determine which filters to use with Pseudo CQT
# These are the ones that fit within 2 hop lengths after padding
pseudo_filters = 2.0 ** np.ceil(np.log2(lengths)) < 2 * hop_length
n_bins_pseudo = int(np.sum(pseudo_filters))
n_bins_full = n_bins - n_bins_pseudo
cqt_resp = []
if n_bins_pseudo > 0:
fmin_pseudo = np.min(freqs[pseudo_filters])
cqt_resp.append(
pseudo_cqt(
y,
sr=sr,
hop_length=hop_length,
fmin=fmin_pseudo,
n_bins=n_bins_pseudo,
bins_per_octave=bins_per_octave,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
window=window,
scale=scale,
pad_mode=pad_mode,
dtype=dtype,
)
)
if n_bins_full > 0:
cqt_resp.append(
np.abs(
cqt(
y,
sr=sr,
hop_length=hop_length,
fmin=fmin,
n_bins=n_bins_full,
bins_per_octave=bins_per_octave,
filter_scale=filter_scale,
norm=norm,
sparsity=sparsity,
window=window,
scale=scale,
pad_mode=pad_mode,
res_type=res_type,
dtype=dtype,
)
)
)
# Propagate dtype from the last component
return __trim_stack(cqt_resp, n_bins, cqt_resp[-1].dtype)
@cache(level=20)
def pseudo_cqt(
y: np.ndarray,
*,
sr: float = 22050,
hop_length: int = 512,
fmin: Optional[_FloatLike_co] = None,
n_bins: int = 84,
bins_per_octave: int = 12,
tuning: Optional[float] = 0.0,
filter_scale: float = 1,
norm: Optional[float] = 1,
sparsity: float = 0.01,
window: _WindowSpec = "hann",
scale: bool = True,
pad_mode: _PadMode = "constant",
dtype: Optional[DTypeLike] = None,
) -> np.ndarray:
"""Compute the pseudo constant-Q transform of an audio signal.
This uses a single fft size that is the smallest power of 2 that is greater
than or equal to the max of:
1. The longest CQT filter
2. 2x the hop_length
Parameters
----------
y : np.ndarray [shape=(..., n)]
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive CQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter filter_scale factor. Larger values use longer windows.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length of
each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the CQT. This is analogous to
``norm=None`` in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
dtype : np.dtype, optional
The complex data type for CQT calculations.
By default, this is inferred to match the precision of the input signal.
Returns
-------
CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
Pseudo Constant-Q energy for each frequency at each time.
Notes
-----
This function caches at level 20.
"""
if fmin is None:
# C1 by default
fmin = note_to_hz("C1")
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
if dtype is None:
dtype = util.dtype_r2c(y.dtype)
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)
if n_bins == 1:
alpha = __et_relative_bw(bins_per_octave)
else:
alpha = filters._relative_bandwidth(freqs=freqs)
lengths, _ = filters.wavelet_lengths(
freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
)
fft_basis, n_fft, _ = __vqt_filter_fft(
sr,
freqs,
filter_scale,
norm,
sparsity,
hop_length=hop_length,
window=window,
dtype=dtype,
alpha=alpha,
)
fft_basis = np.abs(fft_basis)
# Compute the magnitude-only CQT response
C: np.ndarray = __cqt_response(
y,
n_fft,
hop_length,
fft_basis,
pad_mode,
window="hann",
dtype=dtype,
phase=False,
)
if scale:
C /= np.sqrt(n_fft)
else:
# reshape lengths to match dimension properly
lengths = util.expand_to(lengths, ndim=C.ndim, axes=-2)
C *= np.sqrt(lengths / n_fft)
return C
@cache(level=40)
def icqt(
C: np.ndarray,
*,
sr: float = 22050,
hop_length: int = 512,
fmin: Optional[_FloatLike_co] = None,
bins_per_octave: int = 12,
tuning: float = 0.0,
filter_scale: float = 1,
norm: Optional[float] = 1,
sparsity: float = 0.01,
window: _WindowSpec = "hann",
scale: bool = True,
length: Optional[int] = None,
res_type: str = "soxr_hq",
dtype: Optional[DTypeLike] = None,
) -> np.ndarray:
"""Compute the inverse constant-Q transform.
Given a constant-Q transform representation ``C`` of an audio signal ``y``,
this function produces an approximation ``y_hat``.
Parameters
----------
C : np.ndarray, [shape=(..., n_bins, n_frames)]
Constant-Q representation as produced by `cqt`
sr : number > 0 [scalar]
sampling rate of the signal
hop_length : int > 0 [scalar]
number of samples between successive frames
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : float [scalar]
Tuning offset in fractions of a bin.
The minimum frequency of the CQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0 [scalar]
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length
of each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the CQT. This is analogous to ``norm=None``
in FFT.
length : int > 0, optional
If provided, the output ``y`` is zero-padded or clipped to exactly
``length`` samples.
res_type : string
Resampling mode.
See `librosa.resample` for supported modes.
dtype : numeric type
Real numeric type for ``y``. Default is inferred to match the numerical
precision of the input CQT.
Returns
-------
y : np.ndarray, [shape=(..., n_samples), dtype=np.float]
Audio time-series reconstructed from the CQT representation.
See Also
--------
cqt
librosa.resample
Notes
-----
This function caches at level 40.
Examples
--------
Using default parameters
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> C = librosa.cqt(y=y, sr=sr)
>>> y_hat = librosa.icqt(C=C, sr=sr)
Or with a different hop length and frequency resolution:
>>> hop_length = 256
>>> bins_per_octave = 12 * 3
>>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave,
... bins_per_octave=bins_per_octave)
>>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length,
... bins_per_octave=bins_per_octave)
"""
if fmin is None:
fmin = note_to_hz("C1")
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
# Get the top octave of frequencies
n_bins = C.shape[-2]
n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)
if n_bins == 1:
alpha = __et_relative_bw(bins_per_octave)
else:
alpha = filters._relative_bandwidth(freqs=freqs)
lengths, f_cutoff = filters.wavelet_lengths(
freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
)
# Trim the CQT to only what's necessary for reconstruction
if length is not None:
n_frames = int(np.ceil((length + max(lengths)) / hop_length))
C = C[..., :n_frames]
C_scale = np.sqrt(lengths)
# This shape array will be used for broadcasting the basis scale
# we'll have to adapt this per octave within the loop
y: Optional[np.ndarray] = None
# Assume the top octave is at the full rate
srs = [sr]
hops = [hop_length]
for i in range(n_octaves - 1):
if hops[0] % 2 == 0:
# We can downsample:
srs.insert(0, srs[0] * 0.5)
hops.insert(0, hops[0] // 2)
else:
# We're out of downsamplings, carry forward
srs.insert(0, srs[0])
hops.insert(0, hops[0])
for i, (my_sr, my_hop) in enumerate(zip(srs, hops)):
# How many filters are in this octave?
n_filters = min(bins_per_octave, n_bins - bins_per_octave * i)
# Slice out the current octave
sl = slice(bins_per_octave * i, bins_per_octave * i + n_filters)
fft_basis, n_fft, _ = __vqt_filter_fft(
my_sr,
freqs[sl],
filter_scale,
norm,
sparsity,
window=window,
dtype=dtype,
alpha=alpha[sl],
)
# Transpose the basis
inv_basis = fft_basis.H.todense()
# Compute each filter's frequency-domain power
freq_power = 1 / np.sum(util.abs2(np.asarray(inv_basis)), axis=0)
# Compensate for length normalization in the forward transform
freq_power *= n_fft / lengths[sl]
# Inverse-project the basis for each octave
if scale:
# scale=True ==> re-scale by sqrt(lengths)
D_oct = np.einsum(
"fc,c,c,...ct->...ft",
inv_basis,
C_scale[sl],
freq_power,
C[..., sl, :],
optimize=True,
)
else:
D_oct = np.einsum(
"fc,c,...ct->...ft", inv_basis, freq_power, C[..., sl, :], optimize=True
)
y_oct = istft(D_oct, window="ones", hop_length=my_hop, dtype=dtype)
y_oct = audio.resample(
y_oct,
orig_sr=1,
target_sr=sr // my_sr,
res_type=res_type,
scale=False,
fix=False,
)
if y is None:
y = y_oct
else:
y[..., : y_oct.shape[-1]] += y_oct
# make mypy happy
assert y is not None
if length:
y = util.fix_length(y, size=length)
return y
@cache(level=20)
def vqt(
y: np.ndarray,
*,
sr: float = 22050,
hop_length: int = 512,
fmin: Optional[_FloatLike_co] = None,
n_bins: int = 84,
intervals: Union[str, Collection[float]] = "equal",
gamma: Optional[float] = None,
bins_per_octave: int = 12,
tuning: Optional[float] = 0.0,
filter_scale: float = 1,
norm: Optional[float] = 1,
sparsity: float = 0.01,
window: _WindowSpec = "hann",
scale: bool = True,
pad_mode: _PadMode = "constant",
res_type: Optional[str] = "soxr_hq",
dtype: Optional[DTypeLike] = None,
) -> np.ndarray:
"""Compute the variable-Q transform of an audio signal.
This implementation is based on the recursive sub-sampling method
described by [#]_.
.. [#] Schörkhuber, Christian, Anssi Klapuri, Nicki Holighaus, and Monika Dörfler.
"A Matlab toolbox for efficient perfect reconstruction time-frequency
transforms with log-frequency resolution."
In Audio Engineering Society Conference: 53rd International Conference: Semantic Audio.
Audio Engineering Society, 2014.
Parameters
----------
y : np.ndarray [shape=(..., n)]
audio time series. Multi-channel is supported.
sr : number > 0 [scalar]
sampling rate of ``y``
hop_length : int > 0 [scalar]
number of samples between successive VQT columns.
fmin : float > 0 [scalar]
Minimum frequency. Defaults to `C1 ~= 32.70 Hz`
n_bins : int > 0 [scalar]
Number of frequency bins, starting at ``fmin``
intervals : str or array of floats in [1, 2)
Either a string specification for an interval set, e.g.,
`'equal'`, `'pythagorean'`, `'ji3'`, etc. or an array of
intervals expressed as numbers between 1 and 2.
.. see also:: librosa.interval_frequencies
gamma : number > 0 [scalar]
Bandwidth offset for determining filter lengths.
If ``gamma=0``, produces the constant-Q transform.
If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a
constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished
by solving for the gamma which gives::
B_k = alpha * f_k + gamma = C * ERB(f_k),
where ``B_k`` is the bandwidth of filter ``k`` with center frequency ``f_k``, alpha
is the inverse of what would be the constant Q-factor, and ``C = alpha / 0.108`` is the
constant fraction across all filters.
Here we use ``ERB(f_k) = 24.7 + 0.108 * f_k``, the best-fit curve derived
from experimental data in [#]_.
.. [#] Glasberg, Brian R., and Brian CJ Moore.
"Derivation of auditory filter shapes from notched-noise data."
Hearing research 47.1-2 (1990): 103-138.
bins_per_octave : int > 0 [scalar]
Number of bins per octave
tuning : None or float
Tuning offset in fractions of a bin.
If ``None``, tuning will be automatically estimated from the signal.
The minimum frequency of the resulting VQT will be modified to
``fmin * 2**(tuning / bins_per_octave)``.
filter_scale : float > 0
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the VQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, number, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the VQT response by square-root the length of
each channel's filter. This is analogous to ``norm='ortho'`` in FFT.
If ``False``, do not scale the VQT. This is analogous to
``norm=None`` in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string
The resampling mode for recursive downsampling.
dtype : np.dtype
The dtype of the output array. By default, this is inferred to match the
numerical precision of the input signal.
Returns
-------
VQT : np.ndarray [shape=(..., n_bins, t), dtype=np.complex]
Variable-Q value each frequency at each time.
See Also
--------
cqt
Notes
-----
This function caches at level 20.
Examples
--------
Generate and plot a variable-Q power spectrum
>>> import matplotlib.pyplot as plt
>>> y, sr = librosa.load(librosa.ex('choice'), duration=5)
>>> C = np.abs(librosa.cqt(y, sr=sr))
>>> V = np.abs(librosa.vqt(y, sr=sr))
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[0])
>>> ax[0].set(title='Constant-Q power spectrum', xlabel=None)
>>> ax[0].label_outer()
>>> img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max),
... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[1])
>>> ax[1].set_title('Variable-Q power spectrum')
>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
"""
# If intervals are provided as an array, override BPO
if not isinstance(intervals, str):
bins_per_octave = len(intervals)
# How many octaves are we dealing with?
n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
n_filters = min(bins_per_octave, n_bins)
if fmin is None:
# C1 by default
fmin = note_to_hz("C1")
if tuning is None:
tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)
if dtype is None:
dtype = util.dtype_r2c(y.dtype)
# Apply tuning correction
fmin = fmin * 2.0 ** (tuning / bins_per_octave)
# First thing, get the freqs of the top octave
freqs = interval_frequencies(
n_bins=n_bins,
fmin=fmin,
intervals=intervals,
bins_per_octave=bins_per_octave,
sort=True,
)
freqs_top = freqs[-bins_per_octave:]
fmax_t: float = np.max(freqs_top)
if n_bins == 1:
alpha = __et_relative_bw(bins_per_octave)
else:
alpha = filters._relative_bandwidth(freqs=freqs)
lengths, filter_cutoff = filters.wavelet_lengths(
freqs=freqs,
sr=sr,
window=window,
filter_scale=filter_scale,
gamma=gamma,
alpha=alpha,
)
# Determine required resampling quality
nyquist = sr / 2.0
if filter_cutoff > nyquist:
raise ParameterError(
f"Wavelet basis with max frequency={fmax_t} would exceed the Nyquist frequency={nyquist}. "
"Try reducing the number of frequency bins."
)
if res_type is None:
warnings.warn(
"Support for VQT with res_type=None is deprecated in librosa 0.10\n"
"and will be removed in version 1.0.",
category=FutureWarning,
stacklevel=2,
)
res_type = "soxr_hq"
y, sr, hop_length = __early_downsample(
y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
)
vqt_resp = []
# Iterate down the octaves
my_y, my_sr, my_hop = y, sr, hop_length
for i in range(n_octaves):
# Slice out the current octave of filters
if i == 0:
sl = slice(-n_filters, None)
else:
sl = slice(-n_filters * (i + 1), -n_filters * i)
# This may be incorrect with early downsampling
freqs_oct = freqs[sl]
alpha_oct = alpha[sl]
fft_basis, n_fft, _ = __vqt_filter_fft(
my_sr,
freqs_oct,
filter_scale,
norm,
sparsity,
window=window,
gamma=gamma,
dtype=dtype,
alpha=alpha_oct,
)
# Re-scale the filters to compensate for downsampling
fft_basis[:] *= np.sqrt(sr / my_sr)
# Compute the vqt filter response and append to the stack
vqt_resp.append(
__cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode, dtype=dtype)
)
if my_hop % 2 == 0:
my_hop //= 2
my_sr /= 2.0
my_y = audio.resample(
my_y, orig_sr=2, target_sr=1, res_type=res_type, scale=True
)
V = __trim_stack(vqt_resp, n_bins, dtype)
if scale:
# Recompute lengths here because early downsampling may have changed
# our sampling rate
lengths, _ = filters.wavelet_lengths(
freqs=freqs,
sr=sr,
window=window,
filter_scale=filter_scale,
gamma=gamma,
alpha=alpha,
)
# reshape lengths to match V shape
lengths = util.expand_to(lengths, ndim=V.ndim, axes=-2)
V /= np.sqrt(lengths)
return V
@cache(level=10)
def __vqt_filter_fft(
sr,
freqs,
filter_scale,
norm,
sparsity,
hop_length=None,
window="hann",
gamma=0.0,
dtype=np.complex64,
alpha=None,
):
"""Generate the frequency domain variable-Q filter basis."""
basis, lengths = filters.wavelet(
freqs=freqs,
sr=sr,
filter_scale=filter_scale,
norm=norm,
pad_fft=True,
window=window,
gamma=gamma,
alpha=alpha,
)
# Filters are padded up to the nearest integral power of 2
n_fft = basis.shape[1]
if hop_length is not None and n_fft < 2.0 ** (1 + np.ceil(np.log2(hop_length))):
n_fft = int(2.0 ** (1 + np.ceil(np.log2(hop_length))))
# re-normalize bases with respect to the FFT window length
basis *= lengths[:, np.newaxis] / float(n_fft)
# FFT and retain only the non-negative frequencies
fft = get_fftlib()
fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, : (n_fft // 2) + 1]
# sparsify the basis
fft_basis = util.sparsify_rows(fft_basis, quantile=sparsity, dtype=dtype)
return fft_basis, n_fft, lengths
def __trim_stack(
cqt_resp: List[np.ndarray], n_bins: int, dtype: DTypeLike
) -> np.ndarray:
"""Trim and stack a collection of CQT responses"""
max_col = min(c_i.shape[-1] for c_i in cqt_resp)
# Grab any leading dimensions
shape = list(cqt_resp[0].shape)
shape[-2] = n_bins
shape[-1] = max_col
cqt_out = np.empty(shape, dtype=dtype, order="F")
# Copy per-octave data into output array
end = n_bins
for c_i in cqt_resp:
# By default, take the whole octave
n_oct = c_i.shape[-2]
# If the whole octave is more than we can fit,
# take the highest bins from c_i
if end < n_oct:
cqt_out[..., :end, :] = c_i[..., -end:, :max_col]
else:
cqt_out[..., end - n_oct : end, :] = c_i[..., :max_col]
end -= n_oct
return cqt_out
def __cqt_response(
y, n_fft, hop_length, fft_basis, mode, window="ones", phase=True, dtype=None
):
"""Compute the filter response with a target STFT hop."""
# Compute the STFT matrix
D = stft(
y, n_fft=n_fft, hop_length=hop_length, window=window, pad_mode=mode, dtype=dtype
)
if not phase:
D = np.abs(D)
# Reshape D to Dr
Dr = D.reshape((-1, D.shape[-2], D.shape[-1]))
output_flat = np.empty(
(Dr.shape[0], fft_basis.shape[0], Dr.shape[-1]), dtype=D.dtype
)
# iterate over channels
# project fft_basis.dot(Dr[i])
for i in range(Dr.shape[0]):
output_flat[i] = fft_basis.dot(Dr[i])
# reshape Dr to match D's leading dimensions again
shape = list(D.shape)
shape[-2] = fft_basis.shape[0]
return output_flat.reshape(shape)
def __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves):
"""Compute the number of early downsampling operations"""
downsample_count1 = max(0, int(np.ceil(np.log2(nyquist / filter_cutoff)) - 1) - 1)
num_twos = __num_two_factors(hop_length)
downsample_count2 = max(0, num_twos - n_octaves + 1)
return min(downsample_count1, downsample_count2)
def __early_downsample(
y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
):
"""Perform early downsampling on an audio signal, if it applies."""
downsample_count = __early_downsample_count(
nyquist, filter_cutoff, hop_length, n_octaves
)
if downsample_count > 0:
downsample_factor = 2 ** (downsample_count)
hop_length //= downsample_factor
if y.shape[-1] < downsample_factor:
raise ParameterError(
f"Input signal length={len(y):d} is too short for "
f"{n_octaves:d}-octave CQT"
)
new_sr = sr / float(downsample_factor)
y = audio.resample(
y, orig_sr=downsample_factor, target_sr=1, res_type=res_type, scale=True
)
# If we're not going to length-scale after CQT, we
# need to compensate for the downsampling factor here
if not scale:
y *= np.sqrt(downsample_factor)
sr = new_sr
return y, sr, hop_length
@jit(nopython=True, cache=True)
def __num_two_factors(x):
"""Return how many times integer x can be evenly divided by 2.
Returns 0 for non-positive integers.
"""
if x <= 0:
return 0
num_twos = 0
while x % 2 == 0:
num_twos += 1
x //= 2
return num_twos
def griffinlim_cqt(
C: np.ndarray,
*,
n_iter: int = 32,
sr: float = 22050,
hop_length: int = 512,
fmin: Optional[_FloatLike_co] = None,
bins_per_octave: int = 12,
tuning: float = 0.0,
filter_scale: float = 1,
norm: Optional[float] = 1,
sparsity: float = 0.01,
window: _WindowSpec = "hann",
scale: bool = True,
pad_mode: _PadMode = "constant",
res_type: str = "soxr_hq",
dtype: Optional[DTypeLike] = None,
length: Optional[int] = None,
momentum: float = 0.99,
init: Optional[str] = "random",
random_state: Optional[
Union[int, np.random.RandomState, np.random.Generator]
] = None,
) -> np.ndarray:
"""Approximate constant-Q magnitude spectrogram inversion using the "fast" Griffin-Lim
algorithm.
Given the magnitude of a constant-Q spectrogram (``C``), the algorithm randomly initializes
phase estimates, and then alternates forward- and inverse-CQT operations. [#]_
This implementation is based on the (fast) Griffin-Lim method for Short-time Fourier Transforms, [#]_
but adapted for use with constant-Q spectrograms.
.. [#] D. W. Griffin and J. S. Lim,
"Signal estimation from modified short-time Fourier transform,"
IEEE Trans. ASSP, vol.32, no.2, pp.236243, Apr. 1984.
.. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
"A fast Griffin-Lim algorithm,"
IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
Oct. 2013.
Parameters
----------
C : np.ndarray [shape=(..., n_bins, n_frames)]
The constant-Q magnitude spectrogram
n_iter : int > 0
The number of iterations to run
sr : number > 0
Audio sampling rate
hop_length : int > 0
The hop length of the CQT
fmin : number > 0
Minimum frequency for the CQT.
If not provided, it defaults to `C1`.
bins_per_octave : int > 0
Number of bins per octave
tuning : float
Tuning deviation from A440, in fractions of a bin
filter_scale : float > 0
Filter scale factor. Small values (<1) use shorter windows
for improved time resolution.
norm : {inf, -inf, 0, float > 0}
Type of norm to use for basis function normalization.
See `librosa.util.normalize`.
sparsity : float in [0, 1)
Sparsify the CQT basis by discarding up to ``sparsity``
fraction of the energy in each basis.
Set ``sparsity=0`` to disable sparsification.
window : str, tuple, or function
Window specification for the basis filters.
See `filters.get_window` for details.
scale : bool
If ``True``, scale the CQT response by square-root the length
of each channel's filter. This is analogous to ``norm='ortho'``
in FFT.
If ``False``, do not scale the CQT. This is analogous to ``norm=None``
in FFT.
pad_mode : string
Padding mode for centered frame analysis.
See also: `librosa.stft` and `numpy.pad`.
res_type : string
The resampling mode for recursive downsampling.
See ``librosa.resample`` for a list of available options.
dtype : numeric type
Real numeric type for ``y``. Default is inferred to match the precision
of the input CQT.
length : int > 0, optional
If provided, the output ``y`` is zero-padded or clipped to exactly
``length`` samples.
momentum : float > 0
The momentum parameter for fast Griffin-Lim.
Setting this to 0 recovers the original Griffin-Lim method.
Values near 1 can lead to faster convergence, but above 1 may not converge.
init : None or 'random' [default]
If 'random' (the default), then phase values are initialized randomly
according to ``random_state``. This is recommended when the input ``C`` is
a magnitude spectrogram with no initial phase estimates.
If ``None``, then the phase is initialized from ``C``. This is useful when
an initial guess for phase can be provided, or when you want to resume
Griffin-Lim from a previous output.
random_state : None, int, np.random.RandomState, or np.random.Generator
If int, random_state is the seed used by the random number generator
for phase initialization.
If `np.random.RandomState` or `np.random.Generator` instance, the random number generator itself.
If ``None``, defaults to the `np.random.default_rng()` object.
Returns
-------
y : np.ndarray [shape=(..., n)]
time-domain signal reconstructed from ``C``
See Also
--------
cqt
icqt
griffinlim
filters.get_window
resample
Examples
--------
A basis CQT inverse example
>>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), sr=None)
>>> # Get the CQT magnitude, 7 octaves at 36 bins per octave
>>> C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=36, n_bins=7*36))
>>> # Invert using Griffin-Lim
>>> y_inv = librosa.griffinlim_cqt(C, sr=sr, bins_per_octave=36)
>>> # And invert without estimating phase
>>> y_icqt = librosa.icqt(C, sr=sr, bins_per_octave=36)
Wave-plot the results
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
>>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
>>> ax[0].set(title='Original', xlabel=None)
>>> ax[0].label_outer()
>>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
>>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
>>> ax[1].label_outer()
>>> librosa.display.waveshow(y_icqt, sr=sr, color='r', ax=ax[2])
>>> ax[2].set(title='Magnitude-only icqt reconstruction')
"""
if fmin is None:
fmin = note_to_hz("C1")
if random_state is None:
rng = np.random.default_rng()
elif isinstance(random_state, int):
rng = np.random.RandomState(seed=random_state) # type: ignore
elif isinstance(random_state, (np.random.RandomState, np.random.Generator)):
rng = random_state # type: ignore
else:
_ensure_not_reachable(random_state)
raise ParameterError(f"Unsupported random_state={random_state!r}")
if momentum > 1:
warnings.warn(
f"Griffin-Lim with momentum={momentum} > 1 can be unstable. "
"Proceed with caution!",
stacklevel=2,
)
elif momentum < 0:
raise ParameterError(f"griffinlim_cqt() called with momentum={momentum} < 0")
# using complex64 will keep the result to minimal necessary precision
angles = np.empty(C.shape, dtype=np.complex64)
eps = util.tiny(angles)
if init == "random":
# randomly initialize the phase
angles[:] = util.phasor(2 * np.pi * rng.random(size=C.shape))
elif init is None:
# Initialize an all ones complex matrix
angles[:] = 1.0
else:
raise ParameterError(f"init={init} must either None or 'random'")
# And initialize the previous iterate to 0
rebuilt: np.ndarray = np.array(0.0)
for _ in range(n_iter):
# Store the previous iterate
tprev = rebuilt
# Invert with our current estimate of the phases
inverse = icqt(
C * angles,
sr=sr,
hop_length=hop_length,
bins_per_octave=bins_per_octave,
fmin=fmin,
tuning=tuning,
filter_scale=filter_scale,
window=window,
length=length,
res_type=res_type,
norm=norm,
scale=scale,
sparsity=sparsity,
dtype=dtype,
)
# Rebuild the spectrogram
rebuilt = cqt(
inverse,
sr=sr,
bins_per_octave=bins_per_octave,
n_bins=C.shape[-2],
hop_length=hop_length,
fmin=fmin,
tuning=tuning,
filter_scale=filter_scale,
window=window,
norm=norm,
scale=scale,
sparsity=sparsity,
pad_mode=pad_mode,
res_type=res_type,
)
# Update our phase estimates
angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev
angles[:] /= np.abs(angles) + eps
# Return the final phase estimates
return icqt(
C * angles,
sr=sr,
hop_length=hop_length,
bins_per_octave=bins_per_octave,
tuning=tuning,
filter_scale=filter_scale,
fmin=fmin,
window=window,
length=length,
res_type=res_type,
norm=norm,
scale=scale,
sparsity=sparsity,
dtype=dtype,
)
def __et_relative_bw(bins_per_octave: int) -> np.ndarray:
"""Compute the relative bandwidth coefficient for equal
(geometric) freuqency spacing and a give number of bins
per octave.
This is a special case of the more general `relative_bandwidth`
calculation that can be used when only a single basis frequency
is used.
Parameters
----------
bins_per_octave : int
Returns
-------
alpha : np.ndarray > 0
Value is cast up to a 1d array to allow slicing
"""
r = 2 ** (1 / bins_per_octave)
return np.atleast_1d((r**2 - 1) / (r**2 + 1))