614 lines
26 KiB
Python
614 lines
26 KiB
Python
# coding=utf-8
|
|
# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
""" Jukebox configuration"""
|
|
|
|
import os
|
|
from typing import List, Union
|
|
|
|
from ...configuration_utils import PretrainedConfig
|
|
from ...utils import logging
|
|
|
|
|
|
logger = logging.get_logger(__name__)
|
|
|
|
|
|
from ..deprecated._archive_maps import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP # noqa: F401, E402
|
|
|
|
|
|
_LARGE_ATTENTION = [
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"cross_attention",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"cross_attention",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"cross_attention",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"cross_attention",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"cross_attention",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"cross_attention",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"block_attn",
|
|
"transpose_block_attn",
|
|
"prev_block_attn",
|
|
"cross_attention",
|
|
]
|
|
_RawColumnPreviousRowAttention = ["block_attn", "transpose_block_attn", "prev_block_attn"]
|
|
_FullDenseAttention = ["dense_attention"]
|
|
_PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
|
|
|
|
|
|
def full_dense_attention(layer):
|
|
return _FullDenseAttention[0]
|
|
|
|
|
|
def raw_column_previous_row_attention(layer):
|
|
return _RawColumnPreviousRowAttention[layer % 3]
|
|
|
|
|
|
def large_separated_enc_dec_w_lyrics(layer):
|
|
return _LARGE_ATTENTION[layer % 79]
|
|
|
|
|
|
def enc_dec_with_lyrics(layer):
|
|
if layer % 16 == 15:
|
|
return _PrimePrimeDenseAttention[layer % 3]
|
|
return _RawColumnPreviousRowAttention[layer % 3]
|
|
|
|
|
|
ATTENTION_PATTERNS = {
|
|
"full_dense_attention": full_dense_attention,
|
|
"raw_column_previous_row_attention": raw_column_previous_row_attention, # Alternate row, column and previous row attn
|
|
"large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics, # Used by large separated_enc_dec model with lyrics
|
|
"enc_dec_with_lyrics": enc_dec_with_lyrics, # Used by encoder_decoder model with lyrics
|
|
}
|
|
|
|
|
|
class JukeboxPriorConfig(PretrainedConfig):
|
|
"""
|
|
This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
|
|
`JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
|
|
configuration with the defaults will yield a similar configuration to that of the top level prior from the
|
|
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
|
|
-1b-lyrics) architecture.
|
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
documentation from [`PretrainedConfig`] for more information.
|
|
|
|
|
|
|
|
Args:
|
|
act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
|
|
Activation function.
|
|
alignment_head (`int`, *optional*, defaults to 2):
|
|
Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
|
|
alignment
|
|
alignment_layer (`int`, *optional*, defaults to 68):
|
|
Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
|
|
lyric to audio alignment
|
|
attention_multiplier (`float`, *optional*, defaults to 0.25):
|
|
Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
|
|
0.25*width of the model will be used.
|
|
attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
|
|
Which attention pattern to use for the decoder/
|
|
attn_dropout (`int`, *optional*, defaults to 0):
|
|
Dropout probability for the post-attention layer dropout in the decoder.
|
|
attn_res_scale (`bool`, *optional*, defaults to `False`):
|
|
Whether or not to scale the residuals in the attention conditioner block.
|
|
blocks (`int`, *optional*, defaults to 64):
|
|
Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
|
|
blocks]` in the `JukeboxAttention` layer.
|
|
conv_res_scale (`int`, *optional*):
|
|
Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
|
|
conditioner, the default value is to None and should not be modified.
|
|
num_layers (`int`, *optional*, defaults to 72):
|
|
Number of layers of the transformer architecture.
|
|
emb_dropout (`int`, *optional*, defaults to 0):
|
|
Embedding dropout used in the lyric decoder.
|
|
encoder_config (`JukeboxPriorConfig`, *optional*) :
|
|
Configuration of the encoder which models the prior on the lyrics.
|
|
encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
|
|
Multiplication factor used in front of the lyric encoder loss.
|
|
hidden_size (`int`, *optional*, defaults to 2048):
|
|
Hidden dimension of the attention layers.
|
|
init_scale (`float`, *optional*, defaults to 0.2):
|
|
Initialization scales for the prior modules.
|
|
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
|
|
Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
|
|
greater than 0, the `encoder` args should be specified for the lyric encoding.
|
|
mask (`bool`, *optional*, defaults to `False`):
|
|
Whether or not to mask the previous positions in the attention.
|
|
max_duration (`int`, *optional*, defaults to 600):
|
|
Maximum supported duration of the generated song in seconds.
|
|
max_nb_genres (`int`, *optional*, defaults to 1):
|
|
Maximum number of genres that can be used to condition the model.
|
|
merged_decoder (`bool`, *optional*, defaults to `True`):
|
|
Whether or not the decoder and the encoder inputs are merged. This is used for the separated
|
|
encoder-decoder architecture
|
|
metadata_conditioning (`bool`, *optional*, defaults to `True)`:
|
|
Whether or not to condition on the artist and genre metadata.
|
|
metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
|
|
Number of genres and the number of artists that were used to train the embedding layers of the prior
|
|
models.
|
|
min_duration (`int`, *optional*, defaults to 0):
|
|
Minimum duration of the generated audio on which the model was trained.
|
|
mlp_multiplier (`float`, *optional*, defaults to 1.0):
|
|
Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
|
|
the model will be used.
|
|
music_vocab_size (`int`, *optional*, defaults to 2048):
|
|
Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
|
|
n_ctx (`int`, *optional*, defaults to 6144):
|
|
Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
|
|
generating music tokens.
|
|
n_heads (`int`, *optional*, defaults to 2):
|
|
Number of attention heads.
|
|
nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
|
|
Number of lyric tokens that are used when sampling a single window of length `n_ctx`
|
|
res_conv_depth (`int`, *optional*, defaults to 3):
|
|
Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
|
|
`JukeboxMusicTokenConditioner`.
|
|
res_conv_width (`int`, *optional*, defaults to 128):
|
|
Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
|
|
`JukeboxMusicTokenConditioner`.
|
|
res_convolution_multiplier (`int`, *optional*, defaults to 1):
|
|
Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
|
|
res_dilation_cycle (`int`, *optional*):
|
|
Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
|
|
corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
|
|
tokens.
|
|
res_dilation_growth_rate (`int`, *optional*, defaults to 1):
|
|
Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
|
|
res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
|
|
Downsampling rates used in the audio conditioning network
|
|
res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
|
|
Striding used in the audio conditioning network
|
|
resid_dropout (`int`, *optional*, defaults to 0):
|
|
Residual dropout used in the attention pattern.
|
|
sampling_rate (`int`, *optional*, defaults to 44100):
|
|
Sampling rate used for training.
|
|
spread (`int`, *optional*):
|
|
Spread used in the `summary_spread_attention` pattern
|
|
timing_dims (`int`, *optional*, defaults to 64):
|
|
Dimension of the timing embedding.
|
|
zero_out (`bool`, *optional*, defaults to `False`):
|
|
Whether or not to zero out convolution weights when initializing.
|
|
"""
|
|
|
|
model_type = "jukebox_prior"
|
|
attribute_map = {
|
|
"max_position_embeddings": "n_positions",
|
|
"num_attention_heads": "n_head",
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
act_fn="quick_gelu",
|
|
level=0,
|
|
alignment_head=2,
|
|
alignment_layer=68,
|
|
attention_multiplier=0.25,
|
|
attention_pattern="enc_dec_with_lyrics",
|
|
attn_dropout=0,
|
|
attn_res_scale=False,
|
|
blocks=64,
|
|
conv_res_scale=None,
|
|
num_layers=72,
|
|
emb_dropout=0,
|
|
encoder_config=None,
|
|
encoder_loss_fraction=0.4,
|
|
hidden_size=2048,
|
|
init_scale=0.2,
|
|
is_encoder_decoder=True,
|
|
lyric_vocab_size=80,
|
|
mask=False,
|
|
max_duration=600,
|
|
max_nb_genres=1,
|
|
merged_decoder=True,
|
|
metadata_conditioning=True,
|
|
metadata_dims=[604, 7898],
|
|
min_duration=0,
|
|
mlp_multiplier=1.0,
|
|
music_vocab_size=2048,
|
|
n_ctx=6144,
|
|
n_heads=2,
|
|
nb_relevant_lyric_tokens=384,
|
|
res_conv_depth=3,
|
|
res_conv_width=128,
|
|
res_convolution_multiplier=1,
|
|
res_dilation_cycle=None,
|
|
res_dilation_growth_rate=1,
|
|
res_downs_t=[3, 2, 2],
|
|
res_strides_t=[2, 2, 2],
|
|
resid_dropout=0,
|
|
sampling_rate=44100,
|
|
spread=None,
|
|
timing_dims=64,
|
|
zero_out=False,
|
|
**kwargs,
|
|
):
|
|
self.act_fn = act_fn
|
|
self.alignment_head = alignment_head
|
|
self.alignment_layer = alignment_layer
|
|
self.attention_multiplier = attention_multiplier
|
|
self.attention_pattern = attention_pattern
|
|
self.attn_dropout = attn_dropout
|
|
self.attn_res_scale = attn_res_scale
|
|
self.blocks = blocks
|
|
self.conv_res_scale = conv_res_scale
|
|
self.num_layers = num_layers
|
|
self.emb_dropout = emb_dropout
|
|
self.music_vocab_size = music_vocab_size
|
|
if encoder_config is not None:
|
|
self.encoder_config = JukeboxPriorConfig(**encoder_config)
|
|
else:
|
|
self.encoder_config = None
|
|
self.encoder_loss_fraction = encoder_loss_fraction
|
|
self.init_scale = init_scale
|
|
self.is_encoder_decoder = is_encoder_decoder
|
|
self.lyric_vocab_size = lyric_vocab_size
|
|
self.level = level
|
|
self.mask = mask
|
|
self.max_duration = max_duration
|
|
self.max_nb_genres = max_nb_genres
|
|
self.merged_decoder = merged_decoder
|
|
self.metadata_conditioning = metadata_conditioning
|
|
self.metadata_dims = metadata_dims
|
|
self.min_duration = min_duration
|
|
self.mlp_multiplier = mlp_multiplier
|
|
self.n_ctx = n_ctx
|
|
self.n_heads = n_heads
|
|
self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
|
|
self.res_conv_depth = res_conv_depth
|
|
self.res_conv_width = res_conv_width
|
|
self.res_convolution_multiplier = res_convolution_multiplier
|
|
self.res_dilation_cycle = res_dilation_cycle
|
|
self.res_dilation_growth_rate = res_dilation_growth_rate
|
|
self.res_downs_t = res_downs_t
|
|
self.res_strides_t = res_strides_t
|
|
self.resid_dropout = resid_dropout
|
|
self.sampling_rate = sampling_rate
|
|
self.spread = spread
|
|
self.timing_dims = timing_dims
|
|
self.hidden_size = hidden_size
|
|
self.zero_out = zero_out
|
|
|
|
@classmethod
|
|
def from_pretrained(
|
|
cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs
|
|
) -> "PretrainedConfig":
|
|
cls._set_token_in_kwargs(kwargs)
|
|
|
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
|
|
# get the prior config dict if we are loading from JukeboxConfig
|
|
if config_dict.get("model_type") == "jukebox":
|
|
config_dict = config_dict[f"prior_{level}"]
|
|
|
|
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
logger.warning(
|
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
)
|
|
|
|
return cls.from_dict(config_dict, **kwargs)
|
|
|
|
|
|
class JukeboxVQVAEConfig(PretrainedConfig):
|
|
"""
|
|
This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
|
|
`JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
with the defaults will yield a similar configuration to that of the VQVAE from
|
|
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
|
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
documentation from [`PretrainedConfig`] for more information.
|
|
|
|
Args:
|
|
act_fn (`str`, *optional*, defaults to `"relu"`):
|
|
Activation function of the model.
|
|
nb_discrete_codes (`int`, *optional*, defaults to 2048):
|
|
Number of codes of the VQVAE.
|
|
commit (`float`, *optional*, defaults to 0.02):
|
|
Commit loss multiplier.
|
|
conv_input_shape (`int`, *optional*, defaults to 1):
|
|
Number of audio channels.
|
|
conv_res_scale (`bool`, *optional*, defaults to `False`):
|
|
Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
|
|
embed_dim (`int`, *optional*, defaults to 64):
|
|
Embedding dimension of the codebook vectors.
|
|
hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
|
|
Fraction of non-intersecting window used when continuing the sampling process.
|
|
levels (`int`, *optional*, defaults to 3):
|
|
Number of hierarchical levels that used in the VQVAE.
|
|
lmu (`float`, *optional*, defaults to 0.99):
|
|
Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
|
|
of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
|
|
multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
|
|
Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
|
|
res_conv_depth (`int`, *optional*, defaults to 4):
|
|
Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
|
|
res_conv_width (`int`, *optional*, defaults to 32):
|
|
Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
|
|
res_convolution_multiplier (`int`, *optional*, defaults to 1):
|
|
Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
|
|
res_dilation_cycle (`int`, *optional*):
|
|
Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
|
|
reduced by a power of `res_dilation_cycle`.
|
|
res_dilation_growth_rate (`int`, *optional*, defaults to 3):
|
|
Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
|
|
res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
|
|
Downsampling rate for each level of the hierarchical VQ-VAE.
|
|
res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
|
|
Stride used for each level of the hierarchical VQ-VAE.
|
|
sample_length (`int`, *optional*, defaults to 1058304):
|
|
Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
|
|
init_scale (`float`, *optional*, defaults to 0.2):
|
|
Initialization scale.
|
|
zero_out (`bool`, *optional*, defaults to `False`):
|
|
Whether or not to zero out convolution weights when initializing.
|
|
"""
|
|
|
|
model_type = "jukebox_vqvae"
|
|
|
|
def __init__(
|
|
self,
|
|
act_fn="relu",
|
|
nb_discrete_codes=2048,
|
|
commit=0.02,
|
|
conv_input_shape=1,
|
|
conv_res_scale=False,
|
|
embed_dim=64,
|
|
hop_fraction=[0.125, 0.5, 0.5],
|
|
levels=3,
|
|
lmu=0.99,
|
|
multipliers=[2, 1, 1],
|
|
res_conv_depth=4,
|
|
res_conv_width=32,
|
|
res_convolution_multiplier=1,
|
|
res_dilation_cycle=None,
|
|
res_dilation_growth_rate=3,
|
|
res_downs_t=[3, 2, 2],
|
|
res_strides_t=[2, 2, 2],
|
|
sample_length=1058304,
|
|
init_scale=0.2,
|
|
zero_out=False,
|
|
**kwargs,
|
|
):
|
|
self.hop_fraction = hop_fraction
|
|
self.conv_input_shape = conv_input_shape
|
|
self.sample_length = sample_length
|
|
|
|
# VQVAE parameters (all used)
|
|
self.levels = levels
|
|
self.embed_dim = embed_dim
|
|
self.nb_discrete_codes = nb_discrete_codes
|
|
self.res_conv_width = res_conv_width
|
|
self.res_conv_depth = res_conv_depth
|
|
self.res_convolution_multiplier = res_convolution_multiplier
|
|
self.res_dilation_growth_rate = res_dilation_growth_rate
|
|
self.res_dilation_cycle = res_dilation_cycle
|
|
self.multipliers = multipliers
|
|
self.res_downs_t = res_downs_t
|
|
self.res_strides_t = res_strides_t
|
|
self.lmu = lmu
|
|
self.commit = commit
|
|
self.conv_res_scale = conv_res_scale
|
|
self.act_fn = act_fn
|
|
self.init_scale = init_scale
|
|
self.zero_out = zero_out
|
|
|
|
@classmethod
|
|
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
|
cls._set_token_in_kwargs(kwargs)
|
|
|
|
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
|
|
|
# get the text config dict if we are loading from CLIPConfig
|
|
if config_dict.get("model_type") == "jukebox":
|
|
config_dict = config_dict["vqvae_config"]
|
|
|
|
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
|
logger.warning(
|
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
|
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
|
)
|
|
|
|
return cls.from_dict(config_dict, **kwargs)
|
|
|
|
|
|
class JukeboxConfig(PretrainedConfig):
|
|
"""
|
|
This is the configuration class to store the configuration of a [`JukeboxModel`].
|
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
|
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
|
|
yield a similar configuration to that of
|
|
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
|
|
|
|
|
|
The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
|
|
(5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
|
|
to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
|
|
|
|
Args:
|
|
vqvae_config (`JukeboxVQVAEConfig`, *optional*):
|
|
Configuration for the `JukeboxVQVAE` model.
|
|
prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
|
|
List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
|
|
nb_priors (`int`, *optional*, defaults to 3):
|
|
Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
|
|
(decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
|
|
trained using a top prior and 2 upsampler priors.
|
|
sampling_rate (`int`, *optional*, defaults to 44100):
|
|
Sampling rate of the raw audio.
|
|
timing_dims (`int`, *optional*, defaults to 64):
|
|
Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
|
|
layer. The timing embedding layer converts the absolute and relative position in the currently sampled
|
|
audio to a tensor of length `timing_dims` that will be added to the music tokens.
|
|
min_duration (`int`, *optional*, defaults to 0):
|
|
Minimum duration of the audios to generate
|
|
max_duration (`float`, *optional*, defaults to 600.0):
|
|
Maximum duration of the audios to generate
|
|
max_nb_genres (`int`, *optional*, defaults to 5):
|
|
Maximum number of genres that can be used to condition a single sample.
|
|
metadata_conditioning (`bool`, *optional*, defaults to `True`):
|
|
Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
|
|
duration.
|
|
|
|
Example:
|
|
|
|
```python
|
|
>>> from transformers import JukeboxModel, JukeboxConfig
|
|
|
|
>>> # Initializing a Jukebox configuration
|
|
>>> configuration = JukeboxConfig()
|
|
|
|
>>> # Initializing a model from the configuration
|
|
>>> model = JukeboxModel(configuration)
|
|
|
|
>>> # Accessing the model configuration
|
|
>>> configuration = model.config
|
|
```
|
|
"""
|
|
|
|
model_type = "jukebox"
|
|
|
|
def __init__(
|
|
self,
|
|
vqvae_config=None,
|
|
prior_config_list=None,
|
|
nb_priors=3,
|
|
sampling_rate=44100,
|
|
timing_dims=64,
|
|
min_duration=0,
|
|
max_duration=600.0,
|
|
max_nb_genres=5,
|
|
metadata_conditioning=True,
|
|
**kwargs,
|
|
):
|
|
if vqvae_config is None:
|
|
vqvae_config = {}
|
|
logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
|
|
|
|
self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config)
|
|
if prior_config_list is not None:
|
|
self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list]
|
|
else:
|
|
self.prior_configs = []
|
|
for prior_idx in range(nb_priors):
|
|
prior_config = kwargs.pop(f"prior_{prior_idx}", None)
|
|
if prior_config is None:
|
|
prior_config = {}
|
|
logger.info(
|
|
f"prior_{prior_idx}'s config is None. Initializing the JukeboxPriorConfig list with default"
|
|
" values."
|
|
)
|
|
self.prior_configs.append(JukeboxPriorConfig(**prior_config))
|
|
|
|
self.hop_fraction = self.vqvae_config.hop_fraction
|
|
|
|
self.nb_priors = nb_priors
|
|
|
|
# Metadata conditioning
|
|
self.max_nb_genres = max_nb_genres
|
|
self.sampling_rate = sampling_rate
|
|
self.timing_dims = timing_dims
|
|
self.min_duration = min_duration
|
|
self.max_duration = max_duration
|
|
self.metadata_conditioning = metadata_conditioning
|
|
|
|
super().__init__(**kwargs)
|
|
|
|
@classmethod
|
|
def from_configs(cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs):
|
|
r"""
|
|
Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model
|
|
configuration.
|
|
|
|
Returns:
|
|
[`JukeboxConfig`]: An instance of a configuration object
|
|
"""
|
|
prior_config_list = [config.to_dict() for config in prior_configs]
|
|
return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)
|
|
|
|
def to_dict(self):
|
|
# Override the default to_dict to apply to_dict to the list of prior configs.
|
|
result = super().to_dict()
|
|
result["prior_config_list"] = [config.to_dict() for config in result.pop("prior_configs")]
|
|
return result
|