ai-content-maker/.venv/Lib/site-packages/transformers/models/jukebox/configuration_jukebox.py

614 lines
26 KiB
Python
Raw Permalink Normal View History

2024-05-03 04:18:51 +03:00
# coding=utf-8
# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Jukebox configuration"""
import os
from typing import List, Union
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
from ..deprecated._archive_maps import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP # noqa: F401, E402
_LARGE_ATTENTION = [
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"block_attn",
"transpose_block_attn",
"prev_block_attn",
"cross_attention",
]
_RawColumnPreviousRowAttention = ["block_attn", "transpose_block_attn", "prev_block_attn"]
_FullDenseAttention = ["dense_attention"]
_PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
def full_dense_attention(layer):
return _FullDenseAttention[0]
def raw_column_previous_row_attention(layer):
return _RawColumnPreviousRowAttention[layer % 3]
def large_separated_enc_dec_w_lyrics(layer):
return _LARGE_ATTENTION[layer % 79]
def enc_dec_with_lyrics(layer):
if layer % 16 == 15:
return _PrimePrimeDenseAttention[layer % 3]
return _RawColumnPreviousRowAttention[layer % 3]
ATTENTION_PATTERNS = {
"full_dense_attention": full_dense_attention,
"raw_column_previous_row_attention": raw_column_previous_row_attention, # Alternate row, column and previous row attn
"large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics, # Used by large separated_enc_dec model with lyrics
"enc_dec_with_lyrics": enc_dec_with_lyrics, # Used by encoder_decoder model with lyrics
}
class JukeboxPriorConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
`JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the top level prior from the
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
-1b-lyrics) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
Activation function.
alignment_head (`int`, *optional*, defaults to 2):
Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
alignment
alignment_layer (`int`, *optional*, defaults to 68):
Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
lyric to audio alignment
attention_multiplier (`float`, *optional*, defaults to 0.25):
Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
0.25*width of the model will be used.
attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
Which attention pattern to use for the decoder/
attn_dropout (`int`, *optional*, defaults to 0):
Dropout probability for the post-attention layer dropout in the decoder.
attn_res_scale (`bool`, *optional*, defaults to `False`):
Whether or not to scale the residuals in the attention conditioner block.
blocks (`int`, *optional*, defaults to 64):
Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
blocks]` in the `JukeboxAttention` layer.
conv_res_scale (`int`, *optional*):
Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
conditioner, the default value is to None and should not be modified.
num_layers (`int`, *optional*, defaults to 72):
Number of layers of the transformer architecture.
emb_dropout (`int`, *optional*, defaults to 0):
Embedding dropout used in the lyric decoder.
encoder_config (`JukeboxPriorConfig`, *optional*) :
Configuration of the encoder which models the prior on the lyrics.
encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
Multiplication factor used in front of the lyric encoder loss.
hidden_size (`int`, *optional*, defaults to 2048):
Hidden dimension of the attention layers.
init_scale (`float`, *optional*, defaults to 0.2):
Initialization scales for the prior modules.
is_encoder_decoder (`bool`, *optional*, defaults to `True`):
Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
greater than 0, the `encoder` args should be specified for the lyric encoding.
mask (`bool`, *optional*, defaults to `False`):
Whether or not to mask the previous positions in the attention.
max_duration (`int`, *optional*, defaults to 600):
Maximum supported duration of the generated song in seconds.
max_nb_genres (`int`, *optional*, defaults to 1):
Maximum number of genres that can be used to condition the model.
merged_decoder (`bool`, *optional*, defaults to `True`):
Whether or not the decoder and the encoder inputs are merged. This is used for the separated
encoder-decoder architecture
metadata_conditioning (`bool`, *optional*, defaults to `True)`:
Whether or not to condition on the artist and genre metadata.
metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
Number of genres and the number of artists that were used to train the embedding layers of the prior
models.
min_duration (`int`, *optional*, defaults to 0):
Minimum duration of the generated audio on which the model was trained.
mlp_multiplier (`float`, *optional*, defaults to 1.0):
Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
the model will be used.
music_vocab_size (`int`, *optional*, defaults to 2048):
Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
n_ctx (`int`, *optional*, defaults to 6144):
Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
generating music tokens.
n_heads (`int`, *optional*, defaults to 2):
Number of attention heads.
nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
Number of lyric tokens that are used when sampling a single window of length `n_ctx`
res_conv_depth (`int`, *optional*, defaults to 3):
Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
`JukeboxMusicTokenConditioner`.
res_conv_width (`int`, *optional*, defaults to 128):
Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
`JukeboxMusicTokenConditioner`.
res_convolution_multiplier (`int`, *optional*, defaults to 1):
Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
res_dilation_cycle (`int`, *optional*):
Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
tokens.
res_dilation_growth_rate (`int`, *optional*, defaults to 1):
Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
Downsampling rates used in the audio conditioning network
res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
Striding used in the audio conditioning network
resid_dropout (`int`, *optional*, defaults to 0):
Residual dropout used in the attention pattern.
sampling_rate (`int`, *optional*, defaults to 44100):
Sampling rate used for training.
spread (`int`, *optional*):
Spread used in the `summary_spread_attention` pattern
timing_dims (`int`, *optional*, defaults to 64):
Dimension of the timing embedding.
zero_out (`bool`, *optional*, defaults to `False`):
Whether or not to zero out convolution weights when initializing.
"""
model_type = "jukebox_prior"
attribute_map = {
"max_position_embeddings": "n_positions",
"num_attention_heads": "n_head",
}
def __init__(
self,
act_fn="quick_gelu",
level=0,
alignment_head=2,
alignment_layer=68,
attention_multiplier=0.25,
attention_pattern="enc_dec_with_lyrics",
attn_dropout=0,
attn_res_scale=False,
blocks=64,
conv_res_scale=None,
num_layers=72,
emb_dropout=0,
encoder_config=None,
encoder_loss_fraction=0.4,
hidden_size=2048,
init_scale=0.2,
is_encoder_decoder=True,
lyric_vocab_size=80,
mask=False,
max_duration=600,
max_nb_genres=1,
merged_decoder=True,
metadata_conditioning=True,
metadata_dims=[604, 7898],
min_duration=0,
mlp_multiplier=1.0,
music_vocab_size=2048,
n_ctx=6144,
n_heads=2,
nb_relevant_lyric_tokens=384,
res_conv_depth=3,
res_conv_width=128,
res_convolution_multiplier=1,
res_dilation_cycle=None,
res_dilation_growth_rate=1,
res_downs_t=[3, 2, 2],
res_strides_t=[2, 2, 2],
resid_dropout=0,
sampling_rate=44100,
spread=None,
timing_dims=64,
zero_out=False,
**kwargs,
):
self.act_fn = act_fn
self.alignment_head = alignment_head
self.alignment_layer = alignment_layer
self.attention_multiplier = attention_multiplier
self.attention_pattern = attention_pattern
self.attn_dropout = attn_dropout
self.attn_res_scale = attn_res_scale
self.blocks = blocks
self.conv_res_scale = conv_res_scale
self.num_layers = num_layers
self.emb_dropout = emb_dropout
self.music_vocab_size = music_vocab_size
if encoder_config is not None:
self.encoder_config = JukeboxPriorConfig(**encoder_config)
else:
self.encoder_config = None
self.encoder_loss_fraction = encoder_loss_fraction
self.init_scale = init_scale
self.is_encoder_decoder = is_encoder_decoder
self.lyric_vocab_size = lyric_vocab_size
self.level = level
self.mask = mask
self.max_duration = max_duration
self.max_nb_genres = max_nb_genres
self.merged_decoder = merged_decoder
self.metadata_conditioning = metadata_conditioning
self.metadata_dims = metadata_dims
self.min_duration = min_duration
self.mlp_multiplier = mlp_multiplier
self.n_ctx = n_ctx
self.n_heads = n_heads
self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
self.res_conv_depth = res_conv_depth
self.res_conv_width = res_conv_width
self.res_convolution_multiplier = res_convolution_multiplier
self.res_dilation_cycle = res_dilation_cycle
self.res_dilation_growth_rate = res_dilation_growth_rate
self.res_downs_t = res_downs_t
self.res_strides_t = res_strides_t
self.resid_dropout = resid_dropout
self.sampling_rate = sampling_rate
self.spread = spread
self.timing_dims = timing_dims
self.hidden_size = hidden_size
self.zero_out = zero_out
@classmethod
def from_pretrained(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs
) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the prior config dict if we are loading from JukeboxConfig
if config_dict.get("model_type") == "jukebox":
config_dict = config_dict[f"prior_{level}"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class JukeboxVQVAEConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
`JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the VQVAE from
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
act_fn (`str`, *optional*, defaults to `"relu"`):
Activation function of the model.
nb_discrete_codes (`int`, *optional*, defaults to 2048):
Number of codes of the VQVAE.
commit (`float`, *optional*, defaults to 0.02):
Commit loss multiplier.
conv_input_shape (`int`, *optional*, defaults to 1):
Number of audio channels.
conv_res_scale (`bool`, *optional*, defaults to `False`):
Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
embed_dim (`int`, *optional*, defaults to 64):
Embedding dimension of the codebook vectors.
hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
Fraction of non-intersecting window used when continuing the sampling process.
levels (`int`, *optional*, defaults to 3):
Number of hierarchical levels that used in the VQVAE.
lmu (`float`, *optional*, defaults to 0.99):
Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
res_conv_depth (`int`, *optional*, defaults to 4):
Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
res_conv_width (`int`, *optional*, defaults to 32):
Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
res_convolution_multiplier (`int`, *optional*, defaults to 1):
Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
res_dilation_cycle (`int`, *optional*):
Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
reduced by a power of `res_dilation_cycle`.
res_dilation_growth_rate (`int`, *optional*, defaults to 3):
Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
Downsampling rate for each level of the hierarchical VQ-VAE.
res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
Stride used for each level of the hierarchical VQ-VAE.
sample_length (`int`, *optional*, defaults to 1058304):
Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
init_scale (`float`, *optional*, defaults to 0.2):
Initialization scale.
zero_out (`bool`, *optional*, defaults to `False`):
Whether or not to zero out convolution weights when initializing.
"""
model_type = "jukebox_vqvae"
def __init__(
self,
act_fn="relu",
nb_discrete_codes=2048,
commit=0.02,
conv_input_shape=1,
conv_res_scale=False,
embed_dim=64,
hop_fraction=[0.125, 0.5, 0.5],
levels=3,
lmu=0.99,
multipliers=[2, 1, 1],
res_conv_depth=4,
res_conv_width=32,
res_convolution_multiplier=1,
res_dilation_cycle=None,
res_dilation_growth_rate=3,
res_downs_t=[3, 2, 2],
res_strides_t=[2, 2, 2],
sample_length=1058304,
init_scale=0.2,
zero_out=False,
**kwargs,
):
self.hop_fraction = hop_fraction
self.conv_input_shape = conv_input_shape
self.sample_length = sample_length
# VQVAE parameters (all used)
self.levels = levels
self.embed_dim = embed_dim
self.nb_discrete_codes = nb_discrete_codes
self.res_conv_width = res_conv_width
self.res_conv_depth = res_conv_depth
self.res_convolution_multiplier = res_convolution_multiplier
self.res_dilation_growth_rate = res_dilation_growth_rate
self.res_dilation_cycle = res_dilation_cycle
self.multipliers = multipliers
self.res_downs_t = res_downs_t
self.res_strides_t = res_strides_t
self.lmu = lmu
self.commit = commit
self.conv_res_scale = conv_res_scale
self.act_fn = act_fn
self.init_scale = init_scale
self.zero_out = zero_out
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
# get the text config dict if we are loading from CLIPConfig
if config_dict.get("model_type") == "jukebox":
config_dict = config_dict["vqvae_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class JukeboxConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`JukeboxModel`].
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
yield a similar configuration to that of
[openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
(5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
Args:
vqvae_config (`JukeboxVQVAEConfig`, *optional*):
Configuration for the `JukeboxVQVAE` model.
prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
nb_priors (`int`, *optional*, defaults to 3):
Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
(decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
trained using a top prior and 2 upsampler priors.
sampling_rate (`int`, *optional*, defaults to 44100):
Sampling rate of the raw audio.
timing_dims (`int`, *optional*, defaults to 64):
Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
layer. The timing embedding layer converts the absolute and relative position in the currently sampled
audio to a tensor of length `timing_dims` that will be added to the music tokens.
min_duration (`int`, *optional*, defaults to 0):
Minimum duration of the audios to generate
max_duration (`float`, *optional*, defaults to 600.0):
Maximum duration of the audios to generate
max_nb_genres (`int`, *optional*, defaults to 5):
Maximum number of genres that can be used to condition a single sample.
metadata_conditioning (`bool`, *optional*, defaults to `True`):
Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
duration.
Example:
```python
>>> from transformers import JukeboxModel, JukeboxConfig
>>> # Initializing a Jukebox configuration
>>> configuration = JukeboxConfig()
>>> # Initializing a model from the configuration
>>> model = JukeboxModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "jukebox"
def __init__(
self,
vqvae_config=None,
prior_config_list=None,
nb_priors=3,
sampling_rate=44100,
timing_dims=64,
min_duration=0,
max_duration=600.0,
max_nb_genres=5,
metadata_conditioning=True,
**kwargs,
):
if vqvae_config is None:
vqvae_config = {}
logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config)
if prior_config_list is not None:
self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list]
else:
self.prior_configs = []
for prior_idx in range(nb_priors):
prior_config = kwargs.pop(f"prior_{prior_idx}", None)
if prior_config is None:
prior_config = {}
logger.info(
f"prior_{prior_idx}'s config is None. Initializing the JukeboxPriorConfig list with default"
" values."
)
self.prior_configs.append(JukeboxPriorConfig(**prior_config))
self.hop_fraction = self.vqvae_config.hop_fraction
self.nb_priors = nb_priors
# Metadata conditioning
self.max_nb_genres = max_nb_genres
self.sampling_rate = sampling_rate
self.timing_dims = timing_dims
self.min_duration = min_duration
self.max_duration = max_duration
self.metadata_conditioning = metadata_conditioning
super().__init__(**kwargs)
@classmethod
def from_configs(cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs):
r"""
Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model
configuration.
Returns:
[`JukeboxConfig`]: An instance of a configuration object
"""
prior_config_list = [config.to_dict() for config in prior_configs]
return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)
def to_dict(self):
# Override the default to_dict to apply to_dict to the list of prior configs.
result = super().to_dict()
result["prior_config_list"] = [config.to_dict() for config in result.pop("prior_configs")]
return result