183 lines
8.1 KiB
Python
183 lines
8.1 KiB
Python
# coding=utf-8
|
|
# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
""" RAG model configuration"""
|
|
|
|
|
|
from ...configuration_utils import PretrainedConfig
|
|
from ...utils import add_start_docstrings
|
|
|
|
|
|
RAG_CONFIG_DOC = r"""
|
|
[`RagConfig`] stores the configuration of a *RagModel*. Configuration objects inherit from [`PretrainedConfig`] and
|
|
can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information.
|
|
|
|
Args:
|
|
title_sep (`str`, *optional*, defaults to `" / "`):
|
|
Separator inserted between the title and the text of the retrieved document when calling [`RagRetriever`].
|
|
doc_sep (`str`, *optional*, defaults to `" // "`):
|
|
Separator inserted between the text of the retrieved document and the original input when calling
|
|
[`RagRetriever`].
|
|
n_docs (`int`, *optional*, defaults to 5):
|
|
Number of documents to retrieve.
|
|
max_combined_length (`int`, *optional*, defaults to 300):
|
|
Max length of contextualized input returned by [`~RagRetriever.__call__`].
|
|
retrieval_vector_size (`int`, *optional*, defaults to 768):
|
|
Dimensionality of the document embeddings indexed by [`RagRetriever`].
|
|
retrieval_batch_size (`int`, *optional*, defaults to 8):
|
|
Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
|
|
[`RagRetriever`].
|
|
dataset (`str`, *optional*, defaults to `"wiki_dpr"`):
|
|
A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
|
|
using `datasets.list_datasets()`).
|
|
dataset_split (`str`, *optional*, defaults to `"train"`)
|
|
Which split of the `dataset` to load.
|
|
index_name (`str`, *optional*, defaults to `"compressed"`)
|
|
The index name of the index associated with the `dataset`. One can choose between `"legacy"`, `"exact"` and
|
|
`"compressed"`.
|
|
index_path (`str`, *optional*)
|
|
The path to the serialized faiss index on disk.
|
|
passages_path (`str`, *optional*):
|
|
A path to text passages compatible with the faiss index. Required if using
|
|
[`~models.rag.retrieval_rag.LegacyIndex`]
|
|
use_dummy_dataset (`bool`, *optional*, defaults to `False`)
|
|
Whether to load a "dummy" variant of the dataset specified by `dataset`.
|
|
label_smoothing (`float`, *optional*, defaults to 0.0):
|
|
Only relevant if `return_loss` is set to `True`. Controls the `epsilon` parameter value for label smoothing
|
|
in the loss calculation. If set to 0, no label smoothing is performed.
|
|
do_marginalize (`bool`, *optional*, defaults to `False`):
|
|
If `True`, the logits are marginalized over all documents by making use of
|
|
`torch.nn.functional.log_softmax`.
|
|
reduce_loss (`bool`, *optional*, defaults to `False`):
|
|
Whether or not to reduce the NLL loss using the `torch.Tensor.sum` operation.
|
|
do_deduplication (`bool`, *optional*, defaults to `True`):
|
|
Whether or not to deduplicate the generations from different context documents for a given input. Has to be
|
|
set to `False` if used while training with distributed backend.
|
|
exclude_bos_score (`bool`, *optional*, defaults to `False`):
|
|
Whether or not to disregard the BOS token when computing the loss.
|
|
output_retrieved(`bool`, *optional*, defaults to `False`):
|
|
If set to `True`, `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
|
|
`context_attention_mask` are returned. See returned tensors for more detail.
|
|
use_cache (`bool`, *optional*, defaults to `True`):
|
|
Whether or not the model should return the last key/values attentions (not used by all models).
|
|
forced_eos_token_id (`int`, *optional*):
|
|
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
|
|
`eos_token_id`.
|
|
"""
|
|
|
|
|
|
@add_start_docstrings(RAG_CONFIG_DOC)
|
|
class RagConfig(PretrainedConfig):
|
|
model_type = "rag"
|
|
is_composition = True
|
|
|
|
def __init__(
|
|
self,
|
|
vocab_size=None,
|
|
is_encoder_decoder=True,
|
|
prefix=None,
|
|
bos_token_id=None,
|
|
pad_token_id=None,
|
|
eos_token_id=None,
|
|
decoder_start_token_id=None,
|
|
title_sep=" / ",
|
|
doc_sep=" // ",
|
|
n_docs=5,
|
|
max_combined_length=300,
|
|
retrieval_vector_size=768,
|
|
retrieval_batch_size=8,
|
|
dataset="wiki_dpr",
|
|
dataset_split="train",
|
|
index_name="compressed",
|
|
index_path=None,
|
|
passages_path=None,
|
|
use_dummy_dataset=False,
|
|
reduce_loss=False,
|
|
label_smoothing=0.0,
|
|
do_deduplication=True,
|
|
exclude_bos_score=False,
|
|
do_marginalize=False,
|
|
output_retrieved=False,
|
|
use_cache=True,
|
|
forced_eos_token_id=None,
|
|
dataset_revision=None,
|
|
**kwargs,
|
|
):
|
|
super().__init__(
|
|
bos_token_id=bos_token_id,
|
|
pad_token_id=pad_token_id,
|
|
eos_token_id=eos_token_id,
|
|
decoder_start_token_id=decoder_start_token_id,
|
|
forced_eos_token_id=forced_eos_token_id,
|
|
is_encoder_decoder=is_encoder_decoder,
|
|
prefix=prefix,
|
|
vocab_size=vocab_size,
|
|
**kwargs,
|
|
)
|
|
assert (
|
|
"question_encoder" in kwargs and "generator" in kwargs
|
|
), "Config has to be initialized with question_encoder and generator config"
|
|
question_encoder_config = kwargs.pop("question_encoder")
|
|
question_encoder_model_type = question_encoder_config.pop("model_type")
|
|
decoder_config = kwargs.pop("generator")
|
|
decoder_model_type = decoder_config.pop("model_type")
|
|
|
|
from ..auto.configuration_auto import AutoConfig
|
|
|
|
self.question_encoder = AutoConfig.for_model(question_encoder_model_type, **question_encoder_config)
|
|
self.generator = AutoConfig.for_model(decoder_model_type, **decoder_config)
|
|
|
|
self.reduce_loss = reduce_loss
|
|
self.label_smoothing = label_smoothing
|
|
self.exclude_bos_score = exclude_bos_score
|
|
self.do_marginalize = do_marginalize
|
|
|
|
self.title_sep = title_sep
|
|
self.doc_sep = doc_sep
|
|
self.n_docs = n_docs
|
|
self.max_combined_length = max_combined_length
|
|
|
|
self.dataset = dataset
|
|
self.dataset_split = dataset_split
|
|
self.index_name = index_name
|
|
|
|
self.retrieval_vector_size = retrieval_vector_size
|
|
self.retrieval_batch_size = retrieval_batch_size
|
|
self.passages_path = passages_path
|
|
self.index_path = index_path
|
|
self.use_dummy_dataset = use_dummy_dataset
|
|
self.dataset_revision = dataset_revision
|
|
|
|
self.output_retrieved = output_retrieved
|
|
|
|
self.do_deduplication = do_deduplication
|
|
|
|
self.use_cache = use_cache
|
|
|
|
if self.forced_eos_token_id is None:
|
|
self.forced_eos_token_id = getattr(self.generator, "forced_eos_token_id", None)
|
|
|
|
@classmethod
|
|
def from_question_encoder_generator_configs(
|
|
cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
|
|
) -> PretrainedConfig:
|
|
r"""
|
|
Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
|
|
decoder model configuration.
|
|
|
|
Returns:
|
|
[`EncoderDecoderConfig`]: An instance of a configuration object
|
|
"""
|
|
return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)
|