1201 lines
38 KiB
Python
1201 lines
38 KiB
Python
# Generated content DO NOT EDIT
|
|
class AddedToken:
|
|
"""
|
|
Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
|
|
It can have special options that defines the way it should behave.
|
|
|
|
Args:
|
|
content (:obj:`str`): The content of the token
|
|
|
|
single_word (:obj:`bool`, defaults to :obj:`False`):
|
|
Defines whether this token should only match single words. If :obj:`True`, this
|
|
token will never match inside of a word. For example the token ``ing`` would match
|
|
on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
|
|
The notion of "`inside of a word`" is defined by the word boundaries pattern in
|
|
regular expressions (ie. the token should start and end with word boundaries).
|
|
|
|
lstrip (:obj:`bool`, defaults to :obj:`False`):
|
|
Defines whether this token should strip all potential whitespaces on its left side.
|
|
If :obj:`True`, this token will greedily match any whitespace on its left. For
|
|
example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
|
|
``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
|
|
|
|
rstrip (:obj:`bool`, defaults to :obj:`False`):
|
|
Defines whether this token should strip all potential whitespaces on its right
|
|
side. If :obj:`True`, this token will greedily match any whitespace on its right.
|
|
It works just like :obj:`lstrip` but on the right.
|
|
|
|
normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
|
|
Defines whether this token should match against the normalized version of the input
|
|
text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
|
|
lowercasing the text, the token could be extract from the input ``"I saw a lion
|
|
Yesterday"``.
|
|
special (:obj:`bool`, defaults to :obj:`False` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
|
|
Defines whether this token should be skipped when decoding.
|
|
|
|
"""
|
|
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
|
|
pass
|
|
|
|
@property
|
|
def content(self):
|
|
"""
|
|
Get the content of this :obj:`AddedToken`
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def lstrip(self):
|
|
"""
|
|
Get the value of the :obj:`lstrip` option
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def normalized(self):
|
|
"""
|
|
Get the value of the :obj:`normalized` option
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def rstrip(self):
|
|
"""
|
|
Get the value of the :obj:`rstrip` option
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def single_word(self):
|
|
"""
|
|
Get the value of the :obj:`single_word` option
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def special(self):
|
|
"""
|
|
Get the value of the :obj:`special` option
|
|
"""
|
|
pass
|
|
|
|
class Encoding:
|
|
"""
|
|
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
|
|
"""
|
|
@property
|
|
def attention_mask(self):
|
|
"""
|
|
The attention mask
|
|
|
|
This indicates to the LM which tokens should be attended to, and which should not.
|
|
This is especially important when batching sequences, where we need to applying
|
|
padding.
|
|
|
|
Returns:
|
|
:obj:`List[int]`: The attention mask
|
|
"""
|
|
pass
|
|
|
|
def char_to_token(self, char_pos, sequence_index=0):
|
|
"""
|
|
Get the token that contains the char at the given position in the input sequence.
|
|
|
|
Args:
|
|
char_pos (:obj:`int`):
|
|
The position of a char in the input string
|
|
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
|
The index of the sequence that contains the target char
|
|
|
|
Returns:
|
|
:obj:`int`: The index of the token that contains this char in the encoded sequence
|
|
"""
|
|
pass
|
|
|
|
def char_to_word(self, char_pos, sequence_index=0):
|
|
"""
|
|
Get the word that contains the char at the given position in the input sequence.
|
|
|
|
Args:
|
|
char_pos (:obj:`int`):
|
|
The position of a char in the input string
|
|
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
|
The index of the sequence that contains the target char
|
|
|
|
Returns:
|
|
:obj:`int`: The index of the word that contains this char in the input sequence
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def ids(self):
|
|
"""
|
|
The generated IDs
|
|
|
|
The IDs are the main input to a Language Model. They are the token indices,
|
|
the numerical representations that a LM understands.
|
|
|
|
Returns:
|
|
:obj:`List[int]`: The list of IDs
|
|
"""
|
|
pass
|
|
|
|
@staticmethod
|
|
def merge(encodings, growing_offsets=True):
|
|
"""
|
|
Merge the list of encodings into one final :class:`~tokenizers.Encoding`
|
|
|
|
Args:
|
|
encodings (A :obj:`List` of :class:`~tokenizers.Encoding`):
|
|
The list of encodings that should be merged in one
|
|
|
|
growing_offsets (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether the offsets should accumulate while merging
|
|
|
|
Returns:
|
|
:class:`~tokenizers.Encoding`: The resulting Encoding
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def n_sequences(self):
|
|
"""
|
|
The number of sequences represented
|
|
|
|
Returns:
|
|
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def offsets(self):
|
|
"""
|
|
The offsets associated to each token
|
|
|
|
These offsets let's you slice the input string, and thus retrieve the original
|
|
part that led to producing the corresponding token.
|
|
|
|
Returns:
|
|
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def overflowing(self):
|
|
"""
|
|
A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
|
|
|
|
When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
|
|
the output into as many pieces as required to match the specified maximum length.
|
|
This field lets you retrieve all the subsequent pieces.
|
|
|
|
When you use pairs of sequences, the overflowing pieces will contain enough
|
|
variations to cover all the possible combinations, while respecting the provided
|
|
maximum length.
|
|
"""
|
|
pass
|
|
|
|
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
|
|
"""
|
|
Pad the :class:`~tokenizers.Encoding` at the given length
|
|
|
|
Args:
|
|
length (:obj:`int`):
|
|
The desired length
|
|
|
|
direction: (:obj:`str`, defaults to :obj:`right`):
|
|
The expected padding direction. Can be either :obj:`right` or :obj:`left`
|
|
|
|
pad_id (:obj:`int`, defaults to :obj:`0`):
|
|
The ID corresponding to the padding token
|
|
|
|
pad_type_id (:obj:`int`, defaults to :obj:`0`):
|
|
The type ID corresponding to the padding token
|
|
|
|
pad_token (:obj:`str`, defaults to `[PAD]`):
|
|
The pad token to use
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def sequence_ids(self):
|
|
"""
|
|
The generated sequence indices.
|
|
|
|
They represent the index of the input sequence associated to each token.
|
|
The sequence id can be None if the token is not related to any input sequence,
|
|
like for example with special tokens.
|
|
|
|
Returns:
|
|
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
|
|
"""
|
|
pass
|
|
|
|
def set_sequence_id(self, sequence_id):
|
|
"""
|
|
Set the given sequence index
|
|
|
|
Set the given sequence index for the whole range of tokens contained in this
|
|
:class:`~tokenizers.Encoding`.
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def special_tokens_mask(self):
|
|
"""
|
|
The special token mask
|
|
|
|
This indicates which tokens are special tokens, and which are not.
|
|
|
|
Returns:
|
|
:obj:`List[int]`: The special tokens mask
|
|
"""
|
|
pass
|
|
|
|
def token_to_chars(self, token_index):
|
|
"""
|
|
Get the offsets of the token at the given index.
|
|
|
|
The returned offsets are related to the input sequence that contains the
|
|
token. In order to determine in which input sequence it belongs, you
|
|
must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
|
|
|
Args:
|
|
token_index (:obj:`int`):
|
|
The index of a token in the encoded sequence.
|
|
|
|
Returns:
|
|
:obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)`
|
|
"""
|
|
pass
|
|
|
|
def token_to_sequence(self, token_index):
|
|
"""
|
|
Get the index of the sequence represented by the given token.
|
|
|
|
In the general use case, this method returns :obj:`0` for a single sequence or
|
|
the first sequence of a pair, and :obj:`1` for the second sequence of a pair
|
|
|
|
Args:
|
|
token_index (:obj:`int`):
|
|
The index of a token in the encoded sequence.
|
|
|
|
Returns:
|
|
:obj:`int`: The sequence id of the given token
|
|
"""
|
|
pass
|
|
|
|
def token_to_word(self, token_index):
|
|
"""
|
|
Get the index of the word that contains the token in one of the input sequences.
|
|
|
|
The returned word index is related to the input sequence that contains
|
|
the token. In order to determine in which input sequence it belongs, you
|
|
must call :meth:`~tokenizers.Encoding.token_to_sequence()`.
|
|
|
|
Args:
|
|
token_index (:obj:`int`):
|
|
The index of a token in the encoded sequence.
|
|
|
|
Returns:
|
|
:obj:`int`: The index of the word in the relevant input sequence.
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def tokens(self):
|
|
"""
|
|
The generated tokens
|
|
|
|
They are the string representation of the IDs.
|
|
|
|
Returns:
|
|
:obj:`List[str]`: The list of tokens
|
|
"""
|
|
pass
|
|
|
|
def truncate(self, max_length, stride=0, direction="right"):
|
|
"""
|
|
Truncate the :class:`~tokenizers.Encoding` at the given length
|
|
|
|
If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating
|
|
this information is lost. It will be considered as representing a single sequence.
|
|
|
|
Args:
|
|
max_length (:obj:`int`):
|
|
The desired length
|
|
|
|
stride (:obj:`int`, defaults to :obj:`0`):
|
|
The length of previous content to be included in each overflowing piece
|
|
|
|
direction (:obj:`str`, defaults to :obj:`right`):
|
|
Truncate direction
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def type_ids(self):
|
|
"""
|
|
The generated type IDs
|
|
|
|
Generally used for tasks like sequence classification or question answering,
|
|
these tokens let the LM know which input sequence corresponds to each tokens.
|
|
|
|
Returns:
|
|
:obj:`List[int]`: The list of type ids
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def word_ids(self):
|
|
"""
|
|
The generated word indices.
|
|
|
|
They represent the index of the word associated to each token.
|
|
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
|
otherwise they correspond to the words indices as defined by the
|
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
|
|
|
For special tokens and such (any token that was generated from something that was
|
|
not part of the input), the output is :obj:`None`
|
|
|
|
Returns:
|
|
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
|
"""
|
|
pass
|
|
|
|
def word_to_chars(self, word_index, sequence_index=0):
|
|
"""
|
|
Get the offsets of the word at the given index in one of the input sequences.
|
|
|
|
Args:
|
|
word_index (:obj:`int`):
|
|
The index of a word in one of the input sequences.
|
|
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
|
The index of the sequence that contains the target word
|
|
|
|
Returns:
|
|
:obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
|
"""
|
|
pass
|
|
|
|
def word_to_tokens(self, word_index, sequence_index=0):
|
|
"""
|
|
Get the encoded tokens corresponding to the word at the given index
|
|
in one of the input sequences.
|
|
|
|
Args:
|
|
word_index (:obj:`int`):
|
|
The index of a word in one of the input sequences.
|
|
sequence_index (:obj:`int`, defaults to :obj:`0`):
|
|
The index of the sequence that contains the target word
|
|
|
|
Returns:
|
|
:obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def words(self):
|
|
"""
|
|
The generated word indices.
|
|
|
|
.. warning::
|
|
This is deprecated and will be removed in a future version.
|
|
Please use :obj:`~tokenizers.Encoding.word_ids` instead.
|
|
|
|
They represent the index of the word associated to each token.
|
|
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
|
otherwise they correspond to the words indices as defined by the
|
|
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
|
|
|
For special tokens and such (any token that was generated from something that was
|
|
not part of the input), the output is :obj:`None`
|
|
|
|
Returns:
|
|
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
|
"""
|
|
pass
|
|
|
|
class NormalizedString:
|
|
"""
|
|
NormalizedString
|
|
|
|
A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one.
|
|
While making all the requested modifications, it keeps track of the alignment information
|
|
between the two versions of the string.
|
|
|
|
Args:
|
|
sequence: str:
|
|
The string sequence used to initialize this NormalizedString
|
|
"""
|
|
def append(self, s):
|
|
"""
|
|
Append the given sequence to the string
|
|
"""
|
|
pass
|
|
|
|
def clear(self):
|
|
"""
|
|
Clears the string
|
|
"""
|
|
pass
|
|
|
|
def filter(self, func):
|
|
"""
|
|
Filter each character of the string using the given func
|
|
"""
|
|
pass
|
|
|
|
def for_each(self, func):
|
|
"""
|
|
Calls the given function for each character of the string
|
|
"""
|
|
pass
|
|
|
|
def lowercase(self):
|
|
"""
|
|
Lowercase the string
|
|
"""
|
|
pass
|
|
|
|
def lstrip(self):
|
|
"""
|
|
Strip the left of the string
|
|
"""
|
|
pass
|
|
|
|
def map(self, func):
|
|
"""
|
|
Calls the given function for each character of the string
|
|
|
|
Replaces each character of the string using the returned value. Each
|
|
returned value **must** be a str of length 1 (ie a character).
|
|
"""
|
|
pass
|
|
|
|
def nfc(self):
|
|
"""
|
|
Runs the NFC normalization
|
|
"""
|
|
pass
|
|
|
|
def nfd(self):
|
|
"""
|
|
Runs the NFD normalization
|
|
"""
|
|
pass
|
|
|
|
def nfkc(self):
|
|
"""
|
|
Runs the NFKC normalization
|
|
"""
|
|
pass
|
|
|
|
def nfkd(self):
|
|
"""
|
|
Runs the NFKD normalization
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def normalized(self):
|
|
"""
|
|
The normalized part of the string
|
|
"""
|
|
pass
|
|
|
|
def prepend(self, s):
|
|
"""
|
|
Prepend the given sequence to the string
|
|
"""
|
|
pass
|
|
|
|
def replace(self, pattern, content):
|
|
"""
|
|
Replace the content of the given pattern with the provided content
|
|
|
|
Args:
|
|
pattern: Pattern:
|
|
A pattern used to match the string. Usually a string or a Regex
|
|
|
|
content: str:
|
|
The content to be used as replacement
|
|
"""
|
|
pass
|
|
|
|
def rstrip(self):
|
|
"""
|
|
Strip the right of the string
|
|
"""
|
|
pass
|
|
|
|
def slice(self, range):
|
|
"""
|
|
Slice the string using the given range
|
|
"""
|
|
pass
|
|
|
|
def split(self, pattern, behavior):
|
|
"""
|
|
Split the NormalizedString using the given pattern and the specified behavior
|
|
|
|
Args:
|
|
pattern: Pattern:
|
|
A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
|
|
|
|
behavior: SplitDelimiterBehavior:
|
|
The behavior to use when splitting.
|
|
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
|
"contiguous"
|
|
|
|
Returns:
|
|
A list of NormalizedString, representing each split
|
|
"""
|
|
pass
|
|
|
|
def strip(self):
|
|
"""
|
|
Strip both ends of the string
|
|
"""
|
|
pass
|
|
|
|
def uppercase(self):
|
|
"""
|
|
Uppercase the string
|
|
"""
|
|
pass
|
|
|
|
class PreTokenizedString:
|
|
"""
|
|
PreTokenizedString
|
|
|
|
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
|
|
underlying string, while keeping track of the alignment information (offsets).
|
|
|
|
The PreTokenizedString manages what we call `splits`. Each split represents a substring
|
|
which is a subpart of the original string, with the relevant offsets and tokens.
|
|
|
|
When calling one of the methods used to modify the PreTokenizedString (namely one of
|
|
`split`, `normalize` or `tokenize), only the `splits` that don't have any associated
|
|
tokens will get modified.
|
|
|
|
Args:
|
|
sequence: str:
|
|
The string sequence used to initialize this PreTokenizedString
|
|
"""
|
|
def __init__(self, sequence):
|
|
pass
|
|
|
|
def get_splits(self, offset_referential="original", offset_type="char"):
|
|
"""
|
|
Get the splits currently managed by the PreTokenizedString
|
|
|
|
Args:
|
|
offset_referential: :obj:`str`
|
|
Whether the returned splits should have offsets expressed relative
|
|
to the original string, or the normalized one. choices: "original", "normalized".
|
|
|
|
offset_type: :obj:`str`
|
|
Whether the returned splits should have offsets expressed in bytes or chars.
|
|
When slicing an str, we usually want to use chars, which is the default value.
|
|
Now in some cases it might be interesting to get these offsets expressed in bytes,
|
|
so it is possible to change this here.
|
|
choices: "char", "bytes"
|
|
|
|
Returns
|
|
A list of splits
|
|
"""
|
|
pass
|
|
|
|
def normalize(self, func):
|
|
"""
|
|
Normalize each split of the `PreTokenizedString` using the given `func`
|
|
|
|
Args:
|
|
func: Callable[[NormalizedString], None]:
|
|
The function used to normalize each underlying split. This function
|
|
does not need to return anything, just calling the methods on the provided
|
|
NormalizedString allow its modification.
|
|
"""
|
|
pass
|
|
|
|
def split(self, func):
|
|
"""
|
|
Split the PreTokenizedString using the given `func`
|
|
|
|
Args:
|
|
func: Callable[[index, NormalizedString], List[NormalizedString]]:
|
|
The function used to split each underlying split.
|
|
It is expected to return a list of `NormalizedString`, that represent the new
|
|
splits. If the given `NormalizedString` does not need any splitting, we can
|
|
just return it directly.
|
|
In order for the offsets to be tracked accurately, any returned `NormalizedString`
|
|
should come from calling either `.split` or `.slice` on the received one.
|
|
"""
|
|
pass
|
|
|
|
def to_encoding(self, type_id=0, word_idx=None):
|
|
"""
|
|
Return an Encoding generated from this PreTokenizedString
|
|
|
|
Args:
|
|
type_id: int = 0:
|
|
The type_id to be used on the generated Encoding.
|
|
|
|
word_idx: Optional[int] = None:
|
|
An optional word index to be used for each token of this Encoding. If provided,
|
|
all the word indices in the generated Encoding will use this value, instead
|
|
of the one automatically tracked during pre-tokenization.
|
|
|
|
Returns:
|
|
An Encoding
|
|
"""
|
|
pass
|
|
|
|
def tokenize(self, func):
|
|
"""
|
|
Tokenize each split of the `PreTokenizedString` using the given `func`
|
|
|
|
Args:
|
|
func: Callable[[str], List[Token]]:
|
|
The function used to tokenize each underlying split. This function must return
|
|
a list of Token generated from the input str.
|
|
"""
|
|
pass
|
|
|
|
class Regex:
|
|
"""
|
|
Instantiate a new Regex with the given pattern
|
|
"""
|
|
def __init__(self, pattern):
|
|
pass
|
|
|
|
class Token:
|
|
pass
|
|
|
|
class Tokenizer:
|
|
"""
|
|
A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
|
|
and outputs an :class:`~tokenizers.Encoding`.
|
|
|
|
Args:
|
|
model (:class:`~tokenizers.models.Model`):
|
|
The core algorithm that this :obj:`Tokenizer` should be using.
|
|
|
|
"""
|
|
def __init__(self, model):
|
|
pass
|
|
|
|
def add_special_tokens(self, tokens):
|
|
"""
|
|
Add the given special tokens to the Tokenizer.
|
|
|
|
If these tokens are already part of the vocabulary, it just let the Tokenizer know about
|
|
them. If they don't exist, the Tokenizer creates them, giving them a new id.
|
|
|
|
These special tokens will never be processed by the model (ie won't be split into
|
|
multiple tokens), and they can be removed from the output when decoding.
|
|
|
|
Args:
|
|
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
|
The list of special tokens we want to add to the vocabulary. Each token can either
|
|
be a string or an instance of :class:`~tokenizers.AddedToken` for more
|
|
customization.
|
|
|
|
Returns:
|
|
:obj:`int`: The number of tokens that were created in the vocabulary
|
|
"""
|
|
pass
|
|
|
|
def add_tokens(self, tokens):
|
|
"""
|
|
Add the given tokens to the vocabulary
|
|
|
|
The given tokens are added only if they don't already exist in the vocabulary.
|
|
Each token then gets a new attributed id.
|
|
|
|
Args:
|
|
tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
|
|
The list of tokens we want to add to the vocabulary. Each token can be either a
|
|
string or an instance of :class:`~tokenizers.AddedToken` for more customization.
|
|
|
|
Returns:
|
|
:obj:`int`: The number of tokens that were created in the vocabulary
|
|
"""
|
|
pass
|
|
|
|
def decode(self, ids, skip_special_tokens=True):
|
|
"""
|
|
Decode the given list of ids back to a string
|
|
|
|
This is used to decode anything coming back from a Language Model
|
|
|
|
Args:
|
|
ids (A :obj:`List/Tuple` of :obj:`int`):
|
|
The list of ids that we want to decode
|
|
|
|
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether the special tokens should be removed from the decoded string
|
|
|
|
Returns:
|
|
:obj:`str`: The decoded string
|
|
"""
|
|
pass
|
|
|
|
def decode_batch(self, sequences, skip_special_tokens=True):
|
|
"""
|
|
Decode a batch of ids back to their corresponding string
|
|
|
|
Args:
|
|
sequences (:obj:`List` of :obj:`List[int]`):
|
|
The batch of sequences we want to decode
|
|
|
|
skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether the special tokens should be removed from the decoded strings
|
|
|
|
Returns:
|
|
:obj:`List[str]`: A list of decoded strings
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def decoder(self):
|
|
"""
|
|
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
|
|
"""
|
|
pass
|
|
|
|
def enable_padding(
|
|
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
|
|
):
|
|
"""
|
|
Enable the padding
|
|
|
|
Args:
|
|
direction (:obj:`str`, `optional`, defaults to :obj:`right`):
|
|
The direction in which to pad. Can be either ``right`` or ``left``
|
|
|
|
pad_to_multiple_of (:obj:`int`, `optional`):
|
|
If specified, the padding length should always snap to the next multiple of the
|
|
given value. For example if we were going to pad witha length of 250 but
|
|
``pad_to_multiple_of=8`` then we will pad to 256.
|
|
|
|
pad_id (:obj:`int`, defaults to 0):
|
|
The id to be used when padding
|
|
|
|
pad_type_id (:obj:`int`, defaults to 0):
|
|
The type id to be used when padding
|
|
|
|
pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
|
|
The pad token to be used when padding
|
|
|
|
length (:obj:`int`, `optional`):
|
|
If specified, the length at which to pad. If not specified we pad using the size of
|
|
the longest sequence in a batch.
|
|
"""
|
|
pass
|
|
|
|
def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"):
|
|
"""
|
|
Enable truncation
|
|
|
|
Args:
|
|
max_length (:obj:`int`):
|
|
The max length at which to truncate
|
|
|
|
stride (:obj:`int`, `optional`):
|
|
The length of the previous first sequence to be included in the overflowing
|
|
sequence
|
|
|
|
strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
|
|
The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
|
|
``only_second``.
|
|
|
|
direction (:obj:`str`, defaults to :obj:`right`):
|
|
Truncate direction
|
|
"""
|
|
pass
|
|
|
|
def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True):
|
|
"""
|
|
Encode the given sequence and pair. This method can process raw text sequences
|
|
as well as already pre-tokenized sequences.
|
|
|
|
Example:
|
|
Here are some examples of the inputs that are accepted::
|
|
|
|
encode("A single sequence")`
|
|
encode("A sequence", "And its pair")`
|
|
encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
|
|
encode(
|
|
[ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
|
|
is_pretokenized=True
|
|
)
|
|
|
|
Args:
|
|
sequence (:obj:`~tokenizers.InputSequence`):
|
|
The main input sequence we want to encode. This sequence can be either raw
|
|
text or pre-tokenized, according to the ``is_pretokenized`` argument:
|
|
|
|
- If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
|
|
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
|
|
|
|
pair (:obj:`~tokenizers.InputSequence`, `optional`):
|
|
An optional input sequence. The expected format is the same that for ``sequence``.
|
|
|
|
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
|
Whether the input is already pre-tokenized
|
|
|
|
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether to add the special tokens
|
|
|
|
Returns:
|
|
:class:`~tokenizers.Encoding`: The encoded result
|
|
|
|
"""
|
|
pass
|
|
|
|
def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True):
|
|
"""
|
|
Encode the given batch of inputs. This method accept both raw text sequences
|
|
as well as already pre-tokenized sequences.
|
|
|
|
Example:
|
|
Here are some examples of the inputs that are accepted::
|
|
|
|
encode_batch([
|
|
"A single sequence",
|
|
("A tuple with a sequence", "And its pair"),
|
|
[ "A", "pre", "tokenized", "sequence" ],
|
|
([ "A", "pre", "tokenized", "sequence" ], "And its pair")
|
|
])
|
|
|
|
Args:
|
|
input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
|
|
A list of single sequences or pair sequences to encode. Each sequence
|
|
can be either raw text or pre-tokenized, according to the ``is_pretokenized``
|
|
argument:
|
|
|
|
- If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
|
|
- If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
|
|
|
|
is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
|
|
Whether the input is already pre-tokenized
|
|
|
|
add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether to add the special tokens
|
|
|
|
Returns:
|
|
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
|
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def encode_special_tokens(self):
|
|
"""
|
|
Modifies the tokenizer in order to use or not the special tokens
|
|
during encoding.
|
|
|
|
Args:
|
|
value (:obj:`bool`):
|
|
Whether to use the special tokens or not
|
|
|
|
"""
|
|
pass
|
|
|
|
@staticmethod
|
|
def from_buffer(buffer):
|
|
"""
|
|
Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
|
|
|
|
Args:
|
|
buffer (:obj:`bytes`):
|
|
A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
|
|
|
|
Returns:
|
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
|
"""
|
|
pass
|
|
|
|
@staticmethod
|
|
def from_file(path):
|
|
"""
|
|
Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
|
|
|
|
Args:
|
|
path (:obj:`str`):
|
|
A path to a local JSON file representing a previously serialized
|
|
:class:`~tokenizers.Tokenizer`
|
|
|
|
Returns:
|
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
|
"""
|
|
pass
|
|
|
|
@staticmethod
|
|
def from_pretrained(identifier, revision="main", auth_token=None):
|
|
"""
|
|
Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
|
|
Hugging Face Hub.
|
|
|
|
Args:
|
|
identifier (:obj:`str`):
|
|
The identifier of a Model on the Hugging Face Hub, that contains
|
|
a tokenizer.json file
|
|
revision (:obj:`str`, defaults to `main`):
|
|
A branch or commit id
|
|
auth_token (:obj:`str`, `optional`, defaults to `None`):
|
|
An optional auth token used to access private repositories on the
|
|
Hugging Face Hub
|
|
|
|
Returns:
|
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
|
"""
|
|
pass
|
|
|
|
@staticmethod
|
|
def from_str(json):
|
|
"""
|
|
Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
|
|
|
Args:
|
|
json (:obj:`str`):
|
|
A valid JSON string representing a previously serialized
|
|
:class:`~tokenizers.Tokenizer`
|
|
|
|
Returns:
|
|
:class:`~tokenizers.Tokenizer`: The new tokenizer
|
|
"""
|
|
pass
|
|
|
|
def get_added_tokens_decoder(self):
|
|
"""
|
|
Get the underlying vocabulary
|
|
|
|
Returns:
|
|
:obj:`Dict[int, AddedToken]`: The vocabulary
|
|
"""
|
|
pass
|
|
|
|
def get_vocab(self, with_added_tokens=True):
|
|
"""
|
|
Get the underlying vocabulary
|
|
|
|
Args:
|
|
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether to include the added tokens
|
|
|
|
Returns:
|
|
:obj:`Dict[str, int]`: The vocabulary
|
|
"""
|
|
pass
|
|
|
|
def get_vocab_size(self, with_added_tokens=True):
|
|
"""
|
|
Get the size of the underlying vocabulary
|
|
|
|
Args:
|
|
with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether to include the added tokens
|
|
|
|
Returns:
|
|
:obj:`int`: The size of the vocabulary
|
|
"""
|
|
pass
|
|
|
|
def id_to_token(self, id):
|
|
"""
|
|
Convert the given id to its corresponding token if it exists
|
|
|
|
Args:
|
|
id (:obj:`int`):
|
|
The id to convert
|
|
|
|
Returns:
|
|
:obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def model(self):
|
|
"""
|
|
The :class:`~tokenizers.models.Model` in use by the Tokenizer
|
|
"""
|
|
pass
|
|
|
|
def no_padding(self):
|
|
"""
|
|
Disable padding
|
|
"""
|
|
pass
|
|
|
|
def no_truncation(self):
|
|
"""
|
|
Disable truncation
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def normalizer(self):
|
|
"""
|
|
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
|
|
"""
|
|
pass
|
|
|
|
def num_special_tokens_to_add(self, is_pair):
|
|
"""
|
|
Return the number of special tokens that would be added for single/pair sentences.
|
|
:param is_pair: Boolean indicating if the input would be a single sentence or a pair
|
|
:return:
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def padding(self):
|
|
"""
|
|
Get the current padding parameters
|
|
|
|
`Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
|
|
|
|
Returns:
|
|
(:obj:`dict`, `optional`):
|
|
A dict with the current padding parameters if padding is enabled
|
|
"""
|
|
pass
|
|
|
|
def post_process(self, encoding, pair=None, add_special_tokens=True):
|
|
"""
|
|
Apply all the post-processing steps to the given encodings.
|
|
|
|
The various steps are:
|
|
|
|
1. Truncate according to the set truncation params (provided with
|
|
:meth:`~tokenizers.Tokenizer.enable_truncation`)
|
|
2. Apply the :class:`~tokenizers.processors.PostProcessor`
|
|
3. Pad according to the set padding params (provided with
|
|
:meth:`~tokenizers.Tokenizer.enable_padding`)
|
|
|
|
Args:
|
|
encoding (:class:`~tokenizers.Encoding`):
|
|
The :class:`~tokenizers.Encoding` corresponding to the main sequence.
|
|
|
|
pair (:class:`~tokenizers.Encoding`, `optional`):
|
|
An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
|
|
|
|
add_special_tokens (:obj:`bool`):
|
|
Whether to add the special tokens
|
|
|
|
Returns:
|
|
:class:`~tokenizers.Encoding`: The final post-processed encoding
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def post_processor(self):
|
|
"""
|
|
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def pre_tokenizer(self):
|
|
"""
|
|
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
|
|
"""
|
|
pass
|
|
|
|
def save(self, path, pretty=True):
|
|
"""
|
|
Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
|
|
|
|
Args:
|
|
path (:obj:`str`):
|
|
A path to a file in which to save the serialized tokenizer.
|
|
|
|
pretty (:obj:`bool`, defaults to :obj:`True`):
|
|
Whether the JSON file should be pretty formatted.
|
|
"""
|
|
pass
|
|
|
|
def to_str(self, pretty=False):
|
|
"""
|
|
Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
|
|
|
|
Args:
|
|
pretty (:obj:`bool`, defaults to :obj:`False`):
|
|
Whether the JSON string should be pretty formatted.
|
|
|
|
Returns:
|
|
:obj:`str`: A string representing the serialized Tokenizer
|
|
"""
|
|
pass
|
|
|
|
def token_to_id(self, token):
|
|
"""
|
|
Convert the given token to its corresponding id if it exists
|
|
|
|
Args:
|
|
token (:obj:`str`):
|
|
The token to convert
|
|
|
|
Returns:
|
|
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
|
"""
|
|
pass
|
|
|
|
def train(self, files, trainer=None):
|
|
"""
|
|
Train the Tokenizer using the given files.
|
|
|
|
Reads the files line by line, while keeping all the whitespace, even new lines.
|
|
If you want to train from data store in-memory, you can check
|
|
:meth:`~tokenizers.Tokenizer.train_from_iterator`
|
|
|
|
Args:
|
|
files (:obj:`List[str]`):
|
|
A list of path to the files that we should use for training
|
|
|
|
trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
|
An optional trainer that should be used to train our Model
|
|
"""
|
|
pass
|
|
|
|
def train_from_iterator(self, iterator, trainer=None, length=None):
|
|
"""
|
|
Train the Tokenizer using the provided iterator.
|
|
|
|
You can provide anything that is a Python Iterator
|
|
|
|
* A list of sequences :obj:`List[str]`
|
|
* A generator that yields :obj:`str` or :obj:`List[str]`
|
|
* A Numpy array of strings
|
|
* ...
|
|
|
|
Args:
|
|
iterator (:obj:`Iterator`):
|
|
Any iterator over strings or list of strings
|
|
|
|
trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
|
An optional trainer that should be used to train our Model
|
|
|
|
length (:obj:`int`, `optional`):
|
|
The total number of sequences in the iterator. This is used to
|
|
provide meaningful progress tracking
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
def truncation(self):
|
|
"""
|
|
Get the currently set truncation parameters
|
|
|
|
`Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
|
|
|
|
Returns:
|
|
(:obj:`dict`, `optional`):
|
|
A dict with the current truncation parameters if truncation is enabled
|
|
"""
|
|
pass
|