157 lines
5.3 KiB
Python
157 lines
5.3 KiB
Python
# Generated content DO NOT EDIT
|
|
class Trainer:
|
|
"""
|
|
Base class for all trainers
|
|
|
|
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
|
Trainer will return an instance of this class when instantiated.
|
|
"""
|
|
|
|
class BpeTrainer(Trainer):
|
|
"""
|
|
Trainer capable of training a BPE model
|
|
|
|
Args:
|
|
vocab_size (:obj:`int`, `optional`):
|
|
The size of the final vocabulary, including all tokens and alphabet.
|
|
|
|
min_frequency (:obj:`int`, `optional`):
|
|
The minimum frequency a pair should have in order to be merged.
|
|
|
|
show_progress (:obj:`bool`, `optional`):
|
|
Whether to show progress bars while training.
|
|
|
|
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
|
A list of special tokens the model should know of.
|
|
|
|
limit_alphabet (:obj:`int`, `optional`):
|
|
The maximum different characters to keep in the alphabet.
|
|
|
|
initial_alphabet (:obj:`List[str]`, `optional`):
|
|
A list of characters to include in the initial alphabet, even
|
|
if not seen in the training dataset.
|
|
If the strings contain more than one character, only the first one
|
|
is kept.
|
|
|
|
continuing_subword_prefix (:obj:`str`, `optional`):
|
|
A prefix to be used for every subword that is not a beginning-of-word.
|
|
|
|
end_of_word_suffix (:obj:`str`, `optional`):
|
|
A suffix to be used for every subword that is a end-of-word.
|
|
|
|
max_token_length (:obj:`int`, `optional`):
|
|
Prevents creating tokens longer than the specified size.
|
|
This can help with reducing polluting your vocabulary with
|
|
highly repetitive tokens like `======` for wikipedia
|
|
|
|
"""
|
|
|
|
class UnigramTrainer(Trainer):
|
|
"""
|
|
Trainer capable of training a Unigram model
|
|
|
|
Args:
|
|
vocab_size (:obj:`int`):
|
|
The size of the final vocabulary, including all tokens and alphabet.
|
|
|
|
show_progress (:obj:`bool`):
|
|
Whether to show progress bars while training.
|
|
|
|
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
|
A list of special tokens the model should know of.
|
|
|
|
initial_alphabet (:obj:`List[str]`):
|
|
A list of characters to include in the initial alphabet, even
|
|
if not seen in the training dataset.
|
|
If the strings contain more than one character, only the first one
|
|
is kept.
|
|
|
|
shrinking_factor (:obj:`float`):
|
|
The shrinking factor used at each step of the training to prune the
|
|
vocabulary.
|
|
|
|
unk_token (:obj:`str`):
|
|
The token used for out-of-vocabulary tokens.
|
|
|
|
max_piece_length (:obj:`int`):
|
|
The maximum length of a given token.
|
|
|
|
n_sub_iterations (:obj:`int`):
|
|
The number of iterations of the EM algorithm to perform before
|
|
pruning the vocabulary.
|
|
"""
|
|
def __init__(
|
|
self,
|
|
vocab_size=8000,
|
|
show_progress=True,
|
|
special_tokens=[],
|
|
shrinking_factor=0.75,
|
|
unk_token=None,
|
|
max_piece_length=16,
|
|
n_sub_iterations=2,
|
|
):
|
|
pass
|
|
|
|
class WordLevelTrainer(Trainer):
|
|
"""
|
|
Trainer capable of training a WorldLevel model
|
|
|
|
Args:
|
|
vocab_size (:obj:`int`, `optional`):
|
|
The size of the final vocabulary, including all tokens and alphabet.
|
|
|
|
min_frequency (:obj:`int`, `optional`):
|
|
The minimum frequency a pair should have in order to be merged.
|
|
|
|
show_progress (:obj:`bool`, `optional`):
|
|
Whether to show progress bars while training.
|
|
|
|
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
|
A list of special tokens the model should know of.
|
|
"""
|
|
|
|
class WordPieceTrainer(Trainer):
|
|
"""
|
|
Trainer capable of training a WordPiece model
|
|
|
|
Args:
|
|
vocab_size (:obj:`int`, `optional`):
|
|
The size of the final vocabulary, including all tokens and alphabet.
|
|
|
|
min_frequency (:obj:`int`, `optional`):
|
|
The minimum frequency a pair should have in order to be merged.
|
|
|
|
show_progress (:obj:`bool`, `optional`):
|
|
Whether to show progress bars while training.
|
|
|
|
special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
|
|
A list of special tokens the model should know of.
|
|
|
|
limit_alphabet (:obj:`int`, `optional`):
|
|
The maximum different characters to keep in the alphabet.
|
|
|
|
initial_alphabet (:obj:`List[str]`, `optional`):
|
|
A list of characters to include in the initial alphabet, even
|
|
if not seen in the training dataset.
|
|
If the strings contain more than one character, only the first one
|
|
is kept.
|
|
|
|
continuing_subword_prefix (:obj:`str`, `optional`):
|
|
A prefix to be used for every subword that is not a beginning-of-word.
|
|
|
|
end_of_word_suffix (:obj:`str`, `optional`):
|
|
A suffix to be used for every subword that is a end-of-word.
|
|
"""
|
|
def __init__(
|
|
self,
|
|
vocab_size=30000,
|
|
min_frequency=0,
|
|
show_progress=True,
|
|
special_tokens=[],
|
|
limit_alphabet=None,
|
|
initial_alphabet=[],
|
|
continuing_subword_prefix="##",
|
|
end_of_word_suffix=None,
|
|
):
|
|
pass
|