from enum import Enum from typing import List, Tuple, Union Offsets = Tuple[int, int] TextInputSequence = str """A :obj:`str` that represents an input sequence """ PreTokenizedInputSequence = Union[List[str], Tuple[str]] """A pre-tokenized input sequence. Can be one of: - A :obj:`List` of :obj:`str` - A :obj:`Tuple` of :obj:`str` """ TextEncodeInput = Union[ TextInputSequence, Tuple[TextInputSequence, TextInputSequence], List[TextInputSequence], ] """Represents a textual input for encoding. Can be either: - A single sequence: :data:`~tokenizers.TextInputSequence` - A pair of sequences: - A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence` - Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2 """ PreTokenizedEncodeInput = Union[ PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], List[PreTokenizedInputSequence], ] """Represents a pre-tokenized input for encoding. Can be either: - A single sequence: :data:`~tokenizers.PreTokenizedInputSequence` - A pair of sequences: - A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence` - Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2 """ InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] """Represents all the possible types of input sequences for encoding. Can be: - When ``is_pretokenized=False``: :data:`~TextInputSequence` - When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence` """ EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] """Represents all the possible types of input for encoding. Can be: - When ``is_pretokenized=False``: :data:`~TextEncodeInput` - When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput` """ class OffsetReferential(Enum): ORIGINAL = "original" NORMALIZED = "normalized" class OffsetType(Enum): BYTE = "byte" CHAR = "char" class SplitDelimiterBehavior(Enum): REMOVED = "removed" ISOLATED = "isolated" MERGED_WITH_PREVIOUS = "merged_with_previous" MERGED_WITH_NEXT = "merged_with_next" CONTIGUOUS = "contiguous" from .tokenizers import ( AddedToken, Encoding, NormalizedString, PreTokenizedString, Regex, Token, Tokenizer, decoders, models, normalizers, pre_tokenizers, processors, trainers, __version__, ) from .implementations import ( BertWordPieceTokenizer, ByteLevelBPETokenizer, CharBPETokenizer, SentencePieceBPETokenizer, SentencePieceUnigramTokenizer, )