ai-content-maker/.venv/Lib/site-packages/tokenizers/__init__.py

101 lines
2.6 KiB
Python

from enum import Enum
from typing import List, Tuple, Union
Offsets = Tuple[int, int]
TextInputSequence = str
"""A :obj:`str` that represents an input sequence """
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
"""A pre-tokenized input sequence. Can be one of:
- A :obj:`List` of :obj:`str`
- A :obj:`Tuple` of :obj:`str`
"""
TextEncodeInput = Union[
TextInputSequence,
Tuple[TextInputSequence, TextInputSequence],
List[TextInputSequence],
]
"""Represents a textual input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.TextInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
"""
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
List[PreTokenizedInputSequence],
]
"""Represents a pre-tokenized input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
"""
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
"""Represents all the possible types of input sequences for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
"""
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
"""Represents all the possible types of input for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
"""
class OffsetReferential(Enum):
ORIGINAL = "original"
NORMALIZED = "normalized"
class OffsetType(Enum):
BYTE = "byte"
CHAR = "char"
class SplitDelimiterBehavior(Enum):
REMOVED = "removed"
ISOLATED = "isolated"
MERGED_WITH_PREVIOUS = "merged_with_previous"
MERGED_WITH_NEXT = "merged_with_next"
CONTIGUOUS = "contiguous"
from .tokenizers import (
AddedToken,
Encoding,
NormalizedString,
PreTokenizedString,
Regex,
Token,
Tokenizer,
decoders,
models,
normalizers,
pre_tokenizers,
processors,
trainers,
__version__,
)
from .implementations import (
BertWordPieceTokenizer,
ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
SentencePieceUnigramTokenizer,
)