Files
MoFin/venv/lib/python3.12/site-packages/tokenizers/__init__.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

112 lines
3.1 KiB
Python

"""Tokenizers — fast, batteries-included tokenization library.
Free-threaded Python (3.14t) note:
Wheels built against free-threaded CPython declare ``Py_MOD_GIL_NOT_USED``
and use ``RwLock``-guarded interior mutability so component setters are
safe to call from multiple threads. Compound mutations
(``tokenizer.post_processor.special_tokens = …``) are still not atomic —
use a Python lock if you need the read-then-write to be serialized.
See ``docs/free-threading-audit.md`` for the full analysis.
"""
from enum import Enum
from typing import List, Tuple, Union
Offsets = Tuple[int, int]
TextInputSequence = str
"""A :obj:`str` that represents an input sequence """
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
"""A pre-tokenized input sequence. Can be one of:
- A :obj:`List` of :obj:`str`
- A :obj:`Tuple` of :obj:`str`
"""
TextEncodeInput = Union[
TextInputSequence,
Tuple[TextInputSequence, TextInputSequence],
List[TextInputSequence],
]
"""Represents a textual input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.TextInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.TextInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.TextInputSequence` of size 2
"""
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
List[PreTokenizedInputSequence],
]
"""Represents a pre-tokenized input for encoding. Can be either:
- A single sequence: :data:`~tokenizers.PreTokenizedInputSequence`
- A pair of sequences:
- A :obj:`Tuple` of :data:`~tokenizers.PreTokenizedInputSequence`
- Or a :obj:`List` of :data:`~tokenizers.PreTokenizedInputSequence` of size 2
"""
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
"""Represents all the possible types of input sequences for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextInputSequence`
- When ``is_pretokenized=True``: :data:`~PreTokenizedInputSequence`
"""
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
"""Represents all the possible types of input for encoding. Can be:
- When ``is_pretokenized=False``: :data:`~TextEncodeInput`
- When ``is_pretokenized=True``: :data:`~PreTokenizedEncodeInput`
"""
class OffsetReferential(Enum):
ORIGINAL = "original"
NORMALIZED = "normalized"
class OffsetType(Enum):
BYTE = "byte"
CHAR = "char"
class SplitDelimiterBehavior(Enum):
REMOVED = "removed"
ISOLATED = "isolated"
MERGED_WITH_PREVIOUS = "merged_with_previous"
MERGED_WITH_NEXT = "merged_with_next"
CONTIGUOUS = "contiguous"
from .tokenizers import (
AddedToken,
Encoding,
NormalizedString,
PreTokenizedString,
Regex,
Token,
Tokenizer,
decoders,
models,
normalizers,
pre_tokenizers,
processors,
trainers,
__version__,
)
from .implementations import (
BertWordPieceTokenizer,
ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
SentencePieceUnigramTokenizer,
)