Files
MoFin/venv/lib/python3.12/site-packages/nltk/tokenize/api.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

85 lines
2.2 KiB
Python

# Natural Language Toolkit: Tokenizer Interface
#
# Copyright (C) 2001-2026 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Tokenizer Interface
"""
from abc import ABC, abstractmethod
from collections.abc import Iterator
from typing import List, Tuple
from nltk.internals import overridden
from nltk.tokenize.util import string_span_tokenize
class TokenizerI(ABC):
"""
A processing interface for tokenizing a string.
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
"""
@abstractmethod
def tokenize(self, s: str) -> list[str]:
"""
Return a tokenized copy of *s*.
:rtype: List[str]
"""
if overridden(self.tokenize_sents):
return self.tokenize_sents([s])[0]
def span_tokenize(self, s: str) -> Iterator[tuple[int, int]]:
"""
Identify the tokens using integer offsets ``(start_i, end_i)``,
where ``s[start_i:end_i]`` is the corresponding token.
:rtype: Iterator[Tuple[int, int]]
"""
raise NotImplementedError()
def tokenize_sents(self, strings: list[str]) -> list[list[str]]:
"""
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
return [self.tokenize(s) for s in strings]
:rtype: List[List[str]]
"""
return [self.tokenize(s) for s in strings]
def span_tokenize_sents(
self, strings: list[str]
) -> Iterator[list[tuple[int, int]]]:
"""
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
return [self.span_tokenize(s) for s in strings]
:yield: List[Tuple[int, int]]
"""
for s in strings:
yield list(self.span_tokenize(s))
class StringTokenizer(TokenizerI):
"""A tokenizer that divides a string into substrings by splitting
on the specified string (defined in subclasses).
"""
@property
@abstractmethod
def _string(self):
raise NotImplementedError
def tokenize(self, s):
return s.split(self._string)
def span_tokenize(self, s):
yield from string_span_tokenize(s, self._string)