# Natural Language Toolkit: Tokenizer Interface # # Copyright (C) 2001-2026 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ Tokenizer Interface """ from abc import ABC, abstractmethod from collections.abc import Iterator from typing import List, Tuple from nltk.internals import overridden from nltk.tokenize.util import string_span_tokenize class TokenizerI(ABC): """ A processing interface for tokenizing a string. Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both). """ @abstractmethod def tokenize(self, s: str) -> list[str]: """ Return a tokenized copy of *s*. :rtype: List[str] """ if overridden(self.tokenize_sents): return self.tokenize_sents([s])[0] def span_tokenize(self, s: str) -> Iterator[tuple[int, int]]: """ Identify the tokens using integer offsets ``(start_i, end_i)``, where ``s[start_i:end_i]`` is the corresponding token. :rtype: Iterator[Tuple[int, int]] """ raise NotImplementedError() def tokenize_sents(self, strings: list[str]) -> list[list[str]]: """ Apply ``self.tokenize()`` to each element of ``strings``. I.e.: return [self.tokenize(s) for s in strings] :rtype: List[List[str]] """ return [self.tokenize(s) for s in strings] def span_tokenize_sents( self, strings: list[str] ) -> Iterator[list[tuple[int, int]]]: """ Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.: return [self.span_tokenize(s) for s in strings] :yield: List[Tuple[int, int]] """ for s in strings: yield list(self.span_tokenize(s)) class StringTokenizer(TokenizerI): """A tokenizer that divides a string into substrings by splitting on the specified string (defined in subclasses). """ @property @abstractmethod def _string(self): raise NotImplementedError def tokenize(self, s): return s.split(self._string) def span_tokenize(self, s): yield from string_span_tokenize(s, self._string)