Files
MoFin/venv/lib/python3.12/site-packages/tldextract/tldextract.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

737 lines
27 KiB
Python

"""`tldextract` accurately separates a URL's subdomain, domain, and public suffix.
It does this via the Public Suffix List (PSL).
>>> import tldextract
>>> tldextract.extract("http://forums.news.cnn.com/")
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
>>> tldextract.extract("http://forums.bbc.co.uk/") # United Kingdom
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
>>> tldextract.extract("http://www.worldbank.org.kg/") # Kyrgyzstan
ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False)
Note subdomain and suffix are _optional_. Not all URL-like inputs have a
subdomain or a valid suffix.
>>> tldextract.extract("google.com")
ExtractResult(subdomain='', domain='google', suffix='com', is_private=False)
>>> tldextract.extract("google.notavalidsuffix")
ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False)
>>> tldextract.extract("http://127.0.0.1:8080/deployed/")
ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)
To rejoin the original hostname, if it was indeed a valid, registered hostname:
>>> ext = tldextract.extract("http://forums.bbc.co.uk")
>>> ext.top_domain_under_public_suffix
'bbc.co.uk'
>>> ext.fqdn
'forums.bbc.co.uk'
"""
from __future__ import annotations
import os
import urllib.parse
import warnings
from collections.abc import Collection, Sequence
from dataclasses import dataclass, field
from functools import wraps
import idna
import requests
from .cache import DiskCache, get_cache_dir
from .remote import lenient_netloc, looks_like_ip, looks_like_ipv6
from .suffix_list import get_suffix_lists
CACHE_TIMEOUT = os.environ.get("TLDEXTRACT_CACHE_TIMEOUT")
PUBLIC_SUFFIX_LIST_URLS = (
"https://publicsuffix.org/list/public_suffix_list.dat",
"https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat",
)
@dataclass(order=True)
class ExtractResult:
"""A URL's extracted subdomain, domain, and suffix.
These first 3 fields are what most users of this library will care about.
They are the split, non-overlapping hostname components of the input URL.
They can be used to rebuild the original URL's hostname.
Beyond the first 3 fields, the class contains metadata fields, like a flag
that indicates if the input URL's suffix is from a private domain.
"""
subdomain: str
"""All subdomains beneath the domain of the input URL, if it contained any such subdomains, or else the empty string."""
domain: str
"""The topmost domain of the input URL, if it contained a domain name, or else everything hostname-like in the input.
If the input URL didn't contain a real domain name, the `suffix` field will
be empty, and this field will catch values like an IP address, or
private network hostnames like "localhost".
"""
suffix: str
"""The public suffix of the input URL, if it contained one, or else the empty string.
If `include_psl_private_domains` was set to `False`, this field is the same
as `registry_suffix`, i.e. a domain under which people can register
subdomains through a registrar. If `include_psl_private_domains` was set to
`True`, this field may be a PSL private domain, like "blogspot.com".
"""
is_private: bool
"""Whether the input URL belongs in the Public Suffix List's private domains.
If `include_psl_private_domains` was set to `False`, this field is always
`False`.
"""
registry_suffix: str = field(repr=False)
"""The registry suffix of the input URL, if it contained one, or else the empty string.
This field is a domain under which people can register subdomains through a
registar.
This field is unaffected by the `include_psl_private_domains` setting. If
`include_psl_private_domains` was set to `False`, this field is always the
same as `suffix`.
"""
@property
def fqdn(self) -> str:
"""The Fully Qualified Domain Name (FQDN), if there is a proper `domain` and `suffix`, or else the empty string.
>>> extract("http://forums.bbc.co.uk/path/to/file").fqdn
'forums.bbc.co.uk'
>>> extract("http://localhost:8080").fqdn
''
"""
if self.suffix and (self.domain or self.is_private):
return ".".join(i for i in (self.subdomain, self.domain, self.suffix) if i)
return ""
@property
def ipv4(self) -> str:
"""The IPv4 address, if that is what the input domain/URL was, or else the empty string.
>>> extract("http://127.0.0.1/path/to/file").ipv4
'127.0.0.1'
>>> extract("http://127.0.0.1.1/path/to/file").ipv4
''
>>> extract("http://256.1.1.1").ipv4
''
"""
if (
self.domain
and not (self.suffix or self.subdomain)
and looks_like_ip(self.domain)
):
return self.domain
return ""
@property
def ipv6(self) -> str:
"""The IPv6 address, if that is what the input domain/URL was, or else the empty string.
>>> extract(
... "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]/path/to/file"
... ).ipv6
'aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1'
>>> extract(
... "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1.1]/path/to/file"
... ).ipv6
''
>>> extract("http://[aBcD:ef01:2345:6789:aBcD:ef01:256.0.0.1]").ipv6
''
"""
min_num_ipv6_chars = 4
if (
len(self.domain) >= min_num_ipv6_chars
and self.domain[0] == "["
and self.domain[-1] == "]"
and not (self.suffix or self.subdomain)
):
debracketed = self.domain[1:-1]
if looks_like_ipv6(debracketed):
return debracketed
return ""
@property
def registered_domain(self) -> str:
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
>>> extract("http://forums.bbc.co.uk").registered_domain
'bbc.co.uk'
>>> extract("http://localhost:8080").registered_domain
''
.. deprecated:: 5.3.1
Use `top_domain_under_public_suffix` instead, which has the same
behavior but a more accurate name.
.. versionremoved:: 6.0.0
This property will be removed in the next major version.
This is an alias for the `top_domain_under_public_suffix` property.
`registered_domain` is so called because is roughly the domain the
owner paid to register with a registrar or, in the case of a private
domain, "registered" with the domain owner. If the input was not
something one could register, this property returns the empty string.
To distinguish the case of private domains, consider Blogspot, which is
in the PSL's private domains. If `include_psl_private_domains` was set
to `False`, the `registered_domain` property of a Blogspot URL
represents the domain the owner of Blogspot registered with a
registrar, i.e. Google registered "blogspot.com". If
`include_psl_private_domains=True`, the `registered_domain` property
represents the "blogspot.com" _subdomain_ the owner of a blog
"registered" with Blogspot.
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
... ).registered_domain
'blogspot.com'
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
... ).registered_domain
'waiterrant.blogspot.com'
To always get the same joined string, regardless of the
`include_psl_private_domains` setting, consider the
`top_domain_under_registry_suffix` property.
"""
warnings.warn(
"The 'registered_domain' property is deprecated and will be removed in the next major version. "
"Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.",
DeprecationWarning,
stacklevel=2,
)
return self.top_domain_under_public_suffix
@property
def reverse_domain_name(self) -> str:
"""The domain name in Reverse Domain Name Notation.
Joins extracted components of the input URL in reverse domain name
notation. The suffix is used as the leftmost component, followed by the
domain, then followed by the subdomain with its parts reversed.
Reverse Domain Name Notation is typically used to organize namespaces
for packages and plugins. Technically, a full reversal would reverse
the parts of the suffix, e.g. "co.uk" would become "uk.co", but this is
not done in practice when Reverse Domain Name Notation is called for.
So this property leaves the `suffix` part in its original order.
>>> extract("login.example.com").reverse_domain_name
'com.example.login'
>>> extract("login.example.co.uk").reverse_domain_name
'co.uk.example.login'
"""
stack = [self.suffix, self.domain]
if self.subdomain:
stack.extend(reversed(self.subdomain.split(".")))
return ".".join(stack)
@property
def top_domain_under_registry_suffix(self) -> str:
"""The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string.
The rightmost domain label might be in the `domain` field, or, if the
input URL's suffix is a PSL private domain, in the public suffix
`suffix` field.
If the input was not in the PSL's private domains, this property is
equivalent to `top_domain_under_public_suffix`.
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=False
... ).top_domain_under_registry_suffix
'blogspot.com'
>>> extract(
... "http://waiterrant.blogspot.com", include_psl_private_domains=True
... ).top_domain_under_registry_suffix
'blogspot.com'
>>> extract("http://localhost:8080").top_domain_under_registry_suffix
''
"""
top_domain_under_public_suffix = self.top_domain_under_public_suffix
if not top_domain_under_public_suffix or not self.is_private:
return top_domain_under_public_suffix
num_labels = self.registry_suffix.count(".") + 2
return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:])
@property
def top_domain_under_public_suffix(self) -> str:
"""The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string.
>>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix
'bbc.co.uk'
>>> extract("http://localhost:8080").top_domain_under_public_suffix
''
"""
if self.suffix and self.domain:
return f"{self.domain}.{self.suffix}"
return ""
class TLDExtract:
"""A callable for extracting, subdomain, domain, and suffix components from a URL."""
# TODO: too-many-arguments
def __init__(
self,
cache_dir: str | None = get_cache_dir(),
suffix_list_urls: Sequence[str] = PUBLIC_SUFFIX_LIST_URLS,
fallback_to_snapshot: bool = True,
include_psl_private_domains: bool = False,
extra_suffixes: Sequence[str] = (),
cache_fetch_timeout: str | float | None = CACHE_TIMEOUT,
) -> None:
"""Construct a callable for extracting subdomain, domain, and suffix components from a URL.
Upon calling it, it first checks for a JSON in `cache_dir`. By default,
the `cache_dir` will live in the tldextract directory. You can disable
the caching functionality of this module by setting `cache_dir` to `None`.
If the cached version does not exist, such as on the first run, HTTP
request the URLs in `suffix_list_urls` in order, and use the first
successful response for public suffix definitions. Subsequent, untried
URLs are ignored. The default URLs are the latest version of the
Mozilla Public Suffix List and its mirror, but any similar document URL
could be specified. Local files can be specified by using the `file://`
protocol (see `urllib2` documentation). To disable HTTP requests, set
this to an empty sequence.
If there is no cached version loaded and no data is found from the `suffix_list_urls`,
the module will fall back to the included TLD set snapshot. If you do not want
this behavior, you may set `fallback_to_snapshot` to False, and an exception will be
raised instead.
The Public Suffix List includes a list of "private domains" as TLDs,
such as blogspot.com. These do not fit `tldextract`'s definition of a
suffix, so these domains are excluded by default. If you'd like them
included instead, set `include_psl_private_domains` to True.
You can specify additional suffixes in the `extra_suffixes` argument.
These will be merged into whatever public suffix definitions are
already in use by `tldextract`, above.
cache_fetch_timeout is passed unmodified to the underlying request object
per the requests documentation here:
http://docs.python-requests.org/en/master/user/advanced/#timeouts
cache_fetch_timeout can also be set to a single value with the
environment variable TLDEXTRACT_CACHE_TIMEOUT, like so:
TLDEXTRACT_CACHE_TIMEOUT="1.2"
When set this way, the same timeout value will be used for both connect
and read timeouts
"""
suffix_list_urls = suffix_list_urls or ()
self.suffix_list_urls = tuple(
url.strip() for url in suffix_list_urls if url.strip()
)
self.fallback_to_snapshot = fallback_to_snapshot
if not (self.suffix_list_urls or cache_dir or self.fallback_to_snapshot):
raise ValueError(
"The arguments you have provided disable all ways for tldextract "
"to obtain data. Please provide a suffix list data, a cache_dir, "
"or set `fallback_to_snapshot` to `True`."
)
self.include_psl_private_domains = include_psl_private_domains
self.extra_suffixes = extra_suffixes
self._extractor: _PublicSuffixListTLDExtractor | None = None
self.cache_fetch_timeout = (
float(cache_fetch_timeout)
if isinstance(cache_fetch_timeout, str)
else cache_fetch_timeout
)
self._cache = DiskCache(cache_dir)
def __call__(
self,
url: str,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None,
) -> ExtractResult:
"""Alias for `extract_str`."""
return self.extract_str(url, include_psl_private_domains, session=session)
def extract_str(
self,
url: str,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None,
) -> ExtractResult:
"""Take a string URL and splits it into its subdomain, domain, and suffix components.
Args:
url: The URL string to extract components from
include_psl_private_domains: Whether to treat PSL private domains as suffixes.
If None, uses the instance default.
session: Optional requests.Session for HTTP configuration (e.g., proxies)
Returns:
ExtractResult: Named tuple containing subdomain, domain, suffix, and metadata
Examples:
Basic extraction:
>>> extractor = TLDExtract()
>>> extractor.extract_str("http://forums.news.cnn.com/")
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
>>> extractor.extract_str("http://forums.bbc.co.uk/")
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
Using a custom session:
>>> import requests
>>> session = requests.Session()
>>> # customize your session here
>>> with session:
... extractor.extract_str(
... "http://forums.news.cnn.com/", session=session
... )
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
"""
return self._extract_netloc(
lenient_netloc(url), include_psl_private_domains, session=session
)
def extract_urllib(
self,
url: urllib.parse.ParseResult | urllib.parse.SplitResult,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None,
) -> ExtractResult:
"""Extract components from a pre-parsed URL object.
Args:
url: ParseResult or SplitResult from urllib.parse methods
include_psl_private_domains: Whether to treat PSL private domains as suffixes.
If None, uses the instance default.
session: Optional requests.Session for HTTP configuration
Returns:
ExtractResult: Named tuple containing subdomain, domain, suffix, and metadata
Note:
This method is faster than `extract_str` since the URL is already parsed.
Examples:
>>> extractor = TLDExtract()
>>> extractor.extract_urllib(
... urllib.parse.urlsplit("http://forums.news.cnn.com/")
... )
ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
>>> extractor.extract_urllib(
... urllib.parse.urlsplit("http://forums.bbc.co.uk/")
... )
ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
"""
return self._extract_netloc(
url.netloc, include_psl_private_domains, session=session
)
def _extract_netloc(
self,
netloc: str,
include_psl_private_domains: bool | None,
session: requests.Session | None = None,
) -> ExtractResult:
netloc_with_ascii_dots = (
netloc.replace("\u3002", "\u002e")
.replace("\uff0e", "\u002e")
.replace("\uff61", "\u002e")
)
min_num_ipv6_chars = 4
if (
len(netloc_with_ascii_dots) >= min_num_ipv6_chars
and netloc_with_ascii_dots[0] == "["
and netloc_with_ascii_dots[-1] == "]"
and looks_like_ipv6(netloc_with_ascii_dots[1:-1])
):
return ExtractResult(
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
)
labels = netloc_with_ascii_dots.split(".")
maybe_indexes = self._get_tld_extractor(session).suffix_index(
labels, include_psl_private_domains=include_psl_private_domains
)
num_ipv4_labels = 4
if (
not maybe_indexes
and len(labels) == num_ipv4_labels
and looks_like_ip(netloc_with_ascii_dots)
):
return ExtractResult(
"", netloc_with_ascii_dots, "", is_private=False, registry_suffix=""
)
elif not maybe_indexes:
return ExtractResult(
subdomain=".".join(labels[:-1]),
domain=labels[-1],
suffix="",
is_private=False,
registry_suffix="",
)
(
(public_suffix_index, public_suffix_node),
(registry_suffix_index, registry_suffix_node),
) = maybe_indexes
subdomain = (
".".join(labels[: public_suffix_index - 1])
if public_suffix_index >= 2
else ""
)
domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else ""
public_suffix = ".".join(labels[public_suffix_index:])
registry_suffix = (
".".join(labels[registry_suffix_index:])
if public_suffix_node.is_private
else public_suffix
)
return ExtractResult(
subdomain=subdomain,
domain=domain,
suffix=public_suffix,
is_private=public_suffix_node.is_private,
registry_suffix=registry_suffix,
)
def update(
self, fetch_now: bool = False, session: requests.Session | None = None
) -> None:
"""Clear cache and force fresh suffix list fetch on next extraction.
Args:
fetch_now: If True, immediately fetch updated suffix lists
session: Optional requests.Session for HTTP configuration
"""
self._extractor = None
self._cache.clear()
if fetch_now:
self._get_tld_extractor(session=session)
@property
def tlds(self, session: requests.Session | None = None) -> list[str]:
"""The list of TLDs used by default.
This will vary based on `include_psl_private_domains` and `extra_suffixes`.
"""
return list(self._get_tld_extractor(session=session).tlds())
def _get_tld_extractor(
self, session: requests.Session | None = None
) -> _PublicSuffixListTLDExtractor:
"""Get or compute this object's TLDExtractor.
Looks up the TLDExtractor in roughly the following order, based on the
settings passed to __init__:
1. Memoized on `self`
2. Local system _cache file
3. Remote PSL, over HTTP
4. Bundled PSL snapshot file
"""
if self._extractor:
return self._extractor
public_tlds, private_tlds = get_suffix_lists(
cache=self._cache,
urls=self.suffix_list_urls,
cache_fetch_timeout=self.cache_fetch_timeout,
fallback_to_snapshot=self.fallback_to_snapshot,
session=session,
)
if not any([public_tlds, private_tlds, self.extra_suffixes]):
raise ValueError("No tlds set. Cannot proceed without tlds.")
self._extractor = _PublicSuffixListTLDExtractor(
public_tlds=public_tlds,
private_tlds=private_tlds,
extra_tlds=list(self.extra_suffixes),
include_psl_private_domains=self.include_psl_private_domains,
)
return self._extractor
TLD_EXTRACTOR = TLDExtract()
class Trie:
"""Trie for storing eTLDs with their labels in reverse-order."""
def __init__(
self,
matches: dict[str, Trie] | None = None,
end: bool = False,
is_private: bool = False,
) -> None:
"""TODO."""
self.matches = matches if matches else {}
self.end = end
self.is_private = is_private
@staticmethod
def create(
public_suffixes: Collection[str],
private_suffixes: Collection[str] | None = None,
) -> Trie:
"""Create a Trie from a list of suffixes and return its root node."""
root_node = Trie()
for suffix in public_suffixes:
root_node.add_suffix(suffix)
if private_suffixes is None:
private_suffixes = []
for suffix in private_suffixes:
root_node.add_suffix(suffix, True)
return root_node
def add_suffix(self, suffix: str, is_private: bool = False) -> None:
"""Append a suffix's labels to this Trie node."""
node = self
labels = suffix.split(".")
labels.reverse()
for label in labels:
if label not in node.matches:
node.matches[label] = Trie()
node = node.matches[label]
node.end = True
node.is_private = is_private
@wraps(TLD_EXTRACTOR.__call__)
def extract( # noqa: D103
url: str,
include_psl_private_domains: bool | None = False,
session: requests.Session | None = None,
) -> ExtractResult:
return TLD_EXTRACTOR(
url, include_psl_private_domains=include_psl_private_domains, session=session
)
@wraps(TLD_EXTRACTOR.update)
def update(*args, **kwargs): # type: ignore[no-untyped-def] # noqa: D103
return TLD_EXTRACTOR.update(*args, **kwargs)
class _PublicSuffixListTLDExtractor:
"""Wrapper around this project's main algo for PSL lookups."""
def __init__(
self,
public_tlds: list[str],
private_tlds: list[str],
extra_tlds: list[str],
include_psl_private_domains: bool = False,
):
# set the default value
self.include_psl_private_domains = include_psl_private_domains
self.public_tlds = public_tlds
self.private_tlds = private_tlds
self.tlds_incl_private = frozenset(public_tlds + private_tlds + extra_tlds)
self.tlds_excl_private = frozenset(public_tlds + extra_tlds)
self.tlds_incl_private_trie = Trie.create(
self.tlds_excl_private, frozenset(private_tlds)
)
self.tlds_excl_private_trie = Trie.create(self.tlds_excl_private)
def tlds(self, include_psl_private_domains: bool | None = None) -> frozenset[str]:
"""Get the currently filtered list of suffixes."""
if include_psl_private_domains is None:
include_psl_private_domains = self.include_psl_private_domains
return (
self.tlds_incl_private
if include_psl_private_domains
else self.tlds_excl_private
)
def suffix_index(
self, spl: list[str], include_psl_private_domains: bool | None = None
) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None:
"""Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes.
Returns `None` if no suffix is found.
"""
if include_psl_private_domains is None:
include_psl_private_domains = self.include_psl_private_domains
node = reg_node = (
self.tlds_incl_private_trie
if include_psl_private_domains
else self.tlds_excl_private_trie
)
suffix_idx = reg_idx = label_idx = len(spl)
for label in reversed(spl):
decoded_label = _decode_punycode(label)
if decoded_label in node.matches:
label_idx -= 1
node = node.matches[decoded_label]
if node.end:
suffix_idx = label_idx
if not node.is_private:
reg_node = node
reg_idx = label_idx
continue
is_wildcard = "*" in node.matches
if is_wildcard:
is_wildcard_exception = "!" + decoded_label in node.matches
return (
label_idx if is_wildcard_exception else label_idx - 1,
node.matches["*"],
), (
reg_idx,
reg_node,
)
break
if suffix_idx == len(spl):
return None
return ((suffix_idx, node), (reg_idx, reg_node))
def _decode_punycode(label: str) -> str:
lowered = label.lower()
looks_like_puny = lowered.startswith("xn--")
if looks_like_puny:
try:
return idna.decode(lowered)
except (UnicodeError, IndexError):
pass
return lowered