"""`tldextract` accurately separates a URL's subdomain, domain, and public suffix. It does this via the Public Suffix List (PSL). >>> import tldextract >>> tldextract.extract("http://forums.news.cnn.com/") ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) >>> tldextract.extract("http://forums.bbc.co.uk/") # United Kingdom ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False) >>> tldextract.extract("http://www.worldbank.org.kg/") # Kyrgyzstan ExtractResult(subdomain='www', domain='worldbank', suffix='org.kg', is_private=False) Note subdomain and suffix are _optional_. Not all URL-like inputs have a subdomain or a valid suffix. >>> tldextract.extract("google.com") ExtractResult(subdomain='', domain='google', suffix='com', is_private=False) >>> tldextract.extract("google.notavalidsuffix") ExtractResult(subdomain='google', domain='notavalidsuffix', suffix='', is_private=False) >>> tldextract.extract("http://127.0.0.1:8080/deployed/") ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False) To rejoin the original hostname, if it was indeed a valid, registered hostname: >>> ext = tldextract.extract("http://forums.bbc.co.uk") >>> ext.top_domain_under_public_suffix 'bbc.co.uk' >>> ext.fqdn 'forums.bbc.co.uk' """ from __future__ import annotations import os import urllib.parse import warnings from collections.abc import Collection, Sequence from dataclasses import dataclass, field from functools import wraps import idna import requests from .cache import DiskCache, get_cache_dir from .remote import lenient_netloc, looks_like_ip, looks_like_ipv6 from .suffix_list import get_suffix_lists CACHE_TIMEOUT = os.environ.get("TLDEXTRACT_CACHE_TIMEOUT") PUBLIC_SUFFIX_LIST_URLS = ( "https://publicsuffix.org/list/public_suffix_list.dat", "https://raw.githubusercontent.com/publicsuffix/list/master/public_suffix_list.dat", ) @dataclass(order=True) class ExtractResult: """A URL's extracted subdomain, domain, and suffix. These first 3 fields are what most users of this library will care about. They are the split, non-overlapping hostname components of the input URL. They can be used to rebuild the original URL's hostname. Beyond the first 3 fields, the class contains metadata fields, like a flag that indicates if the input URL's suffix is from a private domain. """ subdomain: str """All subdomains beneath the domain of the input URL, if it contained any such subdomains, or else the empty string.""" domain: str """The topmost domain of the input URL, if it contained a domain name, or else everything hostname-like in the input. If the input URL didn't contain a real domain name, the `suffix` field will be empty, and this field will catch values like an IP address, or private network hostnames like "localhost". """ suffix: str """The public suffix of the input URL, if it contained one, or else the empty string. If `include_psl_private_domains` was set to `False`, this field is the same as `registry_suffix`, i.e. a domain under which people can register subdomains through a registrar. If `include_psl_private_domains` was set to `True`, this field may be a PSL private domain, like "blogspot.com". """ is_private: bool """Whether the input URL belongs in the Public Suffix List's private domains. If `include_psl_private_domains` was set to `False`, this field is always `False`. """ registry_suffix: str = field(repr=False) """The registry suffix of the input URL, if it contained one, or else the empty string. This field is a domain under which people can register subdomains through a registar. This field is unaffected by the `include_psl_private_domains` setting. If `include_psl_private_domains` was set to `False`, this field is always the same as `suffix`. """ @property def fqdn(self) -> str: """The Fully Qualified Domain Name (FQDN), if there is a proper `domain` and `suffix`, or else the empty string. >>> extract("http://forums.bbc.co.uk/path/to/file").fqdn 'forums.bbc.co.uk' >>> extract("http://localhost:8080").fqdn '' """ if self.suffix and (self.domain or self.is_private): return ".".join(i for i in (self.subdomain, self.domain, self.suffix) if i) return "" @property def ipv4(self) -> str: """The IPv4 address, if that is what the input domain/URL was, or else the empty string. >>> extract("http://127.0.0.1/path/to/file").ipv4 '127.0.0.1' >>> extract("http://127.0.0.1.1/path/to/file").ipv4 '' >>> extract("http://256.1.1.1").ipv4 '' """ if ( self.domain and not (self.suffix or self.subdomain) and looks_like_ip(self.domain) ): return self.domain return "" @property def ipv6(self) -> str: """The IPv6 address, if that is what the input domain/URL was, or else the empty string. >>> extract( ... "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1]/path/to/file" ... ).ipv6 'aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1' >>> extract( ... "http://[aBcD:ef01:2345:6789:aBcD:ef01:127.0.0.1.1]/path/to/file" ... ).ipv6 '' >>> extract("http://[aBcD:ef01:2345:6789:aBcD:ef01:256.0.0.1]").ipv6 '' """ min_num_ipv6_chars = 4 if ( len(self.domain) >= min_num_ipv6_chars and self.domain[0] == "[" and self.domain[-1] == "]" and not (self.suffix or self.subdomain) ): debracketed = self.domain[1:-1] if looks_like_ipv6(debracketed): return debracketed return "" @property def registered_domain(self) -> str: """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string. >>> extract("http://forums.bbc.co.uk").registered_domain 'bbc.co.uk' >>> extract("http://localhost:8080").registered_domain '' .. deprecated:: 5.3.1 Use `top_domain_under_public_suffix` instead, which has the same behavior but a more accurate name. .. versionremoved:: 6.0.0 This property will be removed in the next major version. This is an alias for the `top_domain_under_public_suffix` property. `registered_domain` is so called because is roughly the domain the owner paid to register with a registrar or, in the case of a private domain, "registered" with the domain owner. If the input was not something one could register, this property returns the empty string. To distinguish the case of private domains, consider Blogspot, which is in the PSL's private domains. If `include_psl_private_domains` was set to `False`, the `registered_domain` property of a Blogspot URL represents the domain the owner of Blogspot registered with a registrar, i.e. Google registered "blogspot.com". If `include_psl_private_domains=True`, the `registered_domain` property represents the "blogspot.com" _subdomain_ the owner of a blog "registered" with Blogspot. >>> extract( ... "http://waiterrant.blogspot.com", include_psl_private_domains=False ... ).registered_domain 'blogspot.com' >>> extract( ... "http://waiterrant.blogspot.com", include_psl_private_domains=True ... ).registered_domain 'waiterrant.blogspot.com' To always get the same joined string, regardless of the `include_psl_private_domains` setting, consider the `top_domain_under_registry_suffix` property. """ warnings.warn( "The 'registered_domain' property is deprecated and will be removed in the next major version. " "Use 'top_domain_under_public_suffix' instead, which has the same behavior but a more accurate name.", DeprecationWarning, stacklevel=2, ) return self.top_domain_under_public_suffix @property def reverse_domain_name(self) -> str: """The domain name in Reverse Domain Name Notation. Joins extracted components of the input URL in reverse domain name notation. The suffix is used as the leftmost component, followed by the domain, then followed by the subdomain with its parts reversed. Reverse Domain Name Notation is typically used to organize namespaces for packages and plugins. Technically, a full reversal would reverse the parts of the suffix, e.g. "co.uk" would become "uk.co", but this is not done in practice when Reverse Domain Name Notation is called for. So this property leaves the `suffix` part in its original order. >>> extract("login.example.com").reverse_domain_name 'com.example.login' >>> extract("login.example.co.uk").reverse_domain_name 'co.uk.example.login' """ stack = [self.suffix, self.domain] if self.subdomain: stack.extend(reversed(self.subdomain.split("."))) return ".".join(stack) @property def top_domain_under_registry_suffix(self) -> str: """The rightmost domain label and `registry_suffix` joined with a dot, if such a domain is available and `registry_suffix` is set, or else the empty string. The rightmost domain label might be in the `domain` field, or, if the input URL's suffix is a PSL private domain, in the public suffix `suffix` field. If the input was not in the PSL's private domains, this property is equivalent to `top_domain_under_public_suffix`. >>> extract( ... "http://waiterrant.blogspot.com", include_psl_private_domains=False ... ).top_domain_under_registry_suffix 'blogspot.com' >>> extract( ... "http://waiterrant.blogspot.com", include_psl_private_domains=True ... ).top_domain_under_registry_suffix 'blogspot.com' >>> extract("http://localhost:8080").top_domain_under_registry_suffix '' """ top_domain_under_public_suffix = self.top_domain_under_public_suffix if not top_domain_under_public_suffix or not self.is_private: return top_domain_under_public_suffix num_labels = self.registry_suffix.count(".") + 2 return ".".join(top_domain_under_public_suffix.split(".")[-num_labels:]) @property def top_domain_under_public_suffix(self) -> str: """The `domain` and `suffix` fields joined with a dot, if they're both set, or else the empty string. >>> extract("http://forums.bbc.co.uk").top_domain_under_public_suffix 'bbc.co.uk' >>> extract("http://localhost:8080").top_domain_under_public_suffix '' """ if self.suffix and self.domain: return f"{self.domain}.{self.suffix}" return "" class TLDExtract: """A callable for extracting, subdomain, domain, and suffix components from a URL.""" # TODO: too-many-arguments def __init__( self, cache_dir: str | None = get_cache_dir(), suffix_list_urls: Sequence[str] = PUBLIC_SUFFIX_LIST_URLS, fallback_to_snapshot: bool = True, include_psl_private_domains: bool = False, extra_suffixes: Sequence[str] = (), cache_fetch_timeout: str | float | None = CACHE_TIMEOUT, ) -> None: """Construct a callable for extracting subdomain, domain, and suffix components from a URL. Upon calling it, it first checks for a JSON in `cache_dir`. By default, the `cache_dir` will live in the tldextract directory. You can disable the caching functionality of this module by setting `cache_dir` to `None`. If the cached version does not exist, such as on the first run, HTTP request the URLs in `suffix_list_urls` in order, and use the first successful response for public suffix definitions. Subsequent, untried URLs are ignored. The default URLs are the latest version of the Mozilla Public Suffix List and its mirror, but any similar document URL could be specified. Local files can be specified by using the `file://` protocol (see `urllib2` documentation). To disable HTTP requests, set this to an empty sequence. If there is no cached version loaded and no data is found from the `suffix_list_urls`, the module will fall back to the included TLD set snapshot. If you do not want this behavior, you may set `fallback_to_snapshot` to False, and an exception will be raised instead. The Public Suffix List includes a list of "private domains" as TLDs, such as blogspot.com. These do not fit `tldextract`'s definition of a suffix, so these domains are excluded by default. If you'd like them included instead, set `include_psl_private_domains` to True. You can specify additional suffixes in the `extra_suffixes` argument. These will be merged into whatever public suffix definitions are already in use by `tldextract`, above. cache_fetch_timeout is passed unmodified to the underlying request object per the requests documentation here: http://docs.python-requests.org/en/master/user/advanced/#timeouts cache_fetch_timeout can also be set to a single value with the environment variable TLDEXTRACT_CACHE_TIMEOUT, like so: TLDEXTRACT_CACHE_TIMEOUT="1.2" When set this way, the same timeout value will be used for both connect and read timeouts """ suffix_list_urls = suffix_list_urls or () self.suffix_list_urls = tuple( url.strip() for url in suffix_list_urls if url.strip() ) self.fallback_to_snapshot = fallback_to_snapshot if not (self.suffix_list_urls or cache_dir or self.fallback_to_snapshot): raise ValueError( "The arguments you have provided disable all ways for tldextract " "to obtain data. Please provide a suffix list data, a cache_dir, " "or set `fallback_to_snapshot` to `True`." ) self.include_psl_private_domains = include_psl_private_domains self.extra_suffixes = extra_suffixes self._extractor: _PublicSuffixListTLDExtractor | None = None self.cache_fetch_timeout = ( float(cache_fetch_timeout) if isinstance(cache_fetch_timeout, str) else cache_fetch_timeout ) self._cache = DiskCache(cache_dir) def __call__( self, url: str, include_psl_private_domains: bool | None = None, session: requests.Session | None = None, ) -> ExtractResult: """Alias for `extract_str`.""" return self.extract_str(url, include_psl_private_domains, session=session) def extract_str( self, url: str, include_psl_private_domains: bool | None = None, session: requests.Session | None = None, ) -> ExtractResult: """Take a string URL and splits it into its subdomain, domain, and suffix components. Args: url: The URL string to extract components from include_psl_private_domains: Whether to treat PSL private domains as suffixes. If None, uses the instance default. session: Optional requests.Session for HTTP configuration (e.g., proxies) Returns: ExtractResult: Named tuple containing subdomain, domain, suffix, and metadata Examples: Basic extraction: >>> extractor = TLDExtract() >>> extractor.extract_str("http://forums.news.cnn.com/") ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) >>> extractor.extract_str("http://forums.bbc.co.uk/") ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False) Using a custom session: >>> import requests >>> session = requests.Session() >>> # customize your session here >>> with session: ... extractor.extract_str( ... "http://forums.news.cnn.com/", session=session ... ) ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) """ return self._extract_netloc( lenient_netloc(url), include_psl_private_domains, session=session ) def extract_urllib( self, url: urllib.parse.ParseResult | urllib.parse.SplitResult, include_psl_private_domains: bool | None = None, session: requests.Session | None = None, ) -> ExtractResult: """Extract components from a pre-parsed URL object. Args: url: ParseResult or SplitResult from urllib.parse methods include_psl_private_domains: Whether to treat PSL private domains as suffixes. If None, uses the instance default. session: Optional requests.Session for HTTP configuration Returns: ExtractResult: Named tuple containing subdomain, domain, suffix, and metadata Note: This method is faster than `extract_str` since the URL is already parsed. Examples: >>> extractor = TLDExtract() >>> extractor.extract_urllib( ... urllib.parse.urlsplit("http://forums.news.cnn.com/") ... ) ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False) >>> extractor.extract_urllib( ... urllib.parse.urlsplit("http://forums.bbc.co.uk/") ... ) ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False) """ return self._extract_netloc( url.netloc, include_psl_private_domains, session=session ) def _extract_netloc( self, netloc: str, include_psl_private_domains: bool | None, session: requests.Session | None = None, ) -> ExtractResult: netloc_with_ascii_dots = ( netloc.replace("\u3002", "\u002e") .replace("\uff0e", "\u002e") .replace("\uff61", "\u002e") ) min_num_ipv6_chars = 4 if ( len(netloc_with_ascii_dots) >= min_num_ipv6_chars and netloc_with_ascii_dots[0] == "[" and netloc_with_ascii_dots[-1] == "]" and looks_like_ipv6(netloc_with_ascii_dots[1:-1]) ): return ExtractResult( "", netloc_with_ascii_dots, "", is_private=False, registry_suffix="" ) labels = netloc_with_ascii_dots.split(".") maybe_indexes = self._get_tld_extractor(session).suffix_index( labels, include_psl_private_domains=include_psl_private_domains ) num_ipv4_labels = 4 if ( not maybe_indexes and len(labels) == num_ipv4_labels and looks_like_ip(netloc_with_ascii_dots) ): return ExtractResult( "", netloc_with_ascii_dots, "", is_private=False, registry_suffix="" ) elif not maybe_indexes: return ExtractResult( subdomain=".".join(labels[:-1]), domain=labels[-1], suffix="", is_private=False, registry_suffix="", ) ( (public_suffix_index, public_suffix_node), (registry_suffix_index, registry_suffix_node), ) = maybe_indexes subdomain = ( ".".join(labels[: public_suffix_index - 1]) if public_suffix_index >= 2 else "" ) domain = labels[public_suffix_index - 1] if public_suffix_index > 0 else "" public_suffix = ".".join(labels[public_suffix_index:]) registry_suffix = ( ".".join(labels[registry_suffix_index:]) if public_suffix_node.is_private else public_suffix ) return ExtractResult( subdomain=subdomain, domain=domain, suffix=public_suffix, is_private=public_suffix_node.is_private, registry_suffix=registry_suffix, ) def update( self, fetch_now: bool = False, session: requests.Session | None = None ) -> None: """Clear cache and force fresh suffix list fetch on next extraction. Args: fetch_now: If True, immediately fetch updated suffix lists session: Optional requests.Session for HTTP configuration """ self._extractor = None self._cache.clear() if fetch_now: self._get_tld_extractor(session=session) @property def tlds(self, session: requests.Session | None = None) -> list[str]: """The list of TLDs used by default. This will vary based on `include_psl_private_domains` and `extra_suffixes`. """ return list(self._get_tld_extractor(session=session).tlds()) def _get_tld_extractor( self, session: requests.Session | None = None ) -> _PublicSuffixListTLDExtractor: """Get or compute this object's TLDExtractor. Looks up the TLDExtractor in roughly the following order, based on the settings passed to __init__: 1. Memoized on `self` 2. Local system _cache file 3. Remote PSL, over HTTP 4. Bundled PSL snapshot file """ if self._extractor: return self._extractor public_tlds, private_tlds = get_suffix_lists( cache=self._cache, urls=self.suffix_list_urls, cache_fetch_timeout=self.cache_fetch_timeout, fallback_to_snapshot=self.fallback_to_snapshot, session=session, ) if not any([public_tlds, private_tlds, self.extra_suffixes]): raise ValueError("No tlds set. Cannot proceed without tlds.") self._extractor = _PublicSuffixListTLDExtractor( public_tlds=public_tlds, private_tlds=private_tlds, extra_tlds=list(self.extra_suffixes), include_psl_private_domains=self.include_psl_private_domains, ) return self._extractor TLD_EXTRACTOR = TLDExtract() class Trie: """Trie for storing eTLDs with their labels in reverse-order.""" def __init__( self, matches: dict[str, Trie] | None = None, end: bool = False, is_private: bool = False, ) -> None: """TODO.""" self.matches = matches if matches else {} self.end = end self.is_private = is_private @staticmethod def create( public_suffixes: Collection[str], private_suffixes: Collection[str] | None = None, ) -> Trie: """Create a Trie from a list of suffixes and return its root node.""" root_node = Trie() for suffix in public_suffixes: root_node.add_suffix(suffix) if private_suffixes is None: private_suffixes = [] for suffix in private_suffixes: root_node.add_suffix(suffix, True) return root_node def add_suffix(self, suffix: str, is_private: bool = False) -> None: """Append a suffix's labels to this Trie node.""" node = self labels = suffix.split(".") labels.reverse() for label in labels: if label not in node.matches: node.matches[label] = Trie() node = node.matches[label] node.end = True node.is_private = is_private @wraps(TLD_EXTRACTOR.__call__) def extract( # noqa: D103 url: str, include_psl_private_domains: bool | None = False, session: requests.Session | None = None, ) -> ExtractResult: return TLD_EXTRACTOR( url, include_psl_private_domains=include_psl_private_domains, session=session ) @wraps(TLD_EXTRACTOR.update) def update(*args, **kwargs): # type: ignore[no-untyped-def] # noqa: D103 return TLD_EXTRACTOR.update(*args, **kwargs) class _PublicSuffixListTLDExtractor: """Wrapper around this project's main algo for PSL lookups.""" def __init__( self, public_tlds: list[str], private_tlds: list[str], extra_tlds: list[str], include_psl_private_domains: bool = False, ): # set the default value self.include_psl_private_domains = include_psl_private_domains self.public_tlds = public_tlds self.private_tlds = private_tlds self.tlds_incl_private = frozenset(public_tlds + private_tlds + extra_tlds) self.tlds_excl_private = frozenset(public_tlds + extra_tlds) self.tlds_incl_private_trie = Trie.create( self.tlds_excl_private, frozenset(private_tlds) ) self.tlds_excl_private_trie = Trie.create(self.tlds_excl_private) def tlds(self, include_psl_private_domains: bool | None = None) -> frozenset[str]: """Get the currently filtered list of suffixes.""" if include_psl_private_domains is None: include_psl_private_domains = self.include_psl_private_domains return ( self.tlds_incl_private if include_psl_private_domains else self.tlds_excl_private ) def suffix_index( self, spl: list[str], include_psl_private_domains: bool | None = None ) -> tuple[tuple[int, Trie], tuple[int, Trie]] | None: """Return the index of the first public suffix label, the index of the first registry suffix label, and their corresponding trie nodes. Returns `None` if no suffix is found. """ if include_psl_private_domains is None: include_psl_private_domains = self.include_psl_private_domains node = reg_node = ( self.tlds_incl_private_trie if include_psl_private_domains else self.tlds_excl_private_trie ) suffix_idx = reg_idx = label_idx = len(spl) for label in reversed(spl): decoded_label = _decode_punycode(label) if decoded_label in node.matches: label_idx -= 1 node = node.matches[decoded_label] if node.end: suffix_idx = label_idx if not node.is_private: reg_node = node reg_idx = label_idx continue is_wildcard = "*" in node.matches if is_wildcard: is_wildcard_exception = "!" + decoded_label in node.matches return ( label_idx if is_wildcard_exception else label_idx - 1, node.matches["*"], ), ( reg_idx, reg_node, ) break if suffix_idx == len(spl): return None return ((suffix_idx, node), (reg_idx, reg_node)) def _decode_punycode(label: str) -> str: lowered = label.lower() looks_like_puny = lowered.startswith("xn--") if looks_like_puny: try: return idna.decode(lowered) except (UnicodeError, IndexError): pass return lowered