fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
114 lines
3.7 KiB
Python
114 lines
3.7 KiB
Python
from typing import Collection, Iterable, Literal, Pattern, TypeVar, overload
|
|
|
|
from lxml.etree import _ElementTree
|
|
from lxml.html import HtmlElement
|
|
|
|
# For methods generating output from input data, their types would match
|
|
_DT = TypeVar("_DT", str, bytes, HtmlElement)
|
|
_ET_DT = TypeVar("_ET_DT", str, bytes, HtmlElement, _ElementTree[HtmlElement])
|
|
|
|
|
|
def _get_authority_from_url(url: str) -> str | None: ...
|
|
|
|
|
|
class LXMLHTMLCleanWarning(Warning):
|
|
pass
|
|
|
|
|
|
class AmbiguousURLWarning(LXMLHTMLCleanWarning):
|
|
pass
|
|
|
|
|
|
class Cleaner:
|
|
@overload # allow_tags present, remove_unknown_tags must be False
|
|
def __init__(
|
|
self,
|
|
*,
|
|
scripts: bool = True,
|
|
javascript: bool = True,
|
|
comments: bool = True,
|
|
style: bool = False,
|
|
inline_style: bool | None = None,
|
|
links: bool = True,
|
|
meta: bool = True,
|
|
page_structure: bool = True,
|
|
processing_instructions: bool = True,
|
|
embedded: bool = True,
|
|
frames: bool = True,
|
|
forms: bool = True,
|
|
annoying_tags: bool = True,
|
|
remove_tags: Collection[str] = (),
|
|
allow_tags: Collection[str] = (),
|
|
kill_tags: Collection[str] = (),
|
|
remove_unknown_tags: Literal[False] = False,
|
|
safe_attrs_only: bool = True,
|
|
safe_attrs: Collection[str] = ...,
|
|
add_nofollow: bool = False,
|
|
host_whitelist: Collection[str] = (),
|
|
whitelist_tags: Collection[str] | None = {"iframe", "embed"},
|
|
) -> None: ...
|
|
@overload # ... otherwise, allow_tags must not be used
|
|
def __init__(
|
|
self,
|
|
*,
|
|
scripts: bool = True,
|
|
javascript: bool = True,
|
|
comments: bool = True,
|
|
style: bool = False,
|
|
inline_style: bool | None = None,
|
|
links: bool = True,
|
|
meta: bool = True,
|
|
page_structure: bool = True,
|
|
processing_instructions: bool = True,
|
|
embedded: bool = True,
|
|
frames: bool = True,
|
|
forms: bool = True,
|
|
annoying_tags: bool = True,
|
|
remove_tags: Collection[str] = (),
|
|
kill_tags: Collection[str] = (),
|
|
remove_unknown_tags: bool = True,
|
|
safe_attrs_only: bool = True,
|
|
safe_attrs: Collection[str] = ...,
|
|
add_nofollow: bool = False,
|
|
host_whitelist: Collection[str] = (),
|
|
whitelist_tags: Collection[str] = {"iframe", "embed"},
|
|
) -> None: ...
|
|
def __call__(self, doc: HtmlElement | _ElementTree[HtmlElement]) -> None: ...
|
|
def allow_follow(self, anchor: HtmlElement) -> bool: ...
|
|
def allow_element(self, el: HtmlElement) -> bool: ...
|
|
def allow_embedded_url(self, el: HtmlElement, url: str) -> bool: ...
|
|
def kill_conditional_comments(self, doc: HtmlElement | _ElementTree[HtmlElement]) -> None: ...
|
|
def clean_html(self, html: _ET_DT) -> _ET_DT: ...
|
|
|
|
clean: Cleaner
|
|
clean_html = clean.clean_html
|
|
|
|
def autolink(
|
|
el: HtmlElement,
|
|
link_regexes: Iterable[Pattern[str]] = ...,
|
|
avoid_elements: Collection[str] = ...,
|
|
avoid_hosts: Iterable[Pattern[str]] = ...,
|
|
avoid_classes: Collection[str] = ["nolink"],
|
|
) -> None: ...
|
|
def autolink_html(
|
|
html: _DT,
|
|
link_regexes: Iterable[Pattern[str]] = ...,
|
|
avoid_elements: Collection[str] = ...,
|
|
avoid_hosts: Iterable[Pattern[str]] = ...,
|
|
avoid_classes: Collection[str] = ["nolink"],
|
|
) -> _DT: ...
|
|
def word_break(
|
|
el: HtmlElement,
|
|
max_width: int = 40,
|
|
avoid_elements: Collection[str] = ["pre", "textarea", "code"],
|
|
avoid_classes: Collection[str] = ["nobreak"],
|
|
break_character: str = chr(0x200B),
|
|
) -> None: ...
|
|
def word_break_html(
|
|
html: _DT,
|
|
max_width: int = 40,
|
|
avoid_elements: Collection[str] = ["pre", "textarea", "code"],
|
|
avoid_classes: Collection[str] = ["nobreak"],
|
|
break_character: str = chr(0x200B),
|
|
) -> _DT: ...
|