Files
MoFin/venv/lib/python3.12/site-packages/newspaper/api.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

94 lines
2.6 KiB
Python

# -*- coding: utf-8 -*-
"""
Ignore the unused imports, this file's purpose is to make visible
anything which a user might need to import from newspaper.
View newspaper/__init__.py for its usage.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import feedparser
from .article import Article
from .configuration import Configuration
from .settings import POPULAR_URLS, TRENDING_URL
from .source import Source
from .utils import extend_config, print_available_languages
def build(url='', dry=False, config=None, **kwargs) -> Source:
"""Returns a constructed source object without
downloading or parsing the articles
"""
config = config or Configuration()
config = extend_config(config, kwargs)
url = url or ''
s = Source(url, config=config)
if not dry:
s.build()
return s
def build_article(url='', config=None, **kwargs) -> Article:
"""Returns a constructed article object without downloading
or parsing
"""
config = config or Configuration()
config = extend_config(config, kwargs)
url = url or ''
a = Article(url, config=config)
return a
def languages():
"""Returns a list of the supported languages
"""
print_available_languages()
def popular_urls():
"""Returns a list of pre-extracted popular source urls
"""
with open(POPULAR_URLS) as f:
urls = ['http://' + u.strip() for u in f.readlines()]
return urls
def hot():
"""Returns a list of hit terms via google trends
"""
try:
listing = feedparser.parse(TRENDING_URL)['entries']
trends = [item['title'] for item in listing]
return trends
except Exception as e:
print('ERR hot terms failed!', str(e))
return None
def fulltext(html, language='en'):
"""Takes article HTML string input and outputs the fulltext
Input string is decoded via UnicodeDammit if needed
"""
from .cleaners import DocumentCleaner
from .configuration import Configuration
from .extractors import ContentExtractor
from .outputformatters import OutputFormatter
config = Configuration()
config.language = language
extractor = ContentExtractor(config)
document_cleaner = DocumentCleaner(config)
output_formatter = OutputFormatter(config)
doc = config.get_parser().fromstring(html)
doc = document_cleaner.clean(doc)
top_node = extractor.calculate_best_node(doc)
top_node = extractor.post_cleanup(top_node)
text, article_html = output_formatter.get_formatted(top_node)
return text