fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
94 lines
2.6 KiB
Python
94 lines
2.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Ignore the unused imports, this file's purpose is to make visible
|
|
anything which a user might need to import from newspaper.
|
|
View newspaper/__init__.py for its usage.
|
|
"""
|
|
__title__ = 'newspaper'
|
|
__author__ = 'Lucas Ou-Yang'
|
|
__license__ = 'MIT'
|
|
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
|
|
|
|
import feedparser
|
|
|
|
from .article import Article
|
|
from .configuration import Configuration
|
|
from .settings import POPULAR_URLS, TRENDING_URL
|
|
from .source import Source
|
|
from .utils import extend_config, print_available_languages
|
|
|
|
|
|
def build(url='', dry=False, config=None, **kwargs) -> Source:
|
|
"""Returns a constructed source object without
|
|
downloading or parsing the articles
|
|
"""
|
|
config = config or Configuration()
|
|
config = extend_config(config, kwargs)
|
|
url = url or ''
|
|
s = Source(url, config=config)
|
|
if not dry:
|
|
s.build()
|
|
return s
|
|
|
|
|
|
def build_article(url='', config=None, **kwargs) -> Article:
|
|
"""Returns a constructed article object without downloading
|
|
or parsing
|
|
"""
|
|
config = config or Configuration()
|
|
config = extend_config(config, kwargs)
|
|
url = url or ''
|
|
a = Article(url, config=config)
|
|
return a
|
|
|
|
|
|
def languages():
|
|
"""Returns a list of the supported languages
|
|
"""
|
|
print_available_languages()
|
|
|
|
|
|
def popular_urls():
|
|
"""Returns a list of pre-extracted popular source urls
|
|
"""
|
|
with open(POPULAR_URLS) as f:
|
|
urls = ['http://' + u.strip() for u in f.readlines()]
|
|
return urls
|
|
|
|
|
|
def hot():
|
|
"""Returns a list of hit terms via google trends
|
|
"""
|
|
try:
|
|
listing = feedparser.parse(TRENDING_URL)['entries']
|
|
trends = [item['title'] for item in listing]
|
|
return trends
|
|
except Exception as e:
|
|
print('ERR hot terms failed!', str(e))
|
|
return None
|
|
|
|
|
|
def fulltext(html, language='en'):
|
|
"""Takes article HTML string input and outputs the fulltext
|
|
Input string is decoded via UnicodeDammit if needed
|
|
"""
|
|
from .cleaners import DocumentCleaner
|
|
from .configuration import Configuration
|
|
from .extractors import ContentExtractor
|
|
from .outputformatters import OutputFormatter
|
|
|
|
config = Configuration()
|
|
config.language = language
|
|
|
|
extractor = ContentExtractor(config)
|
|
document_cleaner = DocumentCleaner(config)
|
|
output_formatter = OutputFormatter(config)
|
|
|
|
doc = config.get_parser().fromstring(html)
|
|
doc = document_cleaner.clean(doc)
|
|
|
|
top_node = extractor.calculate_best_node(doc)
|
|
top_node = extractor.post_cleanup(top_node)
|
|
text, article_html = output_formatter.get_formatted(top_node)
|
|
return text
|