fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
132 lines
4.2 KiB
Python
132 lines
4.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
This class holds configuration objects, which can be thought of
|
|
as settings.py but dynamic and changing for whatever parent object
|
|
holds them. For example, pass in a config object to an Article
|
|
object, Source object, or even network methods, and it just works.
|
|
"""
|
|
__title__ = 'newspaper'
|
|
__author__ = 'Lucas Ou-Yang'
|
|
__license__ = 'MIT'
|
|
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
|
|
|
|
import logging
|
|
|
|
from .parsers import Parser
|
|
from .text import (StopWords, StopWordsArabic, StopWordsChinese,
|
|
StopWordsKorean, StopWordsHindi, StopWordsJapanese)
|
|
from .version import __version__
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class Configuration(object):
|
|
def __init__(self):
|
|
"""
|
|
Modify any of these Article / Source properties
|
|
TODO: Have a separate ArticleConfig and SourceConfig extend this!
|
|
"""
|
|
self.MIN_WORD_COUNT = 300 # num of word tokens in text
|
|
self.MIN_SENT_COUNT = 7 # num of sentence tokens
|
|
self.MAX_TITLE = 200 # num of chars
|
|
self.MAX_TEXT = 100000 # num of chars
|
|
self.MAX_KEYWORDS = 35 # num of strings in list
|
|
self.MAX_AUTHORS = 10 # num strings in list
|
|
self.MAX_SUMMARY = 5000 # num of chars
|
|
self.MAX_SUMMARY_SENT = 5 # num of sentences
|
|
|
|
# max number of urls we cache for each news source
|
|
self.MAX_FILE_MEMO = 20000
|
|
|
|
# Cache and save articles run after run
|
|
self.memoize_articles = True
|
|
|
|
# Set this to false if you don't care about getting images
|
|
self.fetch_images = True
|
|
self.image_dimension_ration = 16 / 9.0
|
|
|
|
# Follow meta refresh redirect when downloading
|
|
self.follow_meta_refresh = False
|
|
|
|
# Don't toggle this variable, done internally
|
|
self.use_meta_language = True
|
|
|
|
# You may keep the html of just the main article body
|
|
self.keep_article_html = False
|
|
|
|
# Fail for error responses (e.g. 404 page)
|
|
self.http_success_only = True
|
|
|
|
# English is the fallback
|
|
self._language = 'en'
|
|
|
|
# Unique stopword classes for oriental languages, don't toggle
|
|
self.stopwords_class = StopWords
|
|
|
|
self.browser_user_agent = 'newspaper/%s' % __version__
|
|
self.headers = {}
|
|
self.request_timeout = 7
|
|
self.proxies = {}
|
|
self.number_threads = 10
|
|
|
|
self.verbose = False # for debugging
|
|
|
|
self.thread_timeout_seconds = 1
|
|
|
|
# Set this to False if you want to recompute the categories
|
|
# *every* time you build a `Source` object
|
|
# TODO: Actually make this work
|
|
# self.use_cached_categories = True
|
|
|
|
def get_language(self):
|
|
return self._language
|
|
|
|
def del_language(self):
|
|
raise Exception('wtf are you doing?')
|
|
|
|
def set_language(self, language):
|
|
"""Language setting must be set in this method b/c non-occidental
|
|
(western) languages require a separate stopwords class.
|
|
"""
|
|
if not language or len(language) != 2:
|
|
raise Exception("Your input language must be a 2 char language code, \
|
|
for example: english-->en \n and german-->de")
|
|
|
|
# If explicitly set language, don't use meta
|
|
self.use_meta_language = False
|
|
|
|
# Set oriental language stopword class
|
|
self._language = language
|
|
self.stopwords_class = self.get_stopwords_class(language)
|
|
|
|
language = property(get_language, set_language,
|
|
del_language, "language prop")
|
|
|
|
@staticmethod
|
|
def get_stopwords_class(language):
|
|
if language == 'ko':
|
|
return StopWordsKorean
|
|
elif language == 'hi':
|
|
return StopWordsHindi
|
|
elif language == 'zh':
|
|
return StopWordsChinese
|
|
# Persian and Arabic Share an alphabet
|
|
# There is a persian parser https://github.com/sobhe/hazm, but nltk is likely sufficient
|
|
elif language == 'ar' or language == 'fa':
|
|
return StopWordsArabic
|
|
elif language == 'ja':
|
|
return StopWordsJapanese
|
|
return StopWords
|
|
|
|
@staticmethod
|
|
def get_parser():
|
|
return Parser
|
|
|
|
|
|
class ArticleConfiguration(Configuration):
|
|
pass
|
|
|
|
|
|
class SourceConfiguration(Configuration):
|
|
pass
|