Files
MoFin/venv/lib/python3.12/site-packages/newspaper/configuration.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

132 lines
4.2 KiB
Python

# -*- coding: utf-8 -*-
"""
This class holds configuration objects, which can be thought of
as settings.py but dynamic and changing for whatever parent object
holds them. For example, pass in a config object to an Article
object, Source object, or even network methods, and it just works.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import logging
from .parsers import Parser
from .text import (StopWords, StopWordsArabic, StopWordsChinese,
StopWordsKorean, StopWordsHindi, StopWordsJapanese)
from .version import __version__
log = logging.getLogger(__name__)
class Configuration(object):
def __init__(self):
"""
Modify any of these Article / Source properties
TODO: Have a separate ArticleConfig and SourceConfig extend this!
"""
self.MIN_WORD_COUNT = 300 # num of word tokens in text
self.MIN_SENT_COUNT = 7 # num of sentence tokens
self.MAX_TITLE = 200 # num of chars
self.MAX_TEXT = 100000 # num of chars
self.MAX_KEYWORDS = 35 # num of strings in list
self.MAX_AUTHORS = 10 # num strings in list
self.MAX_SUMMARY = 5000 # num of chars
self.MAX_SUMMARY_SENT = 5 # num of sentences
# max number of urls we cache for each news source
self.MAX_FILE_MEMO = 20000
# Cache and save articles run after run
self.memoize_articles = True
# Set this to false if you don't care about getting images
self.fetch_images = True
self.image_dimension_ration = 16 / 9.0
# Follow meta refresh redirect when downloading
self.follow_meta_refresh = False
# Don't toggle this variable, done internally
self.use_meta_language = True
# You may keep the html of just the main article body
self.keep_article_html = False
# Fail for error responses (e.g. 404 page)
self.http_success_only = True
# English is the fallback
self._language = 'en'
# Unique stopword classes for oriental languages, don't toggle
self.stopwords_class = StopWords
self.browser_user_agent = 'newspaper/%s' % __version__
self.headers = {}
self.request_timeout = 7
self.proxies = {}
self.number_threads = 10
self.verbose = False # for debugging
self.thread_timeout_seconds = 1
# Set this to False if you want to recompute the categories
# *every* time you build a `Source` object
# TODO: Actually make this work
# self.use_cached_categories = True
def get_language(self):
return self._language
def del_language(self):
raise Exception('wtf are you doing?')
def set_language(self, language):
"""Language setting must be set in this method b/c non-occidental
(western) languages require a separate stopwords class.
"""
if not language or len(language) != 2:
raise Exception("Your input language must be a 2 char language code, \
for example: english-->en \n and german-->de")
# If explicitly set language, don't use meta
self.use_meta_language = False
# Set oriental language stopword class
self._language = language
self.stopwords_class = self.get_stopwords_class(language)
language = property(get_language, set_language,
del_language, "language prop")
@staticmethod
def get_stopwords_class(language):
if language == 'ko':
return StopWordsKorean
elif language == 'hi':
return StopWordsHindi
elif language == 'zh':
return StopWordsChinese
# Persian and Arabic Share an alphabet
# There is a persian parser https://github.com/sobhe/hazm, but nltk is likely sufficient
elif language == 'ar' or language == 'fa':
return StopWordsArabic
elif language == 'ja':
return StopWordsJapanese
return StopWords
@staticmethod
def get_parser():
return Parser
class ArticleConfiguration(Configuration):
pass
class SourceConfiguration(Configuration):
pass