Files
MoFin/venv/lib/python3.12/site-packages/newspaper/mthreading.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

134 lines
4.0 KiB
Python

# -*- coding: utf-8 -*-
"""
Anything that has to do with threading in this library
must be abstracted in this file. If we decide to do gevent
also, it will deserve its own gevent file.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import logging
import queue
import traceback
from threading import Thread
from .configuration import Configuration
log = logging.getLogger(__name__)
class ConcurrencyException(Exception):
pass
class Worker(Thread):
"""
Thread executing tasks from a given tasks queue.
"""
def __init__(self, tasks, timeout_seconds):
Thread.__init__(self)
self.tasks = tasks
self.timeout = timeout_seconds
self.daemon = True
self.start()
def run(self):
while True:
try:
func, args, kargs = self.tasks.get(timeout=self.timeout)
except queue.Empty:
# Extra thread allocated, no job, exit gracefully
break
try:
func(*args, **kargs)
except Exception:
traceback.print_exc()
self.tasks.task_done()
class ThreadPool:
def __init__(self, num_threads, timeout_seconds):
self.tasks = queue.Queue(num_threads)
for _ in range(num_threads):
Worker(self.tasks, timeout_seconds)
def add_task(self, func, *args, **kargs):
self.tasks.put((func, args, kargs))
def wait_completion(self):
self.tasks.join()
class NewsPool(object):
def __init__(self, config=None):
"""
Abstraction of a threadpool. A newspool can accept any number of
source OR article objects together in a list. It allocates one
thread to every source and then joins.
We allocate one thread per source to avoid rate limiting.
5 sources = 5 threads, one per source.
>>> import newspaper
>>> from newspaper import news_pool
>>> cnn_paper = newspaper.build('http://cnn.com')
>>> tc_paper = newspaper.build('http://techcrunch.com')
>>> espn_paper = newspaper.build('http://espn.com')
>>> papers = [cnn_paper, tc_paper, espn_paper]
>>> news_pool.set(papers)
>>> news_pool.join()
# All of your papers should have their articles html all populated now.
>>> cnn_paper.articles[50].html
u'<html>blahblah ... '
"""
self.pool = None
self.config = config or Configuration()
def join(self):
"""
Runs the mtheading and returns when all threads have joined
resets the task.
"""
if self.pool is None:
raise ConcurrencyException('Call set(..) with a list of source objects '
'before calling .join(..)')
self.pool.wait_completion()
self.pool = None
def set(self, news_list, threads_per_source=1, override_threads=None):
"""
news_list can be a list of `Article`, `Source`, or both.
If caller wants to decide how many threads to use, they can use
`override_threads` which takes precedence over all. Otherwise,
this api infers that if the input is all `Source` objects, to
allocate one thread per `Source` to not spam the host.
If both of the above conditions are not true, default to 1 thread.
"""
from .source import Source
if override_threads is not None:
num_threads = override_threads
elif all([isinstance(n, Source) for n in news_list]):
num_threads = threads_per_source * len(news_list)
else:
num_threads = 1
timeout = self.config.thread_timeout_seconds
self.pool = ThreadPool(num_threads, timeout)
for news_object in news_list:
if isinstance(news_object, Source):
self.pool.add_task(news_object.download_articles)
else:
self.pool.add_task(news_object.download)