Files
MoFin/venv/lib/python3.12/site-packages/newspaper/network.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

135 lines
4.0 KiB
Python

# -*- coding: utf-8 -*-
"""
All code involving requests and responses over the http network
must be abstracted in this file.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import logging
import requests
from .configuration import Configuration
from .mthreading import ThreadPool
from .settings import cj
log = logging.getLogger(__name__)
FAIL_ENCODING = 'ISO-8859-1'
def get_request_kwargs(timeout, useragent, proxies, headers):
"""This Wrapper method exists b/c some values in req_kwargs dict
are methods which need to be called every time we make a request
"""
return {
'headers': headers if headers else {'User-Agent': useragent},
'cookies': cj(),
'timeout': timeout,
'allow_redirects': True,
'proxies': proxies
}
def get_html(url, config=None, response=None):
"""HTTP response code agnostic
"""
try:
return get_html_2XX_only(url, config, response)
except requests.exceptions.RequestException as e:
log.debug('get_html() error. %s on URL: %s' % (e, url))
return ''
def get_html_2XX_only(url, config=None, response=None):
"""Consolidated logic for http requests from newspaper. We handle error cases:
- Attempt to find encoding of the html by using HTTP header. Fallback to
'ISO-8859-1' if not provided.
- Error out if a non 2XX HTTP response code is returned.
"""
config = config or Configuration()
useragent = config.browser_user_agent
timeout = config.request_timeout
proxies = config.proxies
headers = config.headers
if response is not None:
return _get_html_from_response(response)
response = requests.get(
url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
html = _get_html_from_response(response)
if config.http_success_only:
# fail if HTTP sends a non 2XX response
response.raise_for_status()
return html
def _get_html_from_response(response):
if response.encoding != FAIL_ENCODING:
# return response as a unicode string
html = response.text
else:
html = response.content
if 'charset' not in response.headers.get('content-type'):
encodings = requests.utils.get_encodings_from_content(response.text)
if len(encodings) > 0:
response.encoding = encodings[0]
html = response.text
return html or ''
class MRequest(object):
"""Wrapper for request object for multithreading. If the domain we are
crawling is under heavy load, the self.resp will be left as None.
If this is the case, we still want to report the url which has failed
so (perhaps) we can try again later.
"""
def __init__(self, url, config=None):
self.url = url
self.config = config
config = config or Configuration()
self.useragent = config.browser_user_agent
self.timeout = config.request_timeout
self.proxies = config.proxies
self.headers = config.headers
self.resp = None
def send(self):
try:
self.resp = requests.get(self.url, **get_request_kwargs(
self.timeout, self.useragent, self.proxies, self.headers))
if self.config.http_success_only:
self.resp.raise_for_status()
except requests.exceptions.RequestException as e:
log.critical('[REQUEST FAILED] ' + str(e))
def multithread_request(urls, config=None):
"""Request multiple urls via mthreading, order of urls & requests is stable
returns same requests but with response variables filled.
"""
config = config or Configuration()
num_threads = config.number_threads
timeout = config.thread_timeout_seconds
pool = ThreadPool(num_threads, timeout)
m_requests = []
for url in urls:
m_requests.append(MRequest(url, config))
for req in m_requests:
pool.add_task(req.send)
pool.wait_completion()
return m_requests