Files
MoFin/venv/lib/python3.12/site-packages/newspaper/utils.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

414 lines
12 KiB
Python

# -*- coding: utf-8 -*-
"""
Holds misc. utility methods which prove to be
useful throughout this library.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import codecs
import hashlib
import logging
import os
import pickle
import random
import re
import string
import sys
import threading
import time
from hashlib import sha1
from bs4 import BeautifulSoup
from . import settings
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
class FileHelper(object):
@staticmethod
def loadResourceFile(filename):
if not os.path.isabs(filename):
dirpath = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(dirpath, 'resources', filename)
else:
path = filename
try:
f = codecs.open(path, 'r', 'utf-8')
content = f.read()
f.close()
return content
except IOError:
raise IOError("Couldn't open file %s" % path)
class ParsingCandidate(object):
def __init__(self, url, link_hash):
self.url = url
self.link_hash = link_hash
class RawHelper(object):
@staticmethod
def get_parsing_candidate(url, raw_html):
if isinstance(raw_html, str):
raw_html = raw_html.encode('utf-8', 'replace')
link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
return ParsingCandidate(url, link_hash)
class URLHelper(object):
@staticmethod
def get_parsing_candidate(url_to_crawl):
# Replace shebang in urls
final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
if '#!' in url_to_crawl else url_to_crawl
link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
return ParsingCandidate(final_url, link_hash)
class StringSplitter(object):
def __init__(self, pattern):
self.pattern = re.compile(pattern)
def split(self, string):
if not string:
return []
return self.pattern.split(string)
class StringReplacement(object):
def __init__(self, pattern, replaceWith):
self.pattern = pattern
self.replaceWith = replaceWith
def replaceAll(self, string):
if not string:
return ''
return string.replace(self.pattern, self.replaceWith)
class ReplaceSequence(object):
def __init__(self):
self.replacements = []
def create(self, firstPattern, replaceWith=None):
result = StringReplacement(firstPattern, replaceWith or '')
self.replacements.append(result)
return self
def append(self, pattern, replaceWith=None):
return self.create(pattern, replaceWith)
def replaceAll(self, string):
if not string:
return ''
mutatedString = string
for rp in self.replacements:
mutatedString = rp.replaceAll(mutatedString)
return mutatedString
class TimeoutError(Exception):
pass
def timelimit(timeout):
"""Borrowed from web.py, rip Aaron Swartz
"""
def _1(function):
def _2(*args, **kw):
class Dispatch(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.result = None
self.error = None
self.setDaemon(True)
self.start()
def run(self):
try:
self.result = function(*args, **kw)
except:
self.error = sys.exc_info()
c = Dispatch()
c.join(timeout)
if c.isAlive():
raise TimeoutError()
if c.error:
raise c.error[0](c.error[1])
return c.result
return _2
return _1
def domain_to_filename(domain):
"""All '/' are turned into '-', no trailing. schema's
are gone, only the raw domain + ".txt" remains
"""
filename = domain.replace('/', '-')
if filename[-1] == '-':
filename = filename[:-1]
filename += ".txt"
return filename
def filename_to_domain(filename):
"""[:-4] for the .txt at end
"""
return filename.replace('-', '/')[:-4]
def is_ascii(word):
"""True if a word is only ascii chars
"""
def onlyascii(char):
if ord(char) > 127:
return ''
else:
return char
for c in word:
if not onlyascii(c):
return False
return True
def extract_meta_refresh(html):
""" Parses html for a tag like:
<meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
Example can be found at: https://www.google.com/url?rct=j&sa=t&url=http://sfbay.craigslist.org/eby/cto/
5617800926.html&ct=ga&cd=CAAYATIaYTc4ZTgzYjAwOTAwY2M4Yjpjb206ZW46VVM&usg=AFQjCNF7zAl6JPuEsV4PbEzBomJTUpX4Lg
"""
soup = BeautifulSoup(html, 'html.parser')
element = soup.find('meta', attrs={'http-equiv': 'refresh'})
if element:
try:
wait_part, url_part = element['content'].split(";")
except ValueError:
# In case there are not enough values to unpack
# for instance: <meta http-equiv="refresh" content="600" />
return None
else:
# Get rid of any " or ' inside the element
# for instance:
# <meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
if url_part.lower().startswith("url="):
return url_part[4:].replace('"', '').replace("'", '')
def to_valid_filename(s):
"""Converts arbitrary string (for us domain name)
into a valid file name for caching
"""
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return ''.join(c for c in s if c in valid_chars)
def cache_disk(seconds=(86400 * 5), cache_folder="/tmp"):
"""Caching extracting category locations & rss feeds for 5 days
"""
def do_cache(function):
def inner_function(*args, **kwargs):
"""Calculate a cache key based on the decorated method signature
args[1] indicates the domain of the inputs, we hash on domain!
"""
key = sha1((str(args[1]) +
str(kwargs)).encode('utf-8')).hexdigest()
filepath = os.path.join(cache_folder, key)
# verify that the cached object exists and is less than
# X seconds old
if os.path.exists(filepath):
modified = os.path.getmtime(filepath)
age_seconds = time.time() - modified
if age_seconds < seconds:
return pickle.load(open(filepath, "rb"))
# call the decorated function...
result = function(*args, **kwargs)
# ... and save the cached object for next time
pickle.dump(result, open(filepath, "wb"))
return result
return inner_function
return do_cache
def print_duration(method):
"""Prints out the runtime duration of a method in seconds
"""
def timed(*args, **kw):
ts = time.time()
result = method(*args, **kw)
te = time.time()
print('%r %2.2f sec' % (method.__name__, te - ts))
return result
return timed
def chunks(l, n):
"""Yield n successive chunks from l
"""
newn = int(len(l) / n)
for i in range(0, n - 1):
yield l[i * newn:i * newn + newn]
yield l[n * newn - newn:]
def purge(fn, pattern):
"""Delete files in a dir matching pattern
"""
for f in os.listdir(fn):
if re.search(pattern, f):
os.remove(os.path.join(fn, f))
def clear_memo_cache(source):
"""Clears the memoization cache for this specific news domain
"""
d_pth = os.path.join(settings.MEMO_DIR, domain_to_filename(source.domain))
if os.path.exists(d_pth):
os.remove(d_pth)
else:
print('memo file for', source.domain, 'has already been deleted!')
def memoize_articles(source, articles):
"""When we parse the <a> links in an <html> page, on the 2nd run
and later, check the <a> links of previous runs. If they match,
it means the link must not be an article, because article urls
change as time passes. This method also uniquifies articles.
"""
source_domain = source.domain
config = source.config
if len(articles) == 0:
return []
memo = {}
cur_articles = {article.url: article for article in articles}
d_pth = os.path.join(settings.MEMO_DIR, domain_to_filename(source_domain))
if os.path.exists(d_pth):
f = codecs.open(d_pth, 'r', 'utf8')
urls = f.readlines()
f.close()
urls = [u.strip() for u in urls]
memo = {url: True for url in urls}
# prev_length = len(memo)
for url, article in list(cur_articles.items()):
if memo.get(url):
del cur_articles[url]
valid_urls = list(memo.keys()) + list(cur_articles.keys())
memo_text = '\r\n'.join(
[href.strip() for href in (valid_urls)])
# Our first run with memoization, save every url as valid
else:
memo_text = '\r\n'.join(
[href.strip() for href in list(cur_articles.keys())])
# new_length = len(cur_articles)
if len(memo) > config.MAX_FILE_MEMO:
# We still keep current batch of articles though!
log.critical('memo overflow, dumping')
memo_text = ''
# TODO if source: source.write_upload_times(prev_length, new_length)
ff = codecs.open(d_pth, 'w', 'utf-8')
ff.write(memo_text)
ff.close()
return list(cur_articles.values())
def get_useragent():
"""Uses generator to return next useragent in saved file
"""
with open(settings.USERAGENTS, 'r') as f:
agents = f.readlines()
selection = random.randint(0, len(agents) - 1)
agent = agents[selection]
return agent.strip()
def get_available_languages():
"""Returns a list of available languages and their 2 char input codes
"""
stopword_files = os.listdir(os.path.join(settings.STOPWORDS_DIR))
two_dig_codes = [f.split('-')[1].split('.')[0] for f in stopword_files]
for d in two_dig_codes:
assert len(d) == 2
return two_dig_codes
def print_available_languages():
"""Prints available languages with their full names
"""
language_dict = {
'ar': 'Arabic',
'ru': 'Russian',
'nl': 'Dutch',
'de': 'German',
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'he': 'Hebrew',
'it': 'Italian',
'ko': 'Korean',
'no': 'Norwegian',
'nb': 'Norwegian (Bokmål)',
'fa': 'Persian',
'pl': 'Polish',
'pt': 'Portuguese',
'sv': 'Swedish',
'hu': 'Hungarian',
'fi': 'Finnish',
'da': 'Danish',
'zh': 'Chinese',
'id': 'Indonesian',
'vi': 'Vietnamese',
'mk': 'Macedonian',
'tr': 'Turkish',
'el': 'Greek',
'uk': 'Ukrainian',
'hi': 'Hindi',
'sw': 'Swahili',
'bg': 'Bulgarian',
'hr': 'Croatian',
'ro': 'Romanian',
'sl': 'Slovenian',
'sr': 'Serbian',
'et': 'Estonian',
'ja': 'Japanese',
'be': 'Belarusian'
}
codes = get_available_languages()
print('\nYour available languages are:')
print('\ninput code\t\tfull name')
for code in codes:
print(' %s\t\t\t %s' % (code, language_dict[code]))
print()
def extend_config(config, config_items):
"""
We are handling config value setting like this for a cleaner api.
Users just need to pass in a named param to this source and we can
dynamically generate a config object for it.
"""
for key, val in list(config_items.items()):
if hasattr(config, key):
setattr(config, key, val)
return config