fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
414 lines
12 KiB
Python
414 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Holds misc. utility methods which prove to be
|
|
useful throughout this library.
|
|
"""
|
|
__title__ = 'newspaper'
|
|
__author__ = 'Lucas Ou-Yang'
|
|
__license__ = 'MIT'
|
|
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
|
|
|
|
import codecs
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import pickle
|
|
import random
|
|
import re
|
|
import string
|
|
import sys
|
|
import threading
|
|
import time
|
|
|
|
from hashlib import sha1
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from . import settings
|
|
|
|
log = logging.getLogger(__name__)
|
|
log.setLevel(logging.DEBUG)
|
|
|
|
|
|
class FileHelper(object):
|
|
@staticmethod
|
|
def loadResourceFile(filename):
|
|
if not os.path.isabs(filename):
|
|
dirpath = os.path.abspath(os.path.dirname(__file__))
|
|
path = os.path.join(dirpath, 'resources', filename)
|
|
else:
|
|
path = filename
|
|
try:
|
|
f = codecs.open(path, 'r', 'utf-8')
|
|
content = f.read()
|
|
f.close()
|
|
return content
|
|
except IOError:
|
|
raise IOError("Couldn't open file %s" % path)
|
|
|
|
|
|
class ParsingCandidate(object):
|
|
|
|
def __init__(self, url, link_hash):
|
|
self.url = url
|
|
self.link_hash = link_hash
|
|
|
|
|
|
class RawHelper(object):
|
|
@staticmethod
|
|
def get_parsing_candidate(url, raw_html):
|
|
if isinstance(raw_html, str):
|
|
raw_html = raw_html.encode('utf-8', 'replace')
|
|
link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
|
|
return ParsingCandidate(url, link_hash)
|
|
|
|
|
|
class URLHelper(object):
|
|
@staticmethod
|
|
def get_parsing_candidate(url_to_crawl):
|
|
# Replace shebang in urls
|
|
final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
|
|
if '#!' in url_to_crawl else url_to_crawl
|
|
link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
|
|
return ParsingCandidate(final_url, link_hash)
|
|
|
|
|
|
class StringSplitter(object):
|
|
def __init__(self, pattern):
|
|
self.pattern = re.compile(pattern)
|
|
|
|
def split(self, string):
|
|
if not string:
|
|
return []
|
|
return self.pattern.split(string)
|
|
|
|
|
|
class StringReplacement(object):
|
|
def __init__(self, pattern, replaceWith):
|
|
self.pattern = pattern
|
|
self.replaceWith = replaceWith
|
|
|
|
def replaceAll(self, string):
|
|
if not string:
|
|
return ''
|
|
return string.replace(self.pattern, self.replaceWith)
|
|
|
|
|
|
class ReplaceSequence(object):
|
|
def __init__(self):
|
|
self.replacements = []
|
|
|
|
def create(self, firstPattern, replaceWith=None):
|
|
result = StringReplacement(firstPattern, replaceWith or '')
|
|
self.replacements.append(result)
|
|
return self
|
|
|
|
def append(self, pattern, replaceWith=None):
|
|
return self.create(pattern, replaceWith)
|
|
|
|
def replaceAll(self, string):
|
|
if not string:
|
|
return ''
|
|
|
|
mutatedString = string
|
|
for rp in self.replacements:
|
|
mutatedString = rp.replaceAll(mutatedString)
|
|
return mutatedString
|
|
|
|
|
|
class TimeoutError(Exception):
|
|
pass
|
|
|
|
|
|
def timelimit(timeout):
|
|
"""Borrowed from web.py, rip Aaron Swartz
|
|
"""
|
|
def _1(function):
|
|
def _2(*args, **kw):
|
|
class Dispatch(threading.Thread):
|
|
def __init__(self):
|
|
threading.Thread.__init__(self)
|
|
self.result = None
|
|
self.error = None
|
|
|
|
self.setDaemon(True)
|
|
self.start()
|
|
|
|
def run(self):
|
|
try:
|
|
self.result = function(*args, **kw)
|
|
except:
|
|
self.error = sys.exc_info()
|
|
c = Dispatch()
|
|
c.join(timeout)
|
|
if c.isAlive():
|
|
raise TimeoutError()
|
|
if c.error:
|
|
raise c.error[0](c.error[1])
|
|
return c.result
|
|
return _2
|
|
return _1
|
|
|
|
|
|
def domain_to_filename(domain):
|
|
"""All '/' are turned into '-', no trailing. schema's
|
|
are gone, only the raw domain + ".txt" remains
|
|
"""
|
|
filename = domain.replace('/', '-')
|
|
if filename[-1] == '-':
|
|
filename = filename[:-1]
|
|
filename += ".txt"
|
|
return filename
|
|
|
|
|
|
def filename_to_domain(filename):
|
|
"""[:-4] for the .txt at end
|
|
"""
|
|
return filename.replace('-', '/')[:-4]
|
|
|
|
|
|
def is_ascii(word):
|
|
"""True if a word is only ascii chars
|
|
"""
|
|
def onlyascii(char):
|
|
if ord(char) > 127:
|
|
return ''
|
|
else:
|
|
return char
|
|
for c in word:
|
|
if not onlyascii(c):
|
|
return False
|
|
return True
|
|
|
|
|
|
def extract_meta_refresh(html):
|
|
""" Parses html for a tag like:
|
|
<meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
|
|
Example can be found at: https://www.google.com/url?rct=j&sa=t&url=http://sfbay.craigslist.org/eby/cto/
|
|
5617800926.html&ct=ga&cd=CAAYATIaYTc4ZTgzYjAwOTAwY2M4Yjpjb206ZW46VVM&usg=AFQjCNF7zAl6JPuEsV4PbEzBomJTUpX4Lg
|
|
"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
element = soup.find('meta', attrs={'http-equiv': 'refresh'})
|
|
if element:
|
|
try:
|
|
wait_part, url_part = element['content'].split(";")
|
|
except ValueError:
|
|
# In case there are not enough values to unpack
|
|
# for instance: <meta http-equiv="refresh" content="600" />
|
|
return None
|
|
else:
|
|
# Get rid of any " or ' inside the element
|
|
# for instance:
|
|
# <meta http-equiv="refresh" content="0;URL='http://sfbay.craigslist.org/eby/cto/5617800926.html'" />
|
|
if url_part.lower().startswith("url="):
|
|
return url_part[4:].replace('"', '').replace("'", '')
|
|
|
|
|
|
def to_valid_filename(s):
|
|
"""Converts arbitrary string (for us domain name)
|
|
into a valid file name for caching
|
|
"""
|
|
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
|
|
return ''.join(c for c in s if c in valid_chars)
|
|
|
|
|
|
def cache_disk(seconds=(86400 * 5), cache_folder="/tmp"):
|
|
"""Caching extracting category locations & rss feeds for 5 days
|
|
"""
|
|
def do_cache(function):
|
|
def inner_function(*args, **kwargs):
|
|
"""Calculate a cache key based on the decorated method signature
|
|
args[1] indicates the domain of the inputs, we hash on domain!
|
|
"""
|
|
key = sha1((str(args[1]) +
|
|
str(kwargs)).encode('utf-8')).hexdigest()
|
|
filepath = os.path.join(cache_folder, key)
|
|
|
|
# verify that the cached object exists and is less than
|
|
# X seconds old
|
|
if os.path.exists(filepath):
|
|
modified = os.path.getmtime(filepath)
|
|
age_seconds = time.time() - modified
|
|
if age_seconds < seconds:
|
|
return pickle.load(open(filepath, "rb"))
|
|
|
|
# call the decorated function...
|
|
result = function(*args, **kwargs)
|
|
# ... and save the cached object for next time
|
|
pickle.dump(result, open(filepath, "wb"))
|
|
return result
|
|
return inner_function
|
|
return do_cache
|
|
|
|
|
|
def print_duration(method):
|
|
"""Prints out the runtime duration of a method in seconds
|
|
"""
|
|
def timed(*args, **kw):
|
|
ts = time.time()
|
|
result = method(*args, **kw)
|
|
te = time.time()
|
|
print('%r %2.2f sec' % (method.__name__, te - ts))
|
|
return result
|
|
return timed
|
|
|
|
|
|
def chunks(l, n):
|
|
"""Yield n successive chunks from l
|
|
"""
|
|
newn = int(len(l) / n)
|
|
for i in range(0, n - 1):
|
|
yield l[i * newn:i * newn + newn]
|
|
yield l[n * newn - newn:]
|
|
|
|
|
|
def purge(fn, pattern):
|
|
"""Delete files in a dir matching pattern
|
|
"""
|
|
for f in os.listdir(fn):
|
|
if re.search(pattern, f):
|
|
os.remove(os.path.join(fn, f))
|
|
|
|
|
|
def clear_memo_cache(source):
|
|
"""Clears the memoization cache for this specific news domain
|
|
"""
|
|
d_pth = os.path.join(settings.MEMO_DIR, domain_to_filename(source.domain))
|
|
if os.path.exists(d_pth):
|
|
os.remove(d_pth)
|
|
else:
|
|
print('memo file for', source.domain, 'has already been deleted!')
|
|
|
|
|
|
def memoize_articles(source, articles):
|
|
"""When we parse the <a> links in an <html> page, on the 2nd run
|
|
and later, check the <a> links of previous runs. If they match,
|
|
it means the link must not be an article, because article urls
|
|
change as time passes. This method also uniquifies articles.
|
|
"""
|
|
source_domain = source.domain
|
|
config = source.config
|
|
|
|
if len(articles) == 0:
|
|
return []
|
|
|
|
memo = {}
|
|
cur_articles = {article.url: article for article in articles}
|
|
d_pth = os.path.join(settings.MEMO_DIR, domain_to_filename(source_domain))
|
|
|
|
if os.path.exists(d_pth):
|
|
f = codecs.open(d_pth, 'r', 'utf8')
|
|
urls = f.readlines()
|
|
f.close()
|
|
urls = [u.strip() for u in urls]
|
|
|
|
memo = {url: True for url in urls}
|
|
# prev_length = len(memo)
|
|
for url, article in list(cur_articles.items()):
|
|
if memo.get(url):
|
|
del cur_articles[url]
|
|
|
|
valid_urls = list(memo.keys()) + list(cur_articles.keys())
|
|
|
|
memo_text = '\r\n'.join(
|
|
[href.strip() for href in (valid_urls)])
|
|
# Our first run with memoization, save every url as valid
|
|
else:
|
|
memo_text = '\r\n'.join(
|
|
[href.strip() for href in list(cur_articles.keys())])
|
|
|
|
# new_length = len(cur_articles)
|
|
if len(memo) > config.MAX_FILE_MEMO:
|
|
# We still keep current batch of articles though!
|
|
log.critical('memo overflow, dumping')
|
|
memo_text = ''
|
|
|
|
# TODO if source: source.write_upload_times(prev_length, new_length)
|
|
ff = codecs.open(d_pth, 'w', 'utf-8')
|
|
ff.write(memo_text)
|
|
ff.close()
|
|
return list(cur_articles.values())
|
|
|
|
|
|
def get_useragent():
|
|
"""Uses generator to return next useragent in saved file
|
|
"""
|
|
with open(settings.USERAGENTS, 'r') as f:
|
|
agents = f.readlines()
|
|
selection = random.randint(0, len(agents) - 1)
|
|
agent = agents[selection]
|
|
return agent.strip()
|
|
|
|
|
|
def get_available_languages():
|
|
"""Returns a list of available languages and their 2 char input codes
|
|
"""
|
|
stopword_files = os.listdir(os.path.join(settings.STOPWORDS_DIR))
|
|
two_dig_codes = [f.split('-')[1].split('.')[0] for f in stopword_files]
|
|
for d in two_dig_codes:
|
|
assert len(d) == 2
|
|
return two_dig_codes
|
|
|
|
|
|
def print_available_languages():
|
|
"""Prints available languages with their full names
|
|
"""
|
|
language_dict = {
|
|
'ar': 'Arabic',
|
|
'ru': 'Russian',
|
|
'nl': 'Dutch',
|
|
'de': 'German',
|
|
'en': 'English',
|
|
'es': 'Spanish',
|
|
'fr': 'French',
|
|
'he': 'Hebrew',
|
|
'it': 'Italian',
|
|
'ko': 'Korean',
|
|
'no': 'Norwegian',
|
|
'nb': 'Norwegian (Bokmål)',
|
|
'fa': 'Persian',
|
|
'pl': 'Polish',
|
|
'pt': 'Portuguese',
|
|
'sv': 'Swedish',
|
|
'hu': 'Hungarian',
|
|
'fi': 'Finnish',
|
|
'da': 'Danish',
|
|
'zh': 'Chinese',
|
|
'id': 'Indonesian',
|
|
'vi': 'Vietnamese',
|
|
'mk': 'Macedonian',
|
|
'tr': 'Turkish',
|
|
'el': 'Greek',
|
|
'uk': 'Ukrainian',
|
|
'hi': 'Hindi',
|
|
'sw': 'Swahili',
|
|
'bg': 'Bulgarian',
|
|
'hr': 'Croatian',
|
|
'ro': 'Romanian',
|
|
'sl': 'Slovenian',
|
|
'sr': 'Serbian',
|
|
'et': 'Estonian',
|
|
'ja': 'Japanese',
|
|
'be': 'Belarusian'
|
|
}
|
|
|
|
codes = get_available_languages()
|
|
print('\nYour available languages are:')
|
|
print('\ninput code\t\tfull name')
|
|
for code in codes:
|
|
print(' %s\t\t\t %s' % (code, language_dict[code]))
|
|
print()
|
|
|
|
|
|
def extend_config(config, config_items):
|
|
"""
|
|
We are handling config value setting like this for a cleaner api.
|
|
Users just need to pass in a named param to this source and we can
|
|
dynamically generate a config object for it.
|
|
"""
|
|
for key, val in list(config_items.items()):
|
|
if hasattr(config, key):
|
|
setattr(config, key, val)
|
|
|
|
return config
|