# -*- coding: utf-8 -*- """ Holds misc. utility methods which prove to be useful throughout this library. """ __title__ = 'newspaper' __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' import codecs import hashlib import logging import os import pickle import random import re import string import sys import threading import time from hashlib import sha1 from bs4 import BeautifulSoup from . import settings log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) class FileHelper(object): @staticmethod def loadResourceFile(filename): if not os.path.isabs(filename): dirpath = os.path.abspath(os.path.dirname(__file__)) path = os.path.join(dirpath, 'resources', filename) else: path = filename try: f = codecs.open(path, 'r', 'utf-8') content = f.read() f.close() return content except IOError: raise IOError("Couldn't open file %s" % path) class ParsingCandidate(object): def __init__(self, url, link_hash): self.url = url self.link_hash = link_hash class RawHelper(object): @staticmethod def get_parsing_candidate(url, raw_html): if isinstance(raw_html, str): raw_html = raw_html.encode('utf-8', 'replace') link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) return ParsingCandidate(url, link_hash) class URLHelper(object): @staticmethod def get_parsing_candidate(url_to_crawl): # Replace shebang in urls final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \ if '#!' in url_to_crawl else url_to_crawl link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time()) return ParsingCandidate(final_url, link_hash) class StringSplitter(object): def __init__(self, pattern): self.pattern = re.compile(pattern) def split(self, string): if not string: return [] return self.pattern.split(string) class StringReplacement(object): def __init__(self, pattern, replaceWith): self.pattern = pattern self.replaceWith = replaceWith def replaceAll(self, string): if not string: return '' return string.replace(self.pattern, self.replaceWith) class ReplaceSequence(object): def __init__(self): self.replacements = [] def create(self, firstPattern, replaceWith=None): result = StringReplacement(firstPattern, replaceWith or '') self.replacements.append(result) return self def append(self, pattern, replaceWith=None): return self.create(pattern, replaceWith) def replaceAll(self, string): if not string: return '' mutatedString = string for rp in self.replacements: mutatedString = rp.replaceAll(mutatedString) return mutatedString class TimeoutError(Exception): pass def timelimit(timeout): """Borrowed from web.py, rip Aaron Swartz """ def _1(function): def _2(*args, **kw): class Dispatch(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.result = None self.error = None self.setDaemon(True) self.start() def run(self): try: self.result = function(*args, **kw) except: self.error = sys.exc_info() c = Dispatch() c.join(timeout) if c.isAlive(): raise TimeoutError() if c.error: raise c.error[0](c.error[1]) return c.result return _2 return _1 def domain_to_filename(domain): """All '/' are turned into '-', no trailing. schema's are gone, only the raw domain + ".txt" remains """ filename = domain.replace('/', '-') if filename[-1] == '-': filename = filename[:-1] filename += ".txt" return filename def filename_to_domain(filename): """[:-4] for the .txt at end """ return filename.replace('-', '/')[:-4] def is_ascii(word): """True if a word is only ascii chars """ def onlyascii(char): if ord(char) > 127: return '' else: return char for c in word: if not onlyascii(c): return False return True def extract_meta_refresh(html): """ Parses html for a tag like: Example can be found at: https://www.google.com/url?rct=j&sa=t&url=http://sfbay.craigslist.org/eby/cto/ 5617800926.html&ct=ga&cd=CAAYATIaYTc4ZTgzYjAwOTAwY2M4Yjpjb206ZW46VVM&usg=AFQjCNF7zAl6JPuEsV4PbEzBomJTUpX4Lg """ soup = BeautifulSoup(html, 'html.parser') element = soup.find('meta', attrs={'http-equiv': 'refresh'}) if element: try: wait_part, url_part = element['content'].split(";") except ValueError: # In case there are not enough values to unpack # for instance: return None else: # Get rid of any " or ' inside the element # for instance: # if url_part.lower().startswith("url="): return url_part[4:].replace('"', '').replace("'", '') def to_valid_filename(s): """Converts arbitrary string (for us domain name) into a valid file name for caching """ valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) return ''.join(c for c in s if c in valid_chars) def cache_disk(seconds=(86400 * 5), cache_folder="/tmp"): """Caching extracting category locations & rss feeds for 5 days """ def do_cache(function): def inner_function(*args, **kwargs): """Calculate a cache key based on the decorated method signature args[1] indicates the domain of the inputs, we hash on domain! """ key = sha1((str(args[1]) + str(kwargs)).encode('utf-8')).hexdigest() filepath = os.path.join(cache_folder, key) # verify that the cached object exists and is less than # X seconds old if os.path.exists(filepath): modified = os.path.getmtime(filepath) age_seconds = time.time() - modified if age_seconds < seconds: return pickle.load(open(filepath, "rb")) # call the decorated function... result = function(*args, **kwargs) # ... and save the cached object for next time pickle.dump(result, open(filepath, "wb")) return result return inner_function return do_cache def print_duration(method): """Prints out the runtime duration of a method in seconds """ def timed(*args, **kw): ts = time.time() result = method(*args, **kw) te = time.time() print('%r %2.2f sec' % (method.__name__, te - ts)) return result return timed def chunks(l, n): """Yield n successive chunks from l """ newn = int(len(l) / n) for i in range(0, n - 1): yield l[i * newn:i * newn + newn] yield l[n * newn - newn:] def purge(fn, pattern): """Delete files in a dir matching pattern """ for f in os.listdir(fn): if re.search(pattern, f): os.remove(os.path.join(fn, f)) def clear_memo_cache(source): """Clears the memoization cache for this specific news domain """ d_pth = os.path.join(settings.MEMO_DIR, domain_to_filename(source.domain)) if os.path.exists(d_pth): os.remove(d_pth) else: print('memo file for', source.domain, 'has already been deleted!') def memoize_articles(source, articles): """When we parse the links in an page, on the 2nd run and later, check the links of previous runs. If they match, it means the link must not be an article, because article urls change as time passes. This method also uniquifies articles. """ source_domain = source.domain config = source.config if len(articles) == 0: return [] memo = {} cur_articles = {article.url: article for article in articles} d_pth = os.path.join(settings.MEMO_DIR, domain_to_filename(source_domain)) if os.path.exists(d_pth): f = codecs.open(d_pth, 'r', 'utf8') urls = f.readlines() f.close() urls = [u.strip() for u in urls] memo = {url: True for url in urls} # prev_length = len(memo) for url, article in list(cur_articles.items()): if memo.get(url): del cur_articles[url] valid_urls = list(memo.keys()) + list(cur_articles.keys()) memo_text = '\r\n'.join( [href.strip() for href in (valid_urls)]) # Our first run with memoization, save every url as valid else: memo_text = '\r\n'.join( [href.strip() for href in list(cur_articles.keys())]) # new_length = len(cur_articles) if len(memo) > config.MAX_FILE_MEMO: # We still keep current batch of articles though! log.critical('memo overflow, dumping') memo_text = '' # TODO if source: source.write_upload_times(prev_length, new_length) ff = codecs.open(d_pth, 'w', 'utf-8') ff.write(memo_text) ff.close() return list(cur_articles.values()) def get_useragent(): """Uses generator to return next useragent in saved file """ with open(settings.USERAGENTS, 'r') as f: agents = f.readlines() selection = random.randint(0, len(agents) - 1) agent = agents[selection] return agent.strip() def get_available_languages(): """Returns a list of available languages and their 2 char input codes """ stopword_files = os.listdir(os.path.join(settings.STOPWORDS_DIR)) two_dig_codes = [f.split('-')[1].split('.')[0] for f in stopword_files] for d in two_dig_codes: assert len(d) == 2 return two_dig_codes def print_available_languages(): """Prints available languages with their full names """ language_dict = { 'ar': 'Arabic', 'ru': 'Russian', 'nl': 'Dutch', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'he': 'Hebrew', 'it': 'Italian', 'ko': 'Korean', 'no': 'Norwegian', 'nb': 'Norwegian (Bokmål)', 'fa': 'Persian', 'pl': 'Polish', 'pt': 'Portuguese', 'sv': 'Swedish', 'hu': 'Hungarian', 'fi': 'Finnish', 'da': 'Danish', 'zh': 'Chinese', 'id': 'Indonesian', 'vi': 'Vietnamese', 'mk': 'Macedonian', 'tr': 'Turkish', 'el': 'Greek', 'uk': 'Ukrainian', 'hi': 'Hindi', 'sw': 'Swahili', 'bg': 'Bulgarian', 'hr': 'Croatian', 'ro': 'Romanian', 'sl': 'Slovenian', 'sr': 'Serbian', 'et': 'Estonian', 'ja': 'Japanese', 'be': 'Belarusian' } codes = get_available_languages() print('\nYour available languages are:') print('\ninput code\t\tfull name') for code in codes: print(' %s\t\t\t %s' % (code, language_dict[code])) print() def extend_config(config, config_items): """ We are handling config value setting like this for a cleaner api. Users just need to pass in a named param to this source and we can dynamically generate a config object for it. """ for key, val in list(config_items.items()): if hasattr(config, key): setattr(config, key, val) return config