# -*- coding: utf-8 -*- """ The following image extraction implementation was taken from an old copy of Reddit's source code. """ __title__ = 'newspaper' __author__ = 'Lucas Ou-Yang' __license__ = 'MIT' __copyright__ = 'Copyright 2014, Lucas Ou-Yang' import logging import math import io import traceback import urllib.parse import requests from PIL import Image, ImageFile from . import urls log = logging.getLogger(__name__) chunk_size = 1024 thumbnail_size = 90, 90 minimal_area = 5000 def image_to_str(image): s = io.StringIO() image.save(s, image.format) s.seek(0) return s.read() def str_to_image(s): s = io.StringIO(s) s.seek(0) image = Image.open(s) return image def prepare_image(image): image = square_image(image) image.thumbnail(thumbnail_size, Image.ANTIALIAS) return image def image_entropy(img): """ Calculate the entropy of an image """ hist = img.histogram() hist_size = sum(hist) hist = [float(h) / hist_size for h in hist] return -sum([p * math.log(p, 2) for p in hist if p != 0]) def square_image(img): """If the image is taller than it is wide, square it off. determine which pieces to cut off based on the entropy pieces """ x, y = img.size while y > x: # Slice 10px at a time until square slice_height = min(y - x, 10) bottom = img.crop((0, y - slice_height, x, y)) top = img.crop((0, 0, x, slice_height)) # remove the slice with the least entropy if image_entropy(bottom) < image_entropy(top): img = img.crop((0, 0, x, y - slice_height)) else: img = img.crop((0, slice_height, x, y)) x, y = img.size return img def clean_url(url): """Url quotes unicode data out of urls """ url = url.encode('utf8') url = ''.join([urllib.parse.quote(c) if ord(c) >= 127 else c for c in url.decode('utf-8')]) return url def fetch_url(url, useragent, referer=None, retries=1, dimension=False): cur_try = 0 nothing = None if dimension else (None, None) url = clean_url(url) if not url.startswith(('http://', 'https://')): return nothing response = None while True: try: response = requests.get(url, stream=True, timeout=5, headers={ 'User-Agent': useragent, 'Referer': referer, }) # if we only need the dimension of the image, we may not # need to download the entire thing if dimension: content = response.raw.read(chunk_size) else: content = response.raw.read() content_type = response.headers.get('Content-Type') if not content_type: return nothing if 'image' in content_type: p = ImageFile.Parser() new_data = content while not p.image and new_data: try: p.feed(new_data) except IOError: traceback.print_exc() p = None break except ValueError: traceback.print_exc() p = None break except Exception as e: # For some favicon.ico images, the image is so small # that our PIL feed() method fails a length test. is_favicon = (urls.url_to_filetype(url) == 'ico') if is_favicon: pass else: raise e p = None break new_data = response.raw.read(chunk_size) content += new_data if p is None: return nothing # return the size, or return the data if dimension and p.image: return p.image.size elif dimension: return nothing elif dimension: # expected an image, but didn't get one return nothing return content_type, content except requests.exceptions.RequestException as e: cur_try += 1 if cur_try >= retries: log.debug('error while fetching: %s refer: %s' % (url, referer)) return nothing finally: if response is not None: response.raw.close() if response.raw._connection: response.raw._connection.close() def fetch_image_dimension(url, useragent, referer=None, retries=1): return fetch_url(url, useragent, referer, retries, dimension=True) class Scraper: def __init__(self, article): self.url = article.url self.imgs = article.imgs self.top_img = article.top_img self.config = article.config self.useragent = self.config.browser_user_agent def largest_image_url(self): # TODO: remove. it is not responsibility of Scrapper if not self.imgs and not self.top_img: return None if self.top_img: return self.top_img max_area = 0 max_url = None for img_url in self.imgs: dimension = fetch_image_dimension( img_url, self.useragent, referer=self.url) area = self.calculate_area(img_url, dimension) if area > max_area: max_area = area max_url = img_url log.debug('using max img {}'.format(max_url)) return max_url def calculate_area(self, img_url, dimension): if not dimension: return 0 area = dimension[0] * dimension[1] # Ignore tiny images if area < minimal_area: log.debug('ignore little %s' % img_url) return 0 # PIL won't scale up, so set a min width and # maintain the aspect ratio if dimension[0] < thumbnail_size[0]: return 0 # Ignore excessively long/wide images current_ratio = max(dimension) / min(dimension) if current_ratio > self.config.image_dimension_ration: log.debug('ignore dims %s' % img_url) return 0 # Penalize images with "sprite" in their name lower_case_url = img_url.lower() if 'sprite' in lower_case_url or 'logo' in lower_case_url: log.debug('penalizing sprite %s' % img_url) area /= 10 return area def satisfies_requirements(self, img_url): dimension = fetch_image_dimension( img_url, self.useragent, referer=self.url) area = self.calculate_area(img_url, dimension) return area > minimal_area def thumbnail(self): """Identifies top image, trims out a thumbnail and also has a url """ image_url = self.largest_image_url() if image_url: content_type, image_str = fetch_url(image_url, referer=self.url) if image_str: image = str_to_image(image_str) try: image = prepare_image(image) except IOError as e: if 'interlaced' in e.message: return None return image, image_url return None, None