fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
243 lines
7.4 KiB
Python
243 lines
7.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
The following image extraction implementation was taken from an old
|
|
copy of Reddit's source code.
|
|
"""
|
|
__title__ = 'newspaper'
|
|
__author__ = 'Lucas Ou-Yang'
|
|
__license__ = 'MIT'
|
|
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
|
|
|
|
import logging
|
|
import math
|
|
import io
|
|
import traceback
|
|
import urllib.parse
|
|
|
|
import requests
|
|
from PIL import Image, ImageFile
|
|
|
|
from . import urls
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
chunk_size = 1024
|
|
thumbnail_size = 90, 90
|
|
minimal_area = 5000
|
|
|
|
|
|
def image_to_str(image):
|
|
s = io.StringIO()
|
|
image.save(s, image.format)
|
|
s.seek(0)
|
|
return s.read()
|
|
|
|
|
|
def str_to_image(s):
|
|
s = io.StringIO(s)
|
|
s.seek(0)
|
|
image = Image.open(s)
|
|
return image
|
|
|
|
|
|
def prepare_image(image):
|
|
image = square_image(image)
|
|
image.thumbnail(thumbnail_size, Image.ANTIALIAS)
|
|
return image
|
|
|
|
|
|
def image_entropy(img):
|
|
""" Calculate the entropy of an image
|
|
"""
|
|
hist = img.histogram()
|
|
hist_size = sum(hist)
|
|
hist = [float(h) / hist_size for h in hist]
|
|
return -sum([p * math.log(p, 2) for p in hist if p != 0])
|
|
|
|
|
|
def square_image(img):
|
|
"""If the image is taller than it is wide, square it off. determine
|
|
which pieces to cut off based on the entropy pieces
|
|
"""
|
|
x, y = img.size
|
|
while y > x:
|
|
# Slice 10px at a time until square
|
|
slice_height = min(y - x, 10)
|
|
bottom = img.crop((0, y - slice_height, x, y))
|
|
top = img.crop((0, 0, x, slice_height))
|
|
# remove the slice with the least entropy
|
|
if image_entropy(bottom) < image_entropy(top):
|
|
img = img.crop((0, 0, x, y - slice_height))
|
|
else:
|
|
img = img.crop((0, slice_height, x, y))
|
|
x, y = img.size
|
|
return img
|
|
|
|
|
|
def clean_url(url):
|
|
"""Url quotes unicode data out of urls
|
|
"""
|
|
url = url.encode('utf8')
|
|
url = ''.join([urllib.parse.quote(c)
|
|
if ord(c) >= 127 else c for c in url.decode('utf-8')])
|
|
return url
|
|
|
|
|
|
def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
|
|
cur_try = 0
|
|
nothing = None if dimension else (None, None)
|
|
url = clean_url(url)
|
|
if not url.startswith(('http://', 'https://')):
|
|
return nothing
|
|
|
|
response = None
|
|
while True:
|
|
try:
|
|
response = requests.get(url, stream=True, timeout=5, headers={
|
|
'User-Agent': useragent,
|
|
'Referer': referer,
|
|
})
|
|
|
|
# if we only need the dimension of the image, we may not
|
|
# need to download the entire thing
|
|
if dimension:
|
|
content = response.raw.read(chunk_size)
|
|
else:
|
|
content = response.raw.read()
|
|
|
|
content_type = response.headers.get('Content-Type')
|
|
|
|
if not content_type:
|
|
return nothing
|
|
|
|
if 'image' in content_type:
|
|
p = ImageFile.Parser()
|
|
new_data = content
|
|
while not p.image and new_data:
|
|
try:
|
|
p.feed(new_data)
|
|
except IOError:
|
|
traceback.print_exc()
|
|
p = None
|
|
break
|
|
except ValueError:
|
|
traceback.print_exc()
|
|
p = None
|
|
break
|
|
except Exception as e:
|
|
# For some favicon.ico images, the image is so small
|
|
# that our PIL feed() method fails a length test.
|
|
is_favicon = (urls.url_to_filetype(url) == 'ico')
|
|
if is_favicon:
|
|
pass
|
|
else:
|
|
raise e
|
|
p = None
|
|
break
|
|
new_data = response.raw.read(chunk_size)
|
|
content += new_data
|
|
|
|
if p is None:
|
|
return nothing
|
|
# return the size, or return the data
|
|
if dimension and p.image:
|
|
return p.image.size
|
|
elif dimension:
|
|
return nothing
|
|
elif dimension:
|
|
# expected an image, but didn't get one
|
|
return nothing
|
|
|
|
return content_type, content
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
cur_try += 1
|
|
if cur_try >= retries:
|
|
log.debug('error while fetching: %s refer: %s' %
|
|
(url, referer))
|
|
return nothing
|
|
finally:
|
|
if response is not None:
|
|
response.raw.close()
|
|
if response.raw._connection:
|
|
response.raw._connection.close()
|
|
|
|
|
|
def fetch_image_dimension(url, useragent, referer=None, retries=1):
|
|
return fetch_url(url, useragent, referer, retries, dimension=True)
|
|
|
|
|
|
class Scraper:
|
|
|
|
def __init__(self, article):
|
|
self.url = article.url
|
|
self.imgs = article.imgs
|
|
self.top_img = article.top_img
|
|
self.config = article.config
|
|
self.useragent = self.config.browser_user_agent
|
|
|
|
def largest_image_url(self):
|
|
# TODO: remove. it is not responsibility of Scrapper
|
|
if not self.imgs and not self.top_img:
|
|
return None
|
|
if self.top_img:
|
|
return self.top_img
|
|
|
|
max_area = 0
|
|
max_url = None
|
|
for img_url in self.imgs:
|
|
dimension = fetch_image_dimension(
|
|
img_url, self.useragent, referer=self.url)
|
|
area = self.calculate_area(img_url, dimension)
|
|
if area > max_area:
|
|
max_area = area
|
|
max_url = img_url
|
|
log.debug('using max img {}'.format(max_url))
|
|
return max_url
|
|
|
|
def calculate_area(self, img_url, dimension):
|
|
if not dimension:
|
|
return 0
|
|
area = dimension[0] * dimension[1]
|
|
# Ignore tiny images
|
|
if area < minimal_area:
|
|
log.debug('ignore little %s' % img_url)
|
|
return 0
|
|
# PIL won't scale up, so set a min width and
|
|
# maintain the aspect ratio
|
|
if dimension[0] < thumbnail_size[0]:
|
|
return 0
|
|
# Ignore excessively long/wide images
|
|
current_ratio = max(dimension) / min(dimension)
|
|
if current_ratio > self.config.image_dimension_ration:
|
|
log.debug('ignore dims %s' % img_url)
|
|
return 0
|
|
# Penalize images with "sprite" in their name
|
|
lower_case_url = img_url.lower()
|
|
if 'sprite' in lower_case_url or 'logo' in lower_case_url:
|
|
log.debug('penalizing sprite %s' % img_url)
|
|
area /= 10
|
|
return area
|
|
|
|
def satisfies_requirements(self, img_url):
|
|
dimension = fetch_image_dimension(
|
|
img_url, self.useragent, referer=self.url)
|
|
area = self.calculate_area(img_url, dimension)
|
|
return area > minimal_area
|
|
|
|
def thumbnail(self):
|
|
"""Identifies top image, trims out a thumbnail and also has a url
|
|
"""
|
|
image_url = self.largest_image_url()
|
|
if image_url:
|
|
content_type, image_str = fetch_url(image_url, referer=self.url)
|
|
if image_str:
|
|
image = str_to_image(image_str)
|
|
try:
|
|
image = prepare_image(image)
|
|
except IOError as e:
|
|
if 'interlaced' in e.message:
|
|
return None
|
|
return image, image_url
|
|
return None, None
|