Files
MoFin/venv/lib/python3.12/site-packages/newspaper/images.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

243 lines
7.4 KiB
Python

# -*- coding: utf-8 -*-
"""
The following image extraction implementation was taken from an old
copy of Reddit's source code.
"""
__title__ = 'newspaper'
__author__ = 'Lucas Ou-Yang'
__license__ = 'MIT'
__copyright__ = 'Copyright 2014, Lucas Ou-Yang'
import logging
import math
import io
import traceback
import urllib.parse
import requests
from PIL import Image, ImageFile
from . import urls
log = logging.getLogger(__name__)
chunk_size = 1024
thumbnail_size = 90, 90
minimal_area = 5000
def image_to_str(image):
s = io.StringIO()
image.save(s, image.format)
s.seek(0)
return s.read()
def str_to_image(s):
s = io.StringIO(s)
s.seek(0)
image = Image.open(s)
return image
def prepare_image(image):
image = square_image(image)
image.thumbnail(thumbnail_size, Image.ANTIALIAS)
return image
def image_entropy(img):
""" Calculate the entropy of an image
"""
hist = img.histogram()
hist_size = sum(hist)
hist = [float(h) / hist_size for h in hist]
return -sum([p * math.log(p, 2) for p in hist if p != 0])
def square_image(img):
"""If the image is taller than it is wide, square it off. determine
which pieces to cut off based on the entropy pieces
"""
x, y = img.size
while y > x:
# Slice 10px at a time until square
slice_height = min(y - x, 10)
bottom = img.crop((0, y - slice_height, x, y))
top = img.crop((0, 0, x, slice_height))
# remove the slice with the least entropy
if image_entropy(bottom) < image_entropy(top):
img = img.crop((0, 0, x, y - slice_height))
else:
img = img.crop((0, slice_height, x, y))
x, y = img.size
return img
def clean_url(url):
"""Url quotes unicode data out of urls
"""
url = url.encode('utf8')
url = ''.join([urllib.parse.quote(c)
if ord(c) >= 127 else c for c in url.decode('utf-8')])
return url
def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
cur_try = 0
nothing = None if dimension else (None, None)
url = clean_url(url)
if not url.startswith(('http://', 'https://')):
return nothing
response = None
while True:
try:
response = requests.get(url, stream=True, timeout=5, headers={
'User-Agent': useragent,
'Referer': referer,
})
# if we only need the dimension of the image, we may not
# need to download the entire thing
if dimension:
content = response.raw.read(chunk_size)
else:
content = response.raw.read()
content_type = response.headers.get('Content-Type')
if not content_type:
return nothing
if 'image' in content_type:
p = ImageFile.Parser()
new_data = content
while not p.image and new_data:
try:
p.feed(new_data)
except IOError:
traceback.print_exc()
p = None
break
except ValueError:
traceback.print_exc()
p = None
break
except Exception as e:
# For some favicon.ico images, the image is so small
# that our PIL feed() method fails a length test.
is_favicon = (urls.url_to_filetype(url) == 'ico')
if is_favicon:
pass
else:
raise e
p = None
break
new_data = response.raw.read(chunk_size)
content += new_data
if p is None:
return nothing
# return the size, or return the data
if dimension and p.image:
return p.image.size
elif dimension:
return nothing
elif dimension:
# expected an image, but didn't get one
return nothing
return content_type, content
except requests.exceptions.RequestException as e:
cur_try += 1
if cur_try >= retries:
log.debug('error while fetching: %s refer: %s' %
(url, referer))
return nothing
finally:
if response is not None:
response.raw.close()
if response.raw._connection:
response.raw._connection.close()
def fetch_image_dimension(url, useragent, referer=None, retries=1):
return fetch_url(url, useragent, referer, retries, dimension=True)
class Scraper:
def __init__(self, article):
self.url = article.url
self.imgs = article.imgs
self.top_img = article.top_img
self.config = article.config
self.useragent = self.config.browser_user_agent
def largest_image_url(self):
# TODO: remove. it is not responsibility of Scrapper
if not self.imgs and not self.top_img:
return None
if self.top_img:
return self.top_img
max_area = 0
max_url = None
for img_url in self.imgs:
dimension = fetch_image_dimension(
img_url, self.useragent, referer=self.url)
area = self.calculate_area(img_url, dimension)
if area > max_area:
max_area = area
max_url = img_url
log.debug('using max img {}'.format(max_url))
return max_url
def calculate_area(self, img_url, dimension):
if not dimension:
return 0
area = dimension[0] * dimension[1]
# Ignore tiny images
if area < minimal_area:
log.debug('ignore little %s' % img_url)
return 0
# PIL won't scale up, so set a min width and
# maintain the aspect ratio
if dimension[0] < thumbnail_size[0]:
return 0
# Ignore excessively long/wide images
current_ratio = max(dimension) / min(dimension)
if current_ratio > self.config.image_dimension_ration:
log.debug('ignore dims %s' % img_url)
return 0
# Penalize images with "sprite" in their name
lower_case_url = img_url.lower()
if 'sprite' in lower_case_url or 'logo' in lower_case_url:
log.debug('penalizing sprite %s' % img_url)
area /= 10
return area
def satisfies_requirements(self, img_url):
dimension = fetch_image_dimension(
img_url, self.useragent, referer=self.url)
area = self.calculate_area(img_url, dimension)
return area > minimal_area
def thumbnail(self):
"""Identifies top image, trims out a thumbnail and also has a url
"""
image_url = self.largest_image_url()
if image_url:
content_type, image_str = fetch_url(image_url, referer=self.url)
if image_str:
image = str_to_image(image_str)
try:
image = prepare_image(image)
except IOError as e:
if 'interlaced' in e.message:
return None
return image, image_url
return None, None