Files
MoFin/venv/lib/python3.12/site-packages/newspaper/videos/extractors.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

116 lines
3.7 KiB
Python

# -*- coding: utf-8 -*-
from .videos import Video
VIDEOS_TAGS = ['iframe', 'embed', 'object', 'video']
VIDEO_PROVIDERS = ['youtube', 'vimeo', 'dailymotion', 'kewego']
class VideoExtractor(object):
"""Extracts a list of video from Article top node
"""
def __init__(self, config, top_node):
self.config = config
self.parser = self.config.get_parser()
self.top_node = top_node
self.candidates = []
self.movies = []
def get_embed_code(self, node):
return "".join([
line.strip()
for line in self.parser.nodeToString(node).splitlines()])
def get_embed_type(self, node):
return self.parser.getTag(node)
def get_width(self, node):
return self.parser.getAttribute(node, 'width')
def get_height(self, node):
return self.parser.getAttribute(node, 'height')
def get_src(self, node):
return self.parser.getAttribute(node, 'src')
def get_provider(self, src):
if src:
for provider in VIDEO_PROVIDERS:
if provider in src:
return provider
return None
def get_video(self, node):
"""Create a video object from a video embed
"""
video = Video()
video.embed_code = self.get_embed_code(node)
video.embed_type = self.get_embed_type(node)
video.width = self.get_width(node)
video.height = self.get_height(node)
video.src = self.get_src(node)
video.provider = self.get_provider(video.src)
return video
def get_iframe_tag(self, node):
return self.get_video(node)
def get_video_tag(self, node):
"""Extract html video tags
"""
return Video()
def get_embed_tag(self, node):
# embed node may have an object node as parent
# in this case we want to retrieve the object node
# instead of the embed
parent = self.parser.getParent(node)
if parent is not None:
parent_tag = self.parser.getTag(parent)
if parent_tag == 'object':
return self.get_object_tag(node)
return self.get_video(node)
def get_object_tag(self, node):
# test if object tag has en embed child
# in this case we want to remove the embed from
# the candidate list to avoid parsing it twice
child_embed_tag = self.parser.getElementsByTag(node, 'embed')
if child_embed_tag and child_embed_tag[0] in self.candidates:
self.candidates.remove(child_embed_tag[0])
# get the object source
# if we don't have a src node don't coninue
src_node = self.parser.getElementsByTag(
node, tag="param", attr="name", value="movie")
if not src_node:
return None
src = self.parser.getAttribute(src_node[0], "value")
# check provider
provider = self.get_provider(src)
if not provider:
return None
video = self.get_video(node)
video.provider = provider
video.src = src
return video
def get_videos(self):
self.candidates = self.parser.getElementsByTags(
self.top_node, VIDEOS_TAGS)
# loop all candidates
# and check if src attribute belongs to a video provider
for candidate in self.candidates:
tag = self.parser.getTag(candidate)
attr = "get_%s_tag" % tag
if hasattr(self, attr):
movie = getattr(self, attr)(candidate)
if movie is not None and movie.provider is not None:
self.movies.append(movie)
return list(self.movies)
# append movies list to article
# self.article.movies = list(self.movies)