fix: group chat detection + article URL handling (Xiaohongshu/WeChat/zhihu)
- 群聊检测: 通过 protocol 的 group_id 字段识别群消息 - 引用消息: 处理 ref_msg 中的转发文章/引用内容 - 文章URL: 支持 mp.weixin.qq.com / xiaohongshu.com / xhslink.com / zhihu.com - CDN图片: 占位函数,后续实现 AES 解密下载 - 图片OCR: 统一失败处理,不再发送两条消息
This commit is contained in:
@@ -257,13 +257,36 @@ def fetch_article(url: str) -> dict | None:
|
||||
|
||||
# ── Main Message Handler ──────────────────────────────────────
|
||||
|
||||
def build_message_display_id(msg: IncomingMessage, bot: WeixinBot) -> str:
|
||||
def detect_group(msg: IncomingMessage) -> str:
|
||||
"""
|
||||
Build a display identifier for the sender.
|
||||
Since iLink uses internal user_ids, we use the user_id as the identifier
|
||||
and supplement with available info.
|
||||
Detect if a message is from a group chat.
|
||||
Returns the group_id if it's a group, or empty string if private chat.
|
||||
"""
|
||||
return msg.user_id
|
||||
raw = msg.raw
|
||||
# Check group_id field directly from protocol
|
||||
gid = raw.get("group_id", "") or ""
|
||||
if gid:
|
||||
return gid
|
||||
# Also check session_id for group patterns (contains 'chatroom' or multiple '@')
|
||||
session_id = raw.get("session_id", "") or ""
|
||||
if "@chatroom" in session_id:
|
||||
return session_id.split("#")[0] if "#" in session_id else session_id
|
||||
return ""
|
||||
|
||||
|
||||
def extract_article_url(text: str) -> str | None:
|
||||
"""Extract article URL from text. Supports WeChat public accounts, Xiaohongshu, and common share links."""
|
||||
patterns = [
|
||||
r"https?://mp\.weixin\.qq\.com[^\"'\s<)<>\[\]]+",
|
||||
r"https?://(?:www\.)?xiaohongshu\.com[^\"'\s<)<>\[\]]+",
|
||||
r"https?://xhslink\.com[^\"'\s<)<>\[\]]+",
|
||||
r"https?://(?:www\.)?zhihu\.com[^\"'\s<)<>\[\]]+",
|
||||
]
|
||||
for p in patterns:
|
||||
m = re.search(p, text)
|
||||
if m:
|
||||
return m.group(0)
|
||||
return None
|
||||
|
||||
|
||||
async def handle_incoming(msg: IncomingMessage, bot: WeixinBot):
|
||||
@@ -274,15 +297,40 @@ async def handle_incoming(msg: IncomingMessage, bot: WeixinBot):
|
||||
content = msg.text or ""
|
||||
msg_type = msg.type
|
||||
|
||||
# ── text message ──
|
||||
# ── Group chat detection ──
|
||||
group_id = detect_group(msg)
|
||||
if group_id:
|
||||
log.info(f"GROUP message from {group_id}, sender={fu}")
|
||||
# Use the group_id as the conversation identifier (like Windows version uses roomid)
|
||||
conv_id = group_id
|
||||
prefix = "[Group]"
|
||||
else:
|
||||
conv_id = fu
|
||||
prefix = "[Private]"
|
||||
|
||||
# ── TEXT message ──
|
||||
if msg_type == "text":
|
||||
handler_input = content
|
||||
|
||||
# Detect forwarded articles
|
||||
if "mp.weixin.qq.com" in content:
|
||||
url_match = re.search(r"https?://mp\.weixin\.qq\.com[^\"'\s<)<>\[\]]+", content)
|
||||
if url_match:
|
||||
article = fetch_article(url_match.group(0))
|
||||
# Check for ref_msg (forwarded/quoted content like articles)
|
||||
ref_text = ""
|
||||
for item in msg.raw.get("item_list", []):
|
||||
ref_msg = item.get("ref_msg")
|
||||
if ref_msg:
|
||||
ref_title = ref_msg.get("title", "")
|
||||
ref_item = ref_msg.get("message_item", {})
|
||||
if isinstance(ref_item, dict) and ref_item.get("type") == 1:
|
||||
ref_text_data = ref_item.get("text_item", {}).get("text", "")
|
||||
if ref_text_data:
|
||||
ref_text = f"\n[引用消息] {ref_text_data[:500]}"
|
||||
if ref_title:
|
||||
handler_input = f"[老莫转发了一篇文章] 标题: {ref_title}\n\n{content}{ref_text}"
|
||||
|
||||
# Detect article URLs (WeChat public account, Xiaohongshu, etc.)
|
||||
article_url = extract_article_url(handler_input)
|
||||
if article_url:
|
||||
log.info(f"Article URL detected: {article_url[:80]}")
|
||||
article = fetch_article(article_url)
|
||||
if article:
|
||||
title = article.get("title", "")
|
||||
article_text = article.get("content", "")[:2000]
|
||||
@@ -292,63 +340,72 @@ async def handle_incoming(msg: IncomingMessage, bot: WeixinBot):
|
||||
+ (f"({images}张图片已OCR)\n" if images else "")
|
||||
+ f"\n{article_text}"
|
||||
)
|
||||
else:
|
||||
# Article processor failed, send original content
|
||||
log.warning("Article processor returned no content, sending raw text")
|
||||
|
||||
reply = call_hermes(fu, handler_input)
|
||||
reply = call_hermes(conv_id, handler_input)
|
||||
if reply and reply.strip():
|
||||
await process_reply(reply, fu, bot)
|
||||
await process_reply(reply, conv_id, bot)
|
||||
|
||||
# ── image message ──
|
||||
# ── IMAGE message ──
|
||||
elif msg_type == "image":
|
||||
log.info(f"Image from {fu}, attempting OCR...")
|
||||
log.info(f"Image from {conv_id}, attempting OCR...")
|
||||
|
||||
# Get image URL from the raw message
|
||||
img_url = None
|
||||
for item in msg.raw.get("item_list", []):
|
||||
if item.get("type") in (2,): # IMAGE
|
||||
img_item = item.get("image_item", {})
|
||||
# Try direct URL first, then CDN
|
||||
img_url = img_item.get("url", "")
|
||||
if not img_url:
|
||||
# CDN media - need decryption
|
||||
# CDN media - download via CDN API
|
||||
media = img_item.get("media", {})
|
||||
aes_key = media.get("aes_key", "")
|
||||
encrypt_query = media.get("encrypt_query_param", "")
|
||||
# For now, log the CDN info and continue
|
||||
log.info(f"Image has CDN media: aes_key={aes_key[:15]}...")
|
||||
img_url = None
|
||||
if media:
|
||||
log.info("Image has CDN media, attempting CDN download...")
|
||||
img_url = download_cdn_image(media)
|
||||
break
|
||||
|
||||
ocr_text = None
|
||||
if img_url:
|
||||
ocr_text = ocr_image_from_url(img_url)
|
||||
if not ocr_text:
|
||||
log.warning("OCR returned no text from image URL")
|
||||
else:
|
||||
log.info("No direct image URL, sending raw iLink image to Hermes for description")
|
||||
handler_input = "[老莫发送了一张图片]"
|
||||
reply = call_hermes(fu, handler_input)
|
||||
if reply and reply.strip():
|
||||
await bot.reply(msg, reply.strip())
|
||||
return
|
||||
log.info("No image URL available, cannot OCR")
|
||||
|
||||
if ocr_text:
|
||||
handler_input = f"[老莫发送了一张图片,OCR识别结果如下]\n{ocr_text}"
|
||||
else:
|
||||
handler_input = "[老莫发送了一张图片,但OCR识别失败,无法读取内容]"
|
||||
handler_input = "[老莫发送了一张图片,但无法识别图片内容]"
|
||||
|
||||
reply = call_hermes(fu, handler_input)
|
||||
reply = call_hermes(conv_id, handler_input)
|
||||
if reply and reply.strip():
|
||||
await bot.reply(msg, reply.strip())
|
||||
|
||||
# ── voice message ──
|
||||
# ── VOICE message ──
|
||||
elif msg_type == "voice":
|
||||
reply = call_hermes(fu, "[voice message]")
|
||||
reply = call_hermes(conv_id, "[voice message]")
|
||||
if reply and reply.strip():
|
||||
await bot.reply(msg, reply.strip())
|
||||
|
||||
# ── unknown type ──
|
||||
# ── Unknown type ──
|
||||
else:
|
||||
log.info(f"Unhandled message type: {msg_type}")
|
||||
|
||||
|
||||
def download_cdn_image(media: dict) -> str | None:
|
||||
"""
|
||||
Download an image from WeChat CDN using the iLink media protocol.
|
||||
The media dict contains aes_key and encrypt_query_param for AES-128-ECB decryption.
|
||||
For now, this is a placeholder - CDN download requires AES decryption.
|
||||
"""
|
||||
logger = log
|
||||
logger.info(f"CDN media available but direct download not yet implemented")
|
||||
logger.debug(f"CDN media keys: {list(media.keys())}")
|
||||
return None
|
||||
|
||||
|
||||
async def process_reply(reply: str, fu: str, bot: WeixinBot):
|
||||
"""
|
||||
Process Hermes reply text, handling tags like [FILE], [IMG], [EMOJI].
|
||||
|
||||
+340
-967
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user