fix: decode HTML entities in wechat article URL from XML
WeChat XML uses & to encode & in forwarded article URLs. Without html.unescape(), chksm and other query params were passed encoded to WeChat servers, causing signature mismatch and captcha block. Ultraworked with Sisyphus Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
@@ -636,12 +636,16 @@ def process_msg(raw_data):
|
|||||||
if not urls:
|
if not urls:
|
||||||
urls = re.findall(r'<shareUrlOpen>(https?://mp\.weixin\.qq\.com[^<]+)</shareUrlOpen>', ct)
|
urls = re.findall(r'<shareUrlOpen>(https?://mp\.weixin\.qq\.com[^<]+)</shareUrlOpen>', ct)
|
||||||
url = urls[0] if urls else None
|
url = urls[0] if urls else None
|
||||||
|
# Decode HTML entities (& → &) — WeChat XML uses & in URLs
|
||||||
|
if url:
|
||||||
|
import html as _html
|
||||||
|
url = _html.unescape(url)
|
||||||
# Extract title from XML
|
# Extract title from XML
|
||||||
titles = re.findall(r'<title>(.*?)</title>', ct)
|
titles = re.findall(r'<title>(.*?)</title>', ct)
|
||||||
title = titles[0] if titles else ""
|
title = _html.unescape(titles[0]) if titles else ""
|
||||||
# Extract description
|
# Extract description
|
||||||
descs = re.findall(r'<des>(.*?)</des>', ct)
|
descs = re.findall(r'<des>(.*?)</des>', ct)
|
||||||
desc = descs[0] if descs else ""
|
desc = _html.unescape(descs[0]) if descs else ""
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
log(f"ARTICLE URL: {url}")
|
log(f"ARTICLE URL: {url}")
|
||||||
|
|||||||
Reference in New Issue
Block a user