From c9311ad31bf1efcabdadc553421dd7a27f18299d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=A5=E5=BE=AE?= <zhiwei@mofin.local>
Date: Sun, 21 Jun 2026 00:05:17 +0800
Subject: [PATCH] =?UTF-8?q?xiaoguo=5Fnews=5Fprocessor:=20=E5=B8=A6?=
 =?UTF-8?q?=E5=85=A8=E6=96=87=E5=88=86=E6=9E=905=E7=AF=8712=E7=A7=92?=
 =?UTF-8?q?=EF=BC=8C=E6=91=98=E8=A6=81=E4=B8=8D=E9=99=90=E5=AD=97=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 xiaoguo_news_processor.py | 45 ++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/xiaoguo_news_processor.py b/xiaoguo_news_processor.py
index 7c0942c..e0393c5 100644
--- a/xiaoguo_news_processor.py
+++ b/xiaoguo_news_processor.py
@@ -43,20 +43,25 @@ def get_conn():
 
 
 def search_akshare_news(code, max_results=3):
-    """用 akshare 搜个股新闻"""
-    titles = []
+    """用 akshare 搜个股新闻（含全文）"""
+    articles = []
     if not HAS_AKSHARE:
-        return titles
+        return articles
     try:
         clean_proxy()
         df = ak.stock_news_em(symbol=code)
         for _, r in df.head(max_results).iterrows():
             title = r.get('新闻标题', '')
+            content = r.get('新闻内容', '')
             if title and len(title) > 5:
-                titles.append({"title": title, "url": r.get('新闻链接', '')})
+                articles.append({
+                    "title": title,
+                    "content": content,
+                    "url": r.get('新闻链接', '')
+                })
     except:
         pass
-    return titles
+    return articles
 
 
 def extract_json(text):
@@ -90,11 +95,17 @@ def call_xiaoguo(articles):
     """调小果LLM：给摘要+情感"""
     lines = []
     for a in articles:
-        # 清理标题：去掉股票代码（6位数字+前后空格）
         title = re.sub(r'\b\d{6}\b', '', a['title']).strip()
         title = re.sub(r'\s+', ' ', title)
-        lines.append(f"{len(lines)+1}. {title}")
-    prompt = "\n".join(lines) + "\n\n逐篇给一句话摘要和情感（positive/negative/neutral）。JSON数组。"
+        content = (a.get('content') or '')[:200]
+        # 给正文加标点分隔（akshare正文无标点，模型推理会卡）
+        if content and not any(c in content for c in '。，！？；'):
+            content = '。'.join([content[i:i+20] for i in range(0, len(content), 20)])
+        if content:
+            lines.append(f"{len(lines)+1}. {title}\n   {content}")
+        else:
+            lines.append(f"{len(lines)+1}. {title}")
+    prompt = "\n".join(lines) + "\n\n逐篇分析：给摘要（概括核心内容）和情感（positive/negative/neutral）。JSON数组。"
 
     payload = json.dumps({
         "model": XIAOGUO_MODEL,
@@ -174,7 +185,16 @@ def main():
         conn.close()
         return
 
-    batch = all_articles[:MAX_ARTICLES]
+    # 只取前5篇，跳过含有表格数据的脏内容
+    filtered = []
+    for a in all_articles:
+        c = a.get('content', '') or ''
+        if any(kw in c for kw in ['主力资金', '资金净流入', '代码', '简称']):
+            continue
+        filtered.append(a)
+        if len(filtered) >= MAX_ARTICLES:
+            break
+    batch = filtered[:MAX_ARTICLES]
     print(f"  共{len(all_articles)}篇，送小果分析{len(batch)}篇", flush=True)
 
     results = call_xiaoguo(batch)
@@ -183,13 +203,12 @@ def main():
         conn.close()
         return
 
-    # 合并结果（用索引位置匹配，不依赖标题文本）
+    # 合并结果（用索引位置匹配）
     for i, r in enumerate(results):
         if i < len(batch):
-            batch[i]["sentiment"] = translate_sentiment(r.get("sentiment", ""))
-            batch[i]["summary"] = r.get("summary", "")
+            batch[i]["sentiment"] = translate_sentiment(r.get("sentiment", r.get("情感", "")))
+            batch[i]["summary"] = r.get("summary", r.get("摘要", ""))
         else:
-            # 超过批次的额外结果
             break
 
     # 汇总情感