From c9311ad31bf1efcabdadc553421dd7a27f18299d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=A5=E5=BE=AE?= Date: Sun, 21 Jun 2026 00:05:17 +0800 Subject: [PATCH] =?UTF-8?q?xiaoguo=5Fnews=5Fprocessor:=20=E5=B8=A6?= =?UTF-8?q?=E5=85=A8=E6=96=87=E5=88=86=E6=9E=905=E7=AF=8712=E7=A7=92?= =?UTF-8?q?=EF=BC=8C=E6=91=98=E8=A6=81=E4=B8=8D=E9=99=90=E5=AD=97=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xiaoguo_news_processor.py | 45 ++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/xiaoguo_news_processor.py b/xiaoguo_news_processor.py index 7c0942c..e0393c5 100644 --- a/xiaoguo_news_processor.py +++ b/xiaoguo_news_processor.py @@ -43,20 +43,25 @@ def get_conn(): def search_akshare_news(code, max_results=3): - """用 akshare 搜个股新闻""" - titles = [] + """用 akshare 搜个股新闻(含全文)""" + articles = [] if not HAS_AKSHARE: - return titles + return articles try: clean_proxy() df = ak.stock_news_em(symbol=code) for _, r in df.head(max_results).iterrows(): title = r.get('新闻标题', '') + content = r.get('新闻内容', '') if title and len(title) > 5: - titles.append({"title": title, "url": r.get('新闻链接', '')}) + articles.append({ + "title": title, + "content": content, + "url": r.get('新闻链接', '') + }) except: pass - return titles + return articles def extract_json(text): @@ -90,11 +95,17 @@ def call_xiaoguo(articles): """调小果LLM:给摘要+情感""" lines = [] for a in articles: - # 清理标题:去掉股票代码(6位数字+前后空格) title = re.sub(r'\b\d{6}\b', '', a['title']).strip() title = re.sub(r'\s+', ' ', title) - lines.append(f"{len(lines)+1}. {title}") - prompt = "\n".join(lines) + "\n\n逐篇给一句话摘要和情感(positive/negative/neutral)。JSON数组。" + content = (a.get('content') or '')[:200] + # 给正文加标点分隔(akshare正文无标点,模型推理会卡) + if content and not any(c in content for c in '。,!?;'): + content = '。'.join([content[i:i+20] for i in range(0, len(content), 20)]) + if content: + lines.append(f"{len(lines)+1}. {title}\n {content}") + else: + lines.append(f"{len(lines)+1}. {title}") + prompt = "\n".join(lines) + "\n\n逐篇分析:给摘要(概括核心内容)和情感(positive/negative/neutral)。JSON数组。" payload = json.dumps({ "model": XIAOGUO_MODEL, @@ -174,7 +185,16 @@ def main(): conn.close() return - batch = all_articles[:MAX_ARTICLES] + # 只取前5篇,跳过含有表格数据的脏内容 + filtered = [] + for a in all_articles: + c = a.get('content', '') or '' + if any(kw in c for kw in ['主力资金', '资金净流入', '代码', '简称']): + continue + filtered.append(a) + if len(filtered) >= MAX_ARTICLES: + break + batch = filtered[:MAX_ARTICLES] print(f" 共{len(all_articles)}篇,送小果分析{len(batch)}篇", flush=True) results = call_xiaoguo(batch) @@ -183,13 +203,12 @@ def main(): conn.close() return - # 合并结果(用索引位置匹配,不依赖标题文本) + # 合并结果(用索引位置匹配) for i, r in enumerate(results): if i < len(batch): - batch[i]["sentiment"] = translate_sentiment(r.get("sentiment", "")) - batch[i]["summary"] = r.get("summary", "") + batch[i]["sentiment"] = translate_sentiment(r.get("sentiment", r.get("情感", ""))) + batch[i]["summary"] = r.get("summary", r.get("摘要", "")) else: - # 超过批次的额外结果 break # 汇总情感