Files
MoFin/scripts/macro_context_collector.py
T

266 lines
10 KiB
Python

#!/usr/bin/env python3
"""
macro_context_collector.py — 宏观新闻采集器+实时红绿灯(no_agent)
采集来源:
1. akshare.stock_info_global_em() — 东方财富全球宏观新闻(200条实时)
核心功能:
A. 采集新闻→macro_raw_news(去重入库)
B. 实时红绿灯检测——用关键词规则快速识别HIGH/MEDIUM风险
- 不等LLM,采集完立刻判断
- 检测到风险→写入signal_news+macro_risk_state.json
- 盘中的监控cron(每15-25分)读到后立刻调整策略
红绿灯规则:
HIGH(单条就触发): 全球巨头+核心产业负面/美联储意外/指数暴跌/地缘冲突
MEDIUM(累计2条触发): 常规宏观事件/板块级/资金面
"""
import sys, json, hashlib, os, re
from datetime import datetime
from pathlib import Path
DATA_DIR = Path("/home/hmo/MoFin/data")
DB_PATH = DATA_DIR / "mofin.db"
STATE_PATH = DATA_DIR / "macro_risk_state.json"
# ── 红绿灯 关键词规则 ──
# HIGH: 任何一条匹配 → 立即 HIGH 预警
HIGH_PATTERNS = [
# 全球巨头+核心产业
r"苹果[^。]*(?:涨价|降价|推迟|取消|禁|制裁|调查|召回|大跌|暴跌)",
r"openai[^。]*(?:推迟|取消|风险|调查|起诉|倒闭|ipo)",
r"(?:英伟达|nvidia)[^。]*(?:跌|调查|制裁|推迟|禁令)",
r"台积电[^。]*(?:跌|推迟|取消|地震|火灾|禁)",
r"特斯拉.*(?:暴跌|召回|调查|破产|禁)",
# 美联储/央行意外
r"美联储.*(?:意外|紧急|缩表|风暴|警告|超预期|加息\s*50|降息\s*50|紧急\s*(?:会议|声明))",
r"美联储.*(?:利率|决议).*(?:超预期|意外|紧急)",
r"fed.*(?:emergency|unexpected|surprise|hike|cut)",
# 指数暴跌(需 ≥2% 跌幅或使用更强范围词)
r"指数[^。]*?(?:暴跌|熔断|闪崩|重挫)",
r"指数[^。]*?(?:跌幅|下跌)[^。]*?[2-9]%",
r"(?:暴跌|重挫|熔断).*[5-9]%",
r"熔断|闪崩",
# 地缘+贸易
r"关税.*(?:升级|新|报复|制裁)",
r"制裁.*(?:新|升级|全面)",
r"战争|开战|入侵|核(?:威胁|武器|弹头|试验|攻击|冲突|导弹|战争|潜艇|问题|危机|设施)|导弹[^。]*(?:发射|袭击|攻击)",
# 系统性能源
r"原油.*(?:跌破|暴跌|崩盘|断供)",
r"石油.*(?:禁运|制裁|断供)",
r"能源危机|粮食危机",
# 系统金融
r"银行.*(?:倒闭|挤兑|破产|接管|危机)",
r"金融危机(?:风险|爆发|蔓延|冲击|预警|警示|逼近|担忧|席卷|升级|恐慌|进入|出现|形成|即将|来袭|警报|当前|新一轮|全面|全球性)|债务危机|违约潮|系统性(?:风险|危机)",
# AI/科技板块重挫
r"半导体.*(?:暴跌|熔断|崩盘|跌幅)",
r"科技股.*(?:暴跌|熔断|崩盘|重挫)",
r"(?:费城半导体|sox)[^。]*?(?:跌|崩)",
]
# MEDIUM: 累计匹配2条以上 → MEDIUM 预警
MEDIUM_PATTERNS = [
r"加息|降息",
r"通胀|CPI|PPI",
r"汇率.*(?:大幅|波动|贬值|升值)",
r"外资.*(?:流出|撤离|减持)",
r"北向.*(?:流出|净卖出|大幅)",
r"季末|年末|半年末|资金回笼|流动性紧张",
r"解禁.*(?:大额|巨量|千亿)",
r"大跌|重挫|杀跌|恐慌",
r"评级.*(?:下调|负面|降级)",
r"预警|风险提示|谨慎",
r"期货.*(?:暴跌|跌停|熔断)",
r"黑天鹅|灰犀牛",
]
# ── 模式完整性校验(防 .pyc 缓存/版本回退) ──
# 直接检查关键特征字符串是否存在于模式中
# 原理: 旧版有 standalone |核| , 新版有 核(?:威胁
# 旧版有 银行.*倒闭|挤兑|破产 , 新版有 银行.*(?:倒闭|挤兑|破产
_PATTERN_CHECKS = {
8: ["暴跌|熔断|闪崩|重挫"], # index pattern must use strong crash words, not "跌幅"
9: ["[2-9]%"], # 指数+跌幅 requires ≥2%
14: ["核(?:威胁", "核威胁|武器|弹头"], # must NOT have standalone 核
18: ["倒闭|挤兑|破产"], # bank pattern must have crisis keywords
19: ["金融危机(?:风险", "危机|债务危机"], # must NOT have standalone 金融危机
}
_KNOWN_BAD_SIGS = {
# Known stale .pyc signature fragments that indicate wrong version
"指数.*跌幅": "旧版用 .* 跨句匹配且无 ≥2% 阈值",
"|核|": "旧版有独立单字核",
"英伟达|nvidia.*跌": "旧版 alternation 分组错误",
"导弹.*发射": "旧版只匹配发射不匹配袭击",
"|金融危机|": "旧版 standalone 金融危机匹配历史参照",
}
for idx, required_list in _PATTERN_CHECKS.items():
if not any(req in HIGH_PATTERNS[idx] for req in required_list):
print(f"[MACRO-安全] ⚠️ HIGH_PATTERNS[{idx}] 签名不匹配!")
print(f"[MACRO-安全] 当前: {HIGH_PATTERNS[idx][:100]}")
print(f"[MACRO-安全] 预期应包含: {required_list[0]}")
print(f"[MACRO-安全] 可能原因: .pyc 缓存过期 / 回退到旧版本")
# 额外扫描:检查是否有已知的旧版签名残留
_all_patterns_text = "\n".join(HIGH_PATTERNS)
for bad_sig, reason in _KNOWN_BAD_SIGS.items():
if bad_sig in _all_patterns_text:
print(f"[MACRO-安全] ⚠️ 检测到旧版模式签名 '{bad_sig}' ({reason})")
print(f"[MACRO-安全] .pyc 缓存可能未刷新,当前 HIGH_PATTERNS 可能仍为旧版本")
def ensure_tables(conn):
conn.execute("""
CREATE TABLE IF NOT EXISTS macro_raw_news (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
summary TEXT DEFAULT '',
url TEXT DEFAULT '',
source_ts TEXT DEFAULT '',
fetched_at TEXT DEFAULT (datetime('now','localtime')),
risk_level TEXT DEFAULT 'unassessed',
risk_reason TEXT DEFAULT ''
)
""")
conn.commit()
def fetch_news():
try:
import akshare as ak
df = ak.stock_info_global_em()
items = []
for _, row in df.iterrows():
items.append({
"title": str(row.get("标题", "")),
"summary": str(row.get("摘要", "")),
"url": str(row.get("链接", "")),
"source_ts": str(row.get("发布时间", "")),
})
return items
except ImportError:
print("[MACRO] akshare 未安装", file=sys.stderr)
return []
except Exception as e:
print(f"[MACRO] 采集失败: {e}", file=sys.stderr)
return []
def title_hash(title):
return hashlib.md5(title.encode()).hexdigest()[:16]
def quick_risk_check(items):
"""红绿灯:快速关键词检测,返回 (level, matched_articles, summary)"""
hits_high = []
hits_medium = []
for item in items:
text = (item["title"] + " " + item["summary"]).lower()
for pattern in HIGH_PATTERNS:
if re.search(pattern, text):
hits_high.append(item)
break
else:
# 没进 HIGH 才检查 MEDIUM
for pattern in MEDIUM_PATTERNS:
if re.search(pattern, text):
hits_medium.append(item)
break
level = "none"
summary = ""
matched = []
if hits_high:
level = "high"
matched = hits_high[:5]
titles = [f"· {h['title'][:50]}" for h in hits_high[:5]]
summary = f"【高风险】{len(hits_high)}条紧急信号:\n" + "\n".join(titles)
elif len(hits_medium) >= 2:
level = "medium"
matched = hits_medium[:5]
titles = [f"· {h['title'][:50]}" for h in hits_medium[:5]]
summary = f"【中风险】{len(hits_medium)}条预警信号:\n" + "\n".join(titles)
return level, matched, summary
def write_risk_signal(conn, level, matched, summary):
"""写入 signal_news + macro_risk_state.json"""
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
sentiment_map = {"high": "宏观-WATCH_HIGH", "medium": "宏观-WATCH_MEDIUM"}
sentiment = sentiment_map.get(level, "")
# 写入 signal_news
articles_json = json.dumps([{"title": a["title"][:80], "summary": a["summary"][:120]} for a in matched], ensure_ascii=False)
conn.execute(
"INSERT INTO signal_news (signal_id, sector, overall_sentiment, summary, key_articles, searched_stocks, source) VALUES (0, ?, ?, ?, ?, '', ?)",
("宏观", sentiment, summary[:500], articles_json, "macro_watch")
)
conn.commit()
# 写入状态文件(供监控cron实时读取)
state = {
"level": level,
"signals": [{"sentiment": sentiment, "summary": summary[:300], "key_articles": articles_json, "created_at": now}],
"signal_count": len(matched),
"created_at": now,
"expired": False,
"source": "collector_realtime",
}
STATE_PATH.write_text(json.dumps(state, ensure_ascii=False, indent=2))
def main():
conn = sqlite3.connect(str(DB_PATH))
ensure_tables(conn)
# 去重基础
existing = set()
for row in conn.execute("SELECT title FROM macro_raw_news ORDER BY id DESC LIMIT 200"):
existing.add(title_hash(row[0]))
items = fetch_news()
if not items:
conn.close()
return
# 去重写入
new_items = []
now_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for item in items:
if not item["title"].strip():
continue
h = title_hash(item["title"])
if h in existing:
continue
existing.add(h)
try:
conn.execute(
"INSERT INTO macro_raw_news (title, summary, url, source_ts, fetched_at) VALUES (?, ?, ?, ?, ?)",
(item["title"][:300], item["summary"][:500], item["url"][:500], item["source_ts"][:20], now_str)
)
new_items.append(item)
except Exception:
pass
conn.commit()
# 红绿灯检测(只针对新采集的)
level, matched, summary = "none", [], ""
if new_items:
level, matched, summary = quick_risk_check(new_items)
if level in ("high", "medium"):
write_risk_signal(conn, level, matched, summary)
conn.close()
# no_agent 输出
if new_items:
print(f"[MACRO] {len(new_items)}条新宏观新闻")
# HIGH风险:输出预警(会推送到XMPP)
if level == "high":
print(f"⚠️ HIGH风险预警 ({len(matched)}条): {summary[:200]}")
elif level == "medium":
print(f" MEDIUM风险: {len(matched)}条匹配")
if __name__ == "__main__":
import sqlite3
main()