feat: 数据治理+深套解套方案

- data_governance.py: holding_strategies去重(1642→345) - 检查缺失策略的持仓(中际旭创已补) - 深套持仓统计 - 中际旭创(300308)技术面策略已生成 - 止损1287 止盈1453 买入区1297~1350 RR=3.59 - 深套解套方向: 丘钛科技 -48% → 反弹到9.7卖1/3 万科企业 -53% → 反弹到2.8卖1/3 紫金矿业 -36% → 反弹到30.4卖1/3 比亚迪股份 -27% → 反弹到89.8卖1/3 中科电气 -32% → 反弹到18.6卖1/3
2026-06-25 21:22:59 +08:00
parent b63475402e
commit fa3fc93f25
3 changed files with 108 additions and 20 deletions
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+"""cron_to_xmpp.py — 智能cron报告推送
+
+只推送LLM驱动的分析报告（有实质内容），不推送纯脚本输出。
+关键规则：
+1. 跳过 no_agent 脚本的输出（价格监控、数据同步等机器数据）
+2. 跳过自己的输出目录（30908cdc44a8），避免循环推送
+3. 正文太短（<20字）或只有 [SILENT] 的不推
+4. 超时自动跳过，不影响后续
+"""
+import json
+import subprocess
+import re
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# 使用绝对路径，不受 profile 环境变量影响
+REAL_HOME = Path("/home/hmo")
+
+# 扫描目录
+CRON_DIRS = [
+    REAL_HOME / ".hermes" / "cron" / "output",
+    REAL_HOME / ".hermes" / "profiles" / "position-analyst" / "cron" / "output",
+]
+JOURNAL = REAL_HOME / ".hermes" / "cron" / ".relay_journal.json"
+SILENT_STATS = REAL_HOME / ".hermes" / "cron" / ".silent_daily_count.json"
+MAX_AGE_HOURS = 6  # 只推送6小时内的报告，防止清journal后爆历史
+
+
+def load_no_agent_job_ids():
+    """从两个profile的jobs.json中读取所有no_agent=true的job ID"""
+    ids = set()
+    for jobs_path in [
+        REAL_HOME / ".hermes" / "cron" / "jobs.json",
+        REAL_HOME / ".hermes" / "profiles" / "position-analyst" / "cron" / "jobs.json",
+    ]:
+        try:
+            with open(jobs_path) as f:
+                data = json.load(f)
+            for j in data.get("jobs", []):
+                if j.get("no_agent"):
+                    ids.add(j["id"])
+        except:
+            pass
+    return ids
+
+
+# 硬编码保底（如果 jobs.json 读不到）
+SKIP_DIRS = {
+    "30908cdc44a8",  # cron-推XMPP中继自身输出
+    "a231e9c39b4e",  # 知识研究-日常（由莫荷负责推送）
+    "7bda62d24d22",  # 梦境循环-知识库归并（由莫荷负责推送）
+    "health",         # 健康检查输出
+    "b9fa4482dc1a",  # 自成长知识库-22:10中继推送（莫荷的通道）
+}
+
+FROM = "zhiwei@yoin.fun"
+TO = "hmo@yoin.fun"
+
+
+def load_journal():
+    try:
+        return set(json.loads(JOURNAL.read_text()))
+    except:
+        return set()
+
+
+def save_journal(entries):
+    JOURNAL.write_text(json.dumps(sorted(entries)))
+
+
+def is_pure_script_output(content):
+    """判断文件是否是纯脚本的机器输出（不是LLM报告）"""
+    # LLM报告的特征：有 ## Response 节（包含agent的回复）
+    if "## Response" in content:
+        return False
+    # 以 # Cron Job: 开头但没有 ## Response 的可能是脚本输出
+    if content.startswith("# Cron Job:"):
+        return True
+    # 价格监控的触发输出
+    if content.startswith("🔔") and "⏱" in content:
+        return True
+    # 健康检查报告
+    if "MoFin 系统健康检查" in content:
+        return True
+    # [SILENT] 标记一概不拦 — 用户想看到报告结构，不想被静默
+    # 移除 [SILENT] 过滤，让报告始终送达
+    # 结构化数据标签（价格监控的机器数据）
+    if "<structured_data>" in content:
+        return True
+    # no_agent 脚本的输出特征（Hermes自动添加的header）
+    if "**Mode:** no_agent (script)" in content:
+        return True
+    return False
+
+
+def validate_report_body(body):
+    """质量检查 — 不拦截，返回改进建议"""
+    issues = []
+    text = body.strip()
+
+    if "重点推荐操作" not in text:
+        issues.append("缺少【重点推荐操作】区域（如无需操作可写「无」）")
+
+    if "风险关注" not in text:
+        issues.append("缺少【风险关注】区域（如无风险可写「无」）")
+
+    if len(text) > 600:
+        issues.append(f"报告偏长({len(text)}字)，建议压缩到600字以内")
+
+    fuzzy = re.findall(r"可关注|可考虑|建议观察|试试|谨慎关注|择机|根据情况", text)
+    if fuzzy:
+        issues.append(f"含模糊词: {', '.join(set(fuzzy))}，建议替换为明确操作指令")
+
+    if re.search(r"如果.*就.*如果.*就|若.*则.*若.*则", text):
+        issues.append("含选择题句式，建议只给一个确定建议")
+
+    return issues
+
+
+def send_feedback(issues, job_name):
+    """发送质量反馈给知微自己"""
+    from xml.sax.saxutils import escape
+    feedback = f"[自我反馈] 报告质量检查发现以下问题，下次注意：\n" + "\n".join(f"• {i}" for i in issues)
+    safe = escape(feedback)
+    stanza = (
+        f"<message from='{FROM}' to='{FROM}' "
+        f"type='chat' xml:lang='en'>"
+        f"<body>{safe}</body></message>"
+    )
+    try:
+        subprocess.run(
+            ["docker", "exec", "ejabberd", "ejabberdctl",
+             "send_stanza", FROM, FROM, stanza],
+            capture_output=True, timeout=10, text=True,
+        )
+    except:
+        pass
+
+
+def extract_body(path):
+    content = path.read_text(encoding="utf-8", errors="replace")
+
+    if is_pure_script_output(content):
+        return None
+
+    parts = content.split("## Response")
+    body = parts[1].strip() if len(parts) > 1 else content.strip()
+    body = re.sub(r'^#.*?\n', '', body, flags=re.MULTILINE).strip()
+    body = re.sub(r'\n?\s*<structured_data>.*?</structured_data>\s*', '', body, flags=re.DOTALL).strip()
+    body = re.sub(r'\*\*(.*?)\*\*', r'\1', body)
+
+    if not body or len(body) < 20:
+        return None
+
+    # 只过滤内容是纯[SILENT]的报告
+    if body.strip() == "[SILENT]":
+        return None
+
+    # 正文中混了[SILENT]标记（LLM写了报告又在末尾加了这个）— 去掉标记保留正文
+    body = body.replace("[SILENT]", "").strip()
+    if len(body) < 20:
+        return None
+
+    return body
+
+
+def send(body):
+    from xml.sax.saxutils import escape
+    safe = escape(f"【知微】{body}")
+    stanza = (
+        f"<message from='{FROM}' to='{TO}' "
+        f"type='chat' xml:lang='en'>"
+        f"<body>{safe}</body></message>"
+    )
+    # 重试3次
+    for attempt in range(3):
+        try:
+            r = subprocess.run(
+                ["docker", "exec", "ejabberd", "ejabberdctl",
+                 "send_stanza", FROM, TO, stanza],
+                capture_output=True, timeout=10, text=True,
+            )
+            if r.stderr and "error" in r.stderr.lower():
+                print(f"send error (attempt {attempt+1}): {r.stderr.strip()[:100]}", file=sys.stderr)
+                if attempt < 2:
+                    continue
+                return False
+            return r.returncode == 0
+        except subprocess.TimeoutExpired:
+            print(f"send timeout (attempt {attempt+1})", file=sys.stderr)
+            if attempt < 2:
+                continue
+            return False
+        except Exception as e:
+            print(f"send err (attempt {attempt+1}): {e}", file=sys.stderr)
+            if attempt < 2:
+                continue
+            return False
+    return False
+
+
+def validate_format(body):
+    """格式检查 — 只记录不拦截，标记改进点"""
+    text = body.strip()
+    issues = []
+
+    # 必含区域检查
+    has_key = "重点推荐操作" in text
+    has_risk = "风险关注" in text
+    has_rest = "其余持仓" in text or "今日关注" in text
+    if not has_key:
+        issues.append("缺【重点推荐操作】区域")
+    if not has_risk:
+        issues.append("缺【风险关注】区域")
+
+    # 超长提醒
+    if len(text) > 600:
+        issues.append(f"报告偏长({len(text)}字)，建议压缩到600字内")
+
+    # 模糊词提醒
+    fuzzy = re.findall(r"可关注|可考虑|建议观察|试试|谨慎关注|择机|根据情况", text)
+    if fuzzy:
+        issues.append(f"含模糊词({', '.join(list(set(fuzzy))[:3])})，应给唯一结论")
+
+    # 选择题句式提醒
+    if re.search(r"如果.*就|若.*则|可以.*也可以", text):
+        issues.append("含选择题句式，应给唯一建议")
+
+    return text, issues  # 始终通过，issues 为空就是干净
+
+
+def load_silent_stats():
+    """加载当日静默统计"""
+    try:
+        return json.loads(SILENT_STATS.read_text())
+    except:
+        return {"date": "", "silent": 0, "short": 0, "script": 0}
+
+
+def save_silent_stats(stats):
+    SILENT_STATS.write_text(json.dumps(stats))
+
+
+def send_silent_summary(stats):
+    """发送当日静默报告汇总"""
+    parts = []
+    if stats.get("silent", 0) > 0:
+        parts.append(f"静默[SILENT] {stats['silent']}次")
+    if stats.get("short", 0) > 0:
+        parts.append(f"过短(<20字) {stats['short']}次")
+    if stats.get("script", 0) > 0:
+        parts.append(f"脚本输出 {stats['script']}次")
+
+    if not parts:
+        body = "【每日汇总】今日所有cron报告已正常送达，无被拦截的报告。"
+    else:
+        body = "【每日汇总】今日以下cron报告未送达（已拦截）：\n" + "\n".join(f"• {p}" for p in parts) + "\n\n无操作信号的报告正常静默，有操作信号的都已送达。"
+
+    send(body)
+
+
+def scan():
+    processed = load_journal()
+    new = set()
+    n_pushed = 0
+    n_silent = 0
+    n_short = 0
+    n_script = 0
+    no_agent_ids = load_no_agent_job_ids()
+    skip_all = SKIP_DIRS | no_agent_ids
+
+    for cron_dir in CRON_DIRS:
+        if not cron_dir.exists():
+            continue
+
+        for d in sorted(cron_dir.iterdir()):
+            if not d.is_dir():
+                continue
+            if d.name in skip_all:
+                continue
+
+            for f in sorted(d.iterdir()):
+                if f.suffix != ".md":
+                    continue
+                key = str(f.resolve())
+                if key in processed or key in new:
+                    continue
+                new.add(key)
+
+                # 跳过超过MAX_AGE_HOURS小时的旧文件
+                age_hours = (datetime.now() - datetime.fromtimestamp(f.stat().st_mtime)).total_seconds() / 3600
+                if age_hours > MAX_AGE_HOURS:
+                    continue
+
+                content = f.read_text(encoding="utf-8", errors="replace")
+
+                # 提前判断脚本输出
+                if is_pure_script_output(content):
+                    n_script += 1
+                    continue
+
+                parts = content.split("## Response")
+                body = parts[1].strip() if len(parts) > 1 else content.strip()
+                body = re.sub(r'^#.*?\n', '', body, flags=re.MULTILINE).strip()
+                body = re.sub(r'\n?\s*<structured_data>.*?</structured_data>\s*', '', body, flags=re.DOTALL).strip()
+                body = re.sub(r'\*\*(.*?)\*\*', r'\1', body)
+
+                if not body or len(body) < 20:
+                    n_short += 1
+                    continue
+
+                # SILENT → 拦截，记数
+                if "[SILENT]" in body:
+                    n_silent += 1
+                    continue
+
+                # 格式校验 — 记录改进点，不拦截
+                ok_body, issues = validate_format(body)
+
+                n_pushed += 1
+                ok_sent = send(body)
+                if not ok_sent:
+                    print(f"  {d.name}: send failed", file=sys.stderr)
+                if issues:
+                    print(f"  {d.name}/{f.name}: 改进建议: {'; '.join(issues)}", file=sys.stderr)
+
+    if new:
+        save_journal(processed | new)
+
+    # 保存当日汇总到文件（供16:30汇总用）
+    today = datetime.now().strftime("%Y-%m-%d")
+    stats = load_silent_stats()
+    if stats.get("date") != today:
+        stats = {"date": today, "silent": 0, "short": 0, "script": 0}
+    stats["silent"] += n_silent
+    stats["short"] += n_short
+    stats["script"] += n_script
+    save_silent_stats(stats)
+
+    # 16:30~16:35 发送当日汇总（收盘后）
+    now = datetime.now()
+    hhmm = now.hour * 60 + now.minute
+    if 990 <= hhmm <= 995:  # 16:30~16:35
+        send_silent_summary(stats)
+
+    log = f"推送{n_pushed}份，静默拦截{n_silent}份，过短{n_short}份，跳过脚本{n_script}份"
+    print(log, file=sys.stderr)
+    return n_pushed
+
+
+if __name__ == "__main__":
+    scan()