MoFin/scripts/cron_to_xmpp.py

#!/usr/bin/env python3
"""cron_to_xmpp.py — 智能cron报告推送

只推送LLM驱动的分析报告（有实质内容），不推送纯脚本输出。
关键规则：
1. 跳过 no_agent 脚本的输出（价格监控、数据同步等机器数据）
2. 跳过自己的输出目录（30908cdc44a8），避免循环推送
3. 正文太短（<20字）或只有 [SILENT] 的不推
4. 超时自动跳过，不影响后续
"""
import json
import subprocess
import re
import sys
from datetime import datetime
from pathlib import Path

# 使用绝对路径，不受 profile 环境变量影响
REAL_HOME = Path("/home/hmo")

# 扫描目录
CRON_DIRS = [
    REAL_HOME / ".hermes" / "cron" / "output",
    REAL_HOME / ".hermes" / "profiles" / "position-analyst" / "cron" / "output",
]
JOURNAL = REAL_HOME / ".hermes" / "cron" / ".relay_journal.json"
SILENT_STATS = REAL_HOME / ".hermes" / "cron" / ".silent_daily_count.json"
MAX_AGE_HOURS = 6  # 只推送6小时内的报告，防止清journal后爆历史


def load_no_agent_job_ids():
    """从两个profile的jobs.json中读取所有no_agent=true的job ID"""
    ids = set()
    for jobs_path in [
        REAL_HOME / ".hermes" / "cron" / "jobs.json",
        REAL_HOME / ".hermes" / "profiles" / "position-analyst" / "cron" / "jobs.json",
    ]:
        try:
            with open(jobs_path) as f:
                data = json.load(f)
            for j in data.get("jobs", []):
                if j.get("no_agent"):
                    ids.add(j["id"])
        except:
            pass
    return ids


# 硬编码保底（如果 jobs.json 读不到）
SKIP_DIRS = {
    "30908cdc44a8",  # cron-推XMPP中继自身输出
    "health",         # 健康检查输出
}

FROM = "zhiwei@yoin.fun"
TO = "hmo@yoin.fun"


def load_journal():
    try:
        return set(json.loads(JOURNAL.read_text()))
    except:
        return set()


def save_journal(entries):
    JOURNAL.write_text(json.dumps(sorted(entries)))


def is_pure_script_output(content):
    """判断文件是否是纯脚本的机器输出（不是LLM报告）"""
    # LLM报告的特征：有 ## Response 节（包含agent的回复）
    if "## Response" in content:
        return False
    # 以 # Cron Job: 开头但没有 ## Response 的可能是脚本输出
    if content.startswith("# Cron Job:"):
        return True
    # 价格监控的触发输出
    if content.startswith("🔔") and "⏱" in content:
        return True
    # 健康检查报告
    if "MoFin 系统健康检查" in content:
        return True
    # 结构化数据标签（价格监控的机器数据）
    if "<structured_data>" in content:
        return True
    # no_agent 脚本的输出特征（Hermes自动添加的header）
    if "**Mode:** no_agent (script)" in content:
        return True
    return False


def validate_report_body(body):
    """质量检查 — 不拦截，返回改进建议"""
    issues = []
    text = body.strip()

    if "重点推荐操作" not in text:
        issues.append("缺少【重点推荐操作】区域（如无需操作可写「无」）")

    if "风险关注" not in text:
        issues.append("缺少【风险关注】区域（如无风险可写「无」）")

    if len(text) > 600:
        issues.append(f"报告偏长({len(text)}字)，建议压缩到600字以内")

    fuzzy = re.findall(r"可关注|可考虑|建议观察|试试|谨慎关注|择机|根据情况", text)
    if fuzzy:
        issues.append(f"含模糊词: {', '.join(set(fuzzy))}，建议替换为明确操作指令")

    if re.search(r"如果.*就.*如果.*就|若.*则.*若.*则", text):
        issues.append("含选择题句式，建议只给一个确定建议")

    return issues


def send_feedback(issues, job_name):
    """发送质量反馈给知微自己"""
    from xml.sax.saxutils import escape
    feedback = f"[自我反馈] 报告质量检查发现以下问题，下次注意：\n" + "\n".join(f"• {i}" for i in issues)
    safe = escape(feedback)
    stanza = (
        f"<message from='{FROM}' to='{FROM}' "
        f"type='chat' xml:lang='en'>"
        f"<body>{safe}</body></message>"
    )
    try:
        subprocess.run(
            ["docker", "exec", "ejabberd", "ejabberdctl",
             "send_stanza", FROM, FROM, stanza],
            capture_output=True, timeout=10, text=True,
        )
    except:
        pass


def extract_body(path):
    content = path.read_text(encoding="utf-8", errors="replace")

    if is_pure_script_output(content):
        return None

    parts = content.split("## Response")
    body = parts[1].strip() if len(parts) > 1 else content.strip()
    body = re.sub(r'^#.*?\n', '', body, flags=re.MULTILINE).strip()
    body = re.sub(r'\n?\s*<structured_data>.*?</structured_data>\s*', '', body, flags=re.DOTALL).strip()
    body = re.sub(r'\*\*(.*?)\*\*', r'\1', body)

    # 去掉agent的思考过程（"Now let me...", "Let me...", "Now I have..."等开头）
    body = re.sub(r'^(Now let me|Let me|I need|I will|First let me|First,? I|Now I have|Here.i|I.ll|I.m ).*?\n\n', '', body, flags=re.DOTALL).strip()
    # 去掉末尾的思考尾巴
    body = re.sub(r'\n\s*(Now I|This |I have |I used |The report|The data).*?$', '', body, flags=re.DOTALL).strip()
    # 如果只剩"好的"、"收到"等短回应，丢弃
    if re.match(r'^[\u4e00-\u9fff，。]{1,10}$', body):
        return None

    if not body:
        return None

    # [SILENT] → 不推送（计数的逻辑在 scan() 中处理）
    if "[SILENT]" in body:
        return None

    if len(body) < 20:
        return None

    return body


def send(body):
    from xml.sax.saxutils import escape
    safe = escape(f"【知微】{body}")
    stanza = (
        f"<message from='{FROM}' to='{TO}' "
        f"type='chat' xml:lang='en'>"
        f"<body>{safe}</body></message>"
    )
    # 重试3次
    for attempt in range(3):
        try:
            r = subprocess.run(
                ["docker", "exec", "ejabberd", "ejabberdctl",
                 "send_stanza", FROM, TO, stanza],
                capture_output=True, timeout=10, text=True,
            )
            if r.stderr and "error" in r.stderr.lower():
                print(f"send error (attempt {attempt+1}): {r.stderr.strip()[:100]}", file=sys.stderr)
                if attempt < 2:
                    continue
                return False
            return r.returncode == 0
        except subprocess.TimeoutExpired:
            print(f"send timeout (attempt {attempt+1})", file=sys.stderr)
            if attempt < 2:
                continue
            return False
        except Exception as e:
            print(f"send err (attempt {attempt+1}): {e}", file=sys.stderr)
            if attempt < 2:
                continue
            return False
    return False


def validate_format(body):
    """格式检查 — 只记录不拦截，标记改进点"""
    text = body.strip()
    issues = []

    # 必含区域检查
    has_key = "重点推荐操作" in text
    has_risk = "风险关注" in text
    has_rest = "其余持仓" in text or "今日关注" in text
    if not has_key:
        issues.append("缺【重点推荐操作】区域")
    if not has_risk:
        issues.append("缺【风险关注】区域")

    # 超长提醒
    if len(text) > 600:
        issues.append(f"报告偏长({len(text)}字)，建议压缩到600字内")

    # 模糊词提醒
    fuzzy = re.findall(r"可关注|可考虑|建议观察|试试|谨慎关注|择机|根据情况", text)
    if fuzzy:
        issues.append(f"含模糊词({', '.join(list(set(fuzzy))[:3])})，应给唯一结论")

    # 选择题句式提醒
    if re.search(r"如果.*就|若.*则|可以.*也可以", text):
        issues.append("含选择题句式，应给唯一建议")

    return text, issues  # 始终通过，issues 为空就是干净


def load_silent_stats():
    """加载当日静默统计"""
    try:
        return json.loads(SILENT_STATS.read_text())
    except:
        return {"date": "", "silent": 0, "short": 0, "script": 0}


def save_silent_stats(stats):
    SILENT_STATS.write_text(json.dumps(stats))


def send_silent_summary(stats):
    """发送当日静默报告汇总"""
    parts = []
    if stats.get("silent", 0) > 0:
        parts.append(f"静默[SILENT] {stats['silent']}次")
    if stats.get("short", 0) > 0:
        parts.append(f"过短(<20字) {stats['short']}次")
    if stats.get("script", 0) > 0:
        parts.append(f"脚本输出 {stats['script']}次")

    if not parts:
        body = "【每日汇总】今日所有cron报告已正常送达，无被拦截的报告。"
    else:
        body = "【每日汇总】今日以下cron报告未送达（已拦截）：\n" + "\n".join(f"• {p}" for p in parts) + "\n\n无操作信号的报告正常静默，有操作信号的都已送达。"

    send(body)


def scan():
    processed = load_journal()
    new = set()
    n_pushed = 0
    n_silent = 0
    n_short = 0
    n_script = 0
    no_agent_ids = load_no_agent_job_ids()
    skip_all = SKIP_DIRS | no_agent_ids

    for cron_dir in CRON_DIRS:
        if not cron_dir.exists():
            continue

        for d in sorted(cron_dir.iterdir()):
            if not d.is_dir():
                continue
            if d.name in skip_all:
                continue

            for f in sorted(d.iterdir()):
                if f.suffix != ".md":
                    continue
                key = str(f.resolve())
                if key in processed or key in new:
                    continue
                new.add(key)

                # 跳过超过MAX_AGE_HOURS小时的旧文件
                age_hours = (datetime.now() - datetime.fromtimestamp(f.stat().st_mtime)).total_seconds() / 3600
                if age_hours > MAX_AGE_HOURS:
                    continue

                content = f.read_text(encoding="utf-8", errors="replace")

                # 提前判断脚本输出
                if is_pure_script_output(content):
                    n_script += 1
                    continue

                parts = content.split("## Response")
                body = parts[1].strip() if len(parts) > 1 else content.strip()
                body = re.sub(r'^#.*?\n', '', body, flags=re.MULTILINE).strip()
                body = re.sub(r'\n?\s*<structured_data>.*?</structured_data>\s*', '', body, flags=re.DOTALL).strip()
                body = re.sub(r'\*\*(.*?)\*\*', r'\1', body)

                if not body:
                    n_short += 1
                    continue

                # SILENT → 拦截，记数（在长度检查之前，因为 [SILENT] 只有8字符）
                if "[SILENT]" in body:
                    n_silent += 1
                    continue

                if len(body) < 20:
                    n_short += 1
                    continue

                # 格式校验 — 记录改进点，不拦截
                ok_body, issues = validate_format(body)

                n_pushed += 1
                ok_sent = send(body)
                if not ok_sent:
                    print(f"  {d.name}: send failed", file=sys.stderr)
                if issues:
                    print(f"  {d.name}/{f.name}: 改进建议: {'; '.join(issues)}", file=sys.stderr)

    if new:
        save_journal(processed | new)

    # 保存当日汇总到文件（供16:30汇总用）
    today = datetime.now().strftime("%Y-%m-%d")
    stats = load_silent_stats()
    if stats.get("date") != today:
        stats = {"date": today, "silent": 0, "short": 0, "script": 0}
    stats["silent"] += n_silent
    stats["short"] += n_short
    stats["script"] += n_script
    save_silent_stats(stats)

    # 16:30~16:35 发送当日汇总（收盘后）
    now = datetime.now()
    hhmm = now.hour * 60 + now.minute
    if 990 <= hhmm <= 995:  # 16:30~16:35
        send_silent_summary(stats)

    log = f"推送{n_pushed}份，静默拦截{n_silent}份，过短{n_short}份，跳过脚本{n_script}份"
    print(log, file=sys.stderr)
    return n_pushed


if __name__ == "__main__":
    scan()