Files
MoFin/scripts/cron_to_xmpp.py
T
2026-06-20 12:11:33 +08:00

360 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""cron_to_xmpp.py — 智能cron报告推送
只推送LLM驱动的分析报告(有实质内容),不推送纯脚本输出。
关键规则:
1. 跳过 no_agent 脚本的输出(价格监控、数据同步等机器数据)
2. 跳过自己的输出目录(30908cdc44a8),避免循环推送
3. 正文太短(<20字)或只有 [SILENT] 的不推
4. 超时自动跳过,不影响后续
"""
import json
import subprocess
import re
import sys
from datetime import datetime
from pathlib import Path
# 使用绝对路径,不受 profile 环境变量影响
REAL_HOME = Path("/home/hmo")
# 扫描目录
CRON_DIRS = [
REAL_HOME / ".hermes" / "cron" / "output",
REAL_HOME / ".hermes" / "profiles" / "position-analyst" / "cron" / "output",
]
JOURNAL = REAL_HOME / ".hermes" / "cron" / ".relay_journal.json"
SILENT_STATS = REAL_HOME / ".hermes" / "cron" / ".silent_daily_count.json"
MAX_AGE_HOURS = 6 # 只推送6小时内的报告,防止清journal后爆历史
def load_no_agent_job_ids():
"""从两个profile的jobs.json中读取所有no_agent=true的job ID"""
ids = set()
for jobs_path in [
REAL_HOME / ".hermes" / "cron" / "jobs.json",
REAL_HOME / ".hermes" / "profiles" / "position-analyst" / "cron" / "jobs.json",
]:
try:
with open(jobs_path) as f:
data = json.load(f)
for j in data.get("jobs", []):
if j.get("no_agent"):
ids.add(j["id"])
except:
pass
return ids
# 硬编码保底(如果 jobs.json 读不到)
SKIP_DIRS = {
"30908cdc44a8", # cron-推XMPP中继自身输出
"health", # 健康检查输出
}
FROM = "zhiwei@yoin.fun"
TO = "hmo@yoin.fun"
def load_journal():
try:
return set(json.loads(JOURNAL.read_text()))
except:
return set()
def save_journal(entries):
JOURNAL.write_text(json.dumps(sorted(entries)))
def is_pure_script_output(content):
"""判断文件是否是纯脚本的机器输出(不是LLM报告)"""
# LLM报告的特征:有 ## Response 节(包含agent的回复)
if "## Response" in content:
return False
# 以 # Cron Job: 开头但没有 ## Response 的可能是脚本输出
if content.startswith("# Cron Job:"):
return True
# 价格监控的触发输出
if content.startswith("🔔") and "" in content:
return True
# 健康检查报告
if "MoFin 系统健康检查" in content:
return True
# 结构化数据标签(价格监控的机器数据)
if "<structured_data>" in content:
return True
# no_agent 脚本的输出特征(Hermes自动添加的header
if "**Mode:** no_agent (script)" in content:
return True
return False
def validate_report_body(body):
"""质量检查 — 不拦截,返回改进建议"""
issues = []
text = body.strip()
if "重点推荐操作" not in text:
issues.append("缺少【重点推荐操作】区域(如无需操作可写「无」)")
if "风险关注" not in text:
issues.append("缺少【风险关注】区域(如无风险可写「无」)")
if len(text) > 600:
issues.append(f"报告偏长({len(text)}字),建议压缩到600字以内")
fuzzy = re.findall(r"可关注|可考虑|建议观察|试试|谨慎关注|择机|根据情况", text)
if fuzzy:
issues.append(f"含模糊词: {', '.join(set(fuzzy))},建议替换为明确操作指令")
if re.search(r"如果.*就.*如果.*就|若.*则.*若.*则", text):
issues.append("含选择题句式,建议只给一个确定建议")
return issues
def send_feedback(issues, job_name):
"""发送质量反馈给知微自己"""
from xml.sax.saxutils import escape
feedback = f"[自我反馈] 报告质量检查发现以下问题,下次注意:\n" + "\n".join(f"{i}" for i in issues)
safe = escape(feedback)
stanza = (
f"<message from='{FROM}' to='{FROM}' "
f"type='chat' xml:lang='en'>"
f"<body>{safe}</body></message>"
)
try:
subprocess.run(
["docker", "exec", "ejabberd", "ejabberdctl",
"send_stanza", FROM, FROM, stanza],
capture_output=True, timeout=10, text=True,
)
except:
pass
def extract_body(path):
content = path.read_text(encoding="utf-8", errors="replace")
if is_pure_script_output(content):
return None
parts = content.split("## Response")
body = parts[1].strip() if len(parts) > 1 else content.strip()
body = re.sub(r'^#.*?\n', '', body, flags=re.MULTILINE).strip()
body = re.sub(r'\n?\s*<structured_data>.*?</structured_data>\s*', '', body, flags=re.DOTALL).strip()
body = re.sub(r'\*\*(.*?)\*\*', r'\1', body)
# 去掉agent的思考过程("Now let me...", "Let me...", "Now I have..."等开头)
body = re.sub(r'^(Now let me|Let me|I need|I will|First let me|First,? I|Now I have|Here.i|I.ll|I.m ).*?\n\n', '', body, flags=re.DOTALL).strip()
# 去掉末尾的思考尾巴
body = re.sub(r'\n\s*(Now I|This |I have |I used |The report|The data).*?$', '', body, flags=re.DOTALL).strip()
# 如果只剩"好的"、"收到"等短回应,丢弃
if re.match(r'^[\u4e00-\u9fff,。]{1,10}$', body):
return None
if not body:
return None
# [SILENT] → 不推送(计数的逻辑在 scan() 中处理)
if "[SILENT]" in body:
return None
if len(body) < 20:
return None
return body
def send(body):
from xml.sax.saxutils import escape
safe = escape(f"【知微】{body}")
stanza = (
f"<message from='{FROM}' to='{TO}' "
f"type='chat' xml:lang='en'>"
f"<body>{safe}</body></message>"
)
# 重试3次
for attempt in range(3):
try:
r = subprocess.run(
["docker", "exec", "ejabberd", "ejabberdctl",
"send_stanza", FROM, TO, stanza],
capture_output=True, timeout=10, text=True,
)
if r.stderr and "error" in r.stderr.lower():
print(f"send error (attempt {attempt+1}): {r.stderr.strip()[:100]}", file=sys.stderr)
if attempt < 2:
continue
return False
return r.returncode == 0
except subprocess.TimeoutExpired:
print(f"send timeout (attempt {attempt+1})", file=sys.stderr)
if attempt < 2:
continue
return False
except Exception as e:
print(f"send err (attempt {attempt+1}): {e}", file=sys.stderr)
if attempt < 2:
continue
return False
return False
def validate_format(body):
"""格式检查 — 只记录不拦截,标记改进点"""
text = body.strip()
issues = []
# 必含区域检查
has_key = "重点推荐操作" in text
has_risk = "风险关注" in text
has_rest = "其余持仓" in text or "今日关注" in text
if not has_key:
issues.append("缺【重点推荐操作】区域")
if not has_risk:
issues.append("缺【风险关注】区域")
# 超长提醒
if len(text) > 600:
issues.append(f"报告偏长({len(text)}字),建议压缩到600字内")
# 模糊词提醒
fuzzy = re.findall(r"可关注|可考虑|建议观察|试试|谨慎关注|择机|根据情况", text)
if fuzzy:
issues.append(f"含模糊词({', '.join(list(set(fuzzy))[:3])}),应给唯一结论")
# 选择题句式提醒
if re.search(r"如果.*就|若.*则|可以.*也可以", text):
issues.append("含选择题句式,应给唯一建议")
return text, issues # 始终通过,issues 为空就是干净
def load_silent_stats():
"""加载当日静默统计"""
try:
return json.loads(SILENT_STATS.read_text())
except:
return {"date": "", "silent": 0, "short": 0, "script": 0}
def save_silent_stats(stats):
SILENT_STATS.write_text(json.dumps(stats))
def send_silent_summary(stats):
"""发送当日静默报告汇总"""
parts = []
if stats.get("silent", 0) > 0:
parts.append(f"静默[SILENT] {stats['silent']}")
if stats.get("short", 0) > 0:
parts.append(f"过短(<20字) {stats['short']}")
if stats.get("script", 0) > 0:
parts.append(f"脚本输出 {stats['script']}")
if not parts:
body = "【每日汇总】今日所有cron报告已正常送达,无被拦截的报告。"
else:
body = "【每日汇总】今日以下cron报告未送达(已拦截):\n" + "\n".join(f"{p}" for p in parts) + "\n\n无操作信号的报告正常静默,有操作信号的都已送达。"
send(body)
def scan():
processed = load_journal()
new = set()
n_pushed = 0
n_silent = 0
n_short = 0
n_script = 0
no_agent_ids = load_no_agent_job_ids()
skip_all = SKIP_DIRS | no_agent_ids
for cron_dir in CRON_DIRS:
if not cron_dir.exists():
continue
for d in sorted(cron_dir.iterdir()):
if not d.is_dir():
continue
if d.name in skip_all:
continue
for f in sorted(d.iterdir()):
if f.suffix != ".md":
continue
key = str(f.resolve())
if key in processed or key in new:
continue
new.add(key)
# 跳过超过MAX_AGE_HOURS小时的旧文件
age_hours = (datetime.now() - datetime.fromtimestamp(f.stat().st_mtime)).total_seconds() / 3600
if age_hours > MAX_AGE_HOURS:
continue
content = f.read_text(encoding="utf-8", errors="replace")
# 提前判断脚本输出
if is_pure_script_output(content):
n_script += 1
continue
parts = content.split("## Response")
body = parts[1].strip() if len(parts) > 1 else content.strip()
body = re.sub(r'^#.*?\n', '', body, flags=re.MULTILINE).strip()
body = re.sub(r'\n?\s*<structured_data>.*?</structured_data>\s*', '', body, flags=re.DOTALL).strip()
body = re.sub(r'\*\*(.*?)\*\*', r'\1', body)
if not body:
n_short += 1
continue
# SILENT → 拦截,记数(在长度检查之前,因为 [SILENT] 只有8字符)
if "[SILENT]" in body:
n_silent += 1
continue
if len(body) < 20:
n_short += 1
continue
# 格式校验 — 记录改进点,不拦截
ok_body, issues = validate_format(body)
n_pushed += 1
ok_sent = send(body)
if not ok_sent:
print(f" {d.name}: send failed", file=sys.stderr)
if issues:
print(f" {d.name}/{f.name}: 改进建议: {'; '.join(issues)}", file=sys.stderr)
if new:
save_journal(processed | new)
# 保存当日汇总到文件(供16:30汇总用)
today = datetime.now().strftime("%Y-%m-%d")
stats = load_silent_stats()
if stats.get("date") != today:
stats = {"date": today, "silent": 0, "short": 0, "script": 0}
stats["silent"] += n_silent
stats["short"] += n_short
stats["script"] += n_script
save_silent_stats(stats)
# 16:30~16:35 发送当日汇总(收盘后)
now = datetime.now()
hhmm = now.hour * 60 + now.minute
if 990 <= hhmm <= 995: # 16:30~16:35
send_silent_summary(stats)
log = f"推送{n_pushed}份,静默拦截{n_silent}份,过短{n_short}份,跳过脚本{n_script}"
print(log, file=sys.stderr)
return n_pushed
if __name__ == "__main__":
scan()