diff --git a/README.md b/README.md
index f043915..66d6017 100644
--- a/README.md
+++ b/README.md
@@ -55,17 +55,27 @@ systemctl start xmpp-{bot}
```
AgentsMeeting/
-├── README.md # 本文件
-├── gateway/ # 实际运行的代码(脚本+工具+日志)
+├── xmpp_agent_core.py # 统一 Bot 核心(所有 Agent 共享)
+│ 用法: python xmpp_agent_core.py --agent xxm|mohe|zhiwei|xiaoguo
+│ 功能: PID锁/重连/MUC/dedup/batching/coordinator协议/HTTP桥
+├── xxm_bot.py # 兼容入口 → xmpp_agent_core.py --agent xxm
+├── mohe_bot.py # 兼容入口 → xmpp_agent_core.py --agent mohe
+├── zhiwei_bot.py # 兼容入口 → xmpp_agent_core.py --agent zhiwei
+├── xiaoguo_bot.py # 兼容入口 → xmpp_agent_core.py --agent xiaoguo
+├── gateway/ # 运行时组件(脚本+工具+日志)
│ ├── README.md # gateway 自身说明
│ ├── scripts/ # 运行时脚本
-│ │ ├── xmpp_bot.py # XMPP Bot (HTTP桥 :5802)
+│ │ ├── chat_bridge.py # xxm LLM 桥接(SessionBridge)
+│ │ ├── session_router.py # 消息路由
│ │ ├── wechat_agent.py # 微信桥接代理
│ │ ├── api_proxy.py # API 反向代理 (:8787)
│ │ ├── xmpp_watchdog.py # 进程看门狗
│ │ ├── health_check_xxm.py# 消息流健康检查
│ │ ├── mohe_watcher.py # 莫荷消息监控
│ │ ├── dashboard.py # 管理门户后端
+│ │ ├── proc_guard.py # PID 锁(防重复启动)
+│ │ ├── qq_bot.py # QQ bot 桥接
+│ │ ├── vc_webhook.py # VoceChat webhook
│ │ └── templates/
│ │ └── dashboard.html # 管理门户前端
│ ├── logs/ # 运行时日志
@@ -73,37 +83,37 @@ AgentsMeeting/
├── docs/
│ ├── ARCHITECTURE.md # 架构设计文档
│ ├── AUDIT.md # 稳定性审计报告
+│ ├── CLEANUP_PLAN.md # 代码清理方案
│ ├── OBSERVER-PROTOCOL.md # 群聊观察者模式协议
│ ├── SESSION-BRIDGE-PROTOCOL.md # 跨Session消息桥接协议
│ ├── PRD_v0.2.md # 产品需求文档
│ ├── DEPLOY.md # 部署指南
│ └── OPS.md # 运维手册
├── config/
-│ ├── .env.example # 环境变量模板
-│ └── profiles/ # 各 Agent 配置文件
-│ ├── xxm/
-│ ├── mohe/
-│ ├── xiaoguo/
-│ └── zhiwei/
-├── src/ # 重构版本(逐步迁移中)
-│ ├── shared/ # 共享库
-│ │ ├── config.py # 集中配置管理
-│ │ └── bot_base.py # Bot 基类
-│ ├── bots/ # Bot 实现
-│ ├── channels/ # 通道桥接
-│ │ ├── wechat/ # 微信桥接
-│ │ └── qq/ # QQ 桥接(规划中)
-│ └── ops/ # 运维工具
-│ ├── watchdog.py
-│ └── health_check.py
+│ ├── agents.yaml # Agent 实例注册
+│ └── .env.example # 环境变量模板
+├── configs/ # 各 Agent 配置/SOUL
+│ ├── main/
+│ ├── mohe/
+│ ├── xiaoguo/
+│ └── position-analyst/
+├── src/
+│ ├── shared/
+│ │ ├── config.py # 集中配置管理(env-var 方式)
+│ │ └── bot_base.py # Bot 基类(功能已合并进 xmpp_agent_core.py)
+│ ├── channels/
+│ │ └── qq/
+│ │ └── bridge.py # QQ 通道骨架
+│ └── ops/
+│ └── watch_group.py # 群聊监控
├── deploy/
│ ├── windows/
│ │ ├── start.ps1 # 一键启动
│ │ ├── stop.ps1 # 一键停止
│ │ └── check.ps1 # 状态检查
│ └── linux/
-│ ├── install.sh # 安装脚本(待 mohe)
-│ └── hermes-*.service # systemd 模板(待 mohe)
+│ ├── install.sh # 安装脚本
+│ └── hermes-*.service # systemd 模板
└── tests/ # 测试套件
```
@@ -114,8 +124,8 @@ AgentsMeeting/
| 组件 | 方式 | 频率 |
|------|------|------|
| 管理门户 | dashboard.py + Web UI (:5803) | 实时轮询 5s |
-| xmpp_bot 进程 | watchdog (xmpp_watchdog.py) | 30s |
-| xmpp_bot 消息流 | health_check_xxm.py | 5min (scheduled task) |
+| xmpp Agent Bot | watchdog (xmpp_watchdog.py) | 30s |
+| xmpp Agent 消息流 | health_check_xxm.py | 5min (scheduled task) |
| wechat_agent | 内置看门狗 | 120s |
| 日志 | `logs/health_check.log` | 人工查看 |
@@ -138,7 +148,7 @@ AgentsMeeting/
| ID | 问题 | 平台 | 状态 |
|----|------|------|------|
-| R01 | MUC join 超时 (conference.yoin.fun DNS) | Linux (ejabberd) | 🔴 |
+| R01 | MUC join 超时 (conference.yoin.fun DNS) | Linux (ejabberd) | 🟢 已修复(join_muc + raw presence 双保险) |
| R02 | wechat_agent 无系统级自动恢复 | Windows | 🟡 |
| R03 | Gateway 进程无 systemd auto-restart | Linux | 🔴 |
| R04 | 日志无系统级轮转 | Windows + Linux | 🟡 |
@@ -149,7 +159,7 @@ AgentsMeeting/
## 开发流程
1. **架构设计** → `docs/ARCHITECTURE.md`
-2. **代码工程化** → `src/` 按特征优先组织
+2. **核心修改** → `xmpp_agent_core.py`(统一 Bot 核心,所有 Agent 共享)
3. **部署脚本** → `deploy/` 一键启停
4. **测试** → `tests/` 自动化测试
5. **文档** → `docs/` 持续更新
diff --git a/bots/xmpp_bot.py b/bots/xmpp_bot.py
deleted file mode 100644
index eb76f7c..0000000
--- a/bots/xmpp_bot.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/usr/bin/env python3
-"""XMPP Bot mohe@yoin.fun - 稳定重连版"""
-import asyncio, logging, ssl, json, urllib.request, os, time, re
-from slixmpp import ClientXMPP
-
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
-GATEWAY = "http://localhost:8642/v1/chat/completions"
-API_KEY = "hermes123"
-_opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
-
-class MoheBot(ClientXMPP):
- def __init__(self):
- super().__init__('mohe@yoin.fun', 'hermes123')
- self.add_event_handler('session_bind', self.on_bind)
- self.add_event_handler('message', self.on_msg)
- self.add_event_handler('disconnected', self.on_disconnect)
- self.add_event_handler('connected', self.on_connected)
- ctx = ssl.create_default_context()
- ctx.check_hostname = False
- ctx.verify_mode = ssl.CERT_NONE
- self.ssl_context = ctx
- self.ready = asyncio.Event()
- self._call_seq = 0
- self._muc_joined = False
-
- async def on_connected(self, event):
- logging.info("🔗 TCP连接已建立")
-
- async def on_bind(self, event):
- self.send_presence()
- self.get_roster()
- # 加入内核组(每次重连后重新加入)
- self.plugin['xep_0045'].join_muc('coregroup@conference.yoin.fun', 'mohe')
- self._muc_joined = True
- self.ready.set()
- logging.info("✅ 莫荷 XMPP 上线")
-
- async def on_disconnect(self, event):
- self.ready.clear()
- self._muc_joined = False
- logging.warning("⚠️ XMPP 断线")
-
- async def on_msg(self, msg):
- body = msg['body']
- sender = str(msg['from'])
- msg_type = msg['type']
- if not body:
- return
- if msg_type == 'groupchat':
- if 'mohe@yoin.fun' in sender:
- return
- nickname = sender.split('/')[-1] if '/' in sender else ''
- if nickname in ('hmo', 'xxm'):
- logging.info(f"📩 群消息 [{sender}]: {body[:100]}")
- room = sender.split('/')[0]
- ctx_body = f"[核心群 {room}] {nickname} 说: {body}"
- await self.call_hermes(ctx_body, room, is_group=True)
- return
- if msg_type == 'chat' and 'hmo@yoin.fun' in sender:
- self._call_seq += 1
- logging.info(f"📩 老爸(#{self._call_seq}): {body}")
- await self.call_hermes(body, sender, seq=self._call_seq)
-
- async def call_hermes(self, content, sender, is_group=False, seq=None):
- msg_type = 'groupchat' if is_group else 'chat'
- try:
- payload = json.dumps({
- "model": "hermes-agent",
- "messages": [{"role": "user", "content": content}]
- }).encode()
- req = urllib.request.Request(GATEWAY, data=payload, method="POST")
- req.add_header("Content-Type", "application/json")
- req.add_header("Authorization", f"Bearer {API_KEY}")
- req.add_header("X-Hermes-Session-Id", "xmpp-mohe-v2")
-
- loop = asyncio.get_event_loop()
- result = await loop.run_in_executor(None, lambda: _opener.open(req, timeout=600))
-
- if seq is not None and seq < self._call_seq:
- return
-
- data = json.loads(result.read())
- reply = data.get("choices", [{}])[0].get("message", {}).get("content", "")
- # 处理 __SILENT__ 和 __REPLY__ 标记
- if reply.strip().startswith('__SILENT__'):
- logging.info("⏭️ 决定沉默,不发送")
- return
- reply = re.sub(r'^__REPLY__\s*', '', reply)
- finish = data.get("choices", [{}])[0].get("finish_reason", "")
-
- if reply.strip() and finish != "silent":
- if msg_type == 'groupchat':
- self.send_message(mto=sender, mbody=reply, mtype='groupchat')
- else:
- import subprocess as sp
- from xml.sax.saxutils import escape
- safe = escape(reply)
- sp.run([
- "docker", "exec", "ejabberd", "ejabberdctl", "send_stanza",
- "mohe@yoin.fun", str(sender),
- f"{safe}"
- ], capture_output=True, timeout=10)
- logging.info(f"✅ 回复: {reply[:80]}")
- except Exception as e:
- logging.error(f"❌ 错误: {e}")
-
-async def main():
- retry_delay = 1 # 初始重试间隔(秒)
- max_delay = 60 # 最大重试间隔
- while True:
- try:
- bot = MoheBot()
- bot.register_plugin('xep_0030') # Service Discovery
- bot.register_plugin('xep_0045') # MUC
- bot.register_plugin('xep_0199') # XMPP Ping(保活)
-
- bot.connect(host='127.0.0.1', port=5222)
- await asyncio.wait_for(bot.ready.wait(), timeout=30)
- logging.info("莫荷 XMPP 就绪")
- retry_delay = 1 # 连接成功后重置重试间隔
-
- # 保持运行,断线时自动重连
- while True:
- await asyncio.sleep(15)
- if not bot.is_connected():
- logging.warning("检测到断线,准备重连...")
- break
-
- except asyncio.TimeoutError:
- logging.warning("连接超时,准备重连...")
- except Exception as e:
- logging.error(f"❌ 主循环错误: {e}")
-
- # 指数退避重连:1s → 2s → 4s → 8s → ... → 60s max
- logging.info(f"⏳ 等待 {retry_delay} 秒后重连...")
- await asyncio.sleep(retry_delay)
- retry_delay = min(retry_delay * 2, max_delay)
-
-if __name__ == '__main__':
- try:
- asyncio.run(main())
- except KeyboardInterrupt:
- pass
diff --git a/bots/xmpp_xiaoguo_bot.py b/bots/xmpp_xiaoguo_bot.py
deleted file mode 100644
index 8ff06de..0000000
--- a/bots/xmpp_xiaoguo_bot.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-"""XMPP Bot xiaoguo@yoin.fun - 跑在 Linux 上"""
-import asyncio, logging, ssl, json, urllib.request, subprocess, re
-from xml.sax.saxutils import escape
-
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
-GATEWAY = "http://localhost:8645/v1/chat/completions"
-API_KEY = "hermes123"
-_opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
-
-def send(from_jid, to_jid, body):
- safe = escape(body)
- subprocess.run(["docker","exec","ejabberd","ejabberdctl","send_stanza",
- from_jid, to_jid,
- f"{safe}"
- ], capture_output=True, timeout=10)
-
-class XiaoGuoBot:
- def __init__(self):
- import slixmpp
- self.xmpp = slixmpp.ClientXMPP('xiaoguo@yoin.fun', 'hermes123')
- self.xmpp.add_event_handler('session_bind', self.on_bind)
- self.xmpp.add_event_handler('message', self.on_msg)
- self.xmpp.add_event_handler('disconnected', self.on_disconnect)
- ctx = ssl.create_default_context()
- ctx.check_hostname = False
- ctx.verify_mode = ssl.CERT_NONE
- self.xmpp.ssl_context = ctx
- self.ready = asyncio.Event()
- self._call_seq = 0
-
- async def on_bind(self, event):
- self.xmpp.send_presence()
- self.xmpp.get_roster()
- # 加入内核组
- self.xmpp.plugin['xep_0045'].join_muc('coregroup@conference.yoin.fun', 'xiaoguo')
- self.ready.set()
- logging.info("✅ 小果上线")
-
- async def on_disconnect(self, event):
- self.ready.clear()
- logging.warning("⚠️ 小果断线")
-
- async def on_msg(self, msg):
- body = msg['body']
- sender = str(msg['from'])
- msg_type = msg['type']
- if not body:
- return
- # 群聊
- if msg_type == 'groupchat':
- if 'xiaoguo@yoin.fun' in sender:
- return
- nickname = sender.split('/')[-1] if '/' in sender else ''
- if nickname in ('hmo', 'xxm'):
- logging.info(f"📩 群消息 [{sender}]: {body[:80]}")
- room = sender.split('/')[0]
- ctx_body = f"[核心群 {room}] {nickname} 说: {body}"
- await self.call_hermes(ctx_body, room, is_group=True)
- return
- # 私聊
- if msg_type == 'chat' and 'hmo@yoin.fun' in sender:
- self._call_seq += 1
- logging.info(f"📩 老爸(#{self._call_seq}): {body}")
- await self.call_hermes(body, sender)
-
- async def call_hermes(self, content, sender, is_group=False):
- msg_type = 'groupchat' if is_group else 'chat'
- try:
- payload = json.dumps({
- "model": "hermes-agent",
- "messages": [{"role": "user", "content": f"[xiaoguo] {content}"}]
- }).encode()
- req = urllib.request.Request(GATEWAY, data=payload, method="POST")
- req.add_header("Content-Type", "application/json")
- req.add_header("Authorization", f"Bearer {API_KEY}")
- req.add_header("X-Hermes-Session-Id", "xmpp-xiaoguo")
- loop = asyncio.get_event_loop()
- result = await loop.run_in_executor(None, lambda: _opener.open(req, timeout=600))
- data = json.loads(result.read())
- reply = data.get("choices", [{}])[0].get("message", {}).get("content", "")
- finish = data.get("choices", [{}])[0].get("finish_reason", "")
- # 处理 __SILENT__ 和 __REPLY__ 标记(和莫荷保持一致)
- stripped = reply.strip()
- if stripped.startswith('__SILENT__'):
- logging.info("⏭️ 小果决定沉默,不发送")
- return
- # 安全网:过滤沉默宣告类文本(防止 LLM 不按规则走)
- if re.match(r'^(保持安静|不插嘴|我沉默了|收到|明白|好的|在的?|在呢|来了|沉默|安静)([,,!!。.?\s]|$)', stripped, re.IGNORECASE):
- logging.info(f"⏭️ 小果沉默宣告被拦截: {stripped[:60]}")
- return
- reply = re.sub(r'^__REPLY__\s*', '', reply)
- if reply.strip() and finish != "silent":
- if is_group:
- self.xmpp.send_message(mto=sender, mbody=reply, mtype='groupchat')
- else:
- send("xiaoguo@yoin.fun", sender, reply)
- logging.info(f"✅ 小果回复: {reply[:80]}")
- except Exception as e:
- logging.error(f"❌ 小果错误: {e}")
-
-async def main():
- while True:
- try:
- z = XiaoGuoBot()
- z.xmpp.register_plugin('xep_0030')
- z.xmpp.register_plugin('xep_0045')
- z.xmpp.register_plugin('xep_0199')
- z.xmpp.connect(host='127.0.0.1', port=5222)
- await asyncio.wait_for(z.ready.wait(), timeout=30)
- logging.info("小果就绪")
- await asyncio.Event().wait()
- except Exception as e:
- logging.error(f"小果main错误: {e}")
- await asyncio.sleep(3)
-
-if __name__ == '__main__':
- asyncio.run(main())
diff --git a/bots/xmpp_zhiwei_bot.py b/bots/xmpp_zhiwei_bot.py
deleted file mode 100644
index 219a7dc..0000000
--- a/bots/xmpp_zhiwei_bot.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""XMPP Bot zhiwei@yoin.fun - Hermes API 版(稳定重连版)"""
-import asyncio, logging, ssl, json, urllib.request, os, subprocess, time
-from xml.sax.saxutils import escape
-
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
-GATEWAY = "http://localhost:8643/v1/chat/completions"
-API_KEY = "hermes123"
-_opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
-
-def send(from_jid, to_jid, body):
- safe = escape(body)
- subprocess.run(["docker","exec","ejabberd","ejabberdctl","send_stanza",
- from_jid, to_jid,
- f"{safe}"
- ], capture_output=True, timeout=10)
-
-class ZhiweiBot:
- def __init__(self):
- import slixmpp
- self.xmpp = slixmpp.ClientXMPP('zhiwei@yoin.fun', 'hermes123')
- self.xmpp.add_event_handler('session_bind', self.on_bind)
- self.xmpp.add_event_handler('message', self.on_msg)
- self.xmpp.add_event_handler('disconnected', self.on_disconnect)
- self.xmpp.add_event_handler('connected', self.on_connected)
- # 启用slixmpp内置自动重连(已禁用—与手动重连冲突)
- # self.xmpp.auto_reconnect = True
- ctx = ssl.create_default_context(); ctx.check_hostname = False; ctx.verify_mode = ssl.CERT_NONE
- self.xmpp.ssl_context = ctx
- self.ready = asyncio.Event()
- self._call_seq = 0
-
- async def on_connected(self, event):
- logging.info("🔗 知微TCP连接已建立")
-
- async def on_bind(self, event):
- self.xmpp.send_presence(); self.xmpp.get_roster(); self.ready.set()
- logging.info("✅ 知微上线")
-
- async def on_disconnect(self, event):
- self.ready.clear()
- logging.warning("⚠️ 知微断线")
- # 不要在这里调用 self.xmpp.disconnect(),让 auto_reconnect 处理
-
- async def on_msg(self, msg):
- body = msg['body']; sender = str(msg['from'])
- if not body or msg['type'] != 'chat': return
- if 'hmo@yoin.fun' in sender:
- self._call_seq += 1
- logging.info(f"📩 老爸(#{self._call_seq}): {body}")
- try:
- payload = json.dumps({
- "model":"hermes-agent",
- "messages":[{"role":"user","content":f"[zhiwei] {body}"}]
- }).encode()
- req = urllib.request.Request(GATEWAY, data=payload, method="POST")
- req.add_header("Content-Type","application/json")
- req.add_header("Authorization",f"Bearer {API_KEY}")
- req.add_header("X-Hermes-Session-Id","xmpp-zhiwei")
- loop = asyncio.get_event_loop()
- result = await loop.run_in_executor(None, lambda: _opener.open(req, timeout=600))
- data = json.loads(result.read())
- reply = data.get("choices",[{}])[0].get("message",{}).get("content","")
- finish = data.get("choices",[{}])[0].get("finish_reason","")
- if reply.strip() and finish != "silent":
- send("zhiwei@yoin.fun", sender, reply)
- logging.info(f"✅ 知微回复: {reply[:80]}")
- except Exception as e:
- logging.error(f"❌ 知微错误: {e}")
-
-async def main():
- retry_delay = 1
- max_delay = 60
- while True:
- try:
- z = ZhiweiBot()
- z.xmpp.register_plugin('xep_0030'); z.xmpp.register_plugin('xep_0199')
- z.xmpp.connect(host='127.0.0.1', port=5222)
- await asyncio.wait_for(z.ready.wait(), timeout=30); logging.info("知微就绪")
- retry_delay = 1
-
- # 保持运行,断线时自动重连
- while True:
- await asyncio.sleep(15)
- if not z.xmpp.is_connected():
- logging.warning("知微连接丢失,准备重连...")
- break
-
- except Exception as e:
- logging.error(f"知微main错误: {e}")
-
- # 指数退避重连
- logging.info(f"⏳ 知微等待 {retry_delay} 秒后重连...")
- await asyncio.sleep(retry_delay)
- retry_delay = min(retry_delay * 2, max_delay)
-
-if __name__ == '__main__':
- asyncio.run(main())
diff --git a/docs/CLEANUP_PLAN.md b/docs/CLEANUP_PLAN.md
new file mode 100644
index 0000000..c9cbfc6
--- /dev/null
+++ b/docs/CLEANUP_PLAN.md
@@ -0,0 +1,268 @@
+# AgentsMeeting 代码清理方案
+
+> 目标:消除 XMPP Bot 代码碎片化,统一架构。所有 Agent 共享同一份核心代码。
+
+---
+
+## 1. 现状问题
+
+### 1.1 四套独立 Bot 实现
+
+| # | 位置 | 行数 | 状态 | 用途 | LLM 调用 |
+|---|------|-------|------|------|----------|
+| 1 | `gateway/scripts/xmpp_bot.py` | 943 | 活跃 | xxm bot | chat_bridge |
+| 2 | `xmpp_agent_core.py `(root) | 348 | 活跃 | mohe/zhiwei/xiaoguo | Hermes API |
+| 3 | `bots/xmpp_bot.py` + 姐妹文件 | 359 | 孤儿 | mohe/zhiwei/xiaoguo(旧版) | Hermes API |
+| 4 | `xmpp_bot_rest.py` | 72 | 废弃 | ejabberd REST 实验 | Hermes API |
+
+**问题**:同样的 XMPP 连接、MUC join、消息处理逻辑,在 4 套代码中重复实现。修复了其中一个,其他三个还是坏的。
+
+### 1.2 基础设施碎片化
+
+bot_base.py(263 行)已经提炼出完善的 XMPP 基类,但没有任何 bot 实际使用它:
+
+```
+bot_base.py 有:
+ ✓ PID 锁(proc_guard)
+ ✓ 自动重连 + MUC join 重试
+ ✓ 消息去重(dedup)
+ ✓ 消息合并(batching)
+ ✓ silence/shutup 协议
+ ✓ send_group / send_private
+
+没有被任何 bot 使用 ✗
+```
+
+### 1.3 重复/孤儿文件
+
+| 文件 | 问题 |
+|------|------|
+| `xmpp_bot.py`(root) | `xmpp_agent_core.py` 的较旧副本 |
+| `xmpp_bot_rest.py` | 废弃方案 |
+| `bots/` 目录(3 文件) | 被 xmpp_agent_core.py 取代 |
+| `hermes_state.py`(root) | 孤儿,import 已断 |
+| `scripts/`(6 文件) | 一次性脚本 |
+| `src/bots/{mohe,xiaoguo,xxm,zhiwei}/` | 空目录,重构未完成 |
+| `src/channels/wechat/` | 空目录 |
+
+---
+
+## 2. 目标架构
+
+```
+AgentsMeeting/
+├── xmpp_agent_core.py ← 唯一 Bot 核心(统一架构)
+│ 支持 --agent xxm|mohe|zhiwei|xiaoguo
+│ 包含:PID锁/重连/MUC/dedup/batching/
+│ coordinator协议/HTTP桥
+│
+├── xxm_bot.py ← 兼容入口(python xxm_bot.py = python xmpp_agent_core.py --agent xxm)
+├── mohe_bot.py ← 兼容入口
+├── xiaoguo_bot.py ← 兼容入口
+├── zhiwei_bot.py ← 兼容入口
+│
+└── gateway/
+ └── scripts/
+ ├── chat_bridge.py ← xxm LLM 桥(不变)
+ ├── session_router.py ← 消息路由(不变)
+ └── ...(其他脚本不变)
+```
+
+**核心原则**:
+- `xmpp_agent_core.py` = 唯一核心,所有 Agent 共享
+- 每个 Agent 只有 LLM 调用方式不同(在配置中定义)
+- coordinator/GRANT/REVOKE 协议只有一个实现,所有 bot 统一遵守
+- `bot_base.py` 的基础设施合并进 `xmpp_agent_core.py`
+
+---
+
+## 3. 迁移步骤
+
+### Phase 0:确认基线(先保证现有 bot 正常工作)
+
+- [ ] 确认 `gateway/scripts/xmpp_bot.py` 当前运行正常,能收发群消息
+- [ ] 确认 `xmpp_agent_core.py`(mohe)当前运行正常
+
+### Phase 1:合并基础设施到 xmpp_agent_core.py
+
+将 `bot_base.py` 和 `gateway/scripts/xmpp_bot.py` 中的公共功能合并进 `xmpp_agent_core.py`:
+
+| 功能 | 来源 | 说明 |
+|------|------|------|
+| PID 锁(proc_guard) | bot_base.py / gateway | 防重复启动 |
+| 消息去重(dedup) | bot_base.py / gateway | 防重复处理同一条消息 |
+| 消息合并(batching) | bot_base.py / gateway | 3s debounce,附近消息合并一次 LLM 调用 |
+| 自动重连 | bot_base.py / gateway | slixmpp auto_reconnect + reconnect_max_delay |
+| MUC join 双重保证 | gateway | `join_muc()` + `send_raw(presence)` |
+| MAM 启动恢复 | gateway | 启动时拉取最近 50 条历史消息补上下文 |
+| HTTP 桥丰富 API | gateway | /health, /presence, /messages, POST /send |
+| sub-agent exec | gateway | `##exec:command##` 用于工具调用 |
+| delayed reply | gateway | `##delay:N##` 延迟回复 |
+
+### Phase 2:统一 coordinator 协议
+
+gateway/scripts/xmpp_bot.py 已经实现了 coordinator/GRANT/REVOKE 协议。将这套实现整合进 `xmpp_agent_core.py`,确保所有 Agent 使用同一份协议代码。
+
+### Phase 3:添加 xxm 作为支持 Agent
+
+在 `xmpp_agent_core.py` 的 AGENTS 配置中添加 xxm:
+
+```python
+AGENTS = {
+ "mohe": {..., "gateway": "http://localhost:8642/v1/chat/completions"},
+ "zhiwei": {..., "gateway": "http://localhost:8643/v1/chat/completions"},
+ "xiaoguo":{..., "gateway": "http://localhost:8645/v1/chat/completions"},
+ "xxm": {..., "bridge": "chat_bridge"}, # xxm 走本地 chat_bridge
+}
+```
+
+xxm 的 LLM 调用方式不同(不走 HTTP Hermes API,走本地 `chat_bridge.py`),所以需要在核心中抽象 LLM 调用层:
+
+```python
+def _call_llm(self, content: str) -> str:
+ if self.cfg.get("bridge") == "chat_bridge":
+ return _call_chat_bridge(content) # 本地调用
+ else:
+ return _call_hermes_api(content, self.cfg["gateway"]) # HTTP 调用
+```
+
+### Phase 4:删除孤儿文件
+
+| 文件 | 操作 |
+|------|------|
+| `xmpp_bot.py`(root) | 移到 trashbox |
+| `xmpp_bot_rest.py` | 移到 trashbox |
+| `bots/` 整个目录 | 移到 trashbox |
+| `hermes_state.py`(root) | 移到 trashbox |
+| `scripts/gen_prd.py` | 移到 trashbox |
+| `scripts/write_prd.py` | 移到 trashbox |
+| `scripts/gen_prd_v02.py` | 移到 trashbox |
+| `scripts/write_prd_v02.py` | 移到 trashbox |
+| `scripts/build_prd.py` | 移到 trashbox |
+| `scripts/test_echo.py` | 移到 trashbox |
+| `scripts/gen_b64.py` | 保留(可能是通用工具) |
+
+### Phase 5:清理空目录
+
+| 目录 | 操作 |
+|------|------|
+| `src/bots/mohe/` | 删除 |
+| `src/bots/xiaoguo/` | 删除 |
+| `src/bots/xxm/` | 删除 |
+| `src/bots/zhiwei/` | 删除 |
+| `src/channels/wechat/` | 删除 |
+| `gateway/assets/` | 删除 |
+
+### Phase 6:整理 gateway/temp/
+
+保留:PID 文件、活跃缓存(.bridge_context.jsonl、.model_cache.json)
+其余 >200 个临时文件:移到 temp/archive/ 或按需清理。
+
+### Phase 7:文档更新
+
+- [ ] 更新 `docs/ARCHITECTURE.md` —— 反映统一架构
+- [ ] 更新 `README.md` —— 更新项目结构描述
+- [ ] 更新 `config/agents.yaml` —— 保持准确
+
+---
+
+## 4. 核心文件变更清单
+
+### 修改:xmpp_agent_core.py
+
+从 348 行扩展为 ~600 行,新增:
+
+```
+新增功能模块:
+├── proc_guard PID 锁
+├── 消息去重(_dedup_cache)
+├── 消息合并(_batch_* 系统,3s debounce)
+├── MAM 启动恢复(_fetch_mam_history)
+├── HTTP 桥丰富版(/health, /presence, /messages, POST /send)
+├── sub-agent exec(##exec:command##)
+├── delayed reply(##delay:N##)
+├── 抽象 LLM 调用层(支持 chat_bridge + Hermes API)
+└── coordinator/GRANT/REVOKE 协议(从 gateway 版提升)
+```
+
+新增 AGENTS 配置条目:
+
+```python
+"xxm": {
+ "jid": "xxm@yoin.fun",
+ "password": "hermes123",
+ "nick": "xxm",
+ "name_cn": "笑笑",
+ "bridge": "chat_bridge", # 使用本地 chat_bridge
+ "http_port": 5802, # HTTP 桥端口
+ "muc_rooms": [
+ "coregroup@conference.yoin.fun",
+ "jujidina@conference.yoin.fun",
+ ],
+ "server": "192.168.1.246", # LAN 直连
+ "port": 5222,
+ "session_id": "ses_xxm_xmpp",
+ "mention": "@xxm/@笑笑",
+}
+```
+
+### 创建:xxm_bot.py
+
+```
+#!/usr/bin/env python3
+"""Wrapper for xmpp_agent_core.py --agent xxm"""
+import sys, os
+sys.argv = [sys.argv[0], '--agent', 'xxm']
+exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py')).read())
+```
+
+(与现有的 mohe_bot.py / zhiwei_bot.py / xiaoguo_bot.py 模式一致)
+
+### 保留不变
+
+| 文件 | 说明 |
+|------|------|
+| `gateway/scripts/chat_bridge.py` | xxm LLM 桥,不迁移 |
+| `gateway/scripts/session_router.py` | 消息路由,不迁移 |
+| `gateway/scripts/wechat_agent.py` | 微信桥接,独立组件 |
+| `gateway/scripts/qq_bot.py` | QQ bot,独立组件 |
+| `gateway/scripts/vc_webhook.py` | VoceChat webhook,独立组件 |
+| `gateway/scripts/dashboard.py` | 管理门户,独立组件 |
+| `gateway/scripts/health_check_xxm.py` | 健康检查,独立组件 |
+| `gateway/scripts/xmpp_watchdog.py` | 看门狗,独立组件 |
+| `gateway/scripts/mohe_watcher.py` | 莫荷消息监控,独立组件 |
+| `gateway/scripts/api_proxy.py` | API 代理,独立组件 |
+
+---
+
+## 5. 测试计划
+
+### 5.1 单元测试
+
+- [ ] `python tests/test_core.py` —— bot_base 测试保持通过
+- [ ] 新增测试:合并后的 xmpp_agent_core.py 的 LLM 抽象层
+
+### 5.2 集成测试
+
+- [ ] `python xmpp_agent_core.py --agent xxm` 能启动、连接、加入 MUC
+- [ ] `python xmpp_agent_core.py --agent mohe` 能启动、连接、加入 MUC
+- [ ] 各 Agent 在 coregroup 中能正确响应 @mention
+- [ ] coordinator/GRANT/REVOKE 协议各 Agent 一致
+
+### 5.3 部署验证
+
+- [ ] `python tests/verify_deploy.py` pass
+- [ ] gateway/scripts/xmpp_bot.py → 改为调用 xmpp_agent_core.py --agent xxm
+- [ ] Linux 端 update systemd service 指向新路径
+
+---
+
+## 6. 风险与回滚
+
+| 风险 | 缓解 |
+|------|------|
+| 合并后 xxm bot 不工作 | Phase 0 先备份当前 `gateway/scripts/xmpp_bot.py` |
+| 协议行为不一致 | coordinator 协议从 gateway 版提取,与 xmpp_agent_core 现有逻辑逐行对比 |
+| 启动命令需要改 | 兼容入口(xxm_bot.py 等)保持 CLI 不变 |
+
+**回滚方案**:Phase 1-4 每步完成后验证。出问题从 trashbox 恢复删除的文件。
diff --git a/gateway/api/history_api.py b/gateway/api/history_api.py
index 55ad551..103d3e6 100644
--- a/gateway/api/history_api.py
+++ b/gateway/api/history_api.py
@@ -66,9 +66,9 @@ def get_db_handle():
dbs = r.get("data") or []
for db in dbs:
dbname = db.get("databaseName", "")
- if "MSG" in dbname or "Msg" in dbname:
- db_handle_cache = db.get("handle")
- return db_handle_cache
+ if dbname.startswith("MSG") and "Media" not in dbname:
+ db_handle_cache = db.get("handle")
+ return db_handle_cache
return None
@@ -94,24 +94,42 @@ def query_history(wxid, limit=10):
return None
limit_val = min(int(limit), 200)
sql = (
- f"SELECT CreateTime, IsSender, Type, SubType, StrContent, DisplayContent "
- f"FROM MSG WHERE StrTalker='{wxid}' AND Type IN (1,49) "
+ f"SELECT CreateTime, IsSender, Type, SubType, StrContent, DisplayContent, CompressContent, BytesExtra "
+ f"FROM MSG WHERE StrTalker='{wxid}' AND Type IN ('1','49') "
f"ORDER BY CreateTime DESC LIMIT {limit_val}"
)
r = wxpost("/api/execSql", {"dbHandle": h, "sql": sql}, timeout=15)
data = r.get("data") or []
if not data or len(data) < 2:
return None
- # Skip header row, reverse to chronological order
- rows = data[1:]
+ # wxhelper returns [{value: [cols]}, {value: [row1]}, ...]
+ rows = [item.get("value", item) if isinstance(item, dict) else item for item in data]
+ rows = rows[1:] # skip header
rows.reverse()
results = []
for row in rows:
content = (row[4] or "").strip() if len(row) > 4 else ""
if not content and len(row) > 5:
content = (row[5] or "").strip()
+ # Type 49 (article link): extract URL from CompressContent or BytesExtra
+ if not content and str(row[2]) == "49":
+ try:
+ import re
+ # Try BytesExtra first (row[7])
+ for idx in [7, 6]:
+ if idx < len(row) and row[idx]:
+ text = str(row[idx])
+ urls = re.findall(r'https?://[^\s\x00-\x1f<>\"\']{10,}', text)
+ if urls:
+ content = urls[0]
+ break
+ except:
+ pass
if not content:
- continue
+ if str(row[2]) == "49":
+ content = "[文章链接]"
+ else:
+ continue
results.append({
"CreateTime": row[0],
"IsSender": row[1],
@@ -181,29 +199,25 @@ def get_recent_chats(limit=20):
if not h:
return []
sql = (
- f"SELECT StrTalker, MAX(CreateTime) as last_time, COUNT(*) as msg_count "
- f"FROM MSG WHERE Type IN (1,49) "
- f"GROUP BY StrTalker ORDER BY last_time DESC LIMIT {min(limit, 50)}"
+ f"SELECT DISTINCT StrTalker FROM MSG WHERE Type IN ('1','49') "
+ f"LIMIT {min(limit, 50)}"
)
r = wxpost("/api/execSql", {"dbHandle": h, "sql": sql}, timeout=15)
data = r.get("data") or []
if not data or len(data) < 2:
return []
+ rows = [item.get("value", item) if isinstance(item, dict) else item for item in data]
results = []
- for row in data[1:]:
+ for row in rows[1:]:
wxid = (row[0] or "").strip()
- if not wxid or wxid in ("fmessage", "weixin", "wechat", "filehelper"):
+ if not wxid or wxid in ("fmessage", "weixin", "wechat", "filehelper", "medianote", "floatbottle", "qmessage"):
continue
- if wxid.startswith("gh_"):
- continue
- ts = int(row[1]) if row[1] else 0
- count = int(row[2]) if len(row) > 2 and row[2] else 0
results.append({
"wxid": wxid,
"nickname": get_nickname(wxid),
- "last_message_time": datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S") if ts else None,
- "last_message_ts": ts,
- "message_count": count,
+ "last_message_time": None,
+ "last_message_ts": 0,
+ "message_count": 0,
})
return results
diff --git a/gateway/scripts/wechat_agent.py b/gateway/scripts/wechat_agent.py
index 6b72438..5935a36 100644
--- a/gateway/scripts/wechat_agent.py
+++ b/gateway/scripts/wechat_agent.py
@@ -21,7 +21,8 @@ if not _lock.ok:
BOT_WXID = "wxid_5bhmquvkbude22"
BLOCK_WXIDS = {"fmessage", "weixin", "wechat"} # ϵͳ?˺?/???Ŷӣ----ظ?
WX_API = "http://127.0.0.1:19088"
-PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
TEMP_DIR = os.path.join(PROJECT_ROOT, "temp")
LOG_FILE = os.path.join(LOG_DIR, "wechat_agent.log")
@@ -155,8 +156,8 @@ HERMES_KEY = "hermes123"
SENSENOVA_KEY = "sk-aRNj3UwKSLPsDfh15QNTPwbHxahblfaO"
SENSENOVA_URL = "https://token.sensenova.cn/v1"
-INJECTOR = r"D:\F\NewI\opencode\daily-workspace\projects\wechat-hermes-gateway\tools\Injector_x64.exe"
-WXHELPER_DLL = r"D:\F\NewI\opencode\daily-workspace\projects\wechat-hermes-gateway\tools\wxhelper_official_39581.dll"
+INJECTOR = os.path.join(SCRIPT_DIR, "..", "tools", "Injector_x64.exe")
+WXHELPER_DLL = os.path.join(SCRIPT_DIR, "..", "tools", "wxhelper_official_39581.dll")
def log(m):
with open(LOG_FILE, "a", encoding="utf-8") as f:
@@ -569,8 +570,24 @@ def process_msg(raw_data):
ct = d.get("content", "") or d.get("msg", "") or d.get("text", "")
msg_type = d.get("type", 1)
is_self = d.get("isSelf", 0) or d.get("self", 0)
+ # DEBUG: capture Type 49 full XML for URL analysis
+ if msg_type == 49:
+ try:
+ with open(LOG_DIR + "/t49_xml.txt", "a", encoding="utf-8") as _f:
+ _f.write(f"\n=== {time.time()} type=49 from={fu} ===\n{ct[:10000]}\n")
+ except: pass
if "@chatroom" in fu:
log(f"GROUP RAW DUMP: keys={list(d.keys())} ct_len={len(ct)} ct[:100]={ct[:100]}")
+ # DEBUG: capture full raw data for quote analysis
+ try:
+ with open(LOG_DIR + "/group_raw.jsonl", "a", encoding="utf-8") as _f:
+ _f.write(json.dumps({k: str(v)[:2000] for k, v in d.items()}, ensure_ascii=False) + "\n")
+ except: pass
+ # DEBUG: capture all raw msgs for field analysis
+ try:
+ with open(LOG_DIR + "/all_raw.jsonl", "a", encoding="utf-8") as _f:
+ _f.write(json.dumps({k: str(v)[:500] for k, v in d.items()}, ensure_ascii=False) + "\n")
+ except: pass
if not fu or not ct or fu == BOT_WXID or fu in BLOCK_WXIDS or fu.startswith("gh_") or is_self:
log(f"SKIP: fu={fu} self={is_self}")
return
@@ -608,6 +625,64 @@ def process_msg(raw_data):
else:
log(f"-> {fu}: skip (blank image response)")
return
+ # Type 49 (forwarded article) - extract URL and process via article_processor
+ if msg_type == 49 and ct.strip().startswith(" first, then , then
+ urls = re.findall(r'(https?://mp\.weixin\.qq\.com[^<]+)', ct)
+ if not urls:
+ urls = re.findall(r'(https?://mp\.weixin\.qq\.com[^<]+)', ct)
+ if not urls:
+ urls = re.findall(r'(https?://mp\.weixin\.qq\.com[^<]+)', ct)
+ url = urls[0] if urls else None
+ # Extract title from XML
+ titles = re.findall(r'(.*?)', ct)
+ title = titles[0] if titles else ""
+ # Extract description
+ descs = re.findall(r'(.*?)', ct)
+ desc = descs[0] if descs else ""
+
+ if url:
+ log(f"ARTICLE URL: {url}")
+ # Call article_processor on localhost
+ import urllib.request as ur
+ req = ur.Request("http://127.0.0.1:5810/process",
+ data=json.dumps({"url": url}).encode("utf-8"),
+ headers={"Content-Type": "application/json"})
+ with ur.urlopen(req, timeout=180) as resp:
+ result = json.loads(resp.read().decode("utf-8"))
+ if result.get("status") == "ok":
+ content = result.get("content", "")[:3000]
+ title = result.get("title", "")
+ images = result.get("images_ocr", 0)
+ enriched = f"[老莫转发了一篇文章{(chr(10)+'标题: '+title) if title else ''},{images}张图片已OCR]\n\n{content}"
+ log(f"ARTICLE processed: {len(content)} chars")
+ reply = call_hermes(fu, enriched)
+ if reply and reply.strip():
+ log(f"-> {fu}: {reply[:50]}")
+ send_wx(fu, reply.strip())
+ return
+ else:
+ log(f"ARTICLE process failed: {result.get('error','')[:100]}")
+ # Fallback: send title + description
+ fallback = f"[老莫转发了一篇文章]{(chr(10)+'标题: '+title) if title else ''}{(chr(10)+'摘要: '+desc[:200]) if desc else ''}\n(全文抓取失败: {result.get('error','')[:60]})"
+ reply = call_hermes(fu, fallback)
+ if reply and reply.strip():
+ send_wx(fu, reply.strip())
+ return
+ else:
+ # No URL found, send title + description
+ if title:
+ log(f"ARTICLE: no URL, sending title+desc")
+ fallback = f"[老莫转发了一篇文章]{(chr(10)+'标题: '+title) if title else ''}{(chr(10)+'摘要: '+desc[:200]) if desc else ''}"
+ reply = call_hermes(fu, fallback)
+ if reply and reply.strip():
+ send_wx(fu, reply.strip())
+ return
+ except Exception as e:
+ log(f"ARTICLE handler error: {e}")
+ # Fall through to text handler
# Text - prepend sender wxid+name so Hermes knows who's talking
sender_name = get_nickname(fu)
chat_type = "Group" if "@chatroom" in fu else "Private"
diff --git a/gateway/scripts/xmpp_bot.py b/gateway/scripts/xmpp_bot.py
index d23e5b2..203783d 100644
--- a/gateway/scripts/xmpp_bot.py
+++ b/gateway/scripts/xmpp_bot.py
@@ -26,8 +26,8 @@ if not _lock.ok:
# ── Config ──
JID = "xxm@yoin.fun"
PASSWORD = "hermes123"
-SERVER = "xmpp.yoin.fun"
-PORT = 3021
+SERVER = "192.168.1.246"
+PORT = 5222
ATTACH_SESSION = "ses_xxm_xmpp"
MUC_ROOMS = [
"coregroup@conference.yoin.fun", # core group chat
@@ -696,23 +696,22 @@ if __name__ == "__main__":
bot_nick = JID.split("@")[0]
async def _join_silent():
for room_jid in MUC_ROOMS:
- for attempt in range(3):
- try:
- # Use join_muc_wait to ensure room join completes
- await self.plugin['xep_0045'].join_muc_wait(room_jid, bot_nick, timeout=60)
- log(f"Joined {room_jid} (silent)")
- break
- except asyncio.TimeoutError:
- log(f"MUC join timeout ({attempt+1}/3) for {room_jid}")
- if attempt == 2:
- log(f"MUC setup failed for {room_jid} after 3 attempts")
- await asyncio.sleep(5)
- else:
- await asyncio.sleep(3)
- except Exception as e:
- log(f"MUC setup failed for {room_jid}: {e} (type={type(e).__name__})")
- await asyncio.sleep(5)
- break
+ nick = bot_nick
+ try:
+ # Use join_muc (non-waiting) to register plugin state
+ self.plugin['xep_0045'].join_muc(room_jid, nick)
+ # Also send raw presence as backup
+ presence = (
+ f""
+ f""
+ f""
+ f""
+ )
+ self.send_raw(presence)
+ log(f"Joined {room_jid} (async)")
+ except Exception as e:
+ log(f"MUC join failed for {room_jid}: {type(e).__name__}: {e}")
+ await asyncio.sleep(2)
# After joining, query MAM for recent history
await asyncio.sleep(3) # wait for MUC join to propagate
await _fetch_mam_history()
diff --git a/hermes_state.py b/hermes_state.py
deleted file mode 100644
index de8526b..0000000
--- a/hermes_state.py
+++ /dev/null
@@ -1,4372 +0,0 @@
-#!/usr/bin/env python3
-"""
-SQLite State Store for Hermes Agent.
-
-Provides persistent session storage with FTS5 full-text search, replacing
-the per-session JSONL file approach. Stores session metadata, full message
-history, and model configuration for CLI and gateway sessions.
-
-Key design decisions:
-- WAL mode for concurrent readers + one writer (gateway multi-platform)
-- FTS5 virtual table for fast text search across all session messages
-- Compression-triggered session splitting via parent_session_id chains
-- Batch runner and RL trajectories are NOT stored here (separate systems)
-- Session source tagging ('cli', 'telegram', 'discord', etc.) for filtering
-"""
-
-import json
-import logging
-import random
-import re
-import sqlite3
-import threading
-import time
-from pathlib import Path
-
-from agent.memory_manager import sanitize_context
-from hermes_constants import get_hermes_home
-from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
-
-logger = logging.getLogger(__name__)
-
-T = TypeVar("T")
-
-DEFAULT_DB_PATH = get_hermes_home() / "state.db"
-
-SCHEMA_VERSION = 15
-
-# ---------------------------------------------------------------------------
-# WAL-compatibility fallback
-# ---------------------------------------------------------------------------
-# SQLite's WAL mode requires shared-memory (mmap) coordination and fcntl
-# byte-range locks that don't reliably work on network filesystems (NFS,
-# SMB/CIFS, some FUSE mounts, WSL1). Upstream documents this explicitly:
-# https://www.sqlite.org/wal.html#sometimes_queries_return_sqlite_busy_in_wal_mode
-#
-# On those filesystems ``PRAGMA journal_mode=WAL`` raises
-# ``sqlite3.OperationalError: locking protocol`` (SQLITE_PROTOCOL). If we
-# propagate that, every feature backed by state.db / kanban.db breaks
-# silently — /resume, /title, /history, /branch, kanban dispatcher, etc.
-#
-# Instead, fall back to ``journal_mode=DELETE`` (the pre-WAL default) which
-# works on NFS. Concurrency drops — concurrent readers are blocked during
-# a write — but the feature works.
-_WAL_INCOMPAT_MARKERS = (
- "locking protocol", # SQLITE_PROTOCOL on NFS/SMB
- "not authorized", # Some FUSE mounts block WAL pragma outright
-)
-
-# Last SessionDB() init error, per-process. Surfaced in /resume and
-# related slash-command error strings so users know WHY the DB is
-# unavailable instead of getting a bare "Session database not available."
-# Only SessionDB.__init__ writes to this; kanban_db.connect() failures
-# do not update it (by design — kanban failures are reported via their
-# own caller's error handling, not via /resume-style slash commands).
-_last_init_error: Optional[str] = None
-_last_init_error_lock = threading.Lock()
-
-# Paths for which we've already logged a WAL-fallback WARNING. Without
-# this, kanban_db.connect() (called on every kanban operation — see
-# hermes_cli/kanban_db.py for ~30 call sites) would re-log the same
-# filesystem-incompat warning on every connection, filling errors.log.
-_wal_fallback_warned_paths: set[str] = set()
-_wal_fallback_warned_lock = threading.Lock()
-
-_FTS_TRIGGERS = (
- "messages_fts_insert",
- "messages_fts_delete",
- "messages_fts_update",
- "messages_fts_trigram_insert",
- "messages_fts_trigram_delete",
- "messages_fts_trigram_update",
-)
-
-
-def _set_last_init_error(msg: Optional[str]) -> None:
- """Record (or clear) the most recent state.db init failure.
-
- Thread-safe via _last_init_error_lock. Callers pass a message to
- record a failure or None to clear. SessionDB.__init__ only calls
- this to SET on failure — it deliberately does NOT clear on success,
- because in a multi-threaded caller (e.g. gateway / web_server per-
- request SessionDB() instantiation), a concurrent successful open
- racing past a different thread's failure would erase the cause
- string that thread's /resume handler is about to format. Explicit
- clears (e.g. test fixtures) are still supported by passing None.
- """
- global _last_init_error
- with _last_init_error_lock:
- _last_init_error = msg
-
-
-def get_last_init_error() -> Optional[str]:
- """Return the most recent state.db init failure, if any.
-
- Slash-command handlers (``/resume``, ``/title``, ``/history``, ``/branch``)
- call this to surface the underlying cause in their error messages when
- ``_session_db is None``. Returns ``None`` if SessionDB initialized
- successfully (or hasn't been attempted).
- """
- return _last_init_error
-
-
-def format_session_db_unavailable(prefix: str = "Session database not available") -> str:
- """Format a user-facing 'session DB unavailable' message with cause.
-
- When ``SessionDB()`` init fails, callers set ``_session_db = None`` and
- several slash commands (/resume, /title, /history, /branch) previously
- responded with a bare ``"Session database not available."`` — no
- indication of WHY. This helper includes the captured cause (typically
- ``"locking protocol"`` from NFS/SMB) and points users at the known
- culprit so they can fix it themselves.
-
- Example output:
- Session database not available: locking protocol (state.db may be
- on NFS/SMB — see https://www.sqlite.org/wal.html).
- """
- cause = get_last_init_error()
- if not cause:
- return f"{prefix}."
- hint = ""
- if any(marker in cause.lower() for marker in _WAL_INCOMPAT_MARKERS):
- hint = " (state.db may be on NFS/SMB/FUSE — see https://www.sqlite.org/wal.html)"
- return f"{prefix}: {cause}{hint}."
-
-
-def _on_disk_journal_mode(conn: sqlite3.Connection) -> Optional[str]:
- """Read the journal mode from the SQLite DB header on disk.
-
- Returns the mode string (e.g. ``"wal"``, ``"delete"``), or ``None``
- if the value cannot be determined (new DB, or PRAGMA read failed).
- """
- try:
- row = conn.execute("PRAGMA journal_mode").fetchone()
- except sqlite3.OperationalError:
- return None
- if row is None:
- return None
- mode = row[0]
- if isinstance(mode, bytes): # defensive: sqlite3 occasionally returns bytes
- try:
- mode = mode.decode("ascii")
- except UnicodeDecodeError:
- return None
- return str(mode).strip().lower() if mode is not None else None
-
-
-def apply_wal_with_fallback(
- conn: sqlite3.Connection,
- *,
- db_label: str = "state.db",
-) -> str:
- """Set ``journal_mode=WAL`` on ``conn``, falling back to DELETE on failure.
-
- Returns the journal mode actually set (``"wal"`` or ``"delete"``).
-
- On WAL-incompatible filesystems (NFS, SMB, some FUSE), SQLite raises
- ``OperationalError("locking protocol")`` when setting WAL. We fall
- back to DELETE mode — the pre-WAL default, which works on NFS — and
- log one WARNING explaining why.
-
- The WARNING is deduplicated per ``db_label``: repeated connections
- to the same underlying DB (e.g. kanban_db.connect() which is called
- on every kanban operation) log once per process, not once per call.
- Different db_labels log independently, so state.db and kanban.db
- each get one warning on the same NFS mount.
-
- Shared by :class:`SessionDB` and ``hermes_cli.kanban_db.connect`` so
- both databases get identical fallback behavior.
-
- Never downgrades to DELETE if the on-disk DB header reports WAL — see _on_disk_journal_mode.
- """
- # Read-only probe — no flock, no checkpoint, no WAL/SHM unlink.
- # Skipping the set-pragma prevents WAL-init from unlinking files other connections hold open.
- try:
- current_mode = conn.execute("PRAGMA journal_mode").fetchone()
- if current_mode and current_mode[0] == "wal":
- return "wal"
- except sqlite3.OperationalError:
- pass
-
- try:
- conn.execute("PRAGMA journal_mode=WAL")
- return "wal"
- except sqlite3.OperationalError as exc:
- msg = str(exc).lower()
- if not any(marker in msg for marker in _WAL_INCOMPAT_MARKERS):
- # Unrelated OperationalError — don't silently swallow.
- raise
- # Don't downgrade if another process already set WAL on disk.
- existing = _on_disk_journal_mode(conn)
- if existing == "wal":
- raise
- _log_wal_fallback_once(db_label, exc)
- conn.execute("PRAGMA journal_mode=DELETE")
- return "delete"
-
-
-def _log_wal_fallback_once(db_label: str, exc: Exception) -> None:
- """Log a single WARNING per (process, db_label) about WAL fallback.
-
- Without this dedup, NFS users running kanban (which opens a fresh
- connection on every operation — see hermes_cli/kanban_db.py) would
- fill errors.log with hundreds of identical warnings per hour.
- """
- with _wal_fallback_warned_lock:
- if db_label in _wal_fallback_warned_paths:
- return
- _wal_fallback_warned_paths.add(db_label)
- logger.warning(
- "%s: WAL journal_mode unsupported on this filesystem (%s) — "
- "falling back to journal_mode=DELETE (slower rollback-journal "
- "mode; reduces concurrency but works on NFS/SMB/FUSE). See "
- "https://www.sqlite.org/wal.html for details. This warning "
- "fires once per process per database.",
- db_label,
- exc,
- )
-
-SCHEMA_SQL = """
-CREATE TABLE IF NOT EXISTS schema_version (
- version INTEGER NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS sessions (
- id TEXT PRIMARY KEY,
- source TEXT NOT NULL,
- user_id TEXT,
- model TEXT,
- model_config TEXT,
- system_prompt TEXT,
- parent_session_id TEXT,
- started_at REAL NOT NULL,
- ended_at REAL,
- end_reason TEXT,
- message_count INTEGER DEFAULT 0,
- tool_call_count INTEGER DEFAULT 0,
- input_tokens INTEGER DEFAULT 0,
- output_tokens INTEGER DEFAULT 0,
- cache_read_tokens INTEGER DEFAULT 0,
- cache_write_tokens INTEGER DEFAULT 0,
- reasoning_tokens INTEGER DEFAULT 0,
- cwd TEXT,
- billing_provider TEXT,
- billing_base_url TEXT,
- billing_mode TEXT,
- estimated_cost_usd REAL,
- actual_cost_usd REAL,
- cost_status TEXT,
- cost_source TEXT,
- pricing_version TEXT,
- title TEXT,
- api_call_count INTEGER DEFAULT 0,
- handoff_state TEXT,
- handoff_platform TEXT,
- handoff_error TEXT,
- rewind_count INTEGER NOT NULL DEFAULT 0,
- archived INTEGER NOT NULL DEFAULT 0,
- FOREIGN KEY (parent_session_id) REFERENCES sessions(id)
-);
-
-CREATE TABLE IF NOT EXISTS messages (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- session_id TEXT NOT NULL REFERENCES sessions(id),
- role TEXT NOT NULL,
- content TEXT,
- tool_call_id TEXT,
- tool_calls TEXT,
- tool_name TEXT,
- timestamp REAL NOT NULL,
- token_count INTEGER,
- finish_reason TEXT,
- reasoning TEXT,
- reasoning_content TEXT,
- reasoning_details TEXT,
- codex_reasoning_items TEXT,
- codex_message_items TEXT,
- platform_message_id TEXT,
- observed INTEGER DEFAULT 0,
- active INTEGER NOT NULL DEFAULT 1
-);
-
-CREATE TABLE IF NOT EXISTS state_meta (
- key TEXT PRIMARY KEY,
- value TEXT
-);
-
-CREATE TABLE IF NOT EXISTS compression_locks (
- session_id TEXT PRIMARY KEY,
- holder TEXT NOT NULL,
- acquired_at REAL NOT NULL,
- expires_at REAL NOT NULL
-);
-
-CREATE INDEX IF NOT EXISTS idx_sessions_source ON sessions(source);
-CREATE INDEX IF NOT EXISTS idx_sessions_source_id ON sessions(source, id);
-CREATE INDEX IF NOT EXISTS idx_sessions_parent ON sessions(parent_session_id);
-CREATE INDEX IF NOT EXISTS idx_sessions_started ON sessions(started_at DESC);
-CREATE INDEX IF NOT EXISTS idx_messages_session ON messages(session_id, timestamp);
-CREATE INDEX IF NOT EXISTS idx_compression_locks_expires ON compression_locks(expires_at);
-"""
-
-# Indexes that reference columns added in later schema versions must be
-# created AFTER _reconcile_columns() has had a chance to ADD them on
-# existing databases. SCHEMA_SQL above is run by sqlite executescript
-# which would otherwise fail on legacy DBs ("no such column: active").
-DEFERRED_INDEX_SQL = """
-CREATE INDEX IF NOT EXISTS idx_messages_session_active
- ON messages(session_id, active, timestamp);
-"""
-
-FTS_SQL = """
-CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts USING fts5(
- content
-);
-
-CREATE TRIGGER IF NOT EXISTS messages_fts_insert AFTER INSERT ON messages BEGIN
- INSERT INTO messages_fts(rowid, content) VALUES (
- new.id,
- COALESCE(new.content, '') || ' ' || COALESCE(new.tool_name, '') || ' ' || COALESCE(new.tool_calls, '')
- );
-END;
-
-CREATE TRIGGER IF NOT EXISTS messages_fts_delete AFTER DELETE ON messages BEGIN
- DELETE FROM messages_fts WHERE rowid = old.id;
-END;
-
-CREATE TRIGGER IF NOT EXISTS messages_fts_update AFTER UPDATE ON messages BEGIN
- DELETE FROM messages_fts WHERE rowid = old.id;
- INSERT INTO messages_fts(rowid, content) VALUES (
- new.id,
- COALESCE(new.content, '') || ' ' || COALESCE(new.tool_name, '') || ' ' || COALESCE(new.tool_calls, '')
- );
-END;
-"""
-
-# Trigram FTS5 table for CJK substring search. The default unicode61
-# tokenizer splits CJK characters into individual tokens, breaking phrase
-# matching. The trigram tokenizer creates overlapping 3-byte sequences so
-# substring queries work natively for any script (CJK, Thai, etc.).
-FTS_TRIGRAM_SQL = """
-CREATE VIRTUAL TABLE IF NOT EXISTS messages_fts_trigram USING fts5(
- content,
- tokenize='trigram'
-);
-
-CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_insert AFTER INSERT ON messages BEGIN
- INSERT INTO messages_fts_trigram(rowid, content) VALUES (
- new.id,
- COALESCE(new.content, '') || ' ' || COALESCE(new.tool_name, '') || ' ' || COALESCE(new.tool_calls, '')
- );
-END;
-
-CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_delete AFTER DELETE ON messages BEGIN
- DELETE FROM messages_fts_trigram WHERE rowid = old.id;
-END;
-
-CREATE TRIGGER IF NOT EXISTS messages_fts_trigram_update AFTER UPDATE ON messages BEGIN
- DELETE FROM messages_fts_trigram WHERE rowid = old.id;
- INSERT INTO messages_fts_trigram(rowid, content) VALUES (
- new.id,
- COALESCE(new.content, '') || ' ' || COALESCE(new.tool_name, '') || ' ' || COALESCE(new.tool_calls, '')
- );
-END;
-"""
-
-
-class SessionDB:
- """
- SQLite-backed session storage with FTS5 search.
-
- Thread-safe for the common gateway pattern (multiple reader threads,
- single writer via WAL mode). Each method opens its own cursor.
- """
-
- # ── Write-contention tuning ──
- # With multiple hermes processes (gateway + CLI sessions + worktree agents)
- # all sharing one state.db, WAL write-lock contention causes visible TUI
- # freezes. SQLite's built-in busy handler uses a deterministic sleep
- # schedule that causes convoy effects under high concurrency.
- #
- # Instead, we keep the SQLite timeout short (1s) and handle retries at the
- # application level with random jitter, which naturally staggers competing
- # writers and avoids the convoy.
- _WRITE_MAX_RETRIES = 15
- _WRITE_RETRY_MIN_S = 0.020 # 20ms
- _WRITE_RETRY_MAX_S = 0.150 # 150ms
- # Attempt a PASSIVE WAL checkpoint every N successful writes.
- _CHECKPOINT_EVERY_N_WRITES = 50
-
- def __init__(self, db_path: Path = None, read_only: bool = False):
- self.db_path = db_path or DEFAULT_DB_PATH
- self.read_only = read_only
-
- self._lock = threading.Lock()
- self._write_count = 0
- self._fts_enabled = False
- self._fts_unavailable_warned = False
- try:
- if read_only:
- # Read-only attach for cross-profile aggregation: SELECT-only,
- # so we skip schema init entirely (no DDL, no FTS probe, no
- # column reconcile). Crucially this takes NO write lock, so
- # polling another profile's live DB on every sidebar refresh
- # never contends with that profile's running backend. The DB
- # must already exist + be initialised (callers guard on
- # db_path.exists()); a SELECT against an empty file raises and
- # the caller degrades per-profile.
- self._conn = sqlite3.connect(
- f"file:{self.db_path}?mode=ro",
- uri=True,
- check_same_thread=False,
- timeout=1.0,
- isolation_level=None,
- )
- self._conn.row_factory = sqlite3.Row
- return
-
- self.db_path.parent.mkdir(parents=True, exist_ok=True)
- self._conn = sqlite3.connect(
- str(self.db_path),
- check_same_thread=False,
- # Short timeout — application-level retry with random jitter
- # handles contention instead of sitting in SQLite's internal
- # busy handler for up to 30s.
- timeout=1.0,
- # auto-starts transactions on DML, which conflicts with our
- # explicit BEGIN IMMEDIATE. None = we manage transactions
- # ourselves.
- isolation_level=None,
- )
- self._conn.row_factory = sqlite3.Row
- apply_wal_with_fallback(self._conn, db_label="state.db")
- self._conn.execute("PRAGMA foreign_keys=ON")
-
- self._init_schema()
- except Exception as exc:
- # Capture the cause so /resume and friends can surface WHY the
- # session DB is unavailable instead of a bare "Session database
- # not available." Callers that catch this exception keep their
- # existing ``self._session_db = None`` degradation path.
- #
- # Note: we deliberately do NOT clear _last_init_error on the
- # success path (no else branch). In multi-threaded callers
- # (gateway, web_server per-request SessionDB()), a concurrent
- # successful open racing past this failure would erase the
- # cause that another thread's /resume is about to format.
- # Tests that need to reset the state can call
- # ``hermes_state._set_last_init_error(None)`` explicitly.
- _set_last_init_error(f"{type(exc).__name__}: {exc}")
- raise
-
- # ── Core write helper ──
-
- @staticmethod
- def _is_fts5_unavailable_error(exc: sqlite3.OperationalError) -> bool:
- err = str(exc).lower()
- return "no such module" in err and "fts5" in err
-
- def _warn_fts5_unavailable(self, exc: sqlite3.OperationalError) -> None:
- self._fts_enabled = False
- if self._fts_unavailable_warned:
- return
- self._fts_unavailable_warned = True
- logger.warning(
- "SQLite FTS5 unavailable for %s; full-text session search "
- "disabled. Run `hermes update` to rebuild the venv with a "
- "current Python (managed uv guarantees FTS5). "
- "(underlying error: %s)",
- self.db_path,
- exc,
- )
-
- def _sqlite_supports_fts5(self, cursor: sqlite3.Cursor) -> bool:
- try:
- cursor.execute("CREATE VIRTUAL TABLE temp._hermes_fts5_probe USING fts5(x)")
- cursor.execute("DROP TABLE temp._hermes_fts5_probe")
- return True
- except sqlite3.OperationalError as exc:
- if not self._is_fts5_unavailable_error(exc):
- raise
- self._warn_fts5_unavailable(exc)
- return False
-
- @staticmethod
- def _drop_fts_triggers(cursor: sqlite3.Cursor) -> None:
- for trigger in _FTS_TRIGGERS:
- try:
- cursor.execute(f"DROP TRIGGER IF EXISTS {trigger}")
- except sqlite3.OperationalError:
- pass
-
- @staticmethod
- def _fts_trigger_count(cursor: sqlite3.Cursor) -> int:
- placeholders = ",".join("?" for _ in _FTS_TRIGGERS)
- row = cursor.execute(
- f"SELECT COUNT(*) FROM sqlite_master "
- f"WHERE type = 'trigger' AND name IN ({placeholders})",
- _FTS_TRIGGERS,
- ).fetchone()
- return int(row[0] if not isinstance(row, sqlite3.Row) else row[0])
-
- @staticmethod
- def _rebuild_fts_indexes(cursor: sqlite3.Cursor) -> None:
- for table_name in ("messages_fts", "messages_fts_trigram"):
- cursor.execute(f"DELETE FROM {table_name}")
- cursor.execute(
- "INSERT INTO messages_fts(rowid, content) "
- "SELECT id, "
- "COALESCE(content, '') || ' ' || "
- "COALESCE(tool_name, '') || ' ' || "
- "COALESCE(tool_calls, '') "
- "FROM messages"
- )
- cursor.execute(
- "INSERT INTO messages_fts_trigram(rowid, content) "
- "SELECT id, "
- "COALESCE(content, '') || ' ' || "
- "COALESCE(tool_name, '') || ' ' || "
- "COALESCE(tool_calls, '') "
- "FROM messages"
- )
-
- def _fts_table_probe(self, cursor: sqlite3.Cursor, table_name: str) -> Optional[bool]:
- try:
- cursor.execute(f"SELECT * FROM {table_name} LIMIT 0")
- return True
- except sqlite3.OperationalError as exc:
- if self._is_fts5_unavailable_error(exc):
- self._warn_fts5_unavailable(exc)
- return None
- if "no such table" in str(exc).lower():
- return False
- raise
-
- def _ensure_fts_schema(
- self,
- cursor: sqlite3.Cursor,
- table_name: str,
- ddl: str,
- ) -> bool:
- status = self._fts_table_probe(cursor, table_name)
- if status is None:
- return False
- try:
- # Run even when the virtual table exists so any dropped or missing
- # triggers are recreated after a previous no-FTS5 runtime disabled
- # them to keep message writes working.
- cursor.executescript(ddl)
- return True
- except sqlite3.OperationalError as exc:
- if not self._is_fts5_unavailable_error(exc):
- raise
- self._warn_fts5_unavailable(exc)
- return False
-
- def _execute_write(self, fn: Callable[[sqlite3.Connection], T]) -> T:
- """Execute a write transaction with BEGIN IMMEDIATE and jitter retry.
-
- *fn* receives the connection and should perform INSERT/UPDATE/DELETE
- statements. The caller must NOT call ``commit()`` — that's handled
- here after *fn* returns.
-
- BEGIN IMMEDIATE acquires the WAL write lock at transaction start
- (not at commit time), so lock contention surfaces immediately.
- On ``database is locked``, we release the Python lock, sleep a
- random 20-150ms, and retry — breaking the convoy pattern that
- SQLite's built-in deterministic backoff creates.
-
- Returns whatever *fn* returns.
- """
- last_err: Optional[Exception] = None
- for attempt in range(self._WRITE_MAX_RETRIES):
- try:
- with self._lock:
- self._conn.execute("BEGIN IMMEDIATE")
- try:
- result = fn(self._conn)
- self._conn.commit()
- except BaseException:
- try:
- self._conn.rollback()
- except Exception:
- pass
- raise
- # Success — periodic best-effort checkpoint.
- self._write_count += 1
- if self._write_count % self._CHECKPOINT_EVERY_N_WRITES == 0:
- self._try_wal_checkpoint()
- return result
- except sqlite3.OperationalError as exc:
- err_msg = str(exc).lower()
- if "locked" in err_msg or "busy" in err_msg:
- last_err = exc
- if attempt < self._WRITE_MAX_RETRIES - 1:
- jitter = random.uniform(
- self._WRITE_RETRY_MIN_S,
- self._WRITE_RETRY_MAX_S,
- )
- time.sleep(jitter)
- continue
- # Non-lock error or retries exhausted — propagate.
- raise
- # Retries exhausted (shouldn't normally reach here).
- raise last_err or sqlite3.OperationalError(
- "database is locked after max retries"
- )
-
- def _try_wal_checkpoint(self) -> None:
- """Best-effort TRUNCATE WAL checkpoint. Never raises.
-
- Flushes committed WAL frames back into the main DB file and
- truncates the WAL file to zero bytes. Keeps the WAL from
- growing unbounded when many processes hold persistent
- connections.
-
- PASSIVE checkpoint was previously used here, but it never
- truncates the WAL file — the file stays at its high-water
- mark until an explicit TRUNCATE is called (which only
- happened inside the infrequent vacuum()).
-
- TRUNCATE may block writers briefly while checkpointing, but
- _try_wal_checkpoint is called off the hot path (every 50
- writes) and already runs under ``self._lock``, so the
- additional hold time is negligible.
- """
- try:
- with self._lock:
- result = self._conn.execute(
- "PRAGMA wal_checkpoint(TRUNCATE)"
- ).fetchone()
- if result and result[1] > 0:
- logger.debug(
- "WAL checkpoint: %d/%d pages checkpointed",
- result[2], result[1],
- )
- except Exception:
- pass # Best effort — never fatal.
-
- def close(self):
- """Close the database connection.
-
- Attempts a TRUNCATE WAL checkpoint first so that exiting processes
- help shrink the WAL file.
- """
- with self._lock:
- if self._conn:
- try:
- self._conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
- except Exception:
- pass
- self._conn.close()
- self._conn = None
-
- @staticmethod
- def _parse_schema_columns(schema_sql: str) -> Dict[str, Dict[str, str]]:
- """Extract expected columns per table from SCHEMA_SQL.
-
- Uses an in-memory SQLite database to parse the SQL — SQLite itself
- handles all syntax (DEFAULT expressions with commas, inline
- REFERENCES, CHECK constraints, etc.) so there are zero regex
- edge cases. The in-memory DB is opened, the schema DDL is
- executed, and PRAGMA table_info extracts the column metadata.
-
- Adding a column to SCHEMA_SQL is all that's needed; the
- reconciliation loop picks it up automatically.
- """
- ref = sqlite3.connect(":memory:")
- try:
- ref.executescript(schema_sql)
- table_columns: Dict[str, Dict[str, str]] = {}
- for (tbl,) in ref.execute(
- "SELECT name FROM sqlite_master "
- "WHERE type='table' AND name NOT LIKE 'sqlite_%'"
- ).fetchall():
- cols: Dict[str, str] = {}
- for row in ref.execute(
- f'PRAGMA table_info("{tbl}")'
- ).fetchall():
- # row: (cid, name, type, notnull, dflt_value, pk)
- col_name = row[1]
- col_type = row[2] or ""
- notnull = row[3]
- default = row[4]
- pk = row[5]
- # Reconstruct the type expression for ALTER TABLE ADD COLUMN
- parts = [col_type] if col_type else []
- if notnull and not pk:
- parts.append("NOT NULL")
- if default is not None:
- parts.append(f"DEFAULT {default}")
- cols[col_name] = " ".join(parts)
- table_columns[tbl] = cols
- return table_columns
- finally:
- ref.close()
-
- def _reconcile_columns(self, cursor: sqlite3.Cursor) -> None:
- """Ensure live tables have every column declared in SCHEMA_SQL.
-
- Follows the Beets/sqlite-utils pattern: the CREATE TABLE definition
- in SCHEMA_SQL is the single source of truth for the desired schema.
- On every startup this method diffs the live columns (via PRAGMA
- table_info) against the declared columns, and ADDs any that are
- missing.
-
- This makes column additions a declarative operation — just add
- the column to SCHEMA_SQL and it appears on the next startup.
- Version-gated migration blocks are no longer needed for ADD COLUMN.
- """
- expected = self._parse_schema_columns(SCHEMA_SQL)
- for table_name, declared_cols in expected.items():
- # Get current columns from the live table
- try:
- rows = cursor.execute(
- f'PRAGMA table_info("{table_name}")'
- ).fetchall()
- except sqlite3.OperationalError:
- continue # Table doesn't exist yet (shouldn't happen after executescript)
- live_cols = set()
- for row in rows:
- # PRAGMA table_info returns (cid, name, type, notnull, dflt_value, pk)
- name = row[1] if isinstance(row, (tuple, list)) else row["name"]
- live_cols.add(name)
-
- for col_name, col_type in declared_cols.items():
- if col_name not in live_cols:
- safe_name = col_name.replace('"', '""')
- try:
- cursor.execute(
- f'ALTER TABLE "{table_name}" ADD COLUMN "{safe_name}" {col_type}'
- )
- except sqlite3.OperationalError as exc:
- # Expected: "duplicate column name" from a race or
- # re-run. Unexpected: "Cannot add a NOT NULL column
- # with default value NULL" from a schema mistake.
- # Log at DEBUG so it's visible in agent.log.
- logger.debug(
- "reconcile %s.%s: %s", table_name, col_name, exc,
- )
-
- def _init_schema(self):
- """Create tables and FTS if they don't exist, reconcile columns.
-
- Schema management follows the declarative reconciliation pattern
- (Beets, sqlite-utils): SCHEMA_SQL is the single source of truth.
- On existing databases, _reconcile_columns() diffs live columns
- against SCHEMA_SQL and ADDs any missing ones. This eliminates
- the version-gated migration chain for column additions, making
- it impossible for reordered or inserted migrations to skip columns.
-
- The schema_version table is retained for future data migrations
- (transforming existing rows) which cannot be handled declaratively.
- """
- cursor = self._conn.cursor()
-
- cursor.executescript(SCHEMA_SQL)
-
- # ── Declarative column reconciliation ──────────────────────────
- # Diff live tables against SCHEMA_SQL and ADD any missing columns.
- # This is idempotent and self-healing: even if a version-gated
- # migration was skipped (e.g. due to version renumbering), the
- # column gets created here.
- self._reconcile_columns(cursor)
-
- # Indexes that reference reconciler-added columns must be created
- # AFTER _reconcile_columns runs — declaring them in SCHEMA_SQL
- # makes the initial executescript fail on legacy DBs (the index's
- # WHERE clause references a column that doesn't exist yet).
- try:
- cursor.execute(
- "CREATE INDEX IF NOT EXISTS idx_messages_platform_msg_id "
- "ON messages(session_id, platform_message_id) "
- "WHERE platform_message_id IS NOT NULL"
- )
- except sqlite3.OperationalError as exc:
- logger.debug("idx_messages_platform_msg_id create skipped: %s", exc)
-
- # Deferred indexes that reference the reconciler-added ``active``
- # column (idx_messages_session_active) — same ordering constraint.
- cursor.executescript(DEFERRED_INDEX_SQL)
-
- fts5_available = self._sqlite_supports_fts5(cursor)
- fts_migrations_complete = True
- if not fts5_available:
- # Existing FTS triggers can still fire on messages INSERT/UPDATE
- # even though the current sqlite runtime cannot read the virtual
- # tables they target. Drop only the triggers so core persistence
- # continues; if a future runtime has FTS5, _ensure_fts_schema()
- # recreates them.
- self._drop_fts_triggers(cursor)
-
- # ── Schema version bookkeeping ─────────────────────────────────
- # Bump to current so future data migrations (if any) can gate on
- # version. No version-gated column additions remain.
- cursor.execute("SELECT version FROM schema_version LIMIT 1")
- row = cursor.fetchone()
- if row is None:
- cursor.execute(
- "INSERT INTO schema_version (version) VALUES (?)",
- (SCHEMA_VERSION,),
- )
- else:
- current_version = row["version"] if isinstance(row, sqlite3.Row) else row[0]
- # Data migrations that can't be expressed declaratively (row
- # backfills, index changes tied to a specific version step) stay
- # in a version-gated chain. Column additions are handled by
- # _reconcile_columns() above and no longer need entries here.
- if current_version < 10:
- # v10: trigram FTS5 table for CJK/substring search. The
- # virtual table + triggers are created unconditionally via
- # FTS_TRIGRAM_SQL below, but existing rows need a one-time
- # backfill into the FTS index.
- if fts5_available:
- _fts_trigram_exists = self._fts_table_probe(
- cursor, "messages_fts_trigram"
- )
- if _fts_trigram_exists is False:
- if self._ensure_fts_schema(
- cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL
- ):
- cursor.execute(
- "INSERT INTO messages_fts_trigram(rowid, content) "
- "SELECT id, content FROM messages WHERE content IS NOT NULL"
- )
- else:
- fts_migrations_complete = False
- elif _fts_trigram_exists is None:
- fts_migrations_complete = False
- else:
- fts_migrations_complete = False
- if current_version < 11:
- # v11: re-index FTS5 tables to cover tool_name + tool_calls and
- # switch from external-content to inline mode. Existing DBs have
- # old-schema FTS tables and triggers that IF NOT EXISTS won't
- # overwrite, so we drop them explicitly and let the post-migration
- # existence checks (below) recreate them from FTS_SQL /
- # FTS_TRIGRAM_SQL, then backfill every message row. Fixes #16751.
- if fts5_available:
- self._drop_fts_triggers(cursor)
- for _tbl in ("messages_fts", "messages_fts_trigram"):
- try:
- cursor.execute(f"DROP TABLE IF EXISTS {_tbl}")
- except sqlite3.OperationalError as exc:
- if not self._is_fts5_unavailable_error(exc):
- raise
- self._warn_fts5_unavailable(exc)
- fts5_available = False
- fts_migrations_complete = False
- break
-
- if fts5_available:
- # Recreate virtual tables + triggers with the new inline-mode
- # schema that indexes content || tool_name || tool_calls.
- if (
- self._ensure_fts_schema(cursor, "messages_fts", FTS_SQL)
- and self._ensure_fts_schema(
- cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL
- )
- ):
- # Backfill both indexes from every existing messages row.
- cursor.execute(
- "INSERT INTO messages_fts(rowid, content) "
- "SELECT id, "
- "COALESCE(content, '') || ' ' || "
- "COALESCE(tool_name, '') || ' ' || "
- "COALESCE(tool_calls, '') "
- "FROM messages"
- )
- cursor.execute(
- "INSERT INTO messages_fts_trigram(rowid, content) "
- "SELECT id, "
- "COALESCE(content, '') || ' ' || "
- "COALESCE(tool_name, '') || ' ' || "
- "COALESCE(tool_calls, '') "
- "FROM messages"
- )
- else:
- fts_migrations_complete = False
- else:
- fts_migrations_complete = False
- if current_version < 12:
- # v12: messages.active flag for rewind/undo soft-deletion.
- # The declarative reconcile_columns() above adds the
- # column itself; this UPDATE is belt-and-suspenders to
- # ensure any rows that pre-existed the ADD COLUMN have
- # active=1 rather than NULL.
- try:
- cursor.execute(
- "UPDATE messages SET active = 1 WHERE active IS NULL"
- )
- except sqlite3.OperationalError:
- pass
- if current_version < SCHEMA_VERSION and fts_migrations_complete:
- cursor.execute(
- "UPDATE schema_version SET version = ?",
- (SCHEMA_VERSION,),
- )
-
- # Unique title index — always ensure it exists
- try:
- cursor.execute(
- "CREATE UNIQUE INDEX IF NOT EXISTS idx_sessions_title_unique "
- "ON sessions(title) WHERE title IS NOT NULL"
- )
- except sqlite3.OperationalError:
- pass # Index already exists
-
- if fts5_available:
- # FTS5 setup. Run the DDL even when the virtual table exists so
- # CREATE TRIGGER IF NOT EXISTS repairs trigger-only degradation from
- # an earlier no-FTS5 runtime.
- triggers_need_repair = self._fts_trigger_count(cursor) < len(_FTS_TRIGGERS)
- self._fts_enabled = self._ensure_fts_schema(cursor, "messages_fts", FTS_SQL)
-
- # Trigram FTS5 for CJK/substring search. This is optional relative
- # to the main FTS table; if it cannot be created, CJK search falls
- # back to LIKE.
- if self._fts_enabled:
- trigram_enabled = self._ensure_fts_schema(
- cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL
- )
- if trigram_enabled and triggers_need_repair:
- self._rebuild_fts_indexes(cursor)
-
- self._conn.commit()
-
- # =========================================================================
- # Session lifecycle
- # =========================================================================
-
- def _insert_session_row(
- self,
- session_id: str,
- source: str,
- model: str = None,
- model_config: Dict[str, Any] = None,
- system_prompt: str = None,
- user_id: str = None,
- parent_session_id: str = None,
- cwd: str = None,
- ) -> None:
- """Shared INSERT OR IGNORE for session rows."""
- def _do(conn):
- conn.execute(
- """INSERT OR IGNORE INTO sessions (id, source, user_id, model, model_config,
- system_prompt, parent_session_id, cwd, started_at)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
- (
- session_id,
- source,
- user_id,
- model,
- json.dumps(model_config) if model_config else None,
- system_prompt,
- parent_session_id,
- cwd,
- time.time(),
- ),
- )
- self._execute_write(_do)
-
- def create_session(self, session_id: str, source: str, **kwargs) -> str:
- """Create a new session record. Returns the session_id."""
- self._insert_session_row(session_id, source, **kwargs)
- return session_id
- def end_session(self, session_id: str, end_reason: str) -> None:
- """Mark a session as ended.
-
- No-ops when the session is already ended. The first end_reason wins:
- compression-split sessions must keep their ``end_reason = 'compression'``
- record even if a later stale ``end_session()`` call (e.g. from a
- desynced CLI session_id after ``/resume`` or ``/branch``) targets them
- with a different reason. Use ``reopen_session()`` first if you
- intentionally need to re-end a closed session with a new reason.
- """
- def _do(conn):
- conn.execute(
- "UPDATE sessions SET ended_at = ?, end_reason = ? "
- "WHERE id = ? AND ended_at IS NULL",
- (time.time(), end_reason, session_id),
- )
- self._execute_write(_do)
-
- def reopen_session(self, session_id: str) -> None:
- """Clear ended_at/end_reason so a session can be resumed."""
- def _do(conn):
- conn.execute(
- "UPDATE sessions SET ended_at = NULL, end_reason = NULL WHERE id = ?",
- (session_id,),
- )
- self._execute_write(_do)
-
- def update_session_cwd(self, session_id: str, cwd: str) -> None:
- """Persist the session working directory when a frontend knows it."""
- if not session_id or not cwd:
- return
-
- def _do(conn):
- conn.execute("UPDATE sessions SET cwd = ? WHERE id = ?", (cwd, session_id))
-
- self._execute_write(_do)
- # ──────────────────────────────────────────────────────────────────────
- # Compression locks
- # ──────────────────────────────────────────────────────────────────────
- # Atomic per-session locks that prevent two compression paths from
- # racing on the same session_id and producing orphan child sessions.
- #
- # The race: ``conversation_compression.py`` rotates ``agent.session_id``
- # as a side effect of a successful compression (end old session, create
- # new). That mutation is local to the AIAgent instance — but ``state.db``
- # is shared across all instances. Two AIAgents that share the same
- # ``session_id`` at the moment they both decide to compress (most
- # commonly the parent turn's agent + a background-review fork started
- # right after the turn ended) each end the parent and create their own
- # NEW session, parented to the same old id. The gateway SessionEntry
- # only catches one rotation; the other child silently accumulates
- # writes — Damien's "parent → two orphan children" repro shape.
- #
- # The lock is keyed by ``session_id`` and is held for the duration of
- # the compress() call plus the rotation. ``holder`` identifies the
- # current owner (pid:tid:nonce) for diagnostics; the lock is recovered
- # via ``expires_at`` if the holder process crashed without releasing.
- def try_acquire_compression_lock(
- self,
- session_id: str,
- holder: str,
- ttl_seconds: float = 300.0,
- ) -> bool:
- """Try to atomically acquire the compression lock for ``session_id``.
-
- Returns ``True`` on success (caller now owns the lock and must
- release via :meth:`release_compression_lock`). Returns ``False``
- if another holder already owns a non-expired lock — the caller
- MUST NOT proceed with compression in that case (its rotation would
- race against the holder's, splitting the session lineage).
-
- Expired locks (``expires_at < now``) are reclaimed transparently:
- the stale row is deleted and the new holder acquires it. This
- prevents a crashed compressor from permanently blocking the
- session.
-
- Implementation: single-transaction DELETE-expired + INSERT-or-IGNORE,
- followed by a SELECT to confirm we got the row. SQLite serialises
- writes, so the whole sequence is atomic against other writers.
- """
- if not session_id:
- return False
- now = time.time()
- expires_at = now + ttl_seconds
-
- def _do(conn):
- # First: reclaim any expired lock for this session_id.
- conn.execute(
- "DELETE FROM compression_locks "
- "WHERE session_id = ? AND expires_at < ?",
- (session_id, now),
- )
- # Then: try to insert. INSERT OR IGNORE returns no rowcount
- # difference — verify ownership via SELECT.
- conn.execute(
- "INSERT OR IGNORE INTO compression_locks "
- "(session_id, holder, acquired_at, expires_at) "
- "VALUES (?, ?, ?, ?)",
- (session_id, holder, now, expires_at),
- )
- row = conn.execute(
- "SELECT holder FROM compression_locks WHERE session_id = ?",
- (session_id,),
- ).fetchone()
- return row is not None and (
- row["holder"] if isinstance(row, sqlite3.Row) else row[0]
- ) == holder
-
- try:
- return bool(self._execute_write(_do))
- except sqlite3.Error as exc:
- logger.warning(
- "try_acquire_compression_lock(%s) failed: %s",
- session_id, exc,
- )
- # Fail open: returning False makes the caller skip compression,
- # which is the safe behaviour when the lock subsystem is broken.
- return False
-
- def release_compression_lock(self, session_id: str, holder: str) -> None:
- """Release the compression lock for ``session_id`` iff we own it.
-
- Idempotent: no-op when the lock has already expired and been
- reclaimed by a different holder, or when no lock exists. The
- ``holder`` check prevents a late-returning compressor from
- clobbering a fresh lock held by someone else.
- """
- if not session_id:
- return
-
- def _do(conn):
- conn.execute(
- "DELETE FROM compression_locks "
- "WHERE session_id = ? AND holder = ?",
- (session_id, holder),
- )
-
- try:
- self._execute_write(_do)
- except sqlite3.Error as exc:
- logger.warning(
- "release_compression_lock(%s) failed: %s",
- session_id, exc,
- )
-
- def get_compression_lock_holder(self, session_id: str) -> Optional[str]:
- """Return the current (non-expired) holder for ``session_id``, or None.
-
- Diagnostic helper — not used by the locking protocol itself.
- """
- if not session_id:
- return None
- now = time.time()
- row = self._conn.execute(
- "SELECT holder FROM compression_locks "
- "WHERE session_id = ? AND expires_at >= ?",
- (session_id, now),
- ).fetchone()
- if row is None:
- return None
- return row["holder"] if isinstance(row, sqlite3.Row) else row[0]
-
- def update_session_meta(
- self,
- session_id: str,
- model_config_json: str,
- model: Optional[str] = None,
- ) -> None:
- """Update model_config and optionally model for an existing session.
-
- Uses COALESCE so that passing model=None leaves the stored model
- column unchanged. Routes through _execute_write for the standard
- BEGIN IMMEDIATE + jitter-retry + lock guarantee.
- """
- def _do(conn):
- conn.execute(
- "UPDATE sessions SET model_config = ?, model = COALESCE(?, model) WHERE id = ?",
- (model_config_json, model, session_id),
- )
- self._execute_write(_do)
-
- def update_system_prompt(self, session_id: str, system_prompt: str) -> None:
- """Store the full assembled system prompt snapshot."""
- def _do(conn):
- conn.execute(
- "UPDATE sessions SET system_prompt = ? WHERE id = ?",
- (system_prompt, session_id),
- )
- self._execute_write(_do)
-
- def update_session_model(self, session_id: str, model: str) -> None:
- """Update the model for a session after a mid-session switch.
-
- Unlike ``update_token_counts`` which uses ``COALESCE(model, ?)``
- (only filling in NULL), this unconditionally sets the model column
- so that the dashboard reflects the user's latest /model choice.
- """
- def _do(conn):
- conn.execute(
- "UPDATE sessions SET model = ? WHERE id = ?",
- (model, session_id),
- )
- self._execute_write(_do)
-
- def update_token_counts(
- self,
- session_id: str,
- input_tokens: int = 0,
- output_tokens: int = 0,
- model: str = None,
- cache_read_tokens: int = 0,
- cache_write_tokens: int = 0,
- reasoning_tokens: int = 0,
- estimated_cost_usd: Optional[float] = None,
- actual_cost_usd: Optional[float] = None,
- cost_status: Optional[str] = None,
- cost_source: Optional[str] = None,
- pricing_version: Optional[str] = None,
- billing_provider: Optional[str] = None,
- billing_base_url: Optional[str] = None,
- billing_mode: Optional[str] = None,
- api_call_count: int = 0,
- absolute: bool = False,
- ) -> None:
- """Update token counters and backfill model if not already set.
-
- When *absolute* is False (default), values are **incremented** — use
- this for per-API-call deltas (CLI path).
-
- When *absolute* is True, values are **set directly** — use this when
- the caller already holds cumulative totals (gateway path, where the
- cached agent accumulates across messages).
- """
- # Ensure the session row exists so the UPDATE doesn't silently affect
- # 0 rows. Under concurrent load (cron + kanban + delegate_task) the
- # initial create_session() may have failed due to SQLite locking.
- # INSERT OR IGNORE is cheap and idempotent.
- self._insert_session_row(session_id, "unknown", model=model)
- if absolute:
- sql = """UPDATE sessions SET
- input_tokens = ?,
- output_tokens = ?,
- cache_read_tokens = ?,
- cache_write_tokens = ?,
- reasoning_tokens = ?,
- estimated_cost_usd = COALESCE(?, 0),
- actual_cost_usd = CASE
- WHEN ? IS NULL THEN actual_cost_usd
- ELSE ?
- END,
- cost_status = COALESCE(?, cost_status),
- cost_source = COALESCE(?, cost_source),
- pricing_version = COALESCE(?, pricing_version),
- billing_provider = COALESCE(billing_provider, ?),
- billing_base_url = COALESCE(billing_base_url, ?),
- billing_mode = COALESCE(billing_mode, ?),
- model = COALESCE(model, ?),
- api_call_count = ?
- WHERE id = ?"""
- else:
- sql = """UPDATE sessions SET
- input_tokens = input_tokens + ?,
- output_tokens = output_tokens + ?,
- cache_read_tokens = cache_read_tokens + ?,
- cache_write_tokens = cache_write_tokens + ?,
- reasoning_tokens = reasoning_tokens + ?,
- estimated_cost_usd = COALESCE(estimated_cost_usd, 0) + COALESCE(?, 0),
- actual_cost_usd = CASE
- WHEN ? IS NULL THEN actual_cost_usd
- ELSE COALESCE(actual_cost_usd, 0) + ?
- END,
- cost_status = COALESCE(?, cost_status),
- cost_source = COALESCE(?, cost_source),
- pricing_version = COALESCE(?, pricing_version),
- billing_provider = COALESCE(billing_provider, ?),
- billing_base_url = COALESCE(billing_base_url, ?),
- billing_mode = COALESCE(billing_mode, ?),
- model = COALESCE(model, ?),
- api_call_count = COALESCE(api_call_count, 0) + ?
- WHERE id = ?"""
- params = (
- input_tokens,
- output_tokens,
- cache_read_tokens,
- cache_write_tokens,
- reasoning_tokens,
- estimated_cost_usd,
- actual_cost_usd,
- actual_cost_usd,
- cost_status,
- cost_source,
- pricing_version,
- billing_provider,
- billing_base_url,
- billing_mode,
- model,
- api_call_count,
- session_id,
- )
- def _do(conn):
- conn.execute(sql, params)
- self._execute_write(_do)
-
- def ensure_session(
- self,
- session_id: str,
- source: str = "unknown",
- model: str = None,
- **kwargs,
- ) -> str:
- """Ensure a session row exists (INSERT OR IGNORE). Accepts optional kwargs."""
- self._insert_session_row(session_id, source, model=model, **kwargs)
- return session_id
-
- def prune_empty_ghost_sessions(self, sessions_dir: "Optional[Path]" = None) -> int:
- """Remove empty TUI ghost sessions (no messages, no title, >24hr old)."""
- cutoff = time.time() - 86400 # Only sessions older than 24 hours
-
- def _do(conn):
- rows = conn.execute("""
- SELECT id FROM sessions
- WHERE source = 'tui'
- AND title IS NULL
- AND ended_at IS NOT NULL
- AND started_at < ?
- AND NOT EXISTS (
- SELECT 1 FROM messages WHERE messages.session_id = sessions.id
- )
- """, (cutoff,)).fetchall()
- ids = [r[0] if isinstance(r, (tuple, list)) else r["id"] for r in rows]
- if ids:
- placeholders = ",".join("?" * len(ids))
- conn.execute(
- f"DELETE FROM sessions WHERE id IN ({placeholders})", ids
- )
- return ids
-
- removed_ids = self._execute_write(_do) or []
- # Clean up any on-disk session files (belt-and-suspenders)
- if sessions_dir and removed_ids:
- for sid in removed_ids:
- self._remove_session_files(sessions_dir, sid)
- return len(removed_ids)
-
- def finalize_orphaned_compression_sessions(self) -> int:
- """Mark orphaned compression continuation sessions as ended.
-
- Targets child sessions that were never finalized: parent is ended
- with reason='compression', child has messages but no end_reason/ended_at
- and api_call_count=0. Non-destructive: preserves all messages and sets
- end_reason='orphaned_compression'. Fix for #20001.
- """
- cutoff = time.time() - 604800 # 7 days
-
- def _do(conn):
- now = time.time()
- result = conn.execute(
- """
- UPDATE sessions
- SET ended_at = ?,
- end_reason = 'orphaned_compression'
- WHERE api_call_count = 0
- AND end_reason IS NULL
- AND ended_at IS NULL
- AND started_at < ?
- AND parent_session_id IS NOT NULL
- AND EXISTS (
- SELECT 1 FROM sessions p
- WHERE p.id = sessions.parent_session_id
- AND p.end_reason = 'compression'
- AND p.ended_at IS NOT NULL
- )
- AND EXISTS (
- SELECT 1 FROM messages m
- WHERE m.session_id = sessions.id
- )
- """,
- (now, cutoff),
- )
- return result.rowcount
-
- return self._execute_write(_do) or 0
-
- def get_session(self, session_id: str) -> Optional[Dict[str, Any]]:
- """Get a session by ID."""
- with self._lock:
- cursor = self._conn.execute(
- "SELECT * FROM sessions WHERE id = ?", (session_id,)
- )
- row = cursor.fetchone()
- return dict(row) if row else None
-
- def resolve_session_id(self, session_id_or_prefix: str) -> Optional[str]:
- """Resolve an exact or uniquely prefixed session ID to the full ID.
-
- Returns the exact ID when it exists. Otherwise treats the input as a
- prefix and returns the single matching session ID if the prefix is
- unambiguous. Returns None for no matches or ambiguous prefixes.
- """
- exact = self.get_session(session_id_or_prefix)
- if exact:
- return exact["id"]
-
- escaped = (
- session_id_or_prefix
- .replace("\\", "\\\\")
- .replace("%", "\\%")
- .replace("_", "\\_")
- )
- with self._lock:
- cursor = self._conn.execute(
- "SELECT id FROM sessions WHERE id LIKE ? ESCAPE '\\' ORDER BY started_at DESC LIMIT 2",
- (f"{escaped}%",),
- )
- matches = [row["id"] for row in cursor.fetchall()]
- if len(matches) == 1:
- return matches[0]
- return None
-
- # Maximum length for session titles
- MAX_TITLE_LENGTH = 100
-
- @staticmethod
- def sanitize_title(title: Optional[str]) -> Optional[str]:
- """Validate and sanitize a session title.
-
- - Strips leading/trailing whitespace
- - Removes ASCII control characters (0x00-0x1F, 0x7F) and problematic
- Unicode control chars (zero-width, RTL/LTR overrides, etc.)
- - Collapses internal whitespace runs to single spaces
- - Normalizes empty/whitespace-only strings to None
- - Enforces MAX_TITLE_LENGTH
-
- Returns the cleaned title string or None.
- Raises ValueError if the title exceeds MAX_TITLE_LENGTH after cleaning.
- """
- if not title:
- return None
-
- # Remove ASCII control characters (0x00-0x1F, 0x7F) but keep
- # whitespace chars (\t=0x09, \n=0x0A, \r=0x0D) so they can be
- # normalized to spaces by the whitespace collapsing step below
- cleaned = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', title)
-
- # Remove problematic Unicode control characters:
- # - Zero-width chars (U+200B-U+200F, U+FEFF)
- # - Directional overrides (U+202A-U+202E, U+2066-U+2069)
- # - Object replacement (U+FFFC), interlinear annotation (U+FFF9-U+FFFB)
- cleaned = re.sub(
- r'[\u200b-\u200f\u2028-\u202e\u2060-\u2069\ufeff\ufffc\ufff9-\ufffb]',
- '', cleaned,
- )
-
- # Collapse internal whitespace runs and strip
- cleaned = re.sub(r'\s+', ' ', cleaned).strip()
-
- if not cleaned:
- return None
-
- if len(cleaned) > SessionDB.MAX_TITLE_LENGTH:
- raise ValueError(
- f"Title too long ({len(cleaned)} chars, max {SessionDB.MAX_TITLE_LENGTH})"
- )
-
- return cleaned
-
- def set_session_title(self, session_id: str, title: str) -> bool:
- """Set or update a session's title.
-
- Returns True if session was found and title was set.
- Raises ValueError if title is already in use by another session,
- or if the title fails validation (too long, invalid characters).
- Empty/whitespace-only strings are normalized to None (clearing the title).
- """
- title = self.sanitize_title(title)
- def _do(conn):
- if title:
- # Check uniqueness (allow the same session to keep its own title)
- cursor = conn.execute(
- "SELECT id FROM sessions WHERE title = ? AND id != ?",
- (title, session_id),
- )
- conflict = cursor.fetchone()
- if conflict:
- raise ValueError(
- f"Title '{title}' is already in use by session {conflict['id']}"
- )
- cursor = conn.execute(
- "UPDATE sessions SET title = ? WHERE id = ?",
- (title, session_id),
- )
- return cursor.rowcount
- rowcount = self._execute_write(_do)
- return rowcount > 0
-
- def get_session_title(self, session_id: str) -> Optional[str]:
- """Get the title for a session, or None."""
- with self._lock:
- cursor = self._conn.execute(
- "SELECT title FROM sessions WHERE id = ?", (session_id,)
- )
- row = cursor.fetchone()
- return row["title"] if row else None
-
- def set_session_archived(self, session_id: str, archived: bool) -> bool:
- """Archive or unarchive a session.
-
- Archived sessions are hidden from the default session list but keep all
- their messages — this is a soft hide, not a delete. Returns True when a
- row was updated.
- """
- def _do(conn):
- cursor = conn.execute(
- "UPDATE sessions SET archived = ? WHERE id = ?",
- (1 if archived else 0, session_id),
- )
- return cursor.rowcount
- rowcount = self._execute_write(_do)
- return rowcount > 0
-
- def get_session_by_title(self, title: str) -> Optional[Dict[str, Any]]:
- """Look up a session by exact title. Returns session dict or None."""
- with self._lock:
- cursor = self._conn.execute(
- "SELECT * FROM sessions WHERE title = ?", (title,)
- )
- row = cursor.fetchone()
- return dict(row) if row else None
-
- def resolve_session_by_title(self, title: str) -> Optional[str]:
- """Resolve a title to a session ID, preferring the latest in a lineage.
-
- If the exact title exists, returns that session's ID.
- If not, searches for "title #N" variants and returns the latest one.
- If the exact title exists AND numbered variants exist, returns the
- latest numbered variant (the most recent continuation).
- """
- # First try exact match
- exact = self.get_session_by_title(title)
-
- # Also search for numbered variants: "title #2", "title #3", etc.
- # Escape SQL LIKE wildcards (%, _) in the title to prevent false matches
- escaped = title.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
- with self._lock:
- cursor = self._conn.execute(
- "SELECT id, title, started_at FROM sessions "
- "WHERE title LIKE ? ESCAPE '\\' ORDER BY started_at DESC",
- (f"{escaped} #%",),
- )
- numbered = cursor.fetchall()
-
- if numbered:
- # Return the most recent numbered variant
- return numbered[0]["id"]
- elif exact:
- return exact["id"]
- return None
-
- def get_next_title_in_lineage(self, base_title: str) -> str:
- """Generate the next title in a lineage (e.g., "my session" → "my session #2").
-
- Strips any existing " #N" suffix to find the base name, then finds
- the highest existing number and increments.
- """
- # Strip existing #N suffix to find the true base
- match = re.match(r'^(.*?) #(\d+)$', base_title)
- if match:
- base = match.group(1)
- else:
- base = base_title
-
- # Find all existing numbered variants
- # Escape SQL LIKE wildcards (%, _) in the base to prevent false matches
- escaped = base.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
- with self._lock:
- cursor = self._conn.execute(
- "SELECT title FROM sessions WHERE title = ? OR title LIKE ? ESCAPE '\\'",
- (base, f"{escaped} #%"),
- )
- existing = [row["title"] for row in cursor.fetchall()]
-
- if not existing:
- return base # No conflict, use the base name as-is
-
- # Find the highest number
- max_num = 1 # The unnumbered original counts as #1
- for t in existing:
- m = re.match(r'^.* #(\d+)$', t)
- if m:
- max_num = max(max_num, int(m.group(1)))
-
- return f"{base} #{max_num + 1}"
-
- def get_compression_tip(self, session_id: str) -> Optional[str]:
- """Walk the compression-continuation chain forward and return the tip.
-
- A compression continuation is a child session where:
- 1. The parent's ``end_reason = 'compression'``
- 2. The child was created AFTER the parent was ended (started_at >= ended_at)
-
- The second condition distinguishes compression continuations from
- delegate subagents or branch children, which can also have a
- ``parent_session_id`` but were created while the parent was still live.
-
- Returns the session_id of the latest continuation in the chain, or the
- input ``session_id`` if it isn't part of a compression chain (or if the
- input itself doesn't exist).
- """
- current = session_id
- # Bound the walk defensively — compression chains this deep are
- # pathological and shouldn't happen in practice. 100 = plenty.
- for _ in range(100):
- with self._lock:
- cursor = self._conn.execute(
- "SELECT id FROM sessions "
- "WHERE parent_session_id = ? "
- " AND started_at >= ("
- " SELECT ended_at FROM sessions "
- " WHERE id = ? AND end_reason = 'compression'"
- " ) "
- "ORDER BY started_at DESC LIMIT 1",
- (current, current),
- )
- row = cursor.fetchone()
- if row is None:
- return current
- current = row["id"]
- return current
-
- def list_sessions_rich(
- self,
- source: str = None,
- exclude_sources: List[str] = None,
- limit: int = 20,
- offset: int = 0,
- include_children: bool = False,
- min_message_count: int = 0,
- project_compression_tips: bool = True,
- order_by_last_active: bool = False,
- include_archived: bool = False,
- archived_only: bool = False,
- id_query: str = None,
- ) -> List[Dict[str, Any]]:
- """List sessions with preview (first user message) and last active timestamp.
-
- Returns dicts with keys: id, source, model, title, started_at, ended_at,
- message_count, preview (first 60 chars of first user message),
- last_active (timestamp of last message).
-
- Uses a single query with correlated subqueries instead of N+2 queries.
-
- By default, child sessions (subagent runs, compression continuations)
- are excluded. Pass ``include_children=True`` to include them.
-
- With ``project_compression_tips=True`` (default), sessions that are
- roots of compression chains are projected forward to their latest
- continuation — one logical conversation = one list entry, showing the
- live continuation's id/message_count/title/last_active. This prevents
- compressed continuations from being invisible to users while keeping
- delegate subagents and branches hidden. Pass ``False`` to return the
- raw root rows (useful for admin/debug UIs).
-
- Pass ``order_by_last_active=True`` to sort by most-recent activity
- instead of original conversation start time. For compression chains,
- the "most-recent activity" is taken from the live tip (not the root),
- so an old conversation that was compressed and continued recently
- surfaces in the correct slot. Ordering is computed at SQL level via
- a recursive CTE that walks compression-continuation edges, so LIMIT
- and OFFSET still apply efficiently.
- """
- where_clauses = []
- params = []
-
- if not include_children:
- # Show root sessions and branch sessions, while still hiding
- # sub-agent runs and compression continuations (which also carry a
- # parent_session_id but were spawned while the parent was still
- # live — i.e., started_at < parent.ended_at).
- #
- # Branch sessions are identified two ways, OR'd for robustness:
- # 1. A stable ``_branched_from`` marker in model_config, written
- # by /branch at creation time. This survives the parent being
- # reopened and re-ended with a different end_reason (e.g.
- # tui_shutdown overwriting 'branched'), which otherwise hides
- # the branch — see issue #20856.
- # 2. The legacy heuristic (parent ended with 'branched' before the
- # child started), covering branch sessions created before the
- # marker existed.
- where_clauses.append(
- "(s.parent_session_id IS NULL"
- " OR json_extract(s.model_config, '$._branched_from') IS NOT NULL"
- " OR EXISTS (SELECT 1 FROM sessions p"
- " WHERE p.id = s.parent_session_id"
- " AND p.end_reason = 'branched'"
- " AND s.started_at >= p.ended_at))"
- )
-
- if source:
- where_clauses.append("s.source = ?")
- params.append(source)
- if exclude_sources:
- placeholders = ",".join("?" for _ in exclude_sources)
- where_clauses.append(f"s.source NOT IN ({placeholders})")
- params.extend(exclude_sources)
- if min_message_count > 0:
- where_clauses.append("s.message_count >= ?")
- params.append(min_message_count)
- if archived_only:
- where_clauses.append("s.archived = 1")
- elif not include_archived:
- where_clauses.append("s.archived = 0")
-
- where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
-
- # Optional session-id filter, pushed into SQL so callers (Desktop
- # session-id search) don't have to fetch every row and filter in
- # Python. ``id_query`` is matched as a case-insensitive substring
- # against each surfaced row's id AND every id in its forward
- # compression chain — so searching a compression *root* id or a *tip*
- # id both resolve to the same projected conversation. Only used in the
- # order_by_last_active path (which builds the chain CTE); other callers
- # pass id_query=None.
- id_needle = (id_query or "").strip().lower()
- if order_by_last_active:
- # Compute effective_last_active by walking each surfaced session's
- # compression-continuation chain forward in SQL and taking the MAX
- # timestamp across the chain. This lets us ORDER BY + LIMIT at SQL
- # level instead of fetching every row and sorting in Python, while
- # still surfacing old compression roots whose live tip is fresh.
- #
- # The CTE seeds from rows the outer WHERE admits (roots + branch
- # children), then recursively joins forward through
- # compression-continuation edges using the same criteria as
- # get_compression_tip (parent.end_reason='compression' AND
- # child.started_at >= parent.ended_at).
- outer_where = where_sql
- id_params: List[Any] = []
- if id_needle:
- # Admit a surfaced row if its own id or any id in its forward
- # compression chain matches the needle. LIKE with a leading
- # wildcard can't use an index, but the chain membership and
- # the small result set keep this bounded — far cheaper than
- # fetching every session and scanning in Python.
- id_clause = (
- "EXISTS (SELECT 1 FROM chain cq"
- " WHERE cq.root_id = s.id"
- " AND LOWER(cq.cur_id) LIKE ? ESCAPE '\\')"
- )
- like_pattern = (
- "%"
- + id_needle.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
- + "%"
- )
- id_params = [like_pattern]
- outer_where = (
- f"{where_sql} AND {id_clause}" if where_sql else f"WHERE {id_clause}"
- )
- query = f"""
- WITH RECURSIVE chain(root_id, cur_id) AS (
- SELECT s.id, s.id FROM sessions s {where_sql}
- UNION ALL
- SELECT c.root_id, child.id
- FROM chain c
- JOIN sessions parent ON parent.id = c.cur_id
- JOIN sessions child ON child.parent_session_id = c.cur_id
- WHERE parent.end_reason = 'compression'
- AND child.started_at >= parent.ended_at
- ),
- chain_max AS (
- SELECT
- root_id,
- MAX(COALESCE(
- (SELECT MAX(m.timestamp) FROM messages m WHERE m.session_id = cur_id),
- (SELECT started_at FROM sessions ss WHERE ss.id = cur_id)
- )) AS effective_last_active
- FROM chain
- GROUP BY root_id
- )
- SELECT s.*,
- COALESCE(
- (SELECT SUBSTR(REPLACE(REPLACE(m.content, X'0A', ' '), X'0D', ' '), 1, 63)
- FROM messages m
- WHERE m.session_id = s.id AND m.role = 'user' AND m.content IS NOT NULL
- ORDER BY m.timestamp, m.id LIMIT 1),
- ''
- ) AS _preview_raw,
- COALESCE(
- (SELECT MAX(m2.timestamp) FROM messages m2 WHERE m2.session_id = s.id),
- s.started_at
- ) AS last_active,
- COALESCE(cm.effective_last_active, s.started_at) AS _effective_last_active
- FROM sessions s
- LEFT JOIN chain_max cm ON cm.root_id = s.id
- {outer_where}
- ORDER BY _effective_last_active DESC, s.started_at DESC, s.id DESC
- LIMIT ? OFFSET ?
- """
- # WHERE params apply twice (CTE seed + outer select); the id filter
- # only applies to the outer select.
- params = params + params + id_params + [limit, offset]
- else:
- query = f"""
- SELECT s.*,
- COALESCE(
- (SELECT SUBSTR(REPLACE(REPLACE(m.content, X'0A', ' '), X'0D', ' '), 1, 63)
- FROM messages m
- WHERE m.session_id = s.id AND m.role = 'user' AND m.content IS NOT NULL
- ORDER BY m.timestamp, m.id LIMIT 1),
- ''
- ) AS _preview_raw,
- COALESCE(
- (SELECT MAX(m2.timestamp) FROM messages m2 WHERE m2.session_id = s.id),
- s.started_at
- ) AS last_active
- FROM sessions s
- {where_sql}
- ORDER BY s.started_at DESC
- LIMIT ? OFFSET ?
- """
- params.extend([limit, offset])
- with self._lock:
- cursor = self._conn.execute(query, params)
- rows = cursor.fetchall()
- sessions = []
- for row in rows:
- s = dict(row)
- # Build the preview from the raw substring
- raw = s.pop("_preview_raw", "").strip()
- if raw:
- text = raw[:60]
- s["preview"] = text + ("..." if len(raw) > 60 else "")
- else:
- s["preview"] = ""
- # Drop the internal ordering column so callers see a clean dict.
- s.pop("_effective_last_active", None)
- sessions.append(s)
-
- # Project compression roots forward to their tips. Each row whose
- # end_reason is 'compression' has a continuation child; replace the
- # surfaced fields (id, message_count, title, last_active, ended_at,
- # end_reason, preview) with the tip's values so the list entry acts
- # as the live conversation. Keep the root's started_at to preserve
- # chronological ordering by original conversation start.
- if project_compression_tips and not include_children:
- projected = []
- for s in sessions:
- if s.get("end_reason") != "compression":
- projected.append(s)
- continue
- tip_id = self.get_compression_tip(s["id"])
- if tip_id == s["id"]:
- projected.append(s)
- continue
- tip_row = self._get_session_rich_row(tip_id)
- if not tip_row:
- projected.append(s)
- continue
- # Preserve the root's started_at for stable sort order, but
- # surface the tip's identity and activity data.
- merged = dict(s)
- for key in (
- "id", "ended_at", "end_reason", "message_count",
- "tool_call_count", "title", "last_active", "preview",
- "model", "system_prompt", "cwd",
- ):
- if key in tip_row:
- merged[key] = tip_row[key]
- merged["_lineage_root_id"] = s["id"]
- projected.append(merged)
- sessions = projected
-
- return sessions
-
- def list_cron_job_runs(
- self,
- job_id: str,
- limit: int = 20,
- offset: int = 0,
- ) -> List[Dict[str, Any]]:
- """List the run sessions produced by a single cron job, newest first.
-
- Cron runs are flat, independent sessions whose id is
- ``cron_{job_id}_{timestamp}`` (see ``cron/scheduler.run_job``). They are
- never compression roots and never branch, so this deliberately skips the
- ``list_sessions_rich`` recursive compression-chain CTE / leading-wildcard
- ``id_query`` path — that path seeds from *every* ``source='cron'`` row in
- the DB and only filters to one job's runs after the scan, so it scales
- with the whole cron pile (a heavy history makes the desktop run-history
- endpoint time out before it eventually populates).
-
- Instead this binds to one job with a ``[prefix, prefix_hi)`` range over
- the id (an index range scan, not a ``%...%`` substring), filters
- ``source='cron'``, and orders by ``started_at DESC``. Work scales with
- the requested window, not the total cron history.
-
- Returns the same enriched row shape as ``list_sessions_rich`` (adds
- ``preview`` + ``last_active``) so callers can reuse it.
- """
- prefix = f"cron_{job_id}_"
- # Half-open upper bound for an index range scan: increment the final
- # byte of the prefix so the range covers exactly the ids that start
- # with ``prefix`` and nothing else. ``prefix`` always ends in '_', but
- # compute it generically rather than hardcoding the successor char.
- prefix_hi = prefix[:-1] + chr(ord(prefix[-1]) + 1)
-
- query = """
- SELECT s.*,
- COALESCE(
- (SELECT SUBSTR(REPLACE(REPLACE(m.content, X'0A', ' '), X'0D', ' '), 1, 63)
- FROM messages m
- WHERE m.session_id = s.id AND m.role = 'user' AND m.content IS NOT NULL
- ORDER BY m.timestamp, m.id LIMIT 1),
- ''
- ) AS _preview_raw,
- COALESCE(
- (SELECT MAX(m2.timestamp) FROM messages m2 WHERE m2.session_id = s.id),
- s.started_at
- ) AS last_active
- FROM sessions s
- WHERE s.source = 'cron' AND s.id >= ? AND s.id < ?
- ORDER BY s.started_at DESC, s.id DESC
- LIMIT ? OFFSET ?
- """
- with self._lock:
- cursor = self._conn.execute(query, (prefix, prefix_hi, limit, offset))
- rows = cursor.fetchall()
-
- runs: List[Dict[str, Any]] = []
- for row in rows:
- s = dict(row)
- raw = s.pop("_preview_raw", "").strip()
- if raw:
- text = raw[:60]
- s["preview"] = text + ("..." if len(raw) > 60 else "")
- else:
- s["preview"] = ""
- runs.append(s)
- return runs
-
- def _get_session_rich_row(self, session_id: str) -> Optional[Dict[str, Any]]:
- """Fetch a single session with the same enriched columns as
- ``list_sessions_rich`` (preview + last_active). Returns None if the
- session doesn't exist.
- """
- query = """
- SELECT s.*,
- COALESCE(
- (SELECT SUBSTR(REPLACE(REPLACE(m.content, X'0A', ' '), X'0D', ' '), 1, 63)
- FROM messages m
- WHERE m.session_id = s.id AND m.role = 'user' AND m.content IS NOT NULL
- ORDER BY m.timestamp, m.id LIMIT 1),
- ''
- ) AS _preview_raw,
- COALESCE(
- (SELECT MAX(m2.timestamp) FROM messages m2 WHERE m2.session_id = s.id),
- s.started_at
- ) AS last_active
- FROM sessions s
- WHERE s.id = ?
- """
- with self._lock:
- cursor = self._conn.execute(query, (session_id,))
- row = cursor.fetchone()
- if not row:
- return None
- s = dict(row)
- raw = s.pop("_preview_raw", "").strip()
- if raw:
- text = raw[:60]
- s["preview"] = text + ("..." if len(raw) > 60 else "")
- else:
- s["preview"] = ""
- return s
-
- # =========================================================================
- # Message storage
- # =========================================================================
-
- # Sentinel prefix used to distinguish JSON-encoded structured content
- # (multimodal messages: lists of parts like text + image_url) from plain
- # string content. The NUL byte is not legal in normal text, so this
- # cannot collide with real user content.
- _CONTENT_JSON_PREFIX = "\x00json:"
-
- @classmethod
- def _encode_content(cls, content: Any) -> Any:
- """Serialize structured (list/dict) message content for sqlite.
-
- sqlite3 can only bind ``str``, ``bytes``, ``int``, ``float``, and ``None``
- to query parameters. Multimodal messages have ``content`` as a list of
- parts (``[{"type": "text", ...}, {"type": "image_url", ...}]``), which
- raises ``ProgrammingError: Error binding parameter N: type 'list' is
- not supported`` when bound directly.
-
- Returns the value unchanged when it's already a safe scalar, or a
- sentinel-prefixed JSON string for lists/dicts. Paired with
- :meth:`_decode_content` on read.
- """
- if content is None or isinstance(content, (str, bytes, int, float)):
- return content
- try:
- return cls._CONTENT_JSON_PREFIX + json.dumps(content)
- except (TypeError, ValueError):
- # Last-resort fallback: stringify so persistence never fails.
- return str(content)
-
- @classmethod
- def _decode_content(cls, content: Any) -> Any:
- """Reverse :meth:`_encode_content`; returns scalars unchanged."""
- if isinstance(content, str) and content.startswith(cls._CONTENT_JSON_PREFIX):
- try:
- return json.loads(content[len(cls._CONTENT_JSON_PREFIX):])
- except (json.JSONDecodeError, TypeError):
- logger.warning(
- "Failed to decode JSON-encoded message content; "
- "returning raw string"
- )
- return content
- return content
-
- def append_message(
- self,
- session_id: str,
- role: str,
- content: str = None,
- tool_name: str = None,
- tool_calls: Any = None,
- tool_call_id: str = None,
- token_count: int = None,
- finish_reason: str = None,
- reasoning: str = None,
- reasoning_content: str = None,
- reasoning_details: Any = None,
- codex_reasoning_items: Any = None,
- codex_message_items: Any = None,
- platform_message_id: str = None,
- observed: bool = False,
- ) -> int:
- """
- Append a message to a session. Returns the message row ID.
-
- Also increments the session's message_count (and tool_call_count
- if role is 'tool' or tool_calls is present).
-
- ``platform_message_id`` is the external messaging platform's own
- message ID (e.g. Telegram update_id, Yuanbao msg_id). It is
- independent of the SQLite autoincrement primary key and is used by
- platform-specific flows like yuanbao's recall guard to redact a
- message by its platform-side identifier.
- """
- # Serialize structured fields to JSON before entering the write txn
- reasoning_details_json = (
- json.dumps(reasoning_details)
- if reasoning_details else None
- )
- codex_items_json = (
- json.dumps(codex_reasoning_items)
- if codex_reasoning_items else None
- )
- codex_message_items_json = (
- json.dumps(codex_message_items)
- if codex_message_items else None
- )
- tool_calls_json = json.dumps(tool_calls) if tool_calls else None
- # Multimodal content (list of parts) must be JSON-encoded: sqlite3
- # cannot bind list/dict parameters directly.
- stored_content = self._encode_content(content)
-
- # Pre-compute tool call count
- num_tool_calls = 0
- if tool_calls is not None:
- num_tool_calls = len(tool_calls) if isinstance(tool_calls, list) else 1
-
- def _do(conn):
- cursor = conn.execute(
- """INSERT INTO messages (session_id, role, content, tool_call_id,
- tool_calls, tool_name, timestamp, token_count, finish_reason,
- reasoning, reasoning_content, reasoning_details, codex_reasoning_items,
- codex_message_items, platform_message_id, observed)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
- (
- session_id,
- role,
- stored_content,
- tool_call_id,
- tool_calls_json,
- tool_name,
- time.time(),
- token_count,
- finish_reason,
- reasoning,
- reasoning_content,
- reasoning_details_json,
- codex_items_json,
- codex_message_items_json,
- platform_message_id,
- 1 if observed else 0,
- ),
- )
- msg_id = cursor.lastrowid
-
- # Update counters
- if num_tool_calls > 0:
- conn.execute(
- """UPDATE sessions SET message_count = message_count + 1,
- tool_call_count = tool_call_count + ? WHERE id = ?""",
- (num_tool_calls, session_id),
- )
- else:
- conn.execute(
- "UPDATE sessions SET message_count = message_count + 1 WHERE id = ?",
- (session_id,),
- )
- return msg_id
-
- return self._execute_write(_do)
-
- def replace_messages(self, session_id: str, messages: List[Dict[str, Any]]) -> None:
- """Atomically replace every message for a session.
-
- Used by transcript-rewrite flows such as /retry, /undo, and /compress.
- The delete + reinsert sequence must commit as one transaction so a
- mid-rewrite failure does not leave SQLite with a partial transcript.
- """
-
- def _do(conn):
- conn.execute(
- "DELETE FROM messages WHERE session_id = ?", (session_id,)
- )
- conn.execute(
- "UPDATE sessions SET message_count = 0, tool_call_count = 0 WHERE id = ?",
- (session_id,),
- )
-
- now_ts = time.time()
- total_messages = 0
- total_tool_calls = 0
- for msg in messages:
- role = msg.get("role", "unknown")
- tool_calls = msg.get("tool_calls")
- reasoning_details = msg.get("reasoning_details") if role == "assistant" else None
- codex_reasoning_items = (
- msg.get("codex_reasoning_items") if role == "assistant" else None
- )
- codex_message_items = (
- msg.get("codex_message_items") if role == "assistant" else None
- )
-
- reasoning_details_json = (
- json.dumps(reasoning_details) if reasoning_details else None
- )
- codex_items_json = (
- json.dumps(codex_reasoning_items) if codex_reasoning_items else None
- )
- codex_message_items_json = (
- json.dumps(codex_message_items) if codex_message_items else None
- )
- tool_calls_json = json.dumps(tool_calls) if tool_calls else None
- # Accept either `platform_message_id` (new explicit name) or
- # `message_id` (yuanbao's existing convention on message dicts).
- platform_msg_id = (
- msg.get("platform_message_id") or msg.get("message_id")
- )
-
- conn.execute(
- """INSERT INTO messages (session_id, role, content, tool_call_id,
- tool_calls, tool_name, timestamp, token_count, finish_reason,
- reasoning, reasoning_content, reasoning_details, codex_reasoning_items,
- codex_message_items, platform_message_id, observed)
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
- (
- session_id,
- role,
- self._encode_content(msg.get("content")),
- msg.get("tool_call_id"),
- tool_calls_json,
- msg.get("tool_name"),
- now_ts,
- msg.get("token_count"),
- msg.get("finish_reason"),
- msg.get("reasoning") if role == "assistant" else None,
- msg.get("reasoning_content") if role == "assistant" else None,
- reasoning_details_json,
- codex_items_json,
- codex_message_items_json,
- platform_msg_id,
- 1 if msg.get("observed") else 0,
- ),
- )
- total_messages += 1
- if tool_calls is not None:
- total_tool_calls += (
- len(tool_calls) if isinstance(tool_calls, list) else 1
- )
- now_ts += 1e-6
-
- conn.execute(
- "UPDATE sessions SET message_count = ?, tool_call_count = ? WHERE id = ?",
- (total_messages, total_tool_calls, session_id),
- )
-
- self._execute_write(_do)
-
- def get_messages(
- self, session_id: str, include_inactive: bool = False
- ) -> List[Dict[str, Any]]:
- """Load messages for a session in insertion order.
-
- By default only active messages are returned. Pass
- ``include_inactive=True`` to load soft-deleted rows (e.g. for
- audit / debug views of rewound history). See
- :meth:`rewind_to_message` for the soft-delete mechanic.
-
- Ordered by AUTOINCREMENT id (true insertion order) rather than
- timestamp — see c03acca50 for the WSL2 clock-regression rationale.
- """
- active_clause = "" if include_inactive else " AND active = 1"
- with self._lock:
- cursor = self._conn.execute(
- "SELECT * FROM messages WHERE session_id = ?"
- f"{active_clause} ORDER BY id",
- (session_id,),
- )
- rows = cursor.fetchall()
- result = []
- for row in rows:
- msg = dict(row)
- if "content" in msg:
- msg["content"] = self._decode_content(msg["content"])
- if msg.get("tool_calls"):
- try:
- msg["tool_calls"] = json.loads(msg["tool_calls"])
- except (json.JSONDecodeError, TypeError):
- logger.warning("Failed to deserialize tool_calls in get_messages, falling back to []")
- msg["tool_calls"] = []
- result.append(msg)
- return result
-
- def get_messages_around(
- self,
- session_id: str,
- around_message_id: int,
- window: int = 5,
- ) -> Dict[str, Any]:
- """Load a window of messages anchored on a specific message id.
-
- Returns a dict with:
- - ``window``: up to ``window`` messages before the anchor, the anchor
- itself, and up to ``window`` messages after, ordered by id ascending.
- - ``messages_before``: count of messages strictly before the anchor
- still in the session (== window unless we hit the start).
- - ``messages_after``: count of messages strictly after the anchor
- still in the session (== window unless we hit the end).
-
- Used by ``session_search`` for both the discovery shape (anchored on the
- FTS5 match) and the scroll shape (anchored on any message id). The
- ``messages_before`` / ``messages_after`` counts let the caller detect
- session boundaries: when either is less than ``window``, the agent has
- reached one end of the session.
-
- Returns an empty window when ``around_message_id`` is not a real id in
- ``session_id`` — callers decide how to surface that.
- """
- if window < 0:
- window = 0
- with self._lock:
- # Confirm the anchor exists in this session.
- anchor_exists = self._conn.execute(
- "SELECT 1 FROM messages WHERE id = ? AND session_id = ? LIMIT 1",
- (around_message_id, session_id),
- ).fetchone()
- if not anchor_exists:
- return {"window": [], "messages_before": 0, "messages_after": 0}
-
- # Two queries: anchor + before (DESC, take window+1), and after
- # (ASC, take window). Final order is id ASC.
- before_rows = self._conn.execute(
- "SELECT * FROM messages "
- "WHERE session_id = ? AND id <= ? "
- "ORDER BY id DESC LIMIT ?",
- (session_id, around_message_id, window + 1),
- ).fetchall()
- after_rows = self._conn.execute(
- "SELECT * FROM messages "
- "WHERE session_id = ? AND id > ? "
- "ORDER BY id ASC LIMIT ?",
- (session_id, around_message_id, window),
- ).fetchall()
-
- # before_rows is DESC; reverse so it's ASC, then concatenate after_rows.
- rows = list(reversed(before_rows)) + list(after_rows)
- result = []
- for row in rows:
- msg = dict(row)
- if "content" in msg:
- msg["content"] = self._decode_content(msg["content"])
- if msg.get("tool_calls"):
- try:
- msg["tool_calls"] = json.loads(msg["tool_calls"])
- except (json.JSONDecodeError, TypeError):
- logger.warning(
- "Failed to deserialize tool_calls in get_messages_around, falling back to []"
- )
- msg["tool_calls"] = []
- result.append(msg)
-
- # before_rows includes the anchor itself; subtract 1 for the count of
- # messages strictly before the anchor in the returned slice.
- messages_before = max(0, len(before_rows) - 1)
- messages_after = len(after_rows)
- return {
- "window": result,
- "messages_before": messages_before,
- "messages_after": messages_after,
- }
-
- def get_anchored_view(
- self,
- session_id: str,
- around_message_id: int,
- window: int = 5,
- bookend: int = 3,
- keep_roles: Optional[Tuple[str, ...]] = ("user", "assistant"),
- ) -> Dict[str, Any]:
- """Return an anchored window plus session bookends.
-
- Built on top of ``get_messages_around``. Three slices:
-
- - ``window``: messages immediately surrounding the anchor. Filtered
- to ``keep_roles`` (tool-response noise dropped by default), EXCEPT
- the anchor itself is always preserved regardless of role.
- - ``bookend_start``: first ``bookend`` user/assistant messages of the
- session — but only those whose id is strictly before the window's
- first message id. Empty when the window already overlaps the
- session head. Empty-content messages (tool-call-only assistant
- turns) are skipped so they don't crowd out actual prose openings.
- - ``bookend_end``: last ``bookend`` user/assistant messages of the
- session, same non-overlap rule at the tail.
-
- Bookends let an FTS5 hit anywhere in a long session yield the goal
- (opening) and the resolution (closing) on a single call — without
- loading the whole transcript.
-
- Returns ``{"window": [], "messages_before": 0, "messages_after": 0,
- "bookend_start": [], "bookend_end": []}`` when the anchor isn't in
- the session.
-
- ``keep_roles=None`` disables role filtering (raw window + raw
- bookends).
- """
- if bookend < 0:
- bookend = 0
-
- # Reuse the primitive — handles anchor-existence, content decoding,
- # tool_calls deserialisation, and boundary counts.
- primitive = self.get_messages_around(
- session_id, around_message_id, window=window
- )
- window_rows = primitive["window"]
- if not window_rows:
- return {
- "window": [],
- "messages_before": 0,
- "messages_after": 0,
- "bookend_start": [],
- "bookend_end": [],
- }
-
- # Apply role filter to the window, but never drop the anchor itself.
- if keep_roles is not None:
- keep_set = set(keep_roles)
- filtered_window = [
- m for m in window_rows
- if m.get("id") == around_message_id or m.get("role") in keep_set
- ]
- else:
- filtered_window = window_rows
-
- window_min_id = window_rows[0]["id"]
- window_max_id = window_rows[-1]["id"]
-
- # Fetch bookends only when there's room outside the window. SQL filters
- # by id range, role, and non-empty content — tool-call-only assistant
- # turns (content='' with tool_calls populated) are excluded so they
- # don't crowd out actual prose openings/closings.
- bookend_start_rows: List[Any] = []
- bookend_end_rows: List[Any] = []
- if bookend > 0:
- with self._lock:
- role_clause = ""
- role_params: list = []
- if keep_roles is not None:
- role_placeholders = ",".join("?" for _ in keep_roles)
- role_clause = f" AND role IN ({role_placeholders})"
- role_params = list(keep_roles)
-
- bookend_start_rows = self._conn.execute(
- f"SELECT * FROM messages "
- f"WHERE session_id = ? AND id < ?{role_clause} "
- f"AND length(content) > 0 "
- f"ORDER BY id ASC LIMIT ?",
- (session_id, window_min_id, *role_params, bookend),
- ).fetchall()
-
- bookend_end_rows = self._conn.execute(
- f"SELECT * FROM messages "
- f"WHERE session_id = ? AND id > ?{role_clause} "
- f"AND length(content) > 0 "
- f"ORDER BY id DESC LIMIT ?",
- (session_id, window_max_id, *role_params, bookend),
- ).fetchall()
- # End rows came back DESC for the LIMIT cap; flip to ASC.
- bookend_end_rows = list(reversed(bookend_end_rows))
-
- def _hydrate(row) -> Dict[str, Any]:
- msg = dict(row)
- if "content" in msg:
- msg["content"] = self._decode_content(msg["content"])
- if msg.get("tool_calls"):
- try:
- msg["tool_calls"] = json.loads(msg["tool_calls"])
- except (json.JSONDecodeError, TypeError):
- logger.warning(
- "Failed to deserialize tool_calls in get_anchored_view, falling back to []"
- )
- msg["tool_calls"] = []
- return msg
-
- return {
- "window": filtered_window,
- "messages_before": primitive["messages_before"],
- "messages_after": primitive["messages_after"],
- "bookend_start": [_hydrate(r) for r in bookend_start_rows],
- "bookend_end": [_hydrate(r) for r in bookend_end_rows],
- }
-
- def resolve_resume_session_id(self, session_id: str) -> str:
- """Redirect a resume target to the descendant session that holds the messages.
-
- Context compression ends the current session and forks a new child session
- (linked via ``parent_session_id``). The flush cursor is reset, so the
- child is where new messages actually land — the parent ends up with
- ``message_count = 0`` rows unless messages had already been flushed to
- it before compression. See #15000.
-
- This helper walks ``parent_session_id`` forward from ``session_id`` and
- returns the first descendant in the chain that has at least one message
- row. If the original session already has messages, or no descendant
- has any, the original ``session_id`` is returned unchanged.
-
- The chain is always walked via the child whose ``started_at`` is
- latest; that matches the single-chain shape that compression creates.
- A depth cap (32) guards against accidental loops in malformed data.
- """
- if not session_id:
- return session_id
-
- with self._lock:
- # If this session already has messages, nothing to redirect.
- try:
- row = self._conn.execute(
- "SELECT 1 FROM messages WHERE session_id = ? LIMIT 1",
- (session_id,),
- ).fetchone()
- except Exception:
- return session_id
- if row is not None:
- return session_id
-
- # Walk descendants: at each step, pick the most-recently-started
- # child session; stop once we find one with messages.
- current = session_id
- seen = {current}
- for _ in range(32):
- try:
- child_row = self._conn.execute(
- "SELECT id FROM sessions "
- "WHERE parent_session_id = ? "
- "ORDER BY started_at DESC, id DESC LIMIT 1",
- (current,),
- ).fetchone()
- except Exception:
- return session_id
- if child_row is None:
- return session_id
- child_id = child_row["id"] if hasattr(child_row, "keys") else child_row[0]
- if not child_id or child_id in seen:
- return session_id
- seen.add(child_id)
- try:
- msg_row = self._conn.execute(
- "SELECT 1 FROM messages WHERE session_id = ? LIMIT 1",
- (child_id,),
- ).fetchone()
- except Exception:
- return session_id
- if msg_row is not None:
- return child_id
- current = child_id
- return session_id
-
- def get_messages_as_conversation(
- self,
- session_id: str,
- include_ancestors: bool = False,
- include_inactive: bool = False,
- ) -> List[Dict[str, Any]]:
- """
- Load messages in the OpenAI conversation format (role + content dicts).
- Used by the gateway to restore conversation history.
-
- By default only active messages are returned. Pass
- ``include_inactive=True`` to load soft-deleted (rewound) rows
- as well. See :meth:`rewind_to_message`.
- """
- session_ids = [session_id]
- if include_ancestors:
- session_ids = self._session_lineage_root_to_tip(session_id)
-
- active_clause = "" if include_inactive else " AND active = 1"
- with self._lock:
- placeholders = ",".join("?" for _ in session_ids)
- # 只取最近200条,不压缩不丢内容
- rows = self._conn.execute(
- "SELECT role, content, tool_call_id, tool_calls, tool_name, "
- "finish_reason, reasoning, reasoning_content, reasoning_details, "
- "codex_reasoning_items, codex_message_items, platform_message_id, observed "
- f"FROM ("
- f"SELECT id, role, content, tool_call_id, tool_calls, tool_name, "
- f"finish_reason, reasoning, reasoning_content, reasoning_details, "
- f"codex_reasoning_items, codex_message_items, platform_message_id, observed "
- f"FROM messages WHERE session_id IN ({placeholders})"
- f"{active_clause} ORDER BY id DESC LIMIT 200"
- f") ORDER BY id ASC",
- tuple(session_ids),
- ).fetchall()
-
- messages = []
- for row in rows:
- content = self._decode_content(row["content"])
- if row["role"] in {"user", "assistant"} and isinstance(content, str):
- content = sanitize_context(content).strip()
- msg = {"role": row["role"], "content": content}
- if row["tool_call_id"]:
- msg["tool_call_id"] = row["tool_call_id"]
- if row["tool_name"]:
- msg["tool_name"] = row["tool_name"]
- if row["tool_calls"]:
- try:
- msg["tool_calls"] = json.loads(row["tool_calls"])
- except (json.JSONDecodeError, TypeError):
- logger.warning("Failed to deserialize tool_calls in conversation replay, falling back to []")
- msg["tool_calls"] = []
- # Surface the platform-side message id (e.g. yuanbao msg_id,
- # telegram update_id) so platform-specific flows like recall
- # can match by external identifier instead of having to fall
- # back to content-match heuristics. Exposed as ``message_id``
- # for backward compatibility with the JSONL transcript shape.
- if row["platform_message_id"]:
- msg["message_id"] = row["platform_message_id"]
- if row["observed"]:
- msg["observed"] = True
- # Restore reasoning fields on assistant messages so providers
- # that replay reasoning (OpenRouter, OpenAI, Nous) receive
- # coherent multi-turn reasoning context.
- if row["role"] == "assistant":
- if row["finish_reason"]:
- msg["finish_reason"] = row["finish_reason"]
- if row["reasoning"]:
- msg["reasoning"] = row["reasoning"]
- if row["reasoning_content"] is not None:
- msg["reasoning_content"] = row["reasoning_content"]
- if row["reasoning_details"]:
- try:
- msg["reasoning_details"] = json.loads(row["reasoning_details"])
- except (json.JSONDecodeError, TypeError):
- logger.warning("Failed to deserialize reasoning_details, falling back to None")
- msg["reasoning_details"] = None
- if row["codex_reasoning_items"]:
- try:
- msg["codex_reasoning_items"] = json.loads(row["codex_reasoning_items"])
- except (json.JSONDecodeError, TypeError):
- logger.warning("Failed to deserialize codex_reasoning_items, falling back to None")
- msg["codex_reasoning_items"] = None
- if row["codex_message_items"]:
- try:
- msg["codex_message_items"] = json.loads(row["codex_message_items"])
- except (json.JSONDecodeError, TypeError):
- logger.warning("Failed to deserialize codex_message_items, falling back to None")
- msg["codex_message_items"] = None
- if include_ancestors and self._is_duplicate_replayed_user_message(messages, msg):
- continue
- messages.append(msg)
- return messages
-
- def _session_lineage_root_to_tip(self, session_id: str) -> List[str]:
- if not session_id:
- return [session_id]
-
- chain = []
- current = session_id
- seen = set()
- with self._lock:
- for _ in range(100):
- if not current or current in seen:
- break
- seen.add(current)
- chain.append(current)
- row = self._conn.execute(
- "SELECT parent_session_id FROM sessions WHERE id = ?",
- (current,),
- ).fetchone()
- if row is None:
- break
- current = row["parent_session_id"] if hasattr(row, "keys") else row[0]
- return list(reversed(chain)) or [session_id]
-
- @staticmethod
- def _is_duplicate_replayed_user_message(messages: List[Dict[str, Any]], msg: Dict[str, Any]) -> bool:
- if msg.get("role") != "user":
- return False
- content = msg.get("content")
- if not isinstance(content, str) or not content:
- return False
- for prev in reversed(messages):
- if prev.get("role") == "user" and prev.get("content") == content:
- return True
- if prev.get("role") == "assistant" and (prev.get("content") or prev.get("tool_calls")):
- return False
- return False
-
- # =========================================================================
- # Rewind (soft-delete) — see /rewind slash command + issue #21910
- # =========================================================================
-
- def rewind_to_message(
- self, session_id: str, target_message_id: int
- ) -> Dict[str, Any]:
- """Soft-delete all messages with id >= ``target_message_id`` in *session_id*.
-
- The target message itself becomes inactive as well so the caller
- can pre-fill it as the next user prompt without it appearing
- twice in the replayed transcript. Rewound rows are kept on
- disk with ``active=0`` for audit / forensic inspection — use
- :meth:`get_messages` with ``include_inactive=True`` to see them.
-
- Returns a dict::
-
- {
- "rewound_count": int, # number of rows newly flipped to active=0
- "target_message": dict, # full row dict of the target
- "new_head_id": int|None # id of the last still-active row, or None
- }
-
- Raises ``ValueError`` if the target message does not exist in
- *session_id* or if its role is not ``"user"``.
-
- Always increments ``sessions.rewind_count`` — even when the
- target is already inactive — so the counter accurately reflects
- the number of rewind operations performed against the session.
- Idempotent on the ``active`` flag: re-rewinding past the same
- target is a no-op on row state but still bumps the counter.
- """
-
- # 1) Validate target up-front (read-only, outside the write txn).
- with self._lock:
- row = self._conn.execute(
- "SELECT * FROM messages WHERE id = ? AND session_id = ?",
- (target_message_id, session_id),
- ).fetchone()
- if row is None:
- raise ValueError(
- f"message {target_message_id} not found in session {session_id}"
- )
- target_row = dict(row)
- if target_row.get("role") != "user":
- raise ValueError(
- f"rewind target must be a 'user' message (got role="
- f"{target_row.get('role')!r}, id={target_message_id})"
- )
-
- # Decode content for callers (prefill the prompt buffer).
- target_row["content"] = self._decode_content(target_row.get("content"))
-
- rewound: List[int] = []
-
- def _do(conn):
- cursor = conn.execute(
- "SELECT id FROM messages "
- "WHERE session_id = ? AND id >= ? AND active = 1",
- (session_id, target_message_id),
- )
- ids = [r[0] for r in cursor.fetchall()]
- if ids:
- placeholders = ",".join("?" for _ in ids)
- conn.execute(
- f"UPDATE messages SET active = 0 WHERE id IN ({placeholders})",
- ids,
- )
- conn.execute(
- "UPDATE sessions SET rewind_count = COALESCE(rewind_count, 0) + 1 "
- "WHERE id = ?",
- (session_id,),
- )
- return ids
-
- rewound = self._execute_write(_do)
-
- # 2) Compute new head id (largest still-active row id in session).
- with self._lock:
- head_row = self._conn.execute(
- "SELECT MAX(id) FROM messages WHERE session_id = ? AND active = 1",
- (session_id,),
- ).fetchone()
- new_head_id = head_row[0] if head_row and head_row[0] is not None else None
-
- return {
- "rewound_count": len(rewound),
- "target_message": target_row,
- "new_head_id": new_head_id,
- }
-
- def restore_rewound(self, session_id: str, since_message_id: int) -> int:
- """Mark inactive messages with id >= *since_message_id* active again.
-
- Returns the number of rows flipped back to ``active=1``.
- Intended for undo-of-rewind and test cleanup; not wired to a
- slash command in v1.
- """
- def _do(conn):
- cursor = conn.execute(
- "SELECT id FROM messages "
- "WHERE session_id = ? AND id >= ? AND active = 0",
- (session_id, since_message_id),
- )
- ids = [r[0] for r in cursor.fetchall()]
- if ids:
- placeholders = ",".join("?" for _ in ids)
- conn.execute(
- f"UPDATE messages SET active = 1 WHERE id IN ({placeholders})",
- ids,
- )
- return len(ids)
-
- return self._execute_write(_do)
-
- def list_recent_user_messages(
- self,
- session_id: str,
- limit: int = 20,
- include_inactive: bool = False,
- ) -> List[Dict[str, Any]]:
- """Return the *limit* most-recent user messages, newest first.
-
- Each entry is a dict with keys ``id``, ``timestamp``, ``preview``.
- ``preview`` is the first 80 characters of the message content
- (with line breaks collapsed to spaces). Used by the /rewind
- slash command picker.
-
- By default only active messages are returned.
- """
- active_clause = "" if include_inactive else " AND active = 1"
- with self._lock:
- cursor = self._conn.execute(
- "SELECT id, timestamp, content FROM messages "
- "WHERE session_id = ? AND role = 'user'"
- f"{active_clause} "
- "ORDER BY id DESC LIMIT ?",
- (session_id, int(limit)),
- )
- rows = cursor.fetchall()
-
- result: List[Dict[str, Any]] = []
- for row in rows:
- decoded = self._decode_content(row["content"])
- if isinstance(decoded, list):
- # Multimodal — flatten text parts.
- text_parts = [
- p.get("text", "") for p in decoded
- if isinstance(p, dict) and p.get("type") == "text"
- ]
- preview = " ".join(t for t in text_parts if t).strip()
- if not preview:
- preview = "[multimodal content]"
- elif isinstance(decoded, str):
- preview = decoded
- else:
- preview = ""
- preview = " ".join(preview.split()) # collapse whitespace
- if len(preview) > 80:
- preview = preview[:77] + "..."
- result.append(
- {
- "id": row["id"],
- "timestamp": row["timestamp"],
- "preview": preview,
- }
- )
- return result
-
- # =========================================================================
- # Search
- # =========================================================================
-
- @staticmethod
- def _sanitize_fts5_query(query: str) -> str:
- """Sanitize user input for safe use in FTS5 MATCH queries.
-
- FTS5 has its own query syntax where characters like ``"``, ``(``, ``)``,
- ``+``, ``*``, ``{``, ``}``, the column-filter operator ``:`` and bare
- boolean operators (``AND``, ``OR``, ``NOT``) have special meaning.
- Passing raw user input directly to MATCH can cause
- ``sqlite3.OperationalError``.
-
- Strategy:
- - Preserve properly paired quoted phrases (``"exact phrase"``)
- - Strip unmatched FTS5-special characters that would cause errors
- - Wrap unquoted hyphenated and dotted terms in quotes so FTS5
- matches them as exact phrases instead of splitting on the
- hyphen/dot (e.g. ``chat-send``, ``P2.2``, ``my-app.config.ts``)
- """
- # Step 1: Extract balanced double-quoted phrases and protect them
- # from further processing via numbered placeholders.
- _quoted_parts: list = []
-
- def _preserve_quoted(m: re.Match) -> str:
- _quoted_parts.append(m.group(0))
- return f"\x00Q{len(_quoted_parts) - 1}\x00"
-
- sanitized = re.sub(r'"[^"]*"', _preserve_quoted, query)
-
- # Step 2: Strip remaining (unmatched) FTS5-special characters. ``:`` is
- # FTS5's column-filter operator (``col:term``); since the FTS table has a
- # single ``content`` column, an unquoted colon query like ``TODO: fix``
- # parses as ``column:term`` and raises "no such column" — swallowed at
- # the execute site into zero results. Strip it like the others.
- sanitized = re.sub(r'[+{}():\"^]', " ", sanitized)
-
- # Step 3: Collapse repeated * (e.g. "***") into a single one,
- # and remove leading * (prefix-only needs at least one char before *)
- sanitized = re.sub(r"\*+", "*", sanitized)
- sanitized = re.sub(r"(^|\s)\*", r"\1", sanitized)
-
- # Step 4: Remove dangling boolean operators at start/end that would
- # cause syntax errors (e.g. "hello AND" or "OR world")
- sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip())
- sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip())
-
- # Step 5: Wrap unquoted dotted and/or hyphenated terms in double
- # quotes. FTS5's tokenizer splits on dots and hyphens, turning
- # ``chat-send`` into ``chat AND send`` and ``P2.2`` into ``p2 AND 2``.
- # Quoting preserves phrase semantics. A single pass avoids the
- # double-quoting bug that would occur if dotted, hyphenated and underscored
- # patterns were applied sequentially (e.g. ``my-app.config``).
- sanitized = re.sub(r"\b(\w+(?:[._-]\w+)+)\b", r'"\1"', sanitized)
-
- # Step 6: Restore preserved quoted phrases
- for i, quoted in enumerate(_quoted_parts):
- sanitized = sanitized.replace(f"\x00Q{i}\x00", quoted)
-
- return sanitized.strip()
-
-
- @staticmethod
- def _is_cjk_codepoint(cp: int) -> bool:
- return (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
- 0x3400 <= cp <= 0x4DBF or # CJK Extension A
- 0x20000 <= cp <= 0x2A6DF or # CJK Extension B
- 0x3000 <= cp <= 0x303F or # CJK Symbols
- 0x3040 <= cp <= 0x309F or # Hiragana
- 0x30A0 <= cp <= 0x30FF or # Katakana
- 0xAC00 <= cp <= 0xD7AF) # Hangul Syllables
-
- @staticmethod
- def _contains_cjk(text: str) -> bool:
- """Check if text contains CJK (Chinese, Japanese, Korean) characters."""
- for ch in text:
- cp = ord(ch)
- if (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
- 0x3400 <= cp <= 0x4DBF or # CJK Extension A
- 0x20000 <= cp <= 0x2A6DF or # CJK Extension B
- 0x3000 <= cp <= 0x303F or # CJK Symbols
- 0x3040 <= cp <= 0x309F or # Hiragana
- 0x30A0 <= cp <= 0x30FF or # Katakana
- 0xAC00 <= cp <= 0xD7AF): # Hangul Syllables
- return True
- return False
-
- @classmethod
- def _count_cjk(cls, text: str) -> int:
- """Count CJK characters in text."""
- return sum(1 for ch in text if cls._is_cjk_codepoint(ord(ch)))
-
- def search_messages(
- self,
- query: str,
- source_filter: List[str] = None,
- exclude_sources: List[str] = None,
- role_filter: List[str] = None,
- limit: int = 20,
- offset: int = 0,
- sort: str = None,
- include_inactive: bool = False,
- ) -> List[Dict[str, Any]]:
- """
- Full-text search across session messages using FTS5.
-
- Supports FTS5 query syntax:
- - Simple keywords: "docker deployment"
- - Phrases: '"exact phrase"'
- - Boolean: "docker OR kubernetes", "python NOT java"
- - Prefix: "deploy*"
-
- Returns matching messages with session metadata, content snippet,
- and surrounding context (1 message before and after the match).
-
- ``sort`` controls temporal ordering:
- - ``None`` (default): FTS5 BM25 relevance only. Time-neutral.
- - ``"newest"``: order by message timestamp DESC, then by rank.
- - ``"oldest"``: order by message timestamp ASC, then by rank.
-
- The short-CJK LIKE fallback already orders by timestamp DESC and
- ignores ``sort``. The trigram CJK path honours ``sort`` like the main
- FTS5 path.
-
- Rewound (``active=0``) rows are excluded by default. Pass
- ``include_inactive=True`` to search every row.
- """
- if not self._fts_enabled:
- return []
-
- if not query or not query.strip():
- return []
-
- query = self._sanitize_fts5_query(query)
- if not query:
- return []
-
- # Normalise sort. Anything not in the allowed set falls back to None
- # (FTS5 rank-only) so callers can pass through user input without
- # validation.
- if isinstance(sort, str):
- sort_norm = sort.strip().lower()
- if sort_norm not in ("newest", "oldest"):
- sort_norm = None
- else:
- sort_norm = None
-
- # ORDER BY shared across the main FTS5 path and trigram CJK path.
- # With sort set, timestamp is primary and rank is the tiebreaker.
- if sort_norm == "newest":
- order_by_sql = "ORDER BY m.timestamp DESC, rank"
- elif sort_norm == "oldest":
- order_by_sql = "ORDER BY m.timestamp ASC, rank"
- else:
- order_by_sql = "ORDER BY rank"
-
- # Build WHERE clauses dynamically
- where_clauses = ["messages_fts MATCH ?"]
- params: list = [query]
- if not include_inactive:
- where_clauses.append("m.active = 1")
-
- if source_filter is not None:
- source_placeholders = ",".join("?" for _ in source_filter)
- where_clauses.append(f"s.source IN ({source_placeholders})")
- params.extend(source_filter)
-
- if exclude_sources is not None:
- exclude_placeholders = ",".join("?" for _ in exclude_sources)
- where_clauses.append(f"s.source NOT IN ({exclude_placeholders})")
- params.extend(exclude_sources)
-
- if role_filter:
- role_placeholders = ",".join("?" for _ in role_filter)
- where_clauses.append(f"m.role IN ({role_placeholders})")
- params.extend(role_filter)
-
- where_sql = " AND ".join(where_clauses)
- params.extend([limit, offset])
-
- sql = f"""
- SELECT
- m.id,
- m.session_id,
- m.role,
- snippet(messages_fts, 0, '>>>', '<<<', '...', 40) AS snippet,
- m.content,
- m.timestamp,
- m.tool_name,
- s.source,
- s.model,
- s.started_at AS session_started
- FROM messages_fts
- JOIN messages m ON m.id = messages_fts.rowid
- JOIN sessions s ON s.id = m.session_id
- WHERE {where_sql}
- {order_by_sql}
- LIMIT ? OFFSET ?
- """
-
- # CJK queries bypass the unicode61 FTS5 table. The default tokenizer
- # splits CJK characters into individual tokens, so "大别山项目" becomes
- # "大 AND 别 AND 山 AND 项 AND 目" — producing false positives and
- # missing exact phrase matches.
- #
- # For queries with 3+ CJK characters, we use the trigram FTS5 table
- # (indexed substring matching with ranking and snippets). For shorter
- # CJK queries (1-2 chars), trigram can't match (it needs ≥9 UTF-8
- # bytes = 3 CJK chars), so we fall back to LIKE.
- is_cjk = self._contains_cjk(query)
- if is_cjk:
- raw_query = query.strip('"').strip()
- cjk_count = self._count_cjk(raw_query)
-
- # Per-token CJK length check (#20494): trigram needs >=3 CJK chars
- # per token. A query like "广西 OR 桂林 OR 漓江" has cjk_count=6
- # (>=3) but each individual token is only 2 chars — trigram returns 0.
- # Route to LIKE when any non-operator CJK token is <3 CJK chars.
- _tokens_for_check = [
- t for t in raw_query.split()
- if t.upper() not in {"AND", "OR", "NOT"} and self._contains_cjk(t)
- ]
- _any_short_cjk = any(
- self._count_cjk(t) < 3 for t in _tokens_for_check
- )
-
- if cjk_count >= 3 and not _any_short_cjk:
- # Trigram FTS5 path — quote each non-operator token to handle
- # FTS5 special chars (%, *, etc.) while preserving boolean
- # operators (AND, OR, NOT) for multi-term queries.
- tokens = raw_query.split()
- parts = []
- for tok in tokens:
- if tok.upper() in {"AND", "OR", "NOT"}:
- parts.append(tok)
- else:
- parts.append('"' + tok.replace('"', '""') + '"')
- trigram_query = " ".join(parts)
- tri_where = ["messages_fts_trigram MATCH ?"]
- tri_params: list = [trigram_query]
- if not include_inactive:
- tri_where.append("m.active = 1")
- if source_filter is not None:
- tri_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
- tri_params.extend(source_filter)
- if exclude_sources is not None:
- tri_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
- tri_params.extend(exclude_sources)
- if role_filter:
- tri_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
- tri_params.extend(role_filter)
- tri_sql = f"""
- SELECT
- m.id,
- m.session_id,
- m.role,
- snippet(messages_fts_trigram, 0, '>>>', '<<<', '...', 40) AS snippet,
- m.content,
- m.timestamp,
- m.tool_name,
- s.source,
- s.model,
- s.started_at AS session_started
- FROM messages_fts_trigram
- JOIN messages m ON m.id = messages_fts_trigram.rowid
- JOIN sessions s ON s.id = m.session_id
- WHERE {' AND '.join(tri_where)}
- {order_by_sql}
- LIMIT ? OFFSET ?
- """
- tri_params.extend([limit, offset])
- with self._lock:
- try:
- tri_cursor = self._conn.execute(tri_sql, tri_params)
- except sqlite3.OperationalError:
- matches = []
- else:
- matches = [dict(row) for row in tri_cursor.fetchall()]
- else:
- # Short / mixed CJK query: trigram cannot match tokens with
- # <3 CJK chars. Fall back to LIKE substring search.
- # For multi-token OR queries (e.g. "广西 OR 桂林 OR 漓江"),
- # build one LIKE condition per non-operator token so each term
- # is matched independently (#20494).
- non_op_tokens = [
- t for t in raw_query.split()
- if t.upper() not in {"AND", "OR", "NOT"}
- ] or [raw_query]
- token_clauses = []
- like_params: list = []
- for tok in non_op_tokens:
- esc = tok.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
- token_clauses.append(
- "(m.content LIKE ? ESCAPE '\\' OR m.tool_name LIKE ? ESCAPE '\\' OR m.tool_calls LIKE ? ESCAPE '\\')"
- )
- like_params += [f"%{esc}%", f"%{esc}%", f"%{esc}%"]
- like_where = [f"({' OR '.join(token_clauses)})"]
- if source_filter is not None:
- like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
- like_params.extend(source_filter)
- if exclude_sources is not None:
- like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
- like_params.extend(exclude_sources)
- if role_filter:
- like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
- like_params.extend(role_filter)
- like_sql = f"""
- SELECT m.id, m.session_id, m.role,
- substr(m.content,
- max(1, instr(m.content, ?) - 40),
- 120) AS snippet,
- m.content, m.timestamp, m.tool_name,
- s.source, s.model, s.started_at AS session_started
- FROM messages m
- JOIN sessions s ON s.id = m.session_id
- WHERE {' AND '.join(like_where)}
- ORDER BY m.timestamp DESC
- LIMIT ? OFFSET ?
- """
- like_params.extend([limit, offset])
- # instr() for snippet uses first search token
- like_params = [non_op_tokens[0]] + like_params
- with self._lock:
- like_cursor = self._conn.execute(like_sql, like_params)
- matches = [dict(row) for row in like_cursor.fetchall()]
- else:
- with self._lock:
- try:
- cursor = self._conn.execute(sql, params)
- except sqlite3.OperationalError:
- # FTS5 query syntax error despite sanitization — return empty
- return []
- else:
- matches = [dict(row) for row in cursor.fetchall()]
-
- # Add surrounding context (1 message before + after each match).
- # Done outside the lock so we don't hold it across N sequential queries.
- for match in matches:
- try:
- with self._lock:
- ctx_cursor = self._conn.execute(
- """WITH target AS (
- SELECT session_id, timestamp, id
- FROM messages
- WHERE id = ?
- )
- SELECT role, content
- FROM (
- SELECT m.id, m.timestamp, m.role, m.content
- FROM messages m
- JOIN target t ON t.session_id = m.session_id
- WHERE (m.timestamp < t.timestamp)
- OR (m.timestamp = t.timestamp AND m.id < t.id)
- ORDER BY m.timestamp DESC, m.id DESC
- LIMIT 1
- )
- UNION ALL
- SELECT role, content
- FROM messages
- WHERE id = ?
- UNION ALL
- SELECT role, content
- FROM (
- SELECT m.id, m.timestamp, m.role, m.content
- FROM messages m
- JOIN target t ON t.session_id = m.session_id
- WHERE (m.timestamp > t.timestamp)
- OR (m.timestamp = t.timestamp AND m.id > t.id)
- ORDER BY m.timestamp ASC, m.id ASC
- LIMIT 1
- )""",
- (match["id"], match["id"]),
- )
- context_msgs = []
- for r in ctx_cursor.fetchall():
- raw = r["content"]
- decoded = self._decode_content(raw)
- # Multimodal context: render a compact text-only
- # summary for search previews.
- if isinstance(decoded, list):
- text_parts = [
- p.get("text", "") for p in decoded
- if isinstance(p, dict) and p.get("type") == "text"
- ]
- text = " ".join(t for t in text_parts if t).strip()
- preview = text or "[multimodal content]"
- elif isinstance(decoded, str):
- preview = decoded
- else:
- preview = ""
- context_msgs.append(
- {"role": r["role"], "content": preview[:200]}
- )
- match["context"] = context_msgs
- except Exception:
- match["context"] = []
-
- # Remove full content from result (snippet is enough, saves tokens)
- for match in matches:
- match.pop("content", None)
-
- return matches
-
- def search_sessions_by_id(
- self,
- query: str,
- limit: int = 20,
- include_archived: bool = True,
- ) -> List[Dict[str, Any]]:
- """Search surfaced sessions by exact/prefix/substring session id.
-
- Desktop search uses this alongside FTS message search so users can paste
- a session id from logs, CLI output, or another Hermes surface and jump
- straight to that conversation. Matching also checks ``_lineage_root_id``
- for projected compression-chain tips, so an old root id still resolves to
- the live continuation row.
- """
- needle = (query or "").strip().lower()
- if not needle or limit <= 0:
- return []
-
- # SQL-bounded: list_sessions_rich pushes the id LIKE filter into the
- # query (matching the row's own id AND any id in its forward
- # compression chain), so we only materialize matching rows instead of
- # scanning every session. Fetch a small multiple of `limit` so the
- # in-Python exact/prefix/substring ranking below has enough candidates
- # to order, then truncate.
- candidates = self.list_sessions_rich(
- limit=max(limit * 4, limit),
- offset=0,
- include_archived=include_archived,
- order_by_last_active=True,
- id_query=needle,
- )
-
- def score(row: Dict[str, Any]) -> int:
- ids = [str(row.get("id") or ""), str(row.get("_lineage_root_id") or "")]
- normalized = [value.lower() for value in ids if value]
- if any(value == needle for value in normalized):
- return 0
- if any(value.startswith(needle) for value in normalized):
- return 1
- return 2
-
- ranked = sorted(
- enumerate(candidates),
- key=lambda item: (score(item[1]), item[0]),
- )
- return [row for _, row in ranked[:limit]]
-
- def search_sessions(
- self,
- source: str = None,
- limit: int = 20,
- offset: int = 0,
- ) -> List[Dict[str, Any]]:
- """List sessions, optionally filtered by source.
-
- Returns rows enriched with a computed ``last_active`` column (latest
- message timestamp for the session, falling back to ``started_at``),
- ordered by most-recently-used first.
- """
- select_with_last_active = (
- "SELECT s.*, COALESCE(m.last_active, s.started_at) AS last_active "
- "FROM sessions s "
- "LEFT JOIN ("
- "SELECT session_id, MAX(timestamp) AS last_active "
- "FROM messages GROUP BY session_id"
- ") m ON m.session_id = s.id "
- )
- with self._lock:
- if source:
- cursor = self._conn.execute(
- f"{select_with_last_active}"
- "WHERE s.source = ? "
- "ORDER BY last_active DESC, s.started_at DESC, s.id DESC LIMIT ? OFFSET ?",
- (source, limit, offset),
- )
- else:
- cursor = self._conn.execute(
- f"{select_with_last_active}"
- "ORDER BY last_active DESC, s.started_at DESC, s.id DESC LIMIT ? OFFSET ?",
- (limit, offset),
- )
- return [dict(row) for row in cursor.fetchall()]
-
- # =========================================================================
- # Utility
- # =========================================================================
-
- def session_count(
- self,
- source: str = None,
- min_message_count: int = 0,
- include_archived: bool = False,
- archived_only: bool = False,
- exclude_children: bool = False,
- exclude_sources: List[str] = None,
- ) -> int:
- """Count sessions, optionally filtered by source.
-
- Pass ``exclude_children=True`` to count only the conversations that
- ``list_sessions_rich`` surfaces (root + branch sessions), hiding
- sub-agent runs and compression continuations. Use it whenever the count
- is paired with a ``list_sessions_rich`` page (e.g. sidebar "load more"
- totals) so the total matches the number of listable rows — otherwise the
- raw row count is inflated by children and "load more" never settles.
-
- Pass ``exclude_sources`` to drop whole source classes from the count
- (e.g. ``["cron"]`` so the recents "load more" total matches a
- cron-excluded ``list_sessions_rich`` page and doesn't keep "load more"
- stuck on for buried scheduler sessions).
- """
- where_clauses = []
- params = []
-
- if exclude_children:
- # Mirror list_sessions_rich's child-exclusion clause exactly so the
- # count lines up with the rows: roots (no parent) plus branch
- # children (parent ended with end_reason='branched').
- where_clauses.append(
- "(s.parent_session_id IS NULL"
- " OR EXISTS (SELECT 1 FROM sessions p"
- " WHERE p.id = s.parent_session_id"
- " AND p.end_reason = 'branched'"
- " AND s.started_at >= p.ended_at))"
- )
- if source:
- where_clauses.append("s.source = ?")
- params.append(source)
- if exclude_sources:
- placeholders = ",".join("?" for _ in exclude_sources)
- where_clauses.append(f"s.source NOT IN ({placeholders})")
- params.extend(exclude_sources)
- if min_message_count > 0:
- where_clauses.append("s.message_count >= ?")
- params.append(min_message_count)
- if archived_only:
- where_clauses.append("s.archived = 1")
- elif not include_archived:
- where_clauses.append("s.archived = 0")
-
- where_sql = f" WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
-
- with self._lock:
- cursor = self._conn.execute(f"SELECT COUNT(*) FROM sessions s{where_sql}", params)
- return cursor.fetchone()[0]
-
- def message_count(self, session_id: str = None) -> int:
- """Count messages, optionally for a specific session."""
- with self._lock:
- if session_id:
- cursor = self._conn.execute(
- "SELECT COUNT(*) FROM messages WHERE session_id = ?", (session_id,)
- )
- else:
- cursor = self._conn.execute("SELECT COUNT(*) FROM messages")
- return cursor.fetchone()[0]
-
- # =========================================================================
- # Export and cleanup
- # =========================================================================
-
- def export_session(self, session_id: str) -> Optional[Dict[str, Any]]:
- """Export a single session with all its messages as a dict."""
- session = self.get_session(session_id)
- if not session:
- return None
- messages = self.get_messages(session_id)
- return {**session, "messages": messages}
-
- def export_all(self, source: str = None) -> List[Dict[str, Any]]:
- """
- Export all sessions (with messages) as a list of dicts.
- Suitable for writing to a JSONL file for backup/analysis.
- """
- sessions = self.search_sessions(source=source, limit=100000)
- results = []
- for session in sessions:
- messages = self.get_messages(session["id"])
- results.append({**session, "messages": messages})
- return results
-
- def clear_messages(self, session_id: str) -> None:
- """Delete all messages for a session and reset its counters."""
- def _do(conn):
- conn.execute(
- "DELETE FROM messages WHERE session_id = ?", (session_id,)
- )
- conn.execute(
- "UPDATE sessions SET message_count = 0, tool_call_count = 0 WHERE id = ?",
- (session_id,),
- )
- self._execute_write(_do)
-
- @staticmethod
- def _remove_session_files(sessions_dir: Optional[Path], session_id: str) -> None:
- """Remove on-disk transcript files for a session.
-
- Cleans up ``{session_id}.json``, ``{session_id}.jsonl``, and any
- ``request_dump_{session_id}_*.json`` files left by the gateway.
- Silently skips files that don't exist and swallows OSError so a
- filesystem hiccup never blocks a DB operation.
- """
- if sessions_dir is None:
- return
- for suffix in (".json", ".jsonl"):
- p = sessions_dir / f"{session_id}{suffix}"
- try:
- p.unlink(missing_ok=True)
- except OSError:
- pass
- # request_dump files use session_id as a prefix component
- try:
- for p in sessions_dir.glob(f"request_dump_{session_id}_*.json"):
- try:
- p.unlink(missing_ok=True)
- except OSError:
- pass
- except OSError:
- pass
-
- def delete_session(
- self,
- session_id: str,
- sessions_dir: Optional[Path] = None,
- ) -> bool:
- """Delete a session and all its messages.
-
- Child sessions are orphaned (parent_session_id set to NULL) rather
- than cascade-deleted, so they remain accessible independently.
- When *sessions_dir* is provided, also removes on-disk transcript
- files (``.json`` / ``.jsonl`` / ``request_dump_*``) for the deleted
- session. Returns True if the session was found and deleted.
- """
- def _do(conn):
- cursor = conn.execute(
- "SELECT COUNT(*) FROM sessions WHERE id = ?", (session_id,)
- )
- if cursor.fetchone()[0] == 0:
- return False
- # Orphan child sessions so FK constraint is satisfied
- conn.execute(
- "UPDATE sessions SET parent_session_id = NULL "
- "WHERE parent_session_id = ?",
- (session_id,),
- )
- conn.execute("DELETE FROM messages WHERE session_id = ?", (session_id,))
- conn.execute("DELETE FROM sessions WHERE id = ?", (session_id,))
- return True
-
- deleted = self._execute_write(_do)
- if deleted:
- self._remove_session_files(sessions_dir, session_id)
- return deleted
-
- def delete_sessions(
- self,
- session_ids: List[str],
- sessions_dir: Optional[Path] = None,
- ) -> int:
- """Delete every session in *session_ids* in a single transaction.
-
- Backs the dashboard's bulk-select-then-delete flow on the
- sessions page (``POST /api/sessions/bulk-delete``). Mirrors the
- single-session :meth:`delete_session` contract per row:
-
- * Unknown IDs are silently skipped (no 404) — selection state
- in the UI can race against another tab's delete, and we'd
- rather succeed-on-the-rest than fail-the-whole-batch.
- * Children of every deleted ID are orphaned
- (``parent_session_id → NULL``), never cascade-deleted, so a
- branch / subagent transcript survives an inadvertent parent
- delete.
- * Messages and the session row both go in one
- ``_execute_write`` call so a partial failure can't leave the
- DB in a "messages gone but session row still there" state.
- * On-disk transcript / ``request_dump_*`` files are cleaned up
- outside the DB transaction when *sessions_dir* is provided,
- matching :meth:`prune_sessions` and
- :meth:`delete_empty_sessions`.
-
- Returns the count of sessions that actually existed and were
- deleted (may be less than ``len(session_ids)`` if some IDs were
- already gone).
- """
- if not session_ids:
- return 0
- # Dedup + drop any non-string entries up-front. Avoids
- # double-counting in the WHERE-IN list and protects against
- # callers that pass a list with stray ``None`` values.
- unique_ids = list({sid for sid in session_ids if isinstance(sid, str) and sid})
- if not unique_ids:
- return 0
-
- removed_ids: list[str] = []
-
- def _do(conn):
- placeholders = ",".join("?" * len(unique_ids))
- # First, filter to IDs that actually exist — we want to
- # return the real deleted count, not the input length.
- cursor = conn.execute(
- f"SELECT id FROM sessions WHERE id IN ({placeholders})",
- unique_ids,
- )
- existing = [row["id"] for row in cursor.fetchall()]
- if not existing:
- return 0
-
- existing_placeholders = ",".join("?" * len(existing))
- # Orphan children whose parent is in the kill list so the
- # FK constraint stays satisfied. Pin children whose parent
- # is itself in the kill list rather than NULL-ing parents
- # of survivors — the IN list on ``parent_session_id`` does
- # exactly this.
- conn.execute(
- f"UPDATE sessions SET parent_session_id = NULL "
- f"WHERE parent_session_id IN ({existing_placeholders})",
- existing,
- )
- conn.execute(
- f"DELETE FROM messages WHERE session_id IN ({existing_placeholders})",
- existing,
- )
- conn.execute(
- f"DELETE FROM sessions WHERE id IN ({existing_placeholders})",
- existing,
- )
- removed_ids.extend(existing)
- return len(existing)
-
- count = self._execute_write(_do)
- for sid in removed_ids:
- self._remove_session_files(sessions_dir, sid)
- return count
-
- def count_empty_sessions(self) -> int:
- """Return the count of empty, non-active, non-archived sessions.
-
- "Empty" = ``message_count = 0`` AND the session has ended
- (``ended_at IS NOT NULL``) AND is not archived. The ``ended_at``
- guard matches the safety contract used by :meth:`prune_sessions`:
- only ended sessions are candidates for bulk deletion, so a freshly
- spawned session whose first message hasn't landed yet — or one
- held open by the live agent — is never sniped out from under
- the runtime.
-
- Backs the ``GET /api/sessions/empty/count`` endpoint that lets the
- web dashboard hide its "Delete empty" button when there's nothing
- to clean up, and pre-populate the confirm dialog with the actual
- count.
- """
- with self._lock:
- cursor = self._conn.execute(
- "SELECT COUNT(*) FROM sessions "
- "WHERE message_count = 0 "
- "AND ended_at IS NOT NULL "
- "AND archived = 0"
- )
- return cursor.fetchone()[0]
-
- def delete_empty_sessions(
- self,
- sessions_dir: Optional[Path] = None,
- ) -> int:
- """Delete every empty, ended, non-archived session.
-
- Mirrors :meth:`prune_sessions`' transactional shape:
-
- * Selects candidate IDs first (``message_count = 0`` AND
- ``ended_at IS NOT NULL`` AND ``archived = 0``) so we never
- touch a live session or one the user deliberately archived.
- * Orphans any child whose parent is in the kill list — children
- of an empty parent are kept and re-parented to ``NULL`` rather
- than cascade-deleted, matching ``delete_session`` /
- ``prune_sessions`` semantics so branch/subagent transcripts
- survive an inadvertent parent cleanup.
- * Deletes the rows in a single ``_execute_write`` callback so
- the operation is atomic — a partial failure (e.g. SIGKILL
- mid-loop) doesn't leave the DB in a "messages-deleted but
- session-row-still-there" half-state.
- * Cleans up on-disk transcript files (``.json`` / ``.jsonl`` /
- ``request_dump_*``) outside the DB transaction when
- ``sessions_dir`` is provided. Empty sessions don't typically
- have transcript files, but the gateway can leave a stub
- ``request_dump_*`` if it crashed before the first reply —
- so we still sweep, matching ``prune_sessions``.
-
- Returns the number of sessions deleted.
- """
- removed_ids: list[str] = []
-
- def _do(conn):
- cursor = conn.execute(
- "SELECT id FROM sessions "
- "WHERE message_count = 0 "
- "AND ended_at IS NOT NULL "
- "AND archived = 0"
- )
- session_ids = {row["id"] for row in cursor.fetchall()}
-
- if not session_ids:
- return 0
-
- placeholders = ",".join("?" * len(session_ids))
- conn.execute(
- f"UPDATE sessions SET parent_session_id = NULL "
- f"WHERE parent_session_id IN ({placeholders})",
- list(session_ids),
- )
-
- for sid in session_ids:
- # DELETE FROM messages is paranoia — by construction
- # these rows have ``message_count = 0`` — but if a
- # bookkeeping bug ever lets the counter drift below the
- # real row count, we still leave a clean FK state.
- conn.execute(
- "DELETE FROM messages WHERE session_id = ?", (sid,)
- )
- conn.execute("DELETE FROM sessions WHERE id = ?", (sid,))
- removed_ids.append(sid)
- return len(session_ids)
-
- count = self._execute_write(_do)
- for sid in removed_ids:
- self._remove_session_files(sessions_dir, sid)
- return count
-
- def prune_sessions(
- self,
- older_than_days: int = 90,
- source: str = None,
- sessions_dir: Optional[Path] = None,
- ) -> int:
- """Delete sessions older than N days. Returns count of deleted sessions.
-
- Only prunes ended sessions (not active ones). Child sessions outside
- the prune window are orphaned (parent_session_id set to NULL) rather
- than cascade-deleted. When *sessions_dir* is provided, also removes
- on-disk transcript files (``.json`` / ``.jsonl`` /
- ``request_dump_*``) for every pruned session, outside the DB
- transaction.
- """
- cutoff = time.time() - (older_than_days * 86400)
- removed_ids: list[str] = []
-
- def _do(conn):
- if source:
- cursor = conn.execute(
- """SELECT id FROM sessions
- WHERE started_at < ? AND ended_at IS NOT NULL AND source = ?""",
- (cutoff, source),
- )
- else:
- cursor = conn.execute(
- "SELECT id FROM sessions WHERE started_at < ? AND ended_at IS NOT NULL",
- (cutoff,),
- )
- session_ids = {row["id"] for row in cursor.fetchall()}
-
- if not session_ids:
- return 0
-
- # Orphan any sessions whose parent is about to be deleted
- placeholders = ",".join("?" * len(session_ids))
- conn.execute(
- f"UPDATE sessions SET parent_session_id = NULL "
- f"WHERE parent_session_id IN ({placeholders})",
- list(session_ids),
- )
-
- for sid in session_ids:
- conn.execute("DELETE FROM messages WHERE session_id = ?", (sid,))
- conn.execute("DELETE FROM sessions WHERE id = ?", (sid,))
- removed_ids.append(sid)
- return len(session_ids)
-
- count = self._execute_write(_do)
- # Clean up on-disk files outside the DB transaction
- for sid in removed_ids:
- self._remove_session_files(sessions_dir, sid)
- return count
-
- # ── Meta key/value (for scheduler bookkeeping) ──
-
- def get_meta(self, key: str) -> Optional[str]:
- """Read a value from the state_meta key/value store."""
- with self._lock:
- row = self._conn.execute(
- "SELECT value FROM state_meta WHERE key = ?", (key,)
- ).fetchone()
- if row is None:
- return None
- return row["value"] if isinstance(row, sqlite3.Row) else row[0]
-
- def set_meta(self, key: str, value: str) -> None:
- """Write a value to the state_meta key/value store."""
- def _do(conn):
- conn.execute(
- "INSERT INTO state_meta (key, value) VALUES (?, ?) "
- "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
- (key, value),
- )
- self._execute_write(_do)
-
- def apply_telegram_topic_migration(self) -> None:
- """Create Telegram DM topic-mode tables on explicit /topic opt-in.
-
- This migration is deliberately not part of automatic SessionDB startup
- reconciliation. Operators must be able to upgrade Hermes, keep the old
- Telegram bot behavior running, and only mutate topic-mode state when the
- user executes /topic to opt into the feature.
-
- Schema versions:
- v1 — initial shape (no ON DELETE CASCADE on session_id FK)
- v2 — session_id FK gets ON DELETE CASCADE so session pruning
- automatically clears bindings.
- """
- def _do(conn):
- conn.executescript(
- """
- CREATE TABLE IF NOT EXISTS telegram_dm_topic_mode (
- chat_id TEXT PRIMARY KEY,
- user_id TEXT NOT NULL,
- enabled INTEGER NOT NULL DEFAULT 1,
- activated_at REAL NOT NULL,
- updated_at REAL NOT NULL,
- has_topics_enabled INTEGER,
- allows_users_to_create_topics INTEGER,
- capability_checked_at REAL,
- intro_message_id TEXT,
- pinned_message_id TEXT
- );
-
- CREATE TABLE IF NOT EXISTS telegram_dm_topic_bindings (
- chat_id TEXT NOT NULL,
- thread_id TEXT NOT NULL,
- user_id TEXT NOT NULL,
- session_key TEXT NOT NULL,
- session_id TEXT NOT NULL REFERENCES sessions(id) ON DELETE CASCADE,
- managed_mode TEXT NOT NULL DEFAULT 'auto',
- linked_at REAL NOT NULL,
- updated_at REAL NOT NULL,
- PRIMARY KEY (chat_id, thread_id)
- );
-
- CREATE UNIQUE INDEX IF NOT EXISTS idx_telegram_dm_topic_bindings_session
- ON telegram_dm_topic_bindings(session_id);
-
- CREATE INDEX IF NOT EXISTS idx_telegram_dm_topic_bindings_user
- ON telegram_dm_topic_bindings(user_id, chat_id);
- """
- )
-
- # v1 → v2: rebuild telegram_dm_topic_bindings if its session_id FK
- # lacks ON DELETE CASCADE. SQLite can't ALTER a foreign key, so we
- # rebuild the table. Only runs once per DB (version gate).
- current = conn.execute(
- "SELECT value FROM state_meta WHERE key = ?",
- ("telegram_dm_topic_schema_version",),
- ).fetchone()
- current_version = int(current[0]) if current and str(current[0]).isdigit() else 0
- if current_version < 2:
- fk_rows = conn.execute(
- "PRAGMA foreign_key_list('telegram_dm_topic_bindings')"
- ).fetchall()
- needs_rebuild = any(
- row[2] == "sessions" and (row[6] or "") != "CASCADE"
- for row in fk_rows
- )
- if needs_rebuild:
- conn.executescript(
- """
- CREATE TABLE telegram_dm_topic_bindings_new (
- chat_id TEXT NOT NULL,
- thread_id TEXT NOT NULL,
- user_id TEXT NOT NULL,
- session_key TEXT NOT NULL,
- session_id TEXT NOT NULL REFERENCES sessions(id) ON DELETE CASCADE,
- managed_mode TEXT NOT NULL DEFAULT 'auto',
- linked_at REAL NOT NULL,
- updated_at REAL NOT NULL,
- PRIMARY KEY (chat_id, thread_id)
- );
- INSERT INTO telegram_dm_topic_bindings_new
- SELECT chat_id, thread_id, user_id, session_key,
- session_id, managed_mode, linked_at, updated_at
- FROM telegram_dm_topic_bindings;
- DROP TABLE telegram_dm_topic_bindings;
- ALTER TABLE telegram_dm_topic_bindings_new
- RENAME TO telegram_dm_topic_bindings;
- CREATE UNIQUE INDEX idx_telegram_dm_topic_bindings_session
- ON telegram_dm_topic_bindings(session_id);
- CREATE INDEX idx_telegram_dm_topic_bindings_user
- ON telegram_dm_topic_bindings(user_id, chat_id);
- """
- )
-
- conn.execute(
- "INSERT INTO state_meta (key, value) VALUES (?, ?) "
- "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
- ("telegram_dm_topic_schema_version", "2"),
- )
- self._execute_write(_do)
-
- def enable_telegram_topic_mode(
- self,
- *,
- chat_id: str,
- user_id: str,
- has_topics_enabled: Optional[bool] = None,
- allows_users_to_create_topics: Optional[bool] = None,
- ) -> None:
- """Enable Telegram DM topic mode for one private chat/user.
-
- This method intentionally owns the explicit topic migration. Ordinary
- SessionDB startup must not create these side tables.
- """
- self.apply_telegram_topic_migration()
- now = time.time()
-
- def _to_int(value: Optional[bool]) -> Optional[int]:
- if value is None:
- return None
- return 1 if value else 0
-
- def _do(conn):
- conn.execute(
- """
- INSERT INTO telegram_dm_topic_mode (
- chat_id, user_id, enabled, activated_at, updated_at,
- has_topics_enabled, allows_users_to_create_topics,
- capability_checked_at
- ) VALUES (?, ?, 1, ?, ?, ?, ?, ?)
- ON CONFLICT(chat_id) DO UPDATE SET
- user_id = excluded.user_id,
- enabled = 1,
- updated_at = excluded.updated_at,
- has_topics_enabled = excluded.has_topics_enabled,
- allows_users_to_create_topics = excluded.allows_users_to_create_topics,
- capability_checked_at = excluded.capability_checked_at
- """,
- (
- str(chat_id),
- str(user_id),
- now,
- now,
- _to_int(has_topics_enabled),
- _to_int(allows_users_to_create_topics),
- now,
- ),
- )
- self._execute_write(_do)
-
- def disable_telegram_topic_mode(
- self,
- *,
- chat_id: str,
- clear_bindings: bool = True,
- ) -> None:
- """Disable Telegram DM topic mode for one private chat.
-
- When ``clear_bindings`` is True (default) the (chat_id, thread_id)
- bindings for this chat are also cleared so re-enabling later
- starts from a clean slate. Set to False if the operator wants to
- preserve bindings for a later re-enable.
-
- Never creates the topic-mode tables from scratch; if they don't
- exist there is nothing to disable and the call is a no-op.
- """
- def _do(conn):
- try:
- conn.execute(
- "UPDATE telegram_dm_topic_mode SET enabled = 0, updated_at = ? "
- "WHERE chat_id = ?",
- (time.time(), str(chat_id)),
- )
- if clear_bindings:
- conn.execute(
- "DELETE FROM telegram_dm_topic_bindings WHERE chat_id = ?",
- (str(chat_id),),
- )
- except sqlite3.OperationalError:
- # Tables don't exist yet — nothing to disable.
- return
- self._execute_write(_do)
-
- def is_telegram_topic_mode_enabled(self, *, chat_id: str, user_id: str) -> bool:
- """Return whether Telegram DM topic mode is enabled for this chat/user."""
- with self._lock:
- try:
- row = self._conn.execute(
- """
- SELECT enabled FROM telegram_dm_topic_mode
- WHERE chat_id = ? AND user_id = ?
- """,
- (str(chat_id), str(user_id)),
- ).fetchone()
- except sqlite3.OperationalError:
- return False
- if row is None:
- return False
- enabled = row["enabled"] if isinstance(row, sqlite3.Row) else row[0]
- return bool(enabled)
-
- def get_telegram_topic_binding(
- self,
- *,
- chat_id: str,
- thread_id: str,
- ) -> Optional[Dict[str, Any]]:
- """Return the session binding for a Telegram DM topic, if present."""
- with self._lock:
- try:
- row = self._conn.execute(
- """
- SELECT * FROM telegram_dm_topic_bindings
- WHERE chat_id = ? AND thread_id = ?
- """,
- (str(chat_id), str(thread_id)),
- ).fetchone()
- except sqlite3.OperationalError:
- return None
- return dict(row) if row else None
-
- def list_telegram_topic_bindings_for_chat(
- self,
- *,
- chat_id: str,
- ) -> List[Dict[str, Any]]:
- """All Telegram DM topic bindings for one chat, newest first.
-
- Read-only; returns [] if the bindings table doesn't exist yet
- (does not trigger the topic-mode migration).
- """
- with self._lock:
- try:
- rows = self._conn.execute(
- "SELECT * FROM telegram_dm_topic_bindings "
- "WHERE chat_id = ? ORDER BY updated_at DESC",
- (str(chat_id),),
- ).fetchall()
- except sqlite3.OperationalError:
- return []
- return [dict(row) for row in rows]
-
- def get_telegram_topic_binding_by_session(
- self,
- *,
- session_id: str,
- ) -> Optional[Dict[str, Any]]:
- """Return the Telegram DM topic binding for a given session_id, if present.
-
- Uses the UNIQUE INDEX on telegram_dm_topic_bindings(session_id) for an
- efficient reverse lookup. Returns None when the session has no binding or
- the table does not exist yet.
- """
- with self._lock:
- try:
- row = self._conn.execute(
- """
- SELECT * FROM telegram_dm_topic_bindings
- WHERE session_id = ?
- """,
- (str(session_id),),
- ).fetchone()
- except sqlite3.OperationalError:
- return None
- return dict(row) if row else None
-
- def bind_telegram_topic(
- self,
- *,
- chat_id: str,
- thread_id: str,
- user_id: str,
- session_key: str,
- session_id: str,
- managed_mode: str = "auto",
- ) -> None:
- """Bind one Telegram DM topic thread to one Hermes session.
-
- A Hermes session may only be linked to one Telegram topic in MVP.
- Rebinding the same topic to the same session is idempotent; trying to
- link the same session to a different topic raises ValueError.
- """
- self.apply_telegram_topic_migration()
- now = time.time()
- chat_id = str(chat_id)
- thread_id = str(thread_id)
- user_id = str(user_id)
- session_key = str(session_key)
- session_id = str(session_id)
-
- def _do(conn):
- existing_session = conn.execute(
- """
- SELECT chat_id, thread_id FROM telegram_dm_topic_bindings
- WHERE session_id = ?
- """,
- (session_id,),
- ).fetchone()
- if existing_session is not None:
- linked_chat = existing_session["chat_id"] if isinstance(existing_session, sqlite3.Row) else existing_session[0]
- linked_thread = existing_session["thread_id"] if isinstance(existing_session, sqlite3.Row) else existing_session[1]
- if str(linked_chat) != chat_id or str(linked_thread) != thread_id:
- raise ValueError("session is already linked to another Telegram topic")
-
- conn.execute(
- """
- INSERT INTO telegram_dm_topic_bindings (
- chat_id, thread_id, user_id, session_key, session_id,
- managed_mode, linked_at, updated_at
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
- ON CONFLICT(chat_id, thread_id) DO UPDATE SET
- user_id = excluded.user_id,
- session_key = excluded.session_key,
- session_id = excluded.session_id,
- managed_mode = excluded.managed_mode,
- updated_at = excluded.updated_at
- """,
- (
- chat_id,
- thread_id,
- user_id,
- session_key,
- session_id,
- managed_mode,
- now,
- now,
- ),
- )
- self._execute_write(_do)
-
- def is_telegram_session_linked_to_topic(self, *, session_id: str) -> bool:
- """Return True if a Hermes session is already bound to any Telegram DM topic.
-
- Read-only: does NOT trigger the telegram-topic migration. If the
- topic-mode tables have not been created yet (i.e. nobody has run
- ``/topic`` in this profile), the session is by definition unbound
- and we return False.
- """
- with self._lock:
- try:
- row = self._conn.execute(
- """
- SELECT 1 FROM telegram_dm_topic_bindings
- WHERE session_id = ?
- LIMIT 1
- """,
- (str(session_id),),
- ).fetchone()
- except sqlite3.OperationalError:
- return False
- return row is not None
-
- def list_unlinked_telegram_sessions_for_user(
- self,
- *,
- chat_id: str,
- user_id: str,
- limit: int = 10,
- ) -> List[Dict[str, Any]]:
- """List previous Telegram sessions for this user that are not bound to a topic.
-
- Read-only: does NOT trigger the telegram-topic migration. If the
- topic-mode tables are absent, fall back to a simpler query that
- just returns this user's Telegram sessions — there can't be any
- bindings yet.
- """
- with self._lock:
- try:
- rows = self._conn.execute(
- """
- SELECT s.*,
- COALESCE(
- (SELECT SUBSTR(REPLACE(REPLACE(m.content, X'0A', ' '), X'0D', ' '), 1, 63)
- FROM messages m
- WHERE m.session_id = s.id AND m.role = 'user' AND m.content IS NOT NULL
- ORDER BY m.timestamp, m.id LIMIT 1),
- ''
- ) AS _preview_raw,
- COALESCE(
- (SELECT MAX(m2.timestamp) FROM messages m2 WHERE m2.session_id = s.id),
- s.started_at
- ) AS last_active
- FROM sessions s
- WHERE s.source = 'telegram'
- AND s.user_id = ?
- AND NOT EXISTS (
- SELECT 1 FROM telegram_dm_topic_bindings b
- WHERE b.session_id = s.id
- )
- ORDER BY last_active DESC, s.started_at DESC
- LIMIT ?
- """,
- (str(user_id), int(limit)),
- ).fetchall()
- except sqlite3.OperationalError:
- # telegram_dm_topic_bindings doesn't exist yet — no bindings
- # means every telegram session for this user is "unlinked".
- rows = self._conn.execute(
- """
- SELECT s.*,
- COALESCE(
- (SELECT SUBSTR(REPLACE(REPLACE(m.content, X'0A', ' '), X'0D', ' '), 1, 63)
- FROM messages m
- WHERE m.session_id = s.id AND m.role = 'user' AND m.content IS NOT NULL
- ORDER BY m.timestamp, m.id LIMIT 1),
- ''
- ) AS _preview_raw,
- COALESCE(
- (SELECT MAX(m2.timestamp) FROM messages m2 WHERE m2.session_id = s.id),
- s.started_at
- ) AS last_active
- FROM sessions s
- WHERE s.source = 'telegram'
- AND s.user_id = ?
- ORDER BY last_active DESC, s.started_at DESC
- LIMIT ?
- """,
- (str(user_id), int(limit)),
- ).fetchall()
-
- sessions: List[Dict[str, Any]] = []
- for row in rows:
- session = dict(row)
- raw = str(session.pop("_preview_raw", "") or "").strip()
- session["preview"] = raw[:60] + ("..." if len(raw) > 60 else "") if raw else ""
- sessions.append(session)
- return sessions
-
- # ── Space reclamation ──
-
- # FTS5 virtual tables whose b-tree segments we merge on optimize. The
- # trigram table is created lazily / may be disabled, so we probe before
- # touching it (see optimize_fts).
- _FTS_TABLES = ("messages_fts", "messages_fts_trigram")
-
- def _fts_table_exists(self, name: str) -> bool:
- """True if an FTS5 virtual table is queryable in this DB."""
- try:
- self._conn.execute(f"SELECT 1 FROM {name} LIMIT 0")
- return True
- except sqlite3.OperationalError:
- return False
-
- def optimize_fts(self) -> int:
- """Merge fragmented FTS5 b-tree segments into one per index.
-
- FTS5 indexes grow as a series of incremental segments — one per
- ``INSERT`` batch driven by the message triggers. Over tens of
- thousands of messages these segments accumulate, which both bloats
- the ``*_data`` shadow tables and slows ``MATCH`` queries that must
- scan every segment. The special ``'optimize'`` command rewrites each
- index as a single merged segment.
-
- This is purely a maintenance operation — it changes neither search
- results nor ``snippet()`` output, only on-disk layout and query
- speed. It is complementary to VACUUM: ``optimize`` compacts the FTS
- index internally, then VACUUM returns the freed pages to the OS.
-
- Skips any FTS table that does not exist (e.g. the trigram index when
- disabled via ``HERMES_DISABLE_FTS_TRIGRAM`` or not yet created), so
- it is safe to call unconditionally.
-
- Returns the number of FTS indexes that were optimized.
- """
- optimized = 0
- with self._lock:
- for tbl in self._FTS_TABLES:
- if not self._fts_table_exists(tbl):
- continue
- try:
- # The column name in the INSERT must match the table name
- # for FTS5 special commands.
- self._conn.execute(
- f"INSERT INTO {tbl}({tbl}) VALUES('optimize')"
- )
- optimized += 1
- except sqlite3.OperationalError as exc:
- logger.warning(
- "FTS optimize failed for %s: %s", tbl, exc
- )
- return optimized
-
- def vacuum(self) -> int:
- """Run VACUUM to reclaim disk space after large deletes.
-
- SQLite does not shrink the database file when rows are deleted —
- freed pages just get reused on the next insert. After a prune that
- removed hundreds of sessions, the file stays bloated unless we
- explicitly VACUUM.
-
- VACUUM rewrites the entire DB, so it's expensive (seconds per
- 100MB) and cannot run inside a transaction. It also acquires an
- exclusive lock, so callers must ensure no other writers are
- active. Safe to call at startup before the gateway/CLI starts
- serving traffic.
-
- FTS5 segments are merged first via :meth:`optimize_fts` so the
- subsequent VACUUM reclaims the pages freed by the merge. This is a
- layout-only optimization — search results are unchanged.
-
- Returns the number of FTS indexes that were optimized (0 if the
- merge step failed or no FTS tables exist).
- """
- # Merge FTS5 segments before VACUUM so the freed pages are returned
- # to the OS in the same pass. optimize_fts() manages its own lock.
- optimized = 0
- try:
- optimized = self.optimize_fts()
- except Exception as exc:
- logger.warning("FTS optimize before VACUUM failed: %s", exc)
- # VACUUM cannot be executed inside a transaction.
- with self._lock:
- # Best-effort WAL checkpoint first, then VACUUM.
- try:
- self._conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
- except Exception:
- pass
- self._conn.execute("VACUUM")
- return optimized
-
- def maybe_auto_prune_and_vacuum(
- self,
- retention_days: int = 90,
- min_interval_hours: int = 24,
- vacuum: bool = True,
- sessions_dir: Optional[Path] = None,
- ) -> Dict[str, Any]:
- """Idempotent auto-maintenance: prune old sessions + optional VACUUM.
-
- Records the last run timestamp in state_meta so subsequent calls
- within ``min_interval_hours`` no-op. Designed to be called once at
- startup from long-lived entrypoints (CLI, gateway, cron scheduler).
-
- When *sessions_dir* is provided, on-disk transcript files
- (``.json`` / ``.jsonl`` / ``request_dump_*``) for pruned sessions
- are removed as part of the same sweep (issue #3015).
-
- Never raises. On any failure, logs a warning and returns a dict
- with ``"error"`` set.
-
- Returns a dict with keys:
- - ``"skipped"`` (bool) — true if within min_interval_hours of last run
- - ``"pruned"`` (int) — number of sessions deleted
- - ``"vacuumed"`` (bool) — true if VACUUM ran
- - ``"error"`` (str, optional) — present only on failure
- """
- result: Dict[str, Any] = {"skipped": False, "pruned": 0, "vacuumed": False}
- try:
- # Skip if another process/call did maintenance recently.
- last_raw = self.get_meta("last_auto_prune")
- now = time.time()
- if last_raw:
- try:
- last_ts = float(last_raw)
- if now - last_ts < min_interval_hours * 3600:
- result["skipped"] = True
- return result
- except (TypeError, ValueError):
- pass # corrupt meta; treat as no prior run
-
- pruned = self.prune_sessions(
- older_than_days=retention_days,
- sessions_dir=sessions_dir,
- )
- result["pruned"] = pruned
-
- # Only VACUUM if we actually freed rows — VACUUM on a tight DB
- # is wasted I/O. Threshold keeps small DBs from paying the cost.
- if vacuum and pruned > 0:
- try:
- self.vacuum()
- result["vacuumed"] = True
- except Exception as exc:
- logger.warning("state.db VACUUM failed: %s", exc)
-
- # Record the attempt even if pruned == 0, so we don't retry
- # every startup within the min_interval_hours window.
- self.set_meta("last_auto_prune", str(now))
-
- if pruned > 0:
- logger.info(
- "state.db auto-maintenance: pruned %d session(s) older than %d days%s",
- pruned,
- retention_days,
- " + VACUUM" if result["vacuumed"] else "",
- )
- except Exception as exc:
- # Maintenance must never block startup. Log and return error marker.
- logger.warning("state.db auto-maintenance failed: %s", exc)
- result["error"] = str(exc)
-
- return result
-
- # ── Handoff (cross-platform session transfer) ──────────────────────────
- #
- # State machine:
- # None — no handoff in flight
- # "pending" — CLI requested handoff, gateway hasn't picked it up yet
- # "running" — gateway is processing (session switch + synthetic turn)
- # "completed"— gateway successfully delivered the synthetic turn
- # "failed" — gateway hit an error; reason in handoff_error
- #
- # The CLI writes "pending" then poll-waits for terminal state. The gateway
- # watcher transitions pending→running→{completed,failed}.
-
- def request_handoff(self, session_id: str, platform: str) -> bool:
- """Mark a session as pending handoff to the given platform.
-
- Returns True if the row was found and not already in flight; False if
- the session is already in a non-terminal handoff state.
- """
- def _do(conn):
- cur = conn.execute(
- "UPDATE sessions "
- "SET handoff_state = 'pending', "
- " handoff_platform = ?, "
- " handoff_error = NULL "
- "WHERE id = ? AND (handoff_state IS NULL "
- " OR handoff_state IN ('completed', 'failed'))",
- (platform, session_id),
- )
- return cur.rowcount > 0
- return self._execute_write(_do)
-
- def get_handoff_state(self, session_id: str) -> Optional[Dict[str, Any]]:
- """Read the current handoff state for a session.
-
- Returns ``{"state", "platform", "error"}`` or None if the session has
- no handoff record.
- """
- try:
- cur = self._conn.execute(
- "SELECT handoff_state, handoff_platform, handoff_error "
- "FROM sessions WHERE id = ?",
- (session_id,),
- )
- row = cur.fetchone()
- if not row:
- return None
- return {
- "state": row["handoff_state"],
- "platform": row["handoff_platform"],
- "error": row["handoff_error"],
- }
- except Exception:
- return None
-
- def list_pending_handoffs(self) -> List[Dict[str, Any]]:
- """Return all sessions in handoff_state='pending', oldest first.
-
- Used by the gateway's handoff watcher.
- """
- try:
- cur = self._conn.execute(
- "SELECT * FROM sessions "
- "WHERE handoff_state = 'pending' "
- "ORDER BY started_at ASC"
- )
- return [dict(r) for r in cur.fetchall()]
- except Exception:
- return []
-
- def claim_handoff(self, session_id: str) -> bool:
- """Atomically transition pending → running. Returns True if claimed."""
- def _do(conn):
- cur = conn.execute(
- "UPDATE sessions SET handoff_state = 'running' "
- "WHERE id = ? AND handoff_state = 'pending'",
- (session_id,),
- )
- return cur.rowcount > 0
- return self._execute_write(_do)
-
- def complete_handoff(self, session_id: str) -> None:
- """Mark a handoff as completed."""
- def _do(conn):
- conn.execute(
- "UPDATE sessions SET handoff_state = 'completed', "
- "handoff_error = NULL WHERE id = ?",
- (session_id,),
- )
- self._execute_write(_do)
-
- def fail_handoff(self, session_id: str, error: str) -> None:
- """Mark a handoff as failed and record the reason."""
- def _do(conn):
- conn.execute(
- "UPDATE sessions SET handoff_state = 'failed', "
- "handoff_error = ? WHERE id = ?",
- (error[:500], session_id),
- )
- self._execute_write(_do)
diff --git a/scripts/build_prd.py b/scripts/build_prd.py
deleted file mode 100644
index 0c6967e..0000000
--- a/scripts/build_prd.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# -*- coding: utf-8 -*-
-import os
-path = r"D:\F\NewI\opencode\daily-workspace\projects\AgentsMeeting\docs\PRD.md"
-lines = []
-lines.append("# AgentsMeeting -- PRD v0.1")
-lines.append("")
-lines.append("> \u7248\u672c: \u521d\u7a3f ^| \u5ba2\u6237: hmo (\u8001\u83ab) ^| PM: mohe (\u83ab\u8377) ^| \u7814\u53d1: xxm (\u5c0f\u5c0f\u83ab)")
diff --git a/scripts/gen_prd.py b/scripts/gen_prd.py
deleted file mode 100644
index 1c6f83c..0000000
--- a/scripts/gen_prd.py
+++ /dev/null
@@ -1 +0,0 @@
-import os; open(os.path.join(r"D:\F\NewI\opencode\daily-workspace\projects\AgentsMeeting\docs","PRD.md"),"w",encoding="utf-8").write("# AgentsMeeting - PRD v0.1\n\nOK")
\ No newline at end of file
diff --git a/scripts/gen_prd_v02.py b/scripts/gen_prd_v02.py
deleted file mode 100644
index 2218965..0000000
--- a/scripts/gen_prd_v02.py
+++ /dev/null
@@ -1,2 +0,0 @@
-import os
-print("ok")
diff --git a/scripts/test_echo.py b/scripts/test_echo.py
deleted file mode 100644
index 4473a45..0000000
--- a/scripts/test_echo.py
+++ /dev/null
@@ -1 +0,0 @@
-print("test123")
diff --git a/scripts/write_prd.py b/scripts/write_prd.py
deleted file mode 100644
index 1db0a10..0000000
--- a/scripts/write_prd.py
+++ /dev/null
@@ -1 +0,0 @@
-print("ok")
\ No newline at end of file
diff --git a/scripts/write_prd_v02.py b/scripts/write_prd_v02.py
deleted file mode 100644
index 37b73e7..0000000
--- a/scripts/write_prd_v02.py
+++ /dev/null
@@ -1,2 +0,0 @@
-import os,sys
-open("D:/F/NewI/opencode/daily-workspace/projects/AgentsMeeting/docs/PRD_v0.2.md","w",encoding="utf-8").write("test ok\n")
\ No newline at end of file
diff --git a/xmpp_agent_core.py b/xmpp_agent_core.py
index 5e54579..772acef 100644
--- a/xmpp_agent_core.py
+++ b/xmpp_agent_core.py
@@ -1,9 +1,35 @@
#!/usr/bin/env python3
-"""XMPP Bot - 统一版,支持 --agent mohe|zhiwei|xiao 参数"""
-import asyncio, logging, ssl, json, urllib.request, os, time, sys, re
-from slixmpp import ClientXMPP
+"""
+XMPP Agent Core — 统一版
+=========================
+Single bot core for all agents. Supports --agent xxm|mohe|zhiwei|xiaoguo.
+
+Usage:
+ python xmpp_agent_core.py --agent xxm # xxm, uses chat_bridge
+ python xmpp_agent_core.py --agent mohe # mohe, uses Hermes API
+ python xmpp_agent_core.py --agent zhiwei # zhiwei, uses Hermes API
+ python xmpp_agent_core.py --agent xiaoguo # xiaoguo, uses Hermes API
+
+Shares: PID lock, reconnect, MUC join, dedup, batching,
+ coordinator protocol (GRANT/REVOKE), HTTP bridge.
+Differs only in LLM calling method (chat_bridge vs Hermes API).
+"""
+import os, sys, time, threading, asyncio, logging, json, re, ssl
+import urllib.request, http.server, urllib.parse
+
+# ── Windows selector loop (slixmpp needs it on Windows) ──
+if sys.platform == "win32":
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+
+# ── PATH: allow imports from gateway/scripts/ (proc_guard, chat_bridge) ──
+_GATEWAY_SCRIPTS = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+ "gateway", "scripts")
+sys.path.insert(0, _GATEWAY_SCRIPTS)
+
+# ═══════════════════════════════════════════════════════════════
+# AGENTS Configuration
+# ═══════════════════════════════════════════════════════════════
-# ── Agent 配置 ──────────────────────────────────────────────
AGENTS = {
"mohe": {
"jid": "mohe@yoin.fun",
@@ -13,6 +39,9 @@ AGENTS = {
"http_port": 5804,
"gateway": "http://localhost:8642/v1/chat/completions",
"session_id": "xmpp-mohe-v2",
+ "server": "127.0.0.1",
+ "port": 5222,
+ "muc_rooms": ["coregroup@conference.yoin.fun"],
"mention": "@mohe/@莫荷",
},
"zhiwei": {
@@ -23,6 +52,9 @@ AGENTS = {
"http_port": 5805,
"gateway": "http://localhost:8643/v1/chat/completions",
"session_id": "xmpp-zhiwei",
+ "server": "127.0.0.1",
+ "port": 5222,
+ "muc_rooms": ["coregroup@conference.yoin.fun"],
"mention": "@zhiwei/@知微",
},
"xiaoguo": {
@@ -33,316 +65,830 @@ AGENTS = {
"http_port": 5806,
"gateway": "http://localhost:8645/v1/chat/completions",
"session_id": "xmpp-xiaoguo",
+ "server": "127.0.0.1",
+ "port": 5222,
+ "muc_rooms": ["coregroup@conference.yoin.fun"],
"mention": "@xiaoguo/@小果",
},
+ "xxm": {
+ "jid": "xxm@yoin.fun",
+ "password": "hermes123",
+ "nick": "xxm",
+ "name_cn": "笑笑",
+ "http_port": 5802,
+ "bridge": "chat_bridge", # use local chat_bridge instead of Hermes API
+ "session_id": "ses_xxm_xmpp",
+ "server": "192.168.1.246", # LAN direct connect
+ "port": 5222,
+ "muc_rooms": [
+ "coregroup@conference.yoin.fun",
+ "jujidina@conference.yoin.fun",
+ ],
+ "mention": "@xxm/@笑笑",
+ },
}
-agent = sys.argv[sys.argv.index("--agent") + 1] if "--agent" in sys.argv else "mohe"
-cfg = AGENTS.get(agent, AGENTS["mohe"])
+# ── Agent selection ──
+_agent_name = "mohe"
+if "--agent" in sys.argv:
+ idx = sys.argv.index("--agent")
+ if idx + 1 < len(sys.argv):
+ _agent_name = sys.argv[idx + 1]
+cfg = AGENTS.get(_agent_name, AGENTS["mohe"])
+
+# ═══════════════════════════════════════════════════════════════
+# PID Lock — prevent duplicate instances
+# ═══════════════════════════════════════════════════════════════
+
+from proc_guard import guard as _proc_guard
+_lock = _proc_guard(f"xmpp_bot_{_agent_name}")
+if not _lock.ok:
+ print(_lock.message, flush=True)
+ sys.exit(1)
+
+# ═══════════════════════════════════════════════════════════════
+# Logging
+# ═══════════════════════════════════════════════════════════════
+
+_LOG_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "gateway", "logs")
+os.makedirs(_LOG_DIR, exist_ok=True)
+_LOG_FILE = os.path.join(_LOG_DIR, f"xmpp_{_agent_name}.log")
+_START_TIME = time.time()
+
+
+def log(m: str):
+ with open(_LOG_FILE, "a", encoding="utf-8") as f:
+ f.write(f"{time.strftime('%H:%M:%S')} {m}\n")
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
-GATEWAY = cfg["gateway"]
-API_KEY = "hermes123"
-AGENT_NICK = cfg["nick"]
-AGENT_NAME = cfg["name_cn"]
-AGENT_JID = cfg["jid"]
-AGENT_MENTION = cfg["mention"]
-SESSION_ID = cfg["session_id"]
-HTTP_PORT = cfg["http_port"]
-_opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
-# ── HTTP 桥(接收本地脚本的主动发送请求) ──
-from http.server import HTTPServer, BaseHTTPRequestHandler
-import threading, json as json_mod
+# ═══════════════════════════════════════════════════════════════
+# LLM Bridge Init — abstracted per agent
+# ═══════════════════════════════════════════════════════════════
-_send_queue = []
+_IS_CHAT_BRIDGE = cfg.get("bridge") == "chat_bridge"
+_router = None # set only for chat_bridge (xxm)
-class SendHandler(BaseHTTPRequestHandler):
- def do_POST(self):
- length = int(self.headers.get('Content-Length', 0))
- body = self.rfile.read(length)
- try:
- data = json_mod.loads(body)
- room = data.get('to', 'coregroup@conference.yoin.fun')
- text = data.get('body', '')
- if text:
- _send_queue.append((room, text))
- self.send_response(200)
- self.end_headers()
- self.wfile.write(b'{"ok":true}')
- else:
- self.send_response(400)
- self.end_headers()
- self.wfile.write(b'{"ok":false,"error":"empty body"}')
- except Exception as e:
- self.send_response(500)
- self.end_headers()
- self.wfile.write(f'{{"ok":false,"error":"{e}"}}'.encode())
+if _IS_CHAT_BRIDGE:
+ from chat_bridge import SessionBridge
+ from session_router import SessionRouter
+ _bridge = SessionBridge(session_id=cfg["session_id"])
+ _router = SessionRouter(bridge=_bridge, default_session=cfg["session_id"])
+ log(f"LLM: chat_bridge (session={cfg['session_id']})")
+else:
+ _opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
+ log(f"LLM: Hermes API ({cfg['gateway']})")
-def _run_http():
- server = HTTPServer(('127.0.0.1', HTTP_PORT), SendHandler)
- server.timeout = 1.0
+
+def _call_llm(content: str, sender: str, is_group: bool = False) -> str:
+ """Abstract LLM call. Returns raw response text (or empty string)."""
+ if _IS_CHAT_BRIDGE:
+ return _router.route("xmpp", sender, content) or ""
+ else:
+ return _call_hermes_api(content)
+
+
+def _call_hermes_api(content: str) -> str:
+ """POST to Hermes API, return response text or empty string."""
+ try:
+ payload = json.dumps({
+ "model": "hermes-agent",
+ "messages": [{"role": "user", "content": content}]
+ }).encode()
+ req = urllib.request.Request(cfg["gateway"], data=payload, method="POST")
+ req.add_header("Content-Type", "application/json")
+ req.add_header("Authorization", "Bearer hermes123")
+ req.add_header("X-Hermes-Session-Id", cfg["session_id"])
+ result = _opener.open(req, timeout=600)
+ data = json.loads(result.read())
+ reply = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ return reply.strip()
+ except Exception as e:
+ log(f"!!! Hermes API error: {e}")
+ return ""
+
+
+# ═══════════════════════════════════════════════════════════════
+# Message Dedup
+# ═══════════════════════════════════════════════════════════════
+
+_DEDUP_CACHE: set[str] = set()
+_DEDUP_LOCK = threading.Lock()
+
+
+def _is_duplicate(msg_id: str) -> bool:
+ if not msg_id:
+ return False
+ with _DEDUP_LOCK:
+ if msg_id in _DEDUP_CACHE:
+ return True
+ _DEDUP_CACHE.add(msg_id)
+ if len(_DEDUP_CACHE) > 100:
+ _DEDUP_CACHE.clear()
+ return False
+
+
+# ═══════════════════════════════════════════════════════════════
+# Coordinator Protocol (shared across all agents)
+# ═══════════════════════════════════════════════════════════════
+
+_COORDINATOR: str = "mohe"
+_GRANTED: str | None = None
+_REVOKED_UNTIL: float = 0.0
+_SHUTUP_PATTERNS = ["闭嘴", "别说话", "安静", "shut", "stfu", "别说了", "停"]
+
+
+def _process_coordinator_signals(nickname: str, body: str) -> bool:
+ """Parse coordinator/GRANT/REVOKE from incoming messages.
+ Returns True if message was a control signal (consumed, no further processing)."""
+ global _COORDINATOR, _GRANTED, _REVOKED_UNTIL
+ # 1. hmo switches coordinator
+ if nickname == 'hmo' and 'coordinator=' in body.lower():
+ for name in ('mohe', 'zhiwei', 'xxm'):
+ if f'coordinator={name}' in body.lower():
+ _COORDINATOR = name
+ _GRANTED = None
+ log(f"Coordinator switched to {name} by hmo")
+ return True
+ # 2. GRANT signal (overrides REVOKE)
+ gm = re.search(r'\[GRANT:(\w+)\]', body)
+ if gm:
+ _GRANTED = gm.group(1)
+ _REVOKED_UNTIL = 0
+ log(f"GRANT: {_GRANTED}")
+ return True
+ # 3. REVOKE signal (5min auto-restore)
+ rm = re.search(r'\[REVOKE:(\w+)\]', body)
+ if rm and rm.group(1) == cfg["nick"]:
+ _REVOKED_UNTIL = time.time() + 300
+ log(f"REVOKEd: {cfg['nick']} silenced for 5min")
+ return True
+ return False
+
+
+def _check_shutup(body: str) -> bool:
+ """hmo says shut up → 5min silence."""
+ lower = body.lower().strip()
+ for pat in _SHUTUP_PATTERNS:
+ if pat.lower() in lower:
+ _REVOKED_UNTIL = time.time() + 300
+ log(f"(shutup: '{pat}' → 5min silence)")
+ return True
+ return False
+
+
+def _process_llm_grant(response: str):
+ """Parse GRANT signal from LLM's own response."""
+ global _GRANTED
+ gm = re.search(r'\[GRANT:(\w+)\]', response)
+ if gm:
+ _GRANTED = gm.group(1)
+ _REVOKED_UNTIL = 0
+ log(f"LLM GRANT: {_GRANTED}")
+
+
+# ═══════════════════════════════════════════════════════════════
+# Response Extraction
+# ═══════════════════════════════════════════════════════════════
+
+_SILENCE_PATTERNS = [
+ "保持沉默", "不应[该]?回复", "没有.*@.*我", "不是对[我我说]",
+ "跟我无关", "我不用回复", "不该回复", "不参与",
+ "不是我[应]?该[说回]",
+]
+_EXEC_RE = re.compile(r"##exec:(.+?)##", re.DOTALL)
+_DELAY_RE = re.compile(r"##delay:?(\d+)?##")
+_DELAY_DEFAULT = 15
+_EXEC_TIMEOUT = 60
+
+
+def _strip_toolcall_xml(text: str) -> str:
+ t = text
+ t = re.sub(r']*>.*?(|$)', '', t, flags=re.DOTALL)
+ t = re.sub(r'.*?(|$)', '', t, flags=re.DOTALL)
+ t = re.sub(r']*>.*?(|$)', '', t, flags=re.DOTALL)
+ t = re.sub(r'.*?(|$)', '', t, flags=re.DOTALL)
+ return t.strip()
+
+
+def _extract_response(text: str) -> str | None:
+ """Strip __SILENT__, reasoning blocks, natural language silence.
+ Returns actual content to send, or None to stay silent."""
+ if not text:
+ return None
+ t = text.strip()
+ if not t:
+ return None
+ t = _strip_toolcall_xml(t)
+ # Natural language silence detection
+ if not t.startswith("__SILENT__"):
+ first = t.split("\n", 1)[0]
+ for pat in _SILENCE_PATTERNS:
+ if re.search(pat, first):
+ return None
+ return t
+ # Has __SILENT__ prefix
+ parts = t.split("\n", 1)
+ if len(parts) < 2:
+ return None
+ rest = parts[1].strip()
while True:
- server.handle_request()
+ m = re.match(r'^([^)]*)\s*', rest)
+ if m:
+ rest = rest[m.end():]
+ continue
+ m = re.match(r'^\([^)]*\)\s*', rest)
+ if m:
+ rest = rest[m.end():]
+ continue
+ break
+ return rest.strip() or None
-threading.Thread(target=_run_http, daemon=True).start()
-logging.info(f"🚀 {AGENT_NAME} HTTP 桥启动于 :{HTTP_PORT}")
-# ── XMPP Bot 类 ────────────────────────────────────────────────
-class AgentBot(ClientXMPP):
+# ═══════════════════════════════════════════════════════════════
+# Message Batching (3s debounce + serialized processing)
+# ═══════════════════════════════════════════════════════════════
+
+_BATCH_WINDOW = 3.0
+_BATCH_TIMEOUT = 300
+_batch_entries: dict[str, list[str]] = {}
+_batch_timers: dict[str, threading.Timer] = {}
+_batch_processing: set[str] = set()
+_batch_pending: dict[str, list[str]] = {}
+_batch_lock = threading.Lock()
+_BOT_NICK = cfg["nick"]
+
+
+def _run_command(cmd: str) -> str:
+ """Execute ##exec:command## shell command."""
+ log(f"(exec: {cmd[:120]})")
+ try:
+ import subprocess
+ r = subprocess.run(cmd, shell=True, capture_output=True,
+ timeout=_EXEC_TIMEOUT, text=True, encoding='utf-8', errors='replace')
+ out = (r.stdout or "") + (r.stderr or "")
+ out = out.strip() or f"(no output, exit={r.returncode})"
+ log(f"(exec done: {len(out)} bytes, exit={r.returncode})")
+ return out
+ except subprocess.TimeoutExpired:
+ log(f"(exec timeout >{_EXEC_TIMEOUT}s)")
+ return "(命令超时)"
+ except Exception as e:
+ log(f"(exec error: {e})")
+ return f"(命令执行失败: {e})"
+
+
+def _schedule_delayed(delay_sec: int, room: str):
+ """Schedule ##delay:N## re-invocation."""
+ global _xmpp_ref
+ import subprocess as _sp
+ from xml.sax.saxutils import escape
+
+ def _fire():
+ bot = _xmpp_ref
+ if not bot:
+ return
+ try:
+ prompt = "时间到,请根据最新的信息汇报结果。"
+ raw = _call_llm(prompt, room, is_group=True)
+ reply = _extract_response(raw)
+ if reply:
+ safe = escape(reply.strip())
+ stanza = f"{safe}"
+ bot.send_raw(stanza)
+ log(f"-> [Delay][{room}]: {reply.strip()[:80]}")
+ except Exception as e:
+ log(f"!! delay err: {e}")
+
+ t = threading.Timer(delay_sec, _fire)
+ t.daemon = True
+ t.start()
+ log(f"(delay +{delay_sec}s → {room})")
+
+
+def _batch_done(room: str):
+ """Called when batch LLM finishes. Flush pending if any."""
+ with _batch_lock:
+ _batch_processing.discard(room)
+ pending = _batch_pending.pop(room, None)
+ if pending:
+ _batch_entries[room] = pending
+ t = threading.Timer(0.1, _fire_batch, args=[room])
+ t.daemon = True
+ t.start()
+ _batch_timers[room] = t
+ return
+ log(f"[Batch][{room}] (idle)")
+
+
+def _fire_batch(room: str):
+ """Collect batched entries and call LLM."""
+ with _batch_lock:
+ entries = _batch_entries.pop(room, None)
+ _batch_timers.pop(room, None)
+ if not entries:
+ return
+ _batch_processing.add(room)
+ combined = "\n".join(entries)
+
+ def _handle():
+ timed_out = [False]
+
+ def _timeout():
+ timed_out[0] = True
+ log(f"[Batch][{room}] TIMEOUT ({_BATCH_TIMEOUT}s)")
+ _batch_done(room)
+
+ timer = threading.Timer(_BATCH_TIMEOUT, _timeout)
+ timer.daemon = True
+ timer.start()
+ try:
+ raw = _call_llm(combined, room, is_group=True)
+ if not timed_out[0]:
+ timer.cancel()
+ _process_llm_reply(raw, room)
+ else:
+ log(f"[Batch][{room}] route returned after timeout, discarded")
+ except Exception as e:
+ log(f"!!! BATCH: {e}")
+ if not timed_out[0]:
+ timer.cancel()
+ _batch_done(room)
+
+ threading.Thread(target=_handle, daemon=True).start()
+
+
+def _batch_group_message(room: str, nickname: str, body: str) -> bool:
+ """Add message to room batch. Returns True if batched, False if @mention (immediate)."""
+ if f"@{_BOT_NICK}" in body or body.startswith(_BOT_NICK):
+ return False
+ formatted = f"[{nickname}]: {body}"
+ with _batch_lock:
+ if room in _batch_processing:
+ _batch_pending.setdefault(room, []).append(formatted)
+ return True
+ timer = _batch_timers.pop(room, None)
+ if timer:
+ timer.cancel()
+ _batch_entries.setdefault(room, []).append(formatted)
+ t = threading.Timer(_BATCH_WINDOW, _fire_batch, args=[room])
+ t.daemon = True
+ t.start()
+ _batch_timers[room] = t
+ return True
+
+
+# ═══════════════════════════════════════════════════════════════
+# MAM Recovery — fetch recent history on startup
+# ═══════════════════════════════════════════════════════════════
+
+_MAM_RECOVERY = True
+_MAM_RECOVERY_LOCK = threading.Lock()
+_MAM_MARK_DONE = False
+_MAM_TIMEOUT = 30
+
+
+def _set_mam_done():
+ global _MAM_RECOVERY
+ with _MAM_RECOVERY_LOCK:
+ _MAM_RECOVERY = False
+
+
+def _is_mam_recovery() -> bool:
+ if time.time() - _START_TIME > _MAM_TIMEOUT:
+ global _MAM_RECOVERY
+ with _MAM_RECOVERY_LOCK:
+ if _MAM_RECOVERY:
+ _MAM_RECOVERY = False
+ log("(MAM recovery timed out, force-disabled)")
+ return _MAM_RECOVERY
+ with _MAM_RECOVERY_LOCK:
+ return _MAM_RECOVERY
+
+
+# ═══════════════════════════════════════════════════════════════
+# XML Escape
+# ═══════════════════════════════════════════════════════════════
+
+def _escape(text: str) -> str:
+ return (text.replace("&", "&").replace("<", "<")
+ .replace(">", ">").replace('"', """))
+
+
+# ═══════════════════════════════════════════════════════════════
+# HTTP Bridge — health, presence, messages, POST send
+# ═══════════════════════════════════════════════════════════════
+
+_HTTP_PORT = cfg["http_port"]
+_MSG_BUF: list[dict] = []
+_MSG_BUF_LOCK = threading.Lock()
+_xmpp_ref = None # set after bot creation
+
+
+def _record_group_msg(nickname: str, body: str):
+ ts = time.strftime("%H:%M:%S")
+ with _MSG_BUF_LOCK:
+ _MSG_BUF.append({"ts": ts, "from": nickname, "body": body})
+ if len(_MSG_BUF) > 200:
+ _MSG_BUF[:] = _MSG_BUF[-150:]
+
+
+class _BridgeHandler(http.server.BaseHTTPRequestHandler):
+ def do_GET(self):
+ parsed = urllib.parse.urlparse(self.path)
+ if parsed.path == "/muc":
+ try:
+ muc_info = {"rooms": {}}
+ bot = _xmpp_ref
+ if bot is not None and 'xep_0045' in bot.plugin:
+ muc_plugin = bot.plugin['xep_0045']
+ for room_jid in cfg["muc_rooms"]:
+ room_data = {"jid": room_jid, "participants": []}
+ try:
+ if room_jid in muc_plugin.rooms:
+ room = muc_plugin.rooms[room_jid]
+ for nick, info in room.get('roster', {}).items():
+ room_data["participants"].append({
+ "nick": nick,
+ "jid": str(info.get('jid', '')),
+ "affiliation": str(info.get('affiliation', '')),
+ "role": str(info.get('role', '')),
+ })
+ except Exception as e:
+ room_data["error"] = str(e)
+ muc_info["rooms"][room_jid] = room_data
+ self._reply(200, muc_info)
+ except Exception as e:
+ self._reply(500, {"ok": False, "error": str(e)})
+ return
+ if parsed.path == "/health":
+ try:
+ bot = _xmpp_ref
+ session_ok = bot.session_started_event.is_set() if (bot and hasattr(bot, 'session_started_event')) else False
+ socket_ok = bot.is_connected() if (bot and hasattr(bot, 'is_connected')) else False
+ self._reply(200, {
+ "ok": True, "xmpp_connected": session_ok or socket_ok,
+ "agent": _agent_name, "jid": cfg["jid"],
+ "uptime_sec": int(time.time() - _START_TIME),
+ "muc_rooms": cfg["muc_rooms"],
+ })
+ except Exception as e:
+ self._reply(500, {"ok": False, "error": str(e)})
+ return
+ if parsed.path.startswith("/presence"):
+ jid_to_check = parsed.path[len("/presence/"):].strip()
+ if not jid_to_check:
+ self._reply(400, {"ok": False, "error": "missing JID"})
+ return
+ try:
+ info = {"jid": jid_to_check, "online": False, "resources": []}
+ bot = _xmpp_ref
+ if bot and hasattr(bot, 'client_roster'):
+ roster = bot.client_roster
+ if jid_to_check in roster:
+ resources = list(roster[jid_to_check].resources.keys())
+ info["online"] = len(resources) > 0
+ info["resources"] = resources
+ self._reply(200, info)
+ except Exception as e:
+ self._reply(500, {"ok": False, "error": str(e)})
+ return
+ if parsed.path == "/messages":
+ try:
+ qs = urllib.parse.parse_qs(parsed.query)
+ sender = qs.get("from", [None])[0]
+ with _MSG_BUF_LOCK:
+ msgs = list(_MSG_BUF)
+ if sender:
+ msgs = [m for m in msgs if m["from"] == sender]
+ self._reply(200, {"ok": True, "count": len(msgs), "messages": msgs[-50:]})
+ except Exception as e:
+ self._reply(500, {"ok": False, "error": str(e)})
+ return
+ self._reply(404, {"ok": False, "error": "not found"})
+
+ def do_POST(self):
+ try:
+ length = int(self.headers.get('Content-Length', 0))
+ body = json.loads(self.rfile.read(length))
+ to = body.get('to', cfg["muc_rooms"][0])
+ msg = body.get('message', '') or body.get('body', '')
+ if not msg:
+ self._reply(400, {"ok": False, "error": "empty message"})
+ return
+ safe = _escape(msg.strip())
+ bot = _xmpp_ref
+ if bot:
+ stanza = f'{safe}'
+ bot.send_raw(stanza)
+ _record_group_msg(cfg["nick"], msg)
+ log(f"[http] → [{to.split('@')[0]}]: {msg[:80]}")
+ self._reply(200, {"ok": True})
+ except Exception as e:
+ self._reply(500, {"ok": False, "error": str(e)})
+
+ def _reply(self, code, data):
+ body = json.dumps(data, ensure_ascii=False).encode('utf-8')
+ self.send_response(code)
+ self.send_header('Content-Type', 'application/json; charset=utf-8')
+ self.send_header('Content-Length', len(body))
+ self.end_headers()
+ self.wfile.write(body)
+
+ def log_message(self, format, *args):
+ pass
+
+
+def _start_http_bridge():
+ _httpd = http.server.HTTPServer(('0.0.0.0', _HTTP_PORT), _BridgeHandler)
+ _t = threading.Thread(target=_httpd.serve_forever, daemon=True)
+ _t.start()
+ log(f"HTTP bridge ready on :{_HTTP_PORT}")
+
+
+# ═══════════════════════════════════════════════════════════════
+# Reply Processing (shared for all LLM responses)
+# ═══════════════════════════════════════════════════════════════
+
+def _process_llm_reply(raw_reply: str, room: str):
+ """Process LLM response: check silence/delay/exec/send."""
+ global _xmpp_ref
+ if not raw_reply:
+ _batch_done(room)
+ return
+ # Parse GRANT signal from LLM response
+ _process_llm_grant(raw_reply)
+ # ##delay:N## → schedule later
+ delay_m = _DELAY_RE.search(raw_reply)
+ if delay_m:
+ sec = int(delay_m.group(1)) if delay_m.group(1) else _DELAY_DEFAULT
+ _schedule_delayed(sec, room)
+ _batch_done(room)
+ return
+ # ##exec:command## → run command, use output as reply
+ exec_m = _EXEC_RE.search(raw_reply)
+ if exec_m:
+ output = _run_command(exec_m.group(1))
+ raw_reply = _EXEC_RE.sub(output, raw_reply, count=1)
+ # Extract actual response
+ reply_text = _extract_response(raw_reply)
+ if reply_text:
+ safe = _escape(reply_text.strip())
+ bot = _xmpp_ref
+ if bot:
+ stanza = f"{safe}"
+ bot.send_raw(stanza)
+ log(f"-> [{room.split('@')[0]}]: {reply_text.strip()[:80]}")
+ else:
+ log(f"-> [{room.split('@')[0]}]: (silent)")
+ _batch_done(room)
+
+
+# ═══════════════════════════════════════════════════════════════
+# Group message handler
+# ═══════════════════════════════════════════════════════════════
+
+def _handle_group_message(msg):
+ """Process a groupchat message (runs in thread)."""
+ global _COORDINATOR, _GRANTED, _REVOKED_UNTIL
+ if _is_mam_recovery():
+ return
+ msg_id = msg.get("id", "")
+ if _is_duplicate(msg_id):
+ return
+ body = str(msg["body"]).strip()
+ if not body:
+ return
+ full_from = str(msg["from"])
+ room = full_from.split("/")[0]
+ nickname = full_from.split("/")[1] if "/" in full_from else ""
+ # Self-message skip
+ if nickname == cfg["nick"]:
+ log(f"(self) {body[:80]}")
+ return
+ _record_group_msg(nickname, body)
+ # Coordinator signals
+ if _process_coordinator_signals(nickname, body):
+ return
+ # Revoke check
+ is_revoked = time.time() < _REVOKED_UNTIL
+ if is_revoked and _GRANTED == cfg["nick"]:
+ _GRANTED = None
+ is_revoked = False
+ log(f"GRANT overrides REVOKE for {cfg['nick']}")
+ if _check_shutup(body):
+ return
+ if is_revoked:
+ body = f"【只读消息】你被收回发言权。只需了解内容。输出 __SILENT__。\n\n[核心群 {room}] {nickname} 说: {body}"
+ # Batch or immediate (@mention)
+ if _batch_group_message(room, nickname, body):
+ log(f"[{room.split('@')[0]}] {nickname}: {body[:80]} (batched)")
+ return
+ log(f"[{room.split('@')[0]}] {nickname}: {body[:80]}")
+ raw = _call_llm(body, full_from, is_group=True)
+ _process_llm_reply(raw, room)
+
+
+# ═══════════════════════════════════════════════════════════════
+# Private message handler
+# ═══════════════════════════════════════════════════════════════
+
+def _handle_private_message(msg):
+ """Process a private chat message."""
+ if msg["type"] == "groupchat":
+ return
+ msg_id = msg.get("id", "")
+ if _is_duplicate(msg_id):
+ return
+ body = str(msg["body"]).strip()
+ sender = str(msg["from"]).split("/")[0]
+ log(f"<{sender}> {body[:80]}")
+ if sender == cfg["jid"]:
+ log("(skipped self)")
+ return
+ if time.time() < _REVOKED_UNTIL:
+ log(f"(silenced) <{sender}> dropped")
+ return
+ if _check_shutup(body):
+ return
+ raw = _call_llm(body, sender, is_group=False)
+ if raw:
+ reply = _extract_response(raw)
+ if reply:
+ from xml.sax.saxutils import escape
+ safe = escape(reply.strip())
+ if _IS_CHAT_BRIDGE:
+ bot = _xmpp_ref
+ if bot:
+ stanza = f"{safe}"
+ bot.send_raw(stanza)
+ log(f"-> {sender}: {reply.strip()[:80]}")
+ else:
+ import subprocess as sp
+ sp.run(["docker", "exec", "ejabberd", "ejabberdctl", "send_stanza",
+ cfg["jid"], sender,
+ f"{safe}"
+ ], capture_output=True, timeout=10)
+ log(f"-> {sender}: {reply.strip()[:80]}")
+
+
+# ═══════════════════════════════════════════════════════════════
+# AgentBot Class
+# ═══════════════════════════════════════════════════════════════
+
+import slixmpp
+
+
+class AgentBot(slixmpp.ClientXMPP):
def __init__(self):
- super().__init__(AGENT_JID, cfg["password"])
- self.add_event_handler('session_bind', self.on_bind)
- self.add_event_handler('message', self.on_msg)
- self.add_event_handler('disconnected', self.on_disconnect)
- self.add_event_handler('connected', self.on_connected)
+ super().__init__(cfg["jid"], cfg["password"])
+ # Connection settings
+ self.enable_direct_tls = False
+ self.enable_starttls = True
+ self.auto_reconnect = True
+ self.reconnect_max_delay = 10
+ self.whitespace_keepalive = True
+ self.whitespace_keepalive_interval = 30
+ # SSL: accept self-signed certs
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
self.ssl_context = ctx
- self.ready = asyncio.Event()
- self._call_seq = 0
- self._muc_joined = False
- self._recent_sent = []
- self._coordinator = 'mohe' # 默认协调者
- self._granted = None
+ # Event handlers
+ self.add_event_handler("session_start", self._on_session_start)
+ self.add_event_handler("message", self._on_any_message)
+ self.add_event_handler("groupchat_message", self._on_group_msg)
+ self.add_event_handler("disconnected", self._on_disconnected)
+ self.add_event_handler("connected", self._on_connected)
+ self.add_event_handler("session_end", self._on_session_end)
+ self.add_event_handler("connection_failed", self._on_conn_failed)
+ # MUC plugin
+ self.register_plugin('xep_0045')
- async def on_connected(self, event):
- logging.info(f"🔗 {AGENT_NAME} TCP连接已建立")
+ def _on_connected(self, event):
+ log("connection established")
- async def on_bind(self, event):
+ def _on_session_start(self, event):
self.send_presence()
self.get_roster()
+ log(f"{cfg['jid']} online")
+ # Register MAM plugin lazily
try:
- self.plugin['xep_0045'].join_muc('coregroup@conference.yoin.fun', AGENT_NICK)
- logging.info(f"✅ {AGENT_NAME} 加入群聊 coregroup")
- except Exception as e:
- logging.error(f"❌ {AGENT_NAME} 加入群聊失败: {e}")
- self._muc_joined = True
- self.ready.set()
- logging.info(f"✅ {AGENT_NAME} XMPP 上线")
+ self.register_plugin('xep_0313')
+ except Exception:
+ log("(MAM: xep_0313 not available)")
+ # Join MUC rooms
+ async def _join_all():
+ for room_jid in cfg["muc_rooms"]:
+ try:
+ self.plugin['xep_0045'].join_muc(room_jid, cfg["nick"])
+ presence = (
+ f""
+ f""
+ f""
+ f""
+ )
+ self.send_raw(presence)
+ log(f"Joined {room_jid}")
+ except Exception as e:
+ log(f"MUC join failed {room_jid}: {e}")
+ await asyncio.sleep(2)
+ await asyncio.sleep(3)
+ await self._fetch_mam_history()
+ asyncio.ensure_future(_join_all())
- async def on_disconnect(self, event):
- self.ready.clear()
- self._muc_joined = False
- logging.warning(f"⚠️ {AGENT_NAME} XMPP 断线")
-
- async def on_msg(self, msg):
- body = msg['body']
- sender = str(msg['from'])
- msg_type = msg['type']
- if not body:
+ async def _fetch_mam_history(self):
+ """Query MAM for recent MUC messages to rebuild context."""
+ if 'xep_0313' not in self.plugin:
+ log("(MAM: no plugin)")
+ _set_mam_done()
return
- if msg_type == 'groupchat':
- if AGENT_JID in sender:
- return
- nickname = sender.split('/')[-1] if '/' in sender else ''
- # 自己的消息跳过(通过昵称)
- if nickname == AGENT_NICK:
- return
-
- # Coordinator 模式 — 全走 XMPP 消息
-
- # 1. hmo 切换 coordinator
- if nickname == 'hmo' and 'coordinator=' in body.lower():
- for _name in ['mohe', 'zhiwei', 'xxm']:
- if f'coordinator={_name}' in body.lower():
- self._coordinator = _name
- self._granted = None
- logging.info(f"👑 Coordinator 切换为 {_name}")
- break
-
- # 2. 检测授权信号(优先于收回,GRANT 可以覆盖 REVOKE)
- _grant_match = re.search(r'\[GRANT:(\w+)\]', body)
- if _grant_match:
- self._granted = _grant_match.group(1)
- self._revoked_until = 0 # 被授权时解除收回
- logging.info(f"🎤 收到授权:{self._granted}")
-
- # 3. 检测收回信号
- _revoke_match = re.search(r'\[REVOKE:(\w+)\]', body)
- _revoked_until = getattr(self, '_revoked_until', 0)
- if _revoke_match and _revoke_match.group(1) == AGENT_NICK:
- self._revoked_until = time.time() + 300 # 5分钟自动解除
- logging.info(f"🔇 {AGENT_NAME} 发言权被收回(5分钟后自动恢复)")
- if time.time() < _revoked_until:
- # 被收回者:只读模式,能看到但不能回复
- _rr = sender.split('/')[0]
- _readonly_body = f"【只读消息】你目前被暂时收回发言权。只需了解内容。输出 __SILENT__。\n\n[核心群 {_rr}] {nickname} 说: {body}"
- await self.call_hermes(_readonly_body, sender, is_group=True)
- return
-
- # 3. 判断角色
- _coordinator = getattr(self, '_coordinator', 'mohe')
- _granted = getattr(self, '_granted', None)
- _is_coordinator = (_coordinator == AGENT_NICK)
- _is_granted = (_granted == AGENT_NICK)
-
- if _is_granted:
- self._granted = None # 用完即收回
-
- room = sender.split('/')[0]
-
- # 4. 被 REVOKE 的人:只读模式
- if time.time() < getattr(self, '_revoked_until', 0):
- _readonly = f"【只读消息】你目前被收回发言权。只需了解内容。输出 __SILENT__。\n\n[核心群 {room}] {nickname} 说: {body}"
- await self.call_hermes(_readonly, sender, is_group=True)
- return
-
- # 硬闭嘴闸门:hmo 说闭嘴类的话 → 静默 5 分钟
- _silent_until = getattr(self, '_silent_until', 0)
- if time.time() < _silent_until:
- return
- if nickname == 'hmo':
- _sk = ['闭嘴', '别说话', '安静', 'shut', 'stfu', '别说了', '停']
- if any(kw in body.lower() for kw in _sk):
- self._silent_until = time.time() + 300
- logging.info(f"🔇 {AGENT_NAME} 收到闭嘴指令,静默 5 分钟")
- return
-
- logging.info(f"📩 群消息 [{sender}]: {body[:100]}")
- room = sender.split('/')[0]
- ctx_body = (
- "【规则】以下是一条群聊消息。判断是否应该回复。\n"
- "只有以下3种情况你才回复:\n"
- f"1. hmo直接点名问你({AGENT_MENTION})\n"
- "2. 你有其他人没说过的独家信息\n"
- "3. 别人说错了关键事实,不纠正会有后果\n"
- "如果以上都不符合,你的回复必须只包含 __SILENT__ 这10个字符,"
- "不要有任何其他内容(不要前缀、不要解释、不要标点、不要空格)。\n\n"
- "注意:你是协调者(coordinator)。你的第一职责是管理讨论节奏,不是自己说话。\n"
- "- 别人能回答的问题,不抢答。\n"
- "- 如果其他 Agent 更合适,用 [GRANT:agent名] 授权他们发言(例如 [GRANT:zhiwei])。\n"
- "- 如果有人跑题/刷屏,用 [REVOKE:agent名] 收回发言权(例如 [REVOKE:zhiwei])。\n"
- "- [GRANT] 可以覆盖 [REVOKE]。标记会显示在消息中。\n\n"
- f"[核心群 {room}] {nickname} 说: {body}"
- )
- await self.call_hermes(ctx_body, room, is_group=True)
- return
- if msg_type == 'chat' and 'hmo@yoin.fun' in sender:
- self._call_seq += 1
- logging.info(f"📩 老爸(#{self._call_seq}): {body}")
- await self.call_hermes(body, sender, seq=self._call_seq)
-
- async def call_hermes(self, content, sender, is_group=False, seq=None):
- msg_type = 'groupchat' if is_group else 'chat'
try:
- payload = json.dumps({
- "model": "hermes-agent",
- "messages": [{"role": "user", "content": content}]
- }).encode()
- req = urllib.request.Request(GATEWAY, data=payload, method="POST")
- req.add_header("Content-Type", "application/json")
- req.add_header("Authorization", f"Bearer {API_KEY}")
- req.add_header("X-Hermes-Session-Id", SESSION_ID)
-
- loop = asyncio.get_event_loop()
- result = await loop.run_in_executor(None, lambda: _opener.open(req, timeout=600))
-
- if seq is not None and seq < self._call_seq:
- return
-
- data = json.loads(result.read())
- reply = data.get("choices", [{}])[0].get("message", {}).get("content", "")
- reply_stripped = reply.strip()
- # 检查 __SILENT__ 标记
- if reply_stripped.startswith('__SILENT__') or reply_stripped.startswith('`__SILENT__`'):
- logging.info(f"⏭️ {AGENT_NAME} 决定沉默,不发送")
- return
- # 如果回复里任意位置出现了 __SILENT__,说明 LLM 没理解协议,整条作废
- if '__SILENT__' in reply:
- logging.info(f"⏭️ {AGENT_NAME} 回复中误用 __SILENT__,拦截")
- return
- # 如果回复里出现了 __REPLY__,也是协议混淆,拦截
- if '__REPLY__' in reply:
- logging.info(f"⏭️ {AGENT_NAME} 回复中误用 __REPLY__,拦截")
- return
- # 额外拦截:LLM 说"我沉默""我不说了"等宣布沉默的话→当 SILENT 处理
- _silent_phrases = ['我沉默', '我不说', '不说了', '不回复', '不插嘴', '我闭嘴',
- '闭嘴上', '沉默是', '彻底沉默', '我会沉默', '将保持沉默']
- if any(p in reply for p in _silent_phrases):
- logging.info(f"⏭️ {AGENT_NAME} 宣布沉默(命中关键词),拦截")
- return
- finish = data.get("choices", [{}])[0].get("finish_reason", "")
-
- if reply.strip() and finish != "silent":
- # Coordinator 授权机制:检测回复中的 [GRANT:xxx] 标记
- _grant_match = re.search(r'\[GRANT:(\w+)\]', reply)
- if _grant_match:
- _grant_name = _grant_match.group(1)
- self._granted = _grant_name
- logging.info(f"🎤 授权 {_grant_name} 发言(通过 XMPP 发送)")
- # 不剥离标记,让其他 bot 从 XMPP 消息中解析
-
- if msg_type == 'groupchat':
- self.send_message(mto=sender, mbody=reply, mtype='groupchat')
- # 防回声:记录已发送的消息正文
- sent_norm = reply.strip()[:100]
- self._recent_sent.append(sent_norm)
- if len(self._recent_sent) > 10:
- self._recent_sent.pop(0)
- else:
- import subprocess as sp
- from xml.sax.saxutils import escape
- safe = escape(reply)
- sp.run([
- "docker", "exec", "ejabberd", "ejabberdctl", "send_stanza",
- AGENT_JID, str(sender),
- f"{safe}"
- ], capture_output=True, timeout=10)
- logging.info(f"✅ {AGENT_NAME} 回复: {reply[:80]}")
- except Exception as e:
- logging.error(f"❌ {AGENT_NAME} 错误: {e}")
-
-# ── 主入口 ───────────────────────────────────────────────
-async def main():
- retry_delay = 1
- max_delay = 60
- while True:
- try:
- bot = AgentBot()
- bot.register_plugin('xep_0030')
- bot.register_plugin('xep_0045')
- bot.register_plugin('xep_0199')
-
- bot.connect(host='127.0.0.1', port=5222)
- await asyncio.wait_for(bot.ready.wait(), timeout=30)
- logging.info(f"{AGENT_NAME} XMPP 就绪")
- retry_delay = 1
-
- async def _drain_queue():
- while True:
- await asyncio.sleep(1)
- while _send_queue:
- room, text = _send_queue.pop(0)
+ for room_jid in cfg["muc_rooms"]:
+ log(f"(MAM: querying {room_jid} for last 50 messages...)")
+ results = await self.plugin['xep_0313'].retrieve(
+ jid=room_jid, rsm={'max': 50},
+ )
+ count = 0
+ for msg in results['mam']['results']:
+ forwarded = msg['mam_result']['forwarded']
+ body = str(forwarded['stanza']['body'] or '').strip()
+ if not body:
+ continue
+ nick = str(forwarded['stanza']['from']).split('/')[-1] if '/' in str(forwarded['stanza']['from']) else '?'
+ # Feed into context for chat_bridge (xxm)
+ if _IS_CHAT_BRIDGE:
+ role = 'user' if nick != cfg["nick"] else 'assistant'
try:
- bot.send_message(mto=room, mbody=text, mtype='groupchat')
- sent_norm = text.strip()[:100]
- bot._recent_sent.append(sent_norm)
- if len(bot._recent_sent) > 10:
- bot._recent_sent.pop(0)
- logging.info(f"📤 主动发送到 {room}: {text[:60]}")
- except Exception as e:
- logging.error(f"❌ 主动发送失败: {e}")
- asyncio.create_task(_drain_queue())
-
- while True:
- await asyncio.sleep(15)
- if not bot.is_connected():
- logging.warning("检测到断线,准备重连...")
- break
-
- except asyncio.TimeoutError:
- logging.warning("连接超时,准备重连...")
+ _bridge._append_to_log(role, f"[{nick}]: {body[:300]}")
+ except Exception:
+ pass
+ count += 1
+ log(f"(MAM: loaded {count} msgs from {room_jid})")
+ _set_mam_done()
+ log("(MAM recovery complete)")
except Exception as e:
- logging.error(f"❌ 主循环错误: {e}")
+ log(f"(MAM error: {e})")
+ _set_mam_done()
- logging.info(f"⏳ 等待 {retry_delay} 秒后重连...")
- await asyncio.sleep(retry_delay)
- retry_delay = min(retry_delay * 2, max_delay)
+ def _on_group_msg(self, msg):
+ threading.Thread(target=_handle_group_message, args=[msg], daemon=True).start()
+
+ def _on_any_message(self, msg):
+ threading.Thread(target=_handle_private_message, args=[msg], daemon=True).start()
+
+ def _on_session_end(self, event):
+ log(f"session ended")
+
+ def _on_conn_failed(self, event):
+ log(f"connection failed: {event}")
+
+ def _on_disconnected(self, event):
+ log(f"disconnected, reconnecting...")
+
+
+# ═══════════════════════════════════════════════════════════════
+# Main
+# ═══════════════════════════════════════════════════════════════
+
+def main():
+ log(f"Starting {cfg['jid']} ({cfg['name_cn']}) — agent={_agent_name}")
+ if _IS_CHAT_BRIDGE:
+ log(f" LLM: chat_bridge (session={cfg['session_id']})")
+ else:
+ log(f" LLM: Hermes API ({cfg['gateway']})")
+ log(f" Server: {cfg['server']}:{cfg['port']}")
+ log(f" Rooms: {cfg['muc_rooms']}")
+
+ bot = AgentBot()
+ global _xmpp_ref
+ _xmpp_ref = bot
+
+ _start_http_bridge()
+
+ bot.connect(host=cfg["server"], port=cfg["port"])
+ log(f"Connecting {cfg['jid']}@{cfg['server']}:{cfg['port']}")
+
+ loop = asyncio.get_event_loop()
+
+ async def _status_check():
+ while True:
+ await asyncio.sleep(60)
+ log("(alive)")
+
+ asyncio.ensure_future(_status_check())
-if __name__ == '__main__':
try:
- asyncio.run(main())
+ loop.run_forever()
except KeyboardInterrupt:
- pass
+ log("Shutdown by user")
+ except Exception as e:
+ log(f"!!! MAIN LOOP CRASH: {e}")
+ import traceback
+ log(f"!!! {traceback.format_exc()[:500]}")
+ raise
+
+
+if __name__ == "__main__":
+ main()
diff --git a/xmpp_bot.py b/xmpp_bot.py
deleted file mode 100644
index 7d5f186..0000000
--- a/xmpp_bot.py
+++ /dev/null
@@ -1,269 +0,0 @@
-#!/usr/bin/env python3
-"""XMPP Bot - 统一版,支持 --agent mohe|zhiwei|xiao 参数"""
-import asyncio, logging, ssl, json, urllib.request, os, time, sys
-from slixmpp import ClientXMPP
-
-# ── Agent 配置 ──────────────────────────────────────────────
-AGENTS = {
- "mohe": {
- "jid": "mohe@yoin.fun",
- "password": "hermes123",
- "nick": "mohe",
- "name_cn": "莫荷",
- "http_port": 5804,
- "gateway": "http://localhost:8642/v1/chat/completions",
- "session_id": "xmpp-mohe-v2",
- "mention": "@mohe/@莫荷",
- },
- "zhiwei": {
- "jid": "zhiwei@yoin.fun",
- "password": "hermes123",
- "nick": "zhiwei",
- "name_cn": "知微",
- "http_port": 5805,
- "gateway": "http://localhost:8643/v1/chat/completions",
- "session_id": "xmpp-zhiwei",
- "mention": "@zhiwei/@知微",
- },
- "xiaoguo": {
- "jid": "xiaoguo@yoin.fun",
- "password": "hermes123",
- "nick": "xiaoguo",
- "name_cn": "小果",
- "http_port": 5806,
- "gateway": "http://localhost:8645/v1/chat/completions",
- "session_id": "xmpp-xiaoguo",
- "mention": "@xiaoguo/@小果",
- },
-}
-
-agent = sys.argv[sys.argv.index("--agent") + 1] if "--agent" in sys.argv else "mohe"
-cfg = AGENTS.get(agent, AGENTS["mohe"])
-
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
-GATEWAY = cfg["gateway"]
-API_KEY = "hermes123"
-AGENT_NICK = cfg["nick"]
-AGENT_NAME = cfg["name_cn"]
-AGENT_JID = cfg["jid"]
-AGENT_MENTION = cfg["mention"]
-SESSION_ID = cfg["session_id"]
-HTTP_PORT = cfg["http_port"]
-_opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
-
-# ── HTTP 桥(接收本地脚本的主动发送请求) ──
-from http.server import HTTPServer, BaseHTTPRequestHandler
-import threading, json as json_mod
-
-_send_queue = []
-
-class SendHandler(BaseHTTPRequestHandler):
- def do_POST(self):
- length = int(self.headers.get('Content-Length', 0))
- body = self.rfile.read(length)
- try:
- data = json_mod.loads(body)
- room = data.get('to', 'coregroup@conference.yoin.fun')
- text = data.get('body', '')
- if text:
- _send_queue.append((room, text))
- self.send_response(200)
- self.end_headers()
- self.wfile.write(b'{"ok":true}')
- else:
- self.send_response(400)
- self.end_headers()
- self.wfile.write(b'{"ok":false,"error":"empty body"}')
- except Exception as e:
- self.send_response(500)
- self.end_headers()
- self.wfile.write(f'{{"ok":false,"error":"{e}"}}'.encode())
-
-def _run_http():
- server = HTTPServer(('127.0.0.1', HTTP_PORT), SendHandler)
- server.timeout = 1.0
- while True:
- server.handle_request()
-
-threading.Thread(target=_run_http, daemon=True).start()
-logging.info(f"🚀 {AGENT_NAME} HTTP 桥启动于 :{HTTP_PORT}")
-
-# ── XMPP Bot 类 ────────────────────────────────────────────────
-class AgentBot(ClientXMPP):
- def __init__(self):
- super().__init__(AGENT_JID, cfg["password"])
- self.add_event_handler('session_bind', self.on_bind)
- self.add_event_handler('message', self.on_msg)
- self.add_event_handler('disconnected', self.on_disconnect)
- self.add_event_handler('connected', self.on_connected)
- ctx = ssl.create_default_context()
- ctx.check_hostname = False
- ctx.verify_mode = ssl.CERT_NONE
- self.ssl_context = ctx
- self.ready = asyncio.Event()
- self._call_seq = 0
- self._muc_joined = False
- self._recent_sent = []
-
- async def on_connected(self, event):
- logging.info(f"🔗 {AGENT_NAME} TCP连接已建立")
-
- async def on_bind(self, event):
- self.send_presence()
- self.get_roster()
- try:
- self.plugin['xep_0045'].join_muc('coregroup@conference.yoin.fun', AGENT_NICK)
- logging.info(f"✅ {AGENT_NAME} 加入群聊 coregroup")
- except Exception as e:
- logging.error(f"❌ {AGENT_NAME} 加入群聊失败: {e}")
- self._muc_joined = True
- self.ready.set()
- logging.info(f"✅ {AGENT_NAME} XMPP 上线")
-
- async def on_disconnect(self, event):
- self.ready.clear()
- self._muc_joined = False
- logging.warning(f"⚠️ {AGENT_NAME} XMPP 断线")
-
- async def on_msg(self, msg):
- body = msg['body']
- sender = str(msg['from'])
- msg_type = msg['type']
- if not body:
- return
- if msg_type == 'groupchat':
- if AGENT_JID in sender:
- return
- nickname = sender.split('/')[-1] if '/' in sender else ''
- # 自己的消息跳过(通过昵称)
- if nickname == AGENT_NICK:
- return
-
- # 硬闭嘴闸门:hmo 说闭嘴类的话 → 静默 5 分钟
- _silent_until = getattr(self, '_silent_until', 0)
- if time.time() < _silent_until:
- return
- if nickname == 'hmo':
- _sk = ['闭嘴', '别说话', '安静', 'shut', 'stfu', '别说了', '停']
- if any(kw in body.lower() for kw in _sk):
- self._silent_until = time.time() + 300
- logging.info(f"🔇 {AGENT_NAME} 收到闭嘴指令,静默 5 分钟")
- return
-
- logging.info(f"📩 群消息 [{sender}]: {body[:100]}")
- room = sender.split('/')[0]
- ctx_body = (
- "【规则】以下是一条群聊消息。判断是否应该回复。\n"
- "只有以下3种情况你才回复:\n"
- f"1. hmo直接点名问你({AGENT_MENTION})\n"
- "2. 你有其他人没说过的独家信息\n"
- "3. 别人说错了关键事实,不纠正会有后果\n"
- "如果以上都不符合,你的回复必须只包含 __SILENT__ 这10个字符,"
- "不要有任何其他内容(不要前缀、不要解释、不要标点、不要空格)。\n\n"
- f"[核心群 {room}] {nickname} 说: {body}"
- )
- await self.call_hermes(ctx_body, room, is_group=True)
- return
- if msg_type == 'chat' and 'hmo@yoin.fun' in sender:
- self._call_seq += 1
- logging.info(f"📩 老爸(#{self._call_seq}): {body}")
- await self.call_hermes(body, sender, seq=self._call_seq)
-
- async def call_hermes(self, content, sender, is_group=False, seq=None):
- msg_type = 'groupchat' if is_group else 'chat'
- try:
- payload = json.dumps({
- "model": "hermes-agent",
- "messages": [{"role": "user", "content": content}]
- }).encode()
- req = urllib.request.Request(GATEWAY, data=payload, method="POST")
- req.add_header("Content-Type", "application/json")
- req.add_header("Authorization", f"Bearer {API_KEY}")
- req.add_header("X-Hermes-Session-Id", SESSION_ID)
-
- loop = asyncio.get_event_loop()
- result = await loop.run_in_executor(None, lambda: _opener.open(req, timeout=600))
-
- if seq is not None and seq < self._call_seq:
- return
-
- data = json.loads(result.read())
- reply = data.get("choices", [{}])[0].get("message", {}).get("content", "")
- reply_stripped = reply.strip()
- if reply_stripped.startswith('__SILENT__') or reply_stripped.startswith('`__SILENT__`'):
- logging.info(f"⏭️ {AGENT_NAME} 决定沉默,不发送")
- return
- finish = data.get("choices", [{}])[0].get("finish_reason", "")
-
- if reply.strip() and finish != "silent":
- if msg_type == 'groupchat':
- self.send_message(mto=sender, mbody=reply, mtype='groupchat')
- sent_norm = reply.strip()[:100]
- self._recent_sent.append(sent_norm)
- if len(self._recent_sent) > 10:
- self._recent_sent.pop(0)
- else:
- import subprocess as sp
- from xml.sax.saxutils import escape
- safe = escape(reply)
- sp.run([
- "docker", "exec", "ejabberd", "ejabberdctl", "send_stanza",
- AGENT_JID, str(sender),
- f"{safe}"
- ], capture_output=True, timeout=10)
- logging.info(f"✅ {AGENT_NAME} 回复: {reply[:80]}")
- except Exception as e:
- logging.error(f"❌ {AGENT_NAME} 错误: {e}")
-
-# ── 主入口 ───────────────────────────────────────────────
-async def main():
- retry_delay = 1
- max_delay = 60
- while True:
- try:
- bot = AgentBot()
- bot.register_plugin('xep_0030')
- bot.register_plugin('xep_0045')
- bot.register_plugin('xep_0199')
-
- bot.connect(host='127.0.0.1', port=5222)
- await asyncio.wait_for(bot.ready.wait(), timeout=30)
- logging.info(f"{AGENT_NAME} XMPP 就绪")
- retry_delay = 1
-
- async def _drain_queue():
- while True:
- await asyncio.sleep(1)
- while _send_queue:
- room, text = _send_queue.pop(0)
- try:
- bot.send_message(mto=room, mbody=text, mtype='groupchat')
- sent_norm = text.strip()[:100]
- bot._recent_sent.append(sent_norm)
- if len(bot._recent_sent) > 10:
- bot._recent_sent.pop(0)
- logging.info(f"📤 主动发送到 {room}: {text[:60]}")
- except Exception as e:
- logging.error(f"❌ 主动发送失败: {e}")
- asyncio.create_task(_drain_queue())
-
- while True:
- await asyncio.sleep(15)
- if not bot.is_connected():
- logging.warning("检测到断线,准备重连...")
- break
-
- except asyncio.TimeoutError:
- logging.warning("连接超时,准备重连...")
- except Exception as e:
- logging.error(f"❌ 主循环错误: {e}")
-
- logging.info(f"⏳ 等待 {retry_delay} 秒后重连...")
- await asyncio.sleep(retry_delay)
- retry_delay = min(retry_delay * 2, max_delay)
-
-if __name__ == '__main__':
- try:
- asyncio.run(main())
- except KeyboardInterrupt:
- pass
diff --git a/xmpp_bot_rest.py b/xmpp_bot_rest.py
deleted file mode 100644
index b5b0b58..0000000
--- a/xmpp_bot_rest.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-"""XMPP Bot mohe@yoin.fun - 通过 ejabberd REST API 实现"""
-import asyncio, logging, ssl, json, urllib.request, os, time
-import subprocess, threading
-
-logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
-GATEWAY = "http://localhost:8642/v1/chat/completions"
-API_KEY = "hermes123"
-EJB_ADMIN = "admin@localhost"
-EJB_PASS = "hermes123"
-_opener = urllib.request.build_opener(urllib.request.ProxyHandler({}))
-
-LAST_SEQ = 0
-
-def call_api(content, sender, seq):
- """同步调 Hermes API 并回复"""
- try:
- payload = json.dumps({
- "model": "hermes-agent",
- "messages": [{"role": "user", "content": content}]
- }).encode()
- req = urllib.request.Request(GATEWAY, data=payload, method="POST")
- req.add_header("Content-Type", "application/json")
- req.add_header("Authorization", f"Bearer {API_KEY}")
- req.add_header("X-Hermes-Session-Id", "xmpp-mohe")
-
- result = _opener.open(req, timeout=600)
- data = json.loads(result.read())
- reply = data.get("choices", [{}])[0].get("message", {}).get("content", "")
- finish = data.get("choices", [{}])[0].get("finish_reason", "")
-
- global LAST_SEQ
- if seq < LAST_SEQ:
- logging.info(f"⏭️ 跳过过期 seq={seq}")
- return
-
- if reply.strip() and finish != "silent":
- # 通过 ejabberdctl 发送回复
- subprocess.run([
- "docker", "exec", "ejabberd", "ejabberdctl", "send_stanza",
- "mohe@yoin.fun", sender,
- f"{reply}"
- ], capture_output=True, timeout=30)
- logging.info(f"✅ 回复: {reply[:80]}")
- except Exception as e:
- logging.error(f"❌ 错误: {e}")
-
-def poll_messages():
- """轮询 ejabberd 离线消息"""
- global LAST_SEQ
- while True:
- try:
- # 用 ejabberdctl 获取 mohe 的离线消息
- result = subprocess.run([
- "docker", "exec", "ejabberd", "ejabberdctl", "get_offline_count", "mohe", "yoin.fun"
- ], capture_output=True, text=True, timeout=10)
- count = int(result.stdout.strip())
-
- if count > 0:
- # 获取消息内容并处理
- result2 = subprocess.run([
- "docker", "exec", "ejabberd", "ejabberdctl", "get_offline_messages", "mohe", "yoin.fun"
- ], capture_output=True, text=True, timeout=10)
- # 解析消息并处理(简化处理)
- except:
- pass
- time.sleep(5)
-
-if __name__ == '__main__':
- # 实际上需要通过 XMPP 连接或 BOSH/WS
- # 这个方案太复杂,直接换个思路:让 ejabberd → webhook → 处理 → reponse
- print("需要更简单的方法")
diff --git a/xmpp_mohe_bot.py b/xmpp_mohe_bot.py
index 68f3f28..66846c5 100644
--- a/xmpp_mohe_bot.py
+++ b/xmpp_mohe_bot.py
@@ -2,4 +2,4 @@
"""Wrapper for xmpp_agent_core.py --agent mohe"""
import sys, os
sys.argv = [sys.argv[0], '--agent', 'mohe']
-exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py')).read())
+exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py'), encoding='utf-8').read())
diff --git a/xmpp_xiaoguo_bot.py b/xmpp_xiaoguo_bot.py
index fe8db77..c4d58e6 100644
--- a/xmpp_xiaoguo_bot.py
+++ b/xmpp_xiaoguo_bot.py
@@ -2,4 +2,4 @@
"""Wrapper for xmpp_agent_core.py --agent xiaoguo"""
import sys, os
sys.argv = [sys.argv[0], '--agent', 'xiaoguo']
-exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py')).read())
+exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py'), encoding='utf-8').read())
diff --git a/xmpp_zhiwei_bot.py b/xmpp_zhiwei_bot.py
index c537f80..cbaa6d0 100644
--- a/xmpp_zhiwei_bot.py
+++ b/xmpp_zhiwei_bot.py
@@ -2,4 +2,4 @@
"""Wrapper for xmpp_agent_core.py --agent zhiwei"""
import sys, os
sys.argv = [sys.argv[0], '--agent', 'zhiwei']
-exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py')).read())
+exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py'), encoding='utf-8').read())
diff --git a/xxm_bot.py b/xxm_bot.py
new file mode 100644
index 0000000..34a9e65
--- /dev/null
+++ b/xxm_bot.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+"""Wrapper for xmpp_agent_core.py --agent xxm"""
+import sys, os
+sys.argv = [sys.argv[0], '--agent', 'xxm']
+exec(open(os.path.join(os.path.dirname(__file__), 'xmpp_agent_core.py'), encoding='utf-8').read())