Initial: multi-agent XMPP communication system with dashboard
- Platform-based architecture (Windows/Linux/Mac) - Agent instance registry (agents.yaml) - Management dashboard with cross-platform monitoring - xmpp_bot with HTTP bridge + health endpoints - wechat_agent with WeChat-Hermes bridging - Platform services: ProcessGuardian, HealthProbe, APIRouter, ChannelBridge - Deployment: systemd (Linux) + PowerShell (Windows) - Monitoring: SSH+ejabberdctl for cross-platform presence
This commit is contained in:
@@ -0,0 +1,150 @@
|
||||
"""
|
||||
xxm health check. Runs every 5 min via Task Scheduler.
|
||||
Checks:
|
||||
1. Is xmpp_bot process alive? If not → restart
|
||||
2. Is it receiving messages? If alive but no msgs for 10+ min → restart
|
||||
3. Possible stuck loop? (too many tool calls)
|
||||
"""
|
||||
import os, sys, time, subprocess
|
||||
|
||||
PROJECT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
LOG_FILE = os.path.join(PROJECT, "logs", "health_check.log")
|
||||
BOT_LOG = os.path.join(PROJECT, "logs", "xmpp_bot.log")
|
||||
PYTHON = r"C:\Users\hmo\AppData\Local\Programs\Python\Python310\python.exe"
|
||||
BOT_SCRIPT = os.path.join(PROJECT, "scripts", "xmpp_bot.py")
|
||||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||
|
||||
def wlog(msg: str):
|
||||
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
||||
f.write(f"{ts} [health] {msg}\n")
|
||||
|
||||
def find_bot_pid() -> int:
|
||||
try:
|
||||
r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'],
|
||||
capture_output=True, text=True, timeout=10)
|
||||
for line in r.stdout.splitlines():
|
||||
parts = line.strip('"').split('","')
|
||||
if len(parts) >= 2 and parts[0] == 'python.exe':
|
||||
pid_str = parts[1].strip()
|
||||
try:
|
||||
wmi = subprocess.run(
|
||||
['wmic', 'process', 'where', f'ProcessId={pid_str}',
|
||||
'get', 'CommandLine', '/format:list'],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
if ('xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout
|
||||
and 'health' not in wmi.stdout):
|
||||
return int(pid_str)
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
return 0
|
||||
|
||||
def kill_all_bots():
|
||||
"""Kill all xmpp_bot.py processes."""
|
||||
try:
|
||||
r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'],
|
||||
capture_output=True, text=True, timeout=10)
|
||||
for line in r.stdout.splitlines():
|
||||
parts = line.strip('"').split('","')
|
||||
if len(parts) >= 2 and parts[0] == 'python.exe':
|
||||
pid_str = parts[1].strip()
|
||||
try:
|
||||
wmi = subprocess.run(
|
||||
['wmic', 'process', 'where', f'ProcessId={pid_str}',
|
||||
'get', 'CommandLine', '/format:list'],
|
||||
capture_output=True, text=True, timeout=5)
|
||||
if 'xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout:
|
||||
subprocess.run(['taskkill', '/f', '/pid', pid_str],
|
||||
capture_output=True, timeout=5)
|
||||
wlog(f"Killed old bot PID {pid_str}")
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
def start_bot():
|
||||
kill_all_bots()
|
||||
time.sleep(3)
|
||||
subprocess.Popen([PYTHON, BOT_SCRIPT],
|
||||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||||
creationflags=subprocess.CREATE_NO_WINDOW)
|
||||
|
||||
def parse_log_tail(path: str, n: int = 100) -> list[str]:
|
||||
if not os.path.exists(path): return []
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
||||
return [l.rstrip("\n\r") for l in f.readlines()[-n:]]
|
||||
except:
|
||||
return []
|
||||
|
||||
def get_last_msg_time(lines: list[str]) -> float:
|
||||
"""Get approximate time of last received group message from log lines."""
|
||||
# Look for [Group] entries which mean messages received
|
||||
for line in reversed(lines):
|
||||
if '[Group]' in line and 'batched' in line:
|
||||
m = __import__('re').search(r'^(\d{2}):(\d{2}):(\d{2})', line)
|
||||
if m:
|
||||
h, mi, s = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
||||
now = time.localtime()
|
||||
log_time = time.mktime(
|
||||
(now.tm_year, now.tm_mon, now.tm_mday, h, mi, s,
|
||||
now.tm_wday, now.tm_yday, now.tm_isdst))
|
||||
if log_time > time.time(): # wrap around midnight
|
||||
log_time -= 86400
|
||||
return log_time
|
||||
return 0
|
||||
|
||||
def main():
|
||||
wlog("=== health check ===")
|
||||
|
||||
pid = find_bot_pid()
|
||||
if pid == 0:
|
||||
wlog("CRIT: bot not running, restarting...")
|
||||
start_bot()
|
||||
time.sleep(5)
|
||||
pid = find_bot_pid()
|
||||
if pid:
|
||||
wlog(f"OK: restarted (PID {pid})")
|
||||
else:
|
||||
wlog("FAIL: restart failed")
|
||||
wlog("=== end ===")
|
||||
return
|
||||
|
||||
# Activity analysis
|
||||
recent = parse_log_tail(BOT_LOG, 100)
|
||||
alive = sum(1 for l in recent if "alive" in l)
|
||||
responses = sum(1 for l in recent if l.startswith("-> ") and "silent" not in l)
|
||||
silent = sum(1 for l in recent if "silent" in l)
|
||||
tool_calls = sum(1 for l in recent if "run_command" in l)
|
||||
group_msgs = sum(1 for l in recent if "[Group]" in l)
|
||||
|
||||
last_msg = get_last_msg_time(recent)
|
||||
last_msg_age = (time.time() - last_msg) / 60 if last_msg else 999
|
||||
|
||||
wlog(f"PID={pid} alive={alive} grp={group_msgs} rsp={responses} sl={silent} tools={tool_calls} lastMsg={last_msg_age:.0f}min")
|
||||
|
||||
# CRITICAL: bot is alive but receiving no messages → disconnect detected
|
||||
# Skip check if bot was just started (has "online" in recent logs)
|
||||
recent_start = sum(1 for l in recent if "online" in l)
|
||||
if alive >= 3 and group_msgs == 0 and last_msg_age > 10 and last_msg > 0 and recent_start == 0:
|
||||
wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.")
|
||||
wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.")
|
||||
kill_all_bots()
|
||||
time.sleep(3)
|
||||
start_bot()
|
||||
wlog("=== end (restarted) ===")
|
||||
return
|
||||
|
||||
# Process died
|
||||
if alive == 0 and group_msgs == 0:
|
||||
wlog("WARN: no activity in last windows")
|
||||
|
||||
if tool_calls >= 25:
|
||||
wlog(f"WARN: heavy tool calls ({tool_calls}), possible loop")
|
||||
|
||||
wlog("=== end ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user