Files
hmo 1b2b935832 Initial: multi-agent XMPP communication system with dashboard
- Platform-based architecture (Windows/Linux/Mac)
- Agent instance registry (agents.yaml)
- Management dashboard with cross-platform monitoring
- xmpp_bot with HTTP bridge + health endpoints
- wechat_agent with WeChat-Hermes bridging
- Platform services: ProcessGuardian, HealthProbe, APIRouter, ChannelBridge
- Deployment: systemd (Linux) + PowerShell (Windows)
- Monitoring: SSH+ejabberdctl for cross-platform presence
2026-06-12 21:51:36 +08:00

174 lines
5.8 KiB
Python

"""
xmpp_watchdog.py — monitors xmpp_bot, auto-restarts on crash, reports status.
Runs alongside xmpp_bot.py as a separate process.
"""
import os, sys, time, subprocess, json, threading
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
BOT_SCRIPT = os.path.join(PROJECT_ROOT, "xmpp_bot.py")
LOG_DIR = os.path.join(os.path.dirname(PROJECT_ROOT), "logs")
WATCHDOG_LOG = os.path.join(LOG_DIR, "watchdog.log")
PID_FILE = os.path.join(os.path.dirname(PROJECT_ROOT), "temp", ".xmpp_watchdog.pid")
BOT_PID_FILE = os.path.join(os.path.dirname(PROJECT_ROOT), "temp", ".xmpp_bot.pid")
PYTHON = r"C:\Users\hmo\AppData\Local\Programs\Python\Python310\python.exe"
CHECK_INTERVAL = 30 # seconds between health checks
os.makedirs(LOG_DIR, exist_ok=True)
def wlog(msg: str):
ts = time.strftime("%Y-%m-%d %H:%M:%S")
with open(WATCHDOG_LOG, "a", encoding="utf-8") as f:
f.write(f"{ts} [watchdog] {msg}\n")
print(f"[watchdog] {msg}", flush=True)
def rotate_log(path: str, max_bytes: int = 5 * 1024 * 1024):
"""Rotate log file if it exceeds max_bytes. Keeps last 3 backups."""
try:
if os.path.getsize(path) > max_bytes:
# shift .2→.tmp, .1→.2, file→.1
bak2 = f"{path}.2"
bak1 = f"{path}.1"
if os.path.exists(bak2): os.remove(bak2)
if os.path.exists(bak1): os.rename(bak1, bak2)
os.rename(path, bak1)
wlog(f"Rotated: {os.path.basename(path)}")
except:
pass
def is_process_alive(pid: int) -> bool:
"""Check if a process with given PID is alive."""
try:
proc = subprocess.run(
['tasklist', '/FI', f'PID eq {pid}', '/NH'],
capture_output=True, text=True, timeout=5
)
return str(pid) in proc.stdout
except:
return False
def kill_bot():
"""Kill ALL existing xmpp_bot.py processes before starting a new one."""
killed = 0
try:
r = subprocess.run(
['tasklist', '/FO', 'CSV', '/NH', '/FI', 'IMAGENAME eq python.exe'],
capture_output=True, text=True, timeout=10
)
for line in r.stdout.splitlines():
parts = line.strip('"').split('","')
if len(parts) >= 2 and parts[0] == 'python.exe':
pid_str = parts[1].strip()
try:
wmi = subprocess.run(
['wmic', 'process', 'where', f'ProcessId={pid_str}',
'get', 'CommandLine', '/format:list'],
capture_output=True, text=True, timeout=5
)
if 'xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout:
subprocess.run(['taskkill', '/f', '/pid', pid_str],
capture_output=True, timeout=5)
killed += 1
wlog(f"Killed old bot (PID {pid_str})")
except:
pass
except:
pass
if killed > 0:
time.sleep(3) # wait for process cleanup
def start_bot() -> int:
"""Start xmpp_bot.py and return its PID. Kills old instances first."""
kill_bot()
wlog("Starting xmpp_bot...")
proc = subprocess.Popen(
[PYTHON, BOT_SCRIPT],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
creationflags=subprocess.CREATE_NO_WINDOW
)
pid = proc.pid
with open(BOT_PID_FILE, "w") as f:
f.write(str(pid))
wlog(f"xmpp_bot started (PID {pid})")
return pid
def get_last_log_activity() -> float:
"""Get timestamp of last xmpp_bot.log modification."""
log_file = os.path.join(LOG_DIR, "xmpp_bot.log")
try:
return os.path.getmtime(log_file)
except:
return 0
def health_check(bot_pid: int, last_activity: float) -> tuple[bool, int, float]:
"""
Check bot health.
Returns (is_alive, pid, last_activity).
If dead, restarts bot.
"""
alive = is_process_alive(bot_pid)
if not alive:
wlog(f"Bot PID {bot_pid} is DEAD. Restarting...")
bot_pid = start_bot()
time.sleep(5)
last_activity = get_last_log_activity()
return (True, bot_pid, last_activity)
# Check if bot has been active recently (last 5 minutes)
current_activity = get_last_log_activity()
if current_activity > last_activity:
last_activity = current_activity
# If no activity for 5 minutes but bot is alive, warn
if time.time() - last_activity > 300:
wlog(f"WARNING: Bot PID {bot_pid} alive but no activity for 5+ min")
return (True, bot_pid, last_activity)
if __name__ == "__main__":
wlog("Watchdog started")
# Start bot if not already running
bot_pid = 0
if os.path.exists(BOT_PID_FILE):
try:
with open(BOT_PID_FILE) as f:
bot_pid = int(f.read().strip())
if not is_process_alive(bot_pid):
bot_pid = 0
except:
bot_pid = 0
if bot_pid == 0:
bot_pid = start_bot()
last_activity = get_last_log_activity()
wlog(f"Initial: bot PID {bot_pid}, log last activity: {time.ctime(last_activity)}")
log_rotate_counter = 0
# Main monitoring loop
while True:
time.sleep(CHECK_INTERVAL)
alive, bot_pid, last_activity = health_check(bot_pid, last_activity)
# Log rotation (every 30 checks ≈ 15 min)
log_rotate_counter += 1
if log_rotate_counter >= 30:
log_rotate_counter = 0
bot_log = os.path.join(LOG_DIR, "xmpp_bot.log")
bridge_log = os.path.join(LOG_DIR, "bridge.log")
rotate_log(bot_log)
rotate_log(bridge_log)
# Every 5 minutes, report status
if int(time.time()) % 300 < CHECK_INTERVAL:
alive_str = "ALIVE" if alive else "RESTARTED"
wlog(f"Status: bot PID {bot_pid} [{alive_str}]")