Initial: multi-agent XMPP communication system with dashboard
- Platform-based architecture (Windows/Linux/Mac) - Agent instance registry (agents.yaml) - Management dashboard with cross-platform monitoring - xmpp_bot with HTTP bridge + health endpoints - wechat_agent with WeChat-Hermes bridging - Platform services: ProcessGuardian, HealthProbe, APIRouter, ChannelBridge - Deployment: systemd (Linux) + PowerShell (Windows) - Monitoring: SSH+ejabberdctl for cross-platform presence
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
"""
|
||||
xmpp_watchdog.py — monitors xmpp_bot, auto-restarts on crash, reports status.
|
||||
Runs alongside xmpp_bot.py as a separate process.
|
||||
"""
|
||||
import os, sys, time, subprocess, json, threading
|
||||
|
||||
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
||||
BOT_SCRIPT = os.path.join(PROJECT_ROOT, "xmpp_bot.py")
|
||||
LOG_DIR = os.path.join(os.path.dirname(PROJECT_ROOT), "logs")
|
||||
WATCHDOG_LOG = os.path.join(LOG_DIR, "watchdog.log")
|
||||
PID_FILE = os.path.join(os.path.dirname(PROJECT_ROOT), "temp", ".xmpp_watchdog.pid")
|
||||
BOT_PID_FILE = os.path.join(os.path.dirname(PROJECT_ROOT), "temp", ".xmpp_bot.pid")
|
||||
PYTHON = r"C:\Users\hmo\AppData\Local\Programs\Python\Python310\python.exe"
|
||||
CHECK_INTERVAL = 30 # seconds between health checks
|
||||
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
|
||||
def wlog(msg: str):
|
||||
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
with open(WATCHDOG_LOG, "a", encoding="utf-8") as f:
|
||||
f.write(f"{ts} [watchdog] {msg}\n")
|
||||
print(f"[watchdog] {msg}", flush=True)
|
||||
|
||||
|
||||
def rotate_log(path: str, max_bytes: int = 5 * 1024 * 1024):
|
||||
"""Rotate log file if it exceeds max_bytes. Keeps last 3 backups."""
|
||||
try:
|
||||
if os.path.getsize(path) > max_bytes:
|
||||
# shift .2→.tmp, .1→.2, file→.1
|
||||
bak2 = f"{path}.2"
|
||||
bak1 = f"{path}.1"
|
||||
if os.path.exists(bak2): os.remove(bak2)
|
||||
if os.path.exists(bak1): os.rename(bak1, bak2)
|
||||
os.rename(path, bak1)
|
||||
wlog(f"Rotated: {os.path.basename(path)}")
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def is_process_alive(pid: int) -> bool:
|
||||
"""Check if a process with given PID is alive."""
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
['tasklist', '/FI', f'PID eq {pid}', '/NH'],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
return str(pid) in proc.stdout
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
def kill_bot():
|
||||
"""Kill ALL existing xmpp_bot.py processes before starting a new one."""
|
||||
killed = 0
|
||||
try:
|
||||
r = subprocess.run(
|
||||
['tasklist', '/FO', 'CSV', '/NH', '/FI', 'IMAGENAME eq python.exe'],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
for line in r.stdout.splitlines():
|
||||
parts = line.strip('"').split('","')
|
||||
if len(parts) >= 2 and parts[0] == 'python.exe':
|
||||
pid_str = parts[1].strip()
|
||||
try:
|
||||
wmi = subprocess.run(
|
||||
['wmic', 'process', 'where', f'ProcessId={pid_str}',
|
||||
'get', 'CommandLine', '/format:list'],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if 'xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout:
|
||||
subprocess.run(['taskkill', '/f', '/pid', pid_str],
|
||||
capture_output=True, timeout=5)
|
||||
killed += 1
|
||||
wlog(f"Killed old bot (PID {pid_str})")
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
if killed > 0:
|
||||
time.sleep(3) # wait for process cleanup
|
||||
|
||||
def start_bot() -> int:
|
||||
"""Start xmpp_bot.py and return its PID. Kills old instances first."""
|
||||
kill_bot()
|
||||
wlog("Starting xmpp_bot...")
|
||||
proc = subprocess.Popen(
|
||||
[PYTHON, BOT_SCRIPT],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
creationflags=subprocess.CREATE_NO_WINDOW
|
||||
)
|
||||
pid = proc.pid
|
||||
with open(BOT_PID_FILE, "w") as f:
|
||||
f.write(str(pid))
|
||||
wlog(f"xmpp_bot started (PID {pid})")
|
||||
return pid
|
||||
|
||||
|
||||
def get_last_log_activity() -> float:
|
||||
"""Get timestamp of last xmpp_bot.log modification."""
|
||||
log_file = os.path.join(LOG_DIR, "xmpp_bot.log")
|
||||
try:
|
||||
return os.path.getmtime(log_file)
|
||||
except:
|
||||
return 0
|
||||
|
||||
|
||||
def health_check(bot_pid: int, last_activity: float) -> tuple[bool, int, float]:
|
||||
"""
|
||||
Check bot health.
|
||||
Returns (is_alive, pid, last_activity).
|
||||
If dead, restarts bot.
|
||||
"""
|
||||
alive = is_process_alive(bot_pid)
|
||||
|
||||
if not alive:
|
||||
wlog(f"Bot PID {bot_pid} is DEAD. Restarting...")
|
||||
bot_pid = start_bot()
|
||||
time.sleep(5)
|
||||
last_activity = get_last_log_activity()
|
||||
return (True, bot_pid, last_activity)
|
||||
|
||||
# Check if bot has been active recently (last 5 minutes)
|
||||
current_activity = get_last_log_activity()
|
||||
if current_activity > last_activity:
|
||||
last_activity = current_activity
|
||||
|
||||
# If no activity for 5 minutes but bot is alive, warn
|
||||
if time.time() - last_activity > 300:
|
||||
wlog(f"WARNING: Bot PID {bot_pid} alive but no activity for 5+ min")
|
||||
|
||||
return (True, bot_pid, last_activity)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
wlog("Watchdog started")
|
||||
|
||||
# Start bot if not already running
|
||||
bot_pid = 0
|
||||
if os.path.exists(BOT_PID_FILE):
|
||||
try:
|
||||
with open(BOT_PID_FILE) as f:
|
||||
bot_pid = int(f.read().strip())
|
||||
if not is_process_alive(bot_pid):
|
||||
bot_pid = 0
|
||||
except:
|
||||
bot_pid = 0
|
||||
|
||||
if bot_pid == 0:
|
||||
bot_pid = start_bot()
|
||||
|
||||
last_activity = get_last_log_activity()
|
||||
wlog(f"Initial: bot PID {bot_pid}, log last activity: {time.ctime(last_activity)}")
|
||||
|
||||
log_rotate_counter = 0
|
||||
|
||||
# Main monitoring loop
|
||||
while True:
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
alive, bot_pid, last_activity = health_check(bot_pid, last_activity)
|
||||
|
||||
# Log rotation (every 30 checks ≈ 15 min)
|
||||
log_rotate_counter += 1
|
||||
if log_rotate_counter >= 30:
|
||||
log_rotate_counter = 0
|
||||
bot_log = os.path.join(LOG_DIR, "xmpp_bot.log")
|
||||
bridge_log = os.path.join(LOG_DIR, "bridge.log")
|
||||
rotate_log(bot_log)
|
||||
rotate_log(bridge_log)
|
||||
|
||||
# Every 5 minutes, report status
|
||||
if int(time.time()) % 300 < CHECK_INTERVAL:
|
||||
alive_str = "ALIVE" if alive else "RESTARTED"
|
||||
wlog(f"Status: bot PID {bot_pid} [{alive_str}]")
|
||||
Reference in New Issue
Block a user