1b2b935832
- Platform-based architecture (Windows/Linux/Mac) - Agent instance registry (agents.yaml) - Management dashboard with cross-platform monitoring - xmpp_bot with HTTP bridge + health endpoints - wechat_agent with WeChat-Hermes bridging - Platform services: ProcessGuardian, HealthProbe, APIRouter, ChannelBridge - Deployment: systemd (Linux) + PowerShell (Windows) - Monitoring: SSH+ejabberdctl for cross-platform presence
174 lines
5.8 KiB
Python
174 lines
5.8 KiB
Python
"""
|
|
xmpp_watchdog.py — monitors xmpp_bot, auto-restarts on crash, reports status.
|
|
Runs alongside xmpp_bot.py as a separate process.
|
|
"""
|
|
import os, sys, time, subprocess, json, threading
|
|
|
|
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
|
BOT_SCRIPT = os.path.join(PROJECT_ROOT, "xmpp_bot.py")
|
|
LOG_DIR = os.path.join(os.path.dirname(PROJECT_ROOT), "logs")
|
|
WATCHDOG_LOG = os.path.join(LOG_DIR, "watchdog.log")
|
|
PID_FILE = os.path.join(os.path.dirname(PROJECT_ROOT), "temp", ".xmpp_watchdog.pid")
|
|
BOT_PID_FILE = os.path.join(os.path.dirname(PROJECT_ROOT), "temp", ".xmpp_bot.pid")
|
|
PYTHON = r"C:\Users\hmo\AppData\Local\Programs\Python\Python310\python.exe"
|
|
CHECK_INTERVAL = 30 # seconds between health checks
|
|
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
|
|
def wlog(msg: str):
|
|
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
with open(WATCHDOG_LOG, "a", encoding="utf-8") as f:
|
|
f.write(f"{ts} [watchdog] {msg}\n")
|
|
print(f"[watchdog] {msg}", flush=True)
|
|
|
|
|
|
def rotate_log(path: str, max_bytes: int = 5 * 1024 * 1024):
|
|
"""Rotate log file if it exceeds max_bytes. Keeps last 3 backups."""
|
|
try:
|
|
if os.path.getsize(path) > max_bytes:
|
|
# shift .2→.tmp, .1→.2, file→.1
|
|
bak2 = f"{path}.2"
|
|
bak1 = f"{path}.1"
|
|
if os.path.exists(bak2): os.remove(bak2)
|
|
if os.path.exists(bak1): os.rename(bak1, bak2)
|
|
os.rename(path, bak1)
|
|
wlog(f"Rotated: {os.path.basename(path)}")
|
|
except:
|
|
pass
|
|
|
|
|
|
def is_process_alive(pid: int) -> bool:
|
|
"""Check if a process with given PID is alive."""
|
|
try:
|
|
proc = subprocess.run(
|
|
['tasklist', '/FI', f'PID eq {pid}', '/NH'],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
return str(pid) in proc.stdout
|
|
except:
|
|
return False
|
|
|
|
|
|
def kill_bot():
|
|
"""Kill ALL existing xmpp_bot.py processes before starting a new one."""
|
|
killed = 0
|
|
try:
|
|
r = subprocess.run(
|
|
['tasklist', '/FO', 'CSV', '/NH', '/FI', 'IMAGENAME eq python.exe'],
|
|
capture_output=True, text=True, timeout=10
|
|
)
|
|
for line in r.stdout.splitlines():
|
|
parts = line.strip('"').split('","')
|
|
if len(parts) >= 2 and parts[0] == 'python.exe':
|
|
pid_str = parts[1].strip()
|
|
try:
|
|
wmi = subprocess.run(
|
|
['wmic', 'process', 'where', f'ProcessId={pid_str}',
|
|
'get', 'CommandLine', '/format:list'],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if 'xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout:
|
|
subprocess.run(['taskkill', '/f', '/pid', pid_str],
|
|
capture_output=True, timeout=5)
|
|
killed += 1
|
|
wlog(f"Killed old bot (PID {pid_str})")
|
|
except:
|
|
pass
|
|
except:
|
|
pass
|
|
if killed > 0:
|
|
time.sleep(3) # wait for process cleanup
|
|
|
|
def start_bot() -> int:
|
|
"""Start xmpp_bot.py and return its PID. Kills old instances first."""
|
|
kill_bot()
|
|
wlog("Starting xmpp_bot...")
|
|
proc = subprocess.Popen(
|
|
[PYTHON, BOT_SCRIPT],
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
creationflags=subprocess.CREATE_NO_WINDOW
|
|
)
|
|
pid = proc.pid
|
|
with open(BOT_PID_FILE, "w") as f:
|
|
f.write(str(pid))
|
|
wlog(f"xmpp_bot started (PID {pid})")
|
|
return pid
|
|
|
|
|
|
def get_last_log_activity() -> float:
|
|
"""Get timestamp of last xmpp_bot.log modification."""
|
|
log_file = os.path.join(LOG_DIR, "xmpp_bot.log")
|
|
try:
|
|
return os.path.getmtime(log_file)
|
|
except:
|
|
return 0
|
|
|
|
|
|
def health_check(bot_pid: int, last_activity: float) -> tuple[bool, int, float]:
|
|
"""
|
|
Check bot health.
|
|
Returns (is_alive, pid, last_activity).
|
|
If dead, restarts bot.
|
|
"""
|
|
alive = is_process_alive(bot_pid)
|
|
|
|
if not alive:
|
|
wlog(f"Bot PID {bot_pid} is DEAD. Restarting...")
|
|
bot_pid = start_bot()
|
|
time.sleep(5)
|
|
last_activity = get_last_log_activity()
|
|
return (True, bot_pid, last_activity)
|
|
|
|
# Check if bot has been active recently (last 5 minutes)
|
|
current_activity = get_last_log_activity()
|
|
if current_activity > last_activity:
|
|
last_activity = current_activity
|
|
|
|
# If no activity for 5 minutes but bot is alive, warn
|
|
if time.time() - last_activity > 300:
|
|
wlog(f"WARNING: Bot PID {bot_pid} alive but no activity for 5+ min")
|
|
|
|
return (True, bot_pid, last_activity)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
wlog("Watchdog started")
|
|
|
|
# Start bot if not already running
|
|
bot_pid = 0
|
|
if os.path.exists(BOT_PID_FILE):
|
|
try:
|
|
with open(BOT_PID_FILE) as f:
|
|
bot_pid = int(f.read().strip())
|
|
if not is_process_alive(bot_pid):
|
|
bot_pid = 0
|
|
except:
|
|
bot_pid = 0
|
|
|
|
if bot_pid == 0:
|
|
bot_pid = start_bot()
|
|
|
|
last_activity = get_last_log_activity()
|
|
wlog(f"Initial: bot PID {bot_pid}, log last activity: {time.ctime(last_activity)}")
|
|
|
|
log_rotate_counter = 0
|
|
|
|
# Main monitoring loop
|
|
while True:
|
|
time.sleep(CHECK_INTERVAL)
|
|
alive, bot_pid, last_activity = health_check(bot_pid, last_activity)
|
|
|
|
# Log rotation (every 30 checks ≈ 15 min)
|
|
log_rotate_counter += 1
|
|
if log_rotate_counter >= 30:
|
|
log_rotate_counter = 0
|
|
bot_log = os.path.join(LOG_DIR, "xmpp_bot.log")
|
|
bridge_log = os.path.join(LOG_DIR, "bridge.log")
|
|
rotate_log(bot_log)
|
|
rotate_log(bridge_log)
|
|
|
|
# Every 5 minutes, report status
|
|
if int(time.time()) % 300 < CHECK_INTERVAL:
|
|
alive_str = "ALIVE" if alive else "RESTARTED"
|
|
wlog(f"Status: bot PID {bot_pid} [{alive_str}]") |