""" xxm health check. Runs every 5 min via Task Scheduler. Checks: 1. Is xmpp_bot process alive? If not → restart 2. Is it receiving messages? If alive but no msgs for 10+ min → restart 3. Possible stuck loop? (too many tool calls) """ import os, sys, time, subprocess PROJECT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) LOG_FILE = os.path.join(PROJECT, "logs", "health_check.log") BOT_LOG = os.path.join(PROJECT, "logs", "xmpp_bot.log") PYTHON = r"C:\Users\hmo\AppData\Local\Programs\Python\Python310\python.exe" BOT_SCRIPT = os.path.join(PROJECT, "scripts", "xmpp_bot.py") os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) def wlog(msg: str): ts = time.strftime("%Y-%m-%d %H:%M:%S") with open(LOG_FILE, "a", encoding="utf-8") as f: f.write(f"{ts} [health] {msg}\n") def find_bot_pid() -> int: try: r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'], capture_output=True, text=True, timeout=10) for line in r.stdout.splitlines(): parts = line.strip('"').split('","') if len(parts) >= 2 and parts[0] == 'python.exe': pid_str = parts[1].strip() try: wmi = subprocess.run( ['wmic', 'process', 'where', f'ProcessId={pid_str}', 'get', 'CommandLine', '/format:list'], capture_output=True, text=True, timeout=5) if ('xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout and 'health' not in wmi.stdout): return int(pid_str) except: pass except: pass return 0 def kill_all_bots(): """Kill all xmpp_bot.py processes.""" try: r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'], capture_output=True, text=True, timeout=10) for line in r.stdout.splitlines(): parts = line.strip('"').split('","') if len(parts) >= 2 and parts[0] == 'python.exe': pid_str = parts[1].strip() try: wmi = subprocess.run( ['wmic', 'process', 'where', f'ProcessId={pid_str}', 'get', 'CommandLine', '/format:list'], capture_output=True, text=True, timeout=5) if 'xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout: subprocess.run(['taskkill', '/f', '/pid', pid_str], capture_output=True, timeout=5) wlog(f"Killed old bot PID {pid_str}") except: pass except: pass def start_bot(): kill_all_bots() time.sleep(3) subprocess.Popen([PYTHON, BOT_SCRIPT], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, creationflags=subprocess.CREATE_NO_WINDOW) def parse_log_tail(path: str, n: int = 100) -> list[str]: if not os.path.exists(path): return [] try: with open(path, "r", encoding="utf-8", errors="replace") as f: return [l.rstrip("\n\r") for l in f.readlines()[-n:]] except: return [] def get_last_msg_time(lines: list[str]) -> float: """Get approximate time of last received group message from log lines.""" # Look for [Group] entries which mean messages received for line in reversed(lines): if '[Group]' in line and 'batched' in line: m = __import__('re').search(r'^(\d{2}):(\d{2}):(\d{2})', line) if m: h, mi, s = int(m.group(1)), int(m.group(2)), int(m.group(3)) now = time.localtime() log_time = time.mktime( (now.tm_year, now.tm_mon, now.tm_mday, h, mi, s, now.tm_wday, now.tm_yday, now.tm_isdst)) if log_time > time.time(): # wrap around midnight log_time -= 86400 return log_time return 0 def main(): wlog("=== health check ===") pid = find_bot_pid() if pid == 0: wlog("CRIT: bot not running, restarting...") start_bot() time.sleep(5) pid = find_bot_pid() if pid: wlog(f"OK: restarted (PID {pid})") else: wlog("FAIL: restart failed") wlog("=== end ===") return # ── 0. Quick HTTP health check ────────────────────────── # Check /health endpoint: if xmpp_connected=false, restart immediately. # This catches "process alive but XMPP dead" — the most common failure mode. try: import urllib.request, json req = urllib.request.Request("http://127.0.0.1:5802/health") resp = json.loads(urllib.request.urlopen(req, timeout=5).read()) if not resp.get("xmpp_connected", True): wlog("CRIT: xmpp_connected=false via /health — forcing restart") kill_all_bots() time.sleep(3) start_bot() wlog("=== end (restarted: XMPP disconnected) ===") return except Exception as e: wlog(f"WARN: /health check failed: {e}") # If /health is unreachable, the bot process might be dead or stuck pass # Activity analysis recent = parse_log_tail(BOT_LOG, 100) alive = sum(1 for l in recent if "alive" in l) responses = sum(1 for l in recent if l.startswith("-> ") and "silent" not in l) silent = sum(1 for l in recent if "silent" in l) tool_calls = sum(1 for l in recent if "run_command" in l) group_msgs = sum(1 for l in recent if "[Group]" in l) last_msg = get_last_msg_time(recent) last_msg_age = (time.time() - last_msg) / 60 if last_msg else 999 wlog(f"PID={pid} alive={alive} grp={group_msgs} rsp={responses} sl={silent} tools={tool_calls} lastMsg={last_msg_age:.0f}min") # CRITICAL: bot is alive but receiving no messages → disconnect detected # Skip check if bot was just started (has "online" in recent logs) recent_start = sum(1 for l in recent if "online" in l) if alive >= 3 and group_msgs == 0 and last_msg_age > 10 and last_msg > 0 and recent_start == 0: wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.") wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.") kill_all_bots() time.sleep(3) start_bot() wlog("=== end (restarted) ===") return # Process died if alive == 0 and group_msgs == 0: wlog("WARN: no activity in last windows") if tool_calls >= 25: wlog(f"WARN: heavy tool calls ({tool_calls}), possible loop") wlog("=== end ===") if __name__ == "__main__": main()