Files
hmo 399c77ccbc fix: health_check now queries /health endpoint before log analysis
If xmpp_connected=false, force restart immediately instead of
waiting 10 minutes for message timeout. This catches the most
common failure mode: process alive but XMPP dead.
2026-06-13 02:22:36 +08:00

170 lines
6.7 KiB
Python

"""
xxm health check. Runs every 5 min via Task Scheduler.
Checks:
1. Is xmpp_bot process alive? If not → restart
2. Is it receiving messages? If alive but no msgs for 10+ min → restart
3. Possible stuck loop? (too many tool calls)
"""
import os, sys, time, subprocess
PROJECT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
LOG_FILE = os.path.join(PROJECT, "logs", "health_check.log")
BOT_LOG = os.path.join(PROJECT, "logs", "xmpp_bot.log")
PYTHON = r"C:\Users\hmo\AppData\Local\Programs\Python\Python310\python.exe"
BOT_SCRIPT = os.path.join(PROJECT, "scripts", "xmpp_bot.py")
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
def wlog(msg: str):
ts = time.strftime("%Y-%m-%d %H:%M:%S")
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(f"{ts} [health] {msg}\n")
def find_bot_pid() -> int:
try:
r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'],
capture_output=True, text=True, timeout=10)
for line in r.stdout.splitlines():
parts = line.strip('"').split('","')
if len(parts) >= 2 and parts[0] == 'python.exe':
pid_str = parts[1].strip()
try:
wmi = subprocess.run(
['wmic', 'process', 'where', f'ProcessId={pid_str}',
'get', 'CommandLine', '/format:list'],
capture_output=True, text=True, timeout=5)
if ('xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout
and 'health' not in wmi.stdout):
return int(pid_str)
except:
pass
except:
pass
return 0
def kill_all_bots():
"""Kill all xmpp_bot.py processes."""
try:
r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'],
capture_output=True, text=True, timeout=10)
for line in r.stdout.splitlines():
parts = line.strip('"').split('","')
if len(parts) >= 2 and parts[0] == 'python.exe':
pid_str = parts[1].strip()
try:
wmi = subprocess.run(
['wmic', 'process', 'where', f'ProcessId={pid_str}',
'get', 'CommandLine', '/format:list'],
capture_output=True, text=True, timeout=5)
if 'xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout:
subprocess.run(['taskkill', '/f', '/pid', pid_str],
capture_output=True, timeout=5)
wlog(f"Killed old bot PID {pid_str}")
except:
pass
except:
pass
def start_bot():
kill_all_bots()
time.sleep(3)
subprocess.Popen([PYTHON, BOT_SCRIPT],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
creationflags=subprocess.CREATE_NO_WINDOW)
def parse_log_tail(path: str, n: int = 100) -> list[str]:
if not os.path.exists(path): return []
try:
with open(path, "r", encoding="utf-8", errors="replace") as f:
return [l.rstrip("\n\r") for l in f.readlines()[-n:]]
except:
return []
def get_last_msg_time(lines: list[str]) -> float:
"""Get approximate time of last received group message from log lines."""
# Look for [Group] entries which mean messages received
for line in reversed(lines):
if '[Group]' in line and 'batched' in line:
m = __import__('re').search(r'^(\d{2}):(\d{2}):(\d{2})', line)
if m:
h, mi, s = int(m.group(1)), int(m.group(2)), int(m.group(3))
now = time.localtime()
log_time = time.mktime(
(now.tm_year, now.tm_mon, now.tm_mday, h, mi, s,
now.tm_wday, now.tm_yday, now.tm_isdst))
if log_time > time.time(): # wrap around midnight
log_time -= 86400
return log_time
return 0
def main():
wlog("=== health check ===")
pid = find_bot_pid()
if pid == 0:
wlog("CRIT: bot not running, restarting...")
start_bot()
time.sleep(5)
pid = find_bot_pid()
if pid:
wlog(f"OK: restarted (PID {pid})")
else:
wlog("FAIL: restart failed")
wlog("=== end ===")
return
# ── 0. Quick HTTP health check ──────────────────────────
# Check /health endpoint: if xmpp_connected=false, restart immediately.
# This catches "process alive but XMPP dead" — the most common failure mode.
try:
import urllib.request, json
req = urllib.request.Request("http://127.0.0.1:5802/health")
resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
if not resp.get("xmpp_connected", True):
wlog("CRIT: xmpp_connected=false via /health — forcing restart")
kill_all_bots()
time.sleep(3)
start_bot()
wlog("=== end (restarted: XMPP disconnected) ===")
return
except Exception as e:
wlog(f"WARN: /health check failed: {e}")
# If /health is unreachable, the bot process might be dead or stuck
pass
# Activity analysis
recent = parse_log_tail(BOT_LOG, 100)
alive = sum(1 for l in recent if "alive" in l)
responses = sum(1 for l in recent if l.startswith("-> ") and "silent" not in l)
silent = sum(1 for l in recent if "silent" in l)
tool_calls = sum(1 for l in recent if "run_command" in l)
group_msgs = sum(1 for l in recent if "[Group]" in l)
last_msg = get_last_msg_time(recent)
last_msg_age = (time.time() - last_msg) / 60 if last_msg else 999
wlog(f"PID={pid} alive={alive} grp={group_msgs} rsp={responses} sl={silent} tools={tool_calls} lastMsg={last_msg_age:.0f}min")
# CRITICAL: bot is alive but receiving no messages → disconnect detected
# Skip check if bot was just started (has "online" in recent logs)
recent_start = sum(1 for l in recent if "online" in l)
if alive >= 3 and group_msgs == 0 and last_msg_age > 10 and last_msg > 0 and recent_start == 0:
wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.")
wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.")
kill_all_bots()
time.sleep(3)
start_bot()
wlog("=== end (restarted) ===")
return
# Process died
if alive == 0 and group_msgs == 0:
wlog("WARN: no activity in last windows")
if tool_calls >= 25:
wlog(f"WARN: heavy tool calls ({tool_calls}), possible loop")
wlog("=== end ===")
if __name__ == "__main__":
main()