399c77ccbc
If xmpp_connected=false, force restart immediately instead of waiting 10 minutes for message timeout. This catches the most common failure mode: process alive but XMPP dead.
170 lines
6.7 KiB
Python
170 lines
6.7 KiB
Python
"""
|
|
xxm health check. Runs every 5 min via Task Scheduler.
|
|
Checks:
|
|
1. Is xmpp_bot process alive? If not → restart
|
|
2. Is it receiving messages? If alive but no msgs for 10+ min → restart
|
|
3. Possible stuck loop? (too many tool calls)
|
|
"""
|
|
import os, sys, time, subprocess
|
|
|
|
PROJECT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
LOG_FILE = os.path.join(PROJECT, "logs", "health_check.log")
|
|
BOT_LOG = os.path.join(PROJECT, "logs", "xmpp_bot.log")
|
|
PYTHON = r"C:\Users\hmo\AppData\Local\Programs\Python\Python310\python.exe"
|
|
BOT_SCRIPT = os.path.join(PROJECT, "scripts", "xmpp_bot.py")
|
|
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
|
|
|
def wlog(msg: str):
|
|
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
with open(LOG_FILE, "a", encoding="utf-8") as f:
|
|
f.write(f"{ts} [health] {msg}\n")
|
|
|
|
def find_bot_pid() -> int:
|
|
try:
|
|
r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'],
|
|
capture_output=True, text=True, timeout=10)
|
|
for line in r.stdout.splitlines():
|
|
parts = line.strip('"').split('","')
|
|
if len(parts) >= 2 and parts[0] == 'python.exe':
|
|
pid_str = parts[1].strip()
|
|
try:
|
|
wmi = subprocess.run(
|
|
['wmic', 'process', 'where', f'ProcessId={pid_str}',
|
|
'get', 'CommandLine', '/format:list'],
|
|
capture_output=True, text=True, timeout=5)
|
|
if ('xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout
|
|
and 'health' not in wmi.stdout):
|
|
return int(pid_str)
|
|
except:
|
|
pass
|
|
except:
|
|
pass
|
|
return 0
|
|
|
|
def kill_all_bots():
|
|
"""Kill all xmpp_bot.py processes."""
|
|
try:
|
|
r = subprocess.run(['tasklist', '/FO', 'CSV', '/NH'],
|
|
capture_output=True, text=True, timeout=10)
|
|
for line in r.stdout.splitlines():
|
|
parts = line.strip('"').split('","')
|
|
if len(parts) >= 2 and parts[0] == 'python.exe':
|
|
pid_str = parts[1].strip()
|
|
try:
|
|
wmi = subprocess.run(
|
|
['wmic', 'process', 'where', f'ProcessId={pid_str}',
|
|
'get', 'CommandLine', '/format:list'],
|
|
capture_output=True, text=True, timeout=5)
|
|
if 'xmpp_bot' in wmi.stdout and 'watchdog' not in wmi.stdout:
|
|
subprocess.run(['taskkill', '/f', '/pid', pid_str],
|
|
capture_output=True, timeout=5)
|
|
wlog(f"Killed old bot PID {pid_str}")
|
|
except:
|
|
pass
|
|
except:
|
|
pass
|
|
|
|
def start_bot():
|
|
kill_all_bots()
|
|
time.sleep(3)
|
|
subprocess.Popen([PYTHON, BOT_SCRIPT],
|
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
|
creationflags=subprocess.CREATE_NO_WINDOW)
|
|
|
|
def parse_log_tail(path: str, n: int = 100) -> list[str]:
|
|
if not os.path.exists(path): return []
|
|
try:
|
|
with open(path, "r", encoding="utf-8", errors="replace") as f:
|
|
return [l.rstrip("\n\r") for l in f.readlines()[-n:]]
|
|
except:
|
|
return []
|
|
|
|
def get_last_msg_time(lines: list[str]) -> float:
|
|
"""Get approximate time of last received group message from log lines."""
|
|
# Look for [Group] entries which mean messages received
|
|
for line in reversed(lines):
|
|
if '[Group]' in line and 'batched' in line:
|
|
m = __import__('re').search(r'^(\d{2}):(\d{2}):(\d{2})', line)
|
|
if m:
|
|
h, mi, s = int(m.group(1)), int(m.group(2)), int(m.group(3))
|
|
now = time.localtime()
|
|
log_time = time.mktime(
|
|
(now.tm_year, now.tm_mon, now.tm_mday, h, mi, s,
|
|
now.tm_wday, now.tm_yday, now.tm_isdst))
|
|
if log_time > time.time(): # wrap around midnight
|
|
log_time -= 86400
|
|
return log_time
|
|
return 0
|
|
|
|
def main():
|
|
wlog("=== health check ===")
|
|
|
|
pid = find_bot_pid()
|
|
if pid == 0:
|
|
wlog("CRIT: bot not running, restarting...")
|
|
start_bot()
|
|
time.sleep(5)
|
|
pid = find_bot_pid()
|
|
if pid:
|
|
wlog(f"OK: restarted (PID {pid})")
|
|
else:
|
|
wlog("FAIL: restart failed")
|
|
wlog("=== end ===")
|
|
return
|
|
|
|
# ── 0. Quick HTTP health check ──────────────────────────
|
|
# Check /health endpoint: if xmpp_connected=false, restart immediately.
|
|
# This catches "process alive but XMPP dead" — the most common failure mode.
|
|
try:
|
|
import urllib.request, json
|
|
req = urllib.request.Request("http://127.0.0.1:5802/health")
|
|
resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
|
|
if not resp.get("xmpp_connected", True):
|
|
wlog("CRIT: xmpp_connected=false via /health — forcing restart")
|
|
kill_all_bots()
|
|
time.sleep(3)
|
|
start_bot()
|
|
wlog("=== end (restarted: XMPP disconnected) ===")
|
|
return
|
|
except Exception as e:
|
|
wlog(f"WARN: /health check failed: {e}")
|
|
# If /health is unreachable, the bot process might be dead or stuck
|
|
pass
|
|
|
|
# Activity analysis
|
|
recent = parse_log_tail(BOT_LOG, 100)
|
|
alive = sum(1 for l in recent if "alive" in l)
|
|
responses = sum(1 for l in recent if l.startswith("-> ") and "silent" not in l)
|
|
silent = sum(1 for l in recent if "silent" in l)
|
|
tool_calls = sum(1 for l in recent if "run_command" in l)
|
|
group_msgs = sum(1 for l in recent if "[Group]" in l)
|
|
|
|
last_msg = get_last_msg_time(recent)
|
|
last_msg_age = (time.time() - last_msg) / 60 if last_msg else 999
|
|
|
|
wlog(f"PID={pid} alive={alive} grp={group_msgs} rsp={responses} sl={silent} tools={tool_calls} lastMsg={last_msg_age:.0f}min")
|
|
|
|
# CRITICAL: bot is alive but receiving no messages → disconnect detected
|
|
# Skip check if bot was just started (has "online" in recent logs)
|
|
recent_start = sum(1 for l in recent if "online" in l)
|
|
if alive >= 3 and group_msgs == 0 and last_msg_age > 10 and last_msg > 0 and recent_start == 0:
|
|
wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.")
|
|
wlog(f"CRIT: bot alive but NO messages for {last_msg_age:.0f} min. Forcing restart.")
|
|
kill_all_bots()
|
|
time.sleep(3)
|
|
start_bot()
|
|
wlog("=== end (restarted) ===")
|
|
return
|
|
|
|
# Process died
|
|
if alive == 0 and group_msgs == 0:
|
|
wlog("WARN: no activity in last windows")
|
|
|
|
if tool_calls >= 25:
|
|
wlog(f"WARN: heavy tool calls ({tool_calls}), possible loop")
|
|
|
|
wlog("=== end ===")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|