From 399c77ccbc1cd08509a256069a6946a9c95bec4a Mon Sep 17 00:00:00 2001 From: hmo Date: Sat, 13 Jun 2026 02:22:36 +0800 Subject: [PATCH] fix: health_check now queries /health endpoint before log analysis If xmpp_connected=false, force restart immediately instead of waiting 10 minutes for message timeout. This catches the most common failure mode: process alive but XMPP dead. --- gateway/scripts/health_check_xxm.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/gateway/scripts/health_check_xxm.py b/gateway/scripts/health_check_xxm.py index 7f8d272..a54fe98 100644 --- a/gateway/scripts/health_check_xxm.py +++ b/gateway/scripts/health_check_xxm.py @@ -112,6 +112,25 @@ def main(): wlog("=== end ===") return + # ── 0. Quick HTTP health check ────────────────────────── + # Check /health endpoint: if xmpp_connected=false, restart immediately. + # This catches "process alive but XMPP dead" — the most common failure mode. + try: + import urllib.request, json + req = urllib.request.Request("http://127.0.0.1:5802/health") + resp = json.loads(urllib.request.urlopen(req, timeout=5).read()) + if not resp.get("xmpp_connected", True): + wlog("CRIT: xmpp_connected=false via /health — forcing restart") + kill_all_bots() + time.sleep(3) + start_bot() + wlog("=== end (restarted: XMPP disconnected) ===") + return + except Exception as e: + wlog(f"WARN: /health check failed: {e}") + # If /health is unreachable, the bot process might be dead or stuck + pass + # Activity analysis recent = parse_log_tail(BOT_LOG, 100) alive = sum(1 for l in recent if "alive" in l)