fix: health_check now queries /health endpoint before log analysis

If xmpp_connected=false, force restart immediately instead of
waiting 10 minutes for message timeout. This catches the most
common failure mode: process alive but XMPP dead.
This commit is contained in:
hmo
2026-06-13 02:22:36 +08:00
parent 2e708359f1
commit 399c77ccbc
+19
View File
@@ -112,6 +112,25 @@ def main():
wlog("=== end ===")
return
# ── 0. Quick HTTP health check ──────────────────────────
# Check /health endpoint: if xmpp_connected=false, restart immediately.
# This catches "process alive but XMPP dead" — the most common failure mode.
try:
import urllib.request, json
req = urllib.request.Request("http://127.0.0.1:5802/health")
resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
if not resp.get("xmpp_connected", True):
wlog("CRIT: xmpp_connected=false via /health — forcing restart")
kill_all_bots()
time.sleep(3)
start_bot()
wlog("=== end (restarted: XMPP disconnected) ===")
return
except Exception as e:
wlog(f"WARN: /health check failed: {e}")
# If /health is unreachable, the bot process might be dead or stuck
pass
# Activity analysis
recent = parse_log_tail(BOT_LOG, 100)
alive = sum(1 for l in recent if "alive" in l)