fix: health_check now queries /health endpoint before log analysis
If xmpp_connected=false, force restart immediately instead of waiting 10 minutes for message timeout. This catches the most common failure mode: process alive but XMPP dead.
This commit is contained in:
@@ -112,6 +112,25 @@ def main():
|
||||
wlog("=== end ===")
|
||||
return
|
||||
|
||||
# ── 0. Quick HTTP health check ──────────────────────────
|
||||
# Check /health endpoint: if xmpp_connected=false, restart immediately.
|
||||
# This catches "process alive but XMPP dead" — the most common failure mode.
|
||||
try:
|
||||
import urllib.request, json
|
||||
req = urllib.request.Request("http://127.0.0.1:5802/health")
|
||||
resp = json.loads(urllib.request.urlopen(req, timeout=5).read())
|
||||
if not resp.get("xmpp_connected", True):
|
||||
wlog("CRIT: xmpp_connected=false via /health — forcing restart")
|
||||
kill_all_bots()
|
||||
time.sleep(3)
|
||||
start_bot()
|
||||
wlog("=== end (restarted: XMPP disconnected) ===")
|
||||
return
|
||||
except Exception as e:
|
||||
wlog(f"WARN: /health check failed: {e}")
|
||||
# If /health is unreachable, the bot process might be dead or stuck
|
||||
pass
|
||||
|
||||
# Activity analysis
|
||||
recent = parse_log_tail(BOT_LOG, 100)
|
||||
alive = sum(1 for l in recent if "alive" in l)
|
||||
|
||||
Reference in New Issue
Block a user