fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
660 lines
24 KiB
Python
660 lines
24 KiB
Python
# This file runs a health check for the LLM, used on litellm/proxy
|
|
|
|
import asyncio
|
|
import logging
|
|
import random
|
|
import sys
|
|
import threading
|
|
import time
|
|
from collections.abc import Mapping
|
|
from typing import List, Optional
|
|
|
|
import litellm
|
|
|
|
logger = logging.getLogger(__name__)
|
|
from litellm.constants import (
|
|
BACKGROUND_HEALTH_CHECK_MAX_TOKENS,
|
|
BACKGROUND_HEALTH_CHECK_MAX_TOKENS_REASONING,
|
|
DEFAULT_HEALTH_CHECK_PROMPT,
|
|
HEALTH_CHECK_TIMEOUT_SECONDS,
|
|
)
|
|
|
|
ILLEGAL_DISPLAY_PARAMS = [
|
|
"messages",
|
|
"api_key",
|
|
"prompt",
|
|
"input",
|
|
"vertex_credentials",
|
|
"aws_access_key_id",
|
|
"aws_secret_access_key",
|
|
"exception", # internal; not JSON-serializable, never for display
|
|
"litellm_metadata", # internal tracking metadata with auth objects; not for display
|
|
]
|
|
# Provider routing fields. Allowed for proxy admins so they can see which
|
|
# region/version a deployment is checking; gated at the endpoint layer for
|
|
# non-admin callers (see _strip_admin_only_fields_from_health_result).
|
|
ADMIN_ONLY_HEALTH_DISPLAY_PARAMS = ("api_base", "api_version")
|
|
|
|
MINIMAL_DISPLAY_PARAMS = ["model", "mode_error"]
|
|
|
|
# Modes whose health-check probe is a chat-style completion call and
|
|
# therefore accept `max_tokens`. Other modes (embedding, image_generation,
|
|
# audio_*, rerank, video_generation, ocr, search, moderation, ...) hit
|
|
# endpoints that reject unknown fields with 400 "Unknown parameter:
|
|
# 'max_tokens'". Allow-list so new modes are safe by default.
|
|
# Per-deployment override: `model_info.health_check_supports_max_tokens`.
|
|
_MAX_TOKEN_SUPPORT_MODES: frozenset[str] = frozenset(
|
|
{"chat", "completion", "responses"}
|
|
)
|
|
|
|
|
|
def _resolve_health_check_mode(
|
|
model_info: Mapping[str, object], litellm_params: Mapping[str, object]
|
|
) -> str | None:
|
|
"""
|
|
Effective mode for a deployment's health-check probe.
|
|
|
|
Prefers operator-set `model_info.mode`; otherwise resolves it from the model
|
|
cost map, which understands `bedrock/` and cross-region inference-profile
|
|
prefixes (`us.`, `eu.`, `apac.`). Without this, non-chat Bedrock deployments
|
|
(e.g. embeddings) are probed as chat, so `max_tokens` is injected and the
|
|
request 400s on "extraneous key [max_tokens]".
|
|
"""
|
|
explicit_mode = model_info.get("mode")
|
|
if isinstance(explicit_mode, str):
|
|
return explicit_mode
|
|
model = litellm_params.get("model")
|
|
if not isinstance(model, str):
|
|
return None
|
|
try:
|
|
return litellm.get_model_info(model=model).get("mode")
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _should_inject_health_check_max_tokens(
|
|
model_info: Mapping[str, object], mode: str | None
|
|
) -> bool:
|
|
"""
|
|
Whether the health-check probe should include `max_tokens`.
|
|
|
|
Order:
|
|
1. `model_info.health_check_supports_max_tokens` (operator override).
|
|
2. `_MAX_TOKEN_SUPPORT_MODES`. An unresolvable mode is treated as `chat`
|
|
for backward compatibility.
|
|
"""
|
|
explicit = model_info.get("health_check_supports_max_tokens")
|
|
if explicit is not None:
|
|
return bool(explicit)
|
|
return (mode or "chat") in _MAX_TOKEN_SUPPORT_MODES
|
|
|
|
|
|
# Health-check modes that forward `reasoning_effort` to the provider (chat-style calls).
|
|
_HEALTH_CHECK_MODES_SUPPORTING_REASONING_EFFORT = frozenset(
|
|
(None, "chat", "completion")
|
|
)
|
|
|
|
|
|
def _get_process_rss_mb() -> Optional[float]:
|
|
"""
|
|
Get process RSS memory in MB.
|
|
On Linux, ru_maxrss is in KB. On macOS, ru_maxrss is in bytes.
|
|
"""
|
|
try:
|
|
import resource
|
|
|
|
ru_maxrss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
if sys.platform == "darwin":
|
|
return float(ru_maxrss) / (1024 * 1024)
|
|
return float(ru_maxrss) / 1024
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _rss_mb_for_log() -> str:
|
|
rss_mb = _get_process_rss_mb()
|
|
if rss_mb is None:
|
|
return "unknown"
|
|
return f"{rss_mb:.2f}"
|
|
|
|
|
|
def _get_random_llm_message():
|
|
"""
|
|
Get a random message from the LLM.
|
|
"""
|
|
messages = ["Hey how's it going?", "What's 1 + 1?"]
|
|
|
|
return [{"role": "user", "content": random.choice(messages)}]
|
|
|
|
|
|
def _clean_endpoint_data(endpoint_data: dict, details: Optional[bool] = True):
|
|
"""
|
|
Clean the endpoint data for display to users.
|
|
"""
|
|
endpoint_data.pop("litellm_logging_obj", None)
|
|
return (
|
|
{k: v for k, v in endpoint_data.items() if k not in ILLEGAL_DISPLAY_PARAMS}
|
|
if details is not False
|
|
else {k: v for k, v in endpoint_data.items() if k in MINIMAL_DISPLAY_PARAMS}
|
|
)
|
|
|
|
|
|
def health_check_filter_kwargs_from_general_settings(
|
|
general_settings: Optional[dict],
|
|
) -> dict:
|
|
"""
|
|
Build kwargs for ``perform_health_check`` from ``general_settings``.
|
|
|
|
When ``health_check_skip_disabled_background_models`` is true, deployments with
|
|
``model_info.disable_background_health_check`` are omitted from health runs
|
|
(including on-demand ``GET /health``), matching the background loop behavior.
|
|
"""
|
|
g = general_settings or {}
|
|
return {
|
|
"health_check_skip_disabled_background_models": bool(
|
|
g.get("health_check_skip_disabled_background_models", False)
|
|
),
|
|
}
|
|
|
|
|
|
def filter_deployments_by_id(
|
|
model_list: List,
|
|
) -> List:
|
|
seen_ids = set()
|
|
filtered_deployments = []
|
|
|
|
for deployment in model_list:
|
|
_model_info = deployment.get("model_info") or {}
|
|
_id = _model_info.get("id") or None
|
|
if _id is None:
|
|
continue
|
|
|
|
if _id not in seen_ids:
|
|
seen_ids.add(_id)
|
|
filtered_deployments.append(deployment)
|
|
|
|
return filtered_deployments
|
|
|
|
|
|
async def run_with_timeout(task, timeout):
|
|
try:
|
|
return await asyncio.wait_for(task, timeout)
|
|
except asyncio.TimeoutError:
|
|
# `asyncio.wait_for()` already cancels only the awaited task on timeout.
|
|
# Do not cancel unrelated sibling health check tasks.
|
|
timeout_exception = litellm.Timeout(
|
|
message="Health check timeout exceeded",
|
|
model="",
|
|
llm_provider="",
|
|
)
|
|
return {"error": "Timeout exceeded", "exception": timeout_exception}
|
|
|
|
|
|
async def _run_model_health_check(model: dict):
|
|
litellm_params = model["litellm_params"]
|
|
model_info = model.get("model_info", {})
|
|
mode = _resolve_health_check_mode(
|
|
model_info, litellm_params # any-ok: untyped router config dict
|
|
)
|
|
litellm_params = _update_litellm_params_for_health_check(model_info, litellm_params)
|
|
timeout = model_info.get("health_check_timeout") or HEALTH_CHECK_TIMEOUT_SECONDS
|
|
|
|
return await run_with_timeout(
|
|
litellm.ahealth_check(
|
|
litellm_params,
|
|
mode=mode,
|
|
prompt=DEFAULT_HEALTH_CHECK_PROMPT,
|
|
input=["test from litellm"],
|
|
),
|
|
timeout,
|
|
)
|
|
|
|
|
|
async def _run_health_checks_with_bounded_concurrency(
|
|
models: list, concurrency_limit: int
|
|
) -> tuple[list, int]:
|
|
"""
|
|
Run health checks with at most `concurrency_limit` active tasks.
|
|
Preserves result ordering to match `models`.
|
|
"""
|
|
results: list = [None] * len(models)
|
|
tasks_to_index: dict[asyncio.Task, int] = {}
|
|
model_iter = iter(enumerate(models))
|
|
peak_in_flight = 0
|
|
|
|
def _schedule_next() -> bool:
|
|
nonlocal peak_in_flight
|
|
try:
|
|
idx, next_model = next(model_iter)
|
|
except StopIteration:
|
|
return False
|
|
task = asyncio.create_task(_run_model_health_check(next_model))
|
|
tasks_to_index[task] = idx
|
|
peak_in_flight = max(peak_in_flight, len(tasks_to_index))
|
|
return True
|
|
|
|
for _ in range(min(concurrency_limit, len(models))):
|
|
_schedule_next()
|
|
|
|
while tasks_to_index:
|
|
done, _ = await asyncio.wait(
|
|
set(tasks_to_index.keys()),
|
|
return_when=asyncio.FIRST_COMPLETED,
|
|
)
|
|
for task in done:
|
|
idx = tasks_to_index.pop(task)
|
|
try:
|
|
results[idx] = task.result()
|
|
except Exception as e:
|
|
results[idx] = e
|
|
_schedule_next()
|
|
|
|
return results, peak_in_flight
|
|
|
|
|
|
async def _perform_health_check(
|
|
model_list: list,
|
|
details: Optional[bool] = True,
|
|
max_concurrency: Optional[int] = None,
|
|
instrumentation_context: Optional[dict] = None,
|
|
):
|
|
"""
|
|
Perform a health check for each model in the list.
|
|
|
|
max_concurrency: Optional limit on concurrent health check requests.
|
|
"""
|
|
|
|
instrumentation_context = instrumentation_context or {}
|
|
instrumentation_enabled = bool(instrumentation_context.get("enabled", False))
|
|
cycle_id = instrumentation_context.get("cycle_id", "unknown")
|
|
source = instrumentation_context.get("source", "unknown")
|
|
|
|
dispatch_mode = "unbounded"
|
|
peak_in_flight = 0
|
|
if isinstance(max_concurrency, int) and max_concurrency > 0:
|
|
dispatch_mode = "bounded"
|
|
results, peak_in_flight = await _run_health_checks_with_bounded_concurrency(
|
|
model_list, max_concurrency
|
|
)
|
|
else:
|
|
tasks = [
|
|
asyncio.create_task(_run_model_health_check(model)) for model in model_list
|
|
]
|
|
peak_in_flight = len(tasks)
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
if instrumentation_enabled:
|
|
logger.debug(
|
|
"health_check_dispatch_summary source=%s cycle_id=%s mode=%s model_count=%d max_concurrency=%s peak_in_flight=%d thread_count=%d rss_mb=%s",
|
|
source,
|
|
cycle_id,
|
|
dispatch_mode,
|
|
len(model_list),
|
|
max_concurrency,
|
|
peak_in_flight,
|
|
threading.active_count(),
|
|
_rss_mb_for_log(),
|
|
)
|
|
|
|
healthy_endpoints = []
|
|
unhealthy_endpoints = []
|
|
# Exceptions keyed by model_id; returned separately so callers can use
|
|
# them for cooldown integration without risking JSON-serialization errors
|
|
# in the /health response.
|
|
exceptions_by_model_id: dict = {}
|
|
|
|
for is_healthy, model in zip(results, model_list):
|
|
litellm_params = model["litellm_params"]
|
|
_model_id = (model.get("model_info") or {}).get("id")
|
|
|
|
if isinstance(is_healthy, dict) and "error" not in is_healthy:
|
|
cleaned = _clean_endpoint_data({**litellm_params, **is_healthy}, details)
|
|
if _model_id:
|
|
cleaned["model_id"] = _model_id
|
|
healthy_endpoints.append(cleaned)
|
|
elif isinstance(is_healthy, dict):
|
|
cleaned = _clean_endpoint_data({**litellm_params, **is_healthy}, details)
|
|
if _model_id:
|
|
cleaned["model_id"] = _model_id
|
|
if "exception" in is_healthy:
|
|
exc = is_healthy["exception"]
|
|
exceptions_by_model_id[_model_id] = exc
|
|
# Store integer status code so shared-cache readers can
|
|
# reconstruct the transient-error filter without the exception object.
|
|
cleaned["exception_status"] = getattr(exc, "status_code", 500)
|
|
unhealthy_endpoints.append(cleaned)
|
|
else:
|
|
cleaned = _clean_endpoint_data(litellm_params, details)
|
|
if _model_id:
|
|
cleaned["model_id"] = _model_id
|
|
if isinstance(is_healthy, Exception):
|
|
exceptions_by_model_id[_model_id] = is_healthy
|
|
cleaned["exception_status"] = getattr(
|
|
is_healthy, "status_code", 500
|
|
)
|
|
unhealthy_endpoints.append(cleaned)
|
|
|
|
return healthy_endpoints, unhealthy_endpoints, exceptions_by_model_id
|
|
|
|
|
|
def build_deployment_health_states(
|
|
healthy_endpoints: list,
|
|
unhealthy_endpoints: list,
|
|
) -> dict:
|
|
"""
|
|
Build a dict mapping deployment_id -> DeploymentHealthStateValue from
|
|
health check endpoint results.
|
|
|
|
Each endpoint dict includes a 'model_id' field (added by _perform_health_check)
|
|
that maps back to the deployment's model_info.id.
|
|
|
|
Used by the background health check loop to feed health state into
|
|
the router's DeploymentHealthCache for health-check-driven routing.
|
|
"""
|
|
now = time.time()
|
|
states: dict = {}
|
|
|
|
for ep in healthy_endpoints:
|
|
model_id = ep.get("model_id")
|
|
if model_id:
|
|
states[model_id] = {
|
|
"is_healthy": True,
|
|
"timestamp": now,
|
|
"reason": "",
|
|
}
|
|
|
|
for ep in unhealthy_endpoints:
|
|
model_id = ep.get("model_id")
|
|
if model_id:
|
|
states[model_id] = {
|
|
"is_healthy": False,
|
|
"timestamp": now,
|
|
"reason": "background_health_check_failed",
|
|
}
|
|
|
|
return states
|
|
|
|
|
|
def _deployment_model_string_for_health_check(litellm_params: dict) -> str:
|
|
"""Deployment model from litellm_params (before Bedrock rewrite).
|
|
|
|
Used for reasoning vs non-reasoning max_tokens and wildcard detection only.
|
|
Does not use ``health_check_model``; that override applies later to the request.
|
|
"""
|
|
return litellm_params.get("model") or ""
|
|
|
|
|
|
def _health_check_deployment_is_wildcard(litellm_params: dict) -> bool:
|
|
return "*" in _deployment_model_string_for_health_check(litellm_params)
|
|
|
|
|
|
def _resolve_health_check_max_tokens(
|
|
model_info: dict, litellm_params: dict
|
|
) -> Optional[int]:
|
|
"""
|
|
Pick max_tokens for the health check request.
|
|
|
|
Priority:
|
|
1. model_info.health_check_max_tokens (explicit override)
|
|
2. For non-wildcard routes: health_check_max_tokens_reasoning / _non_reasoning
|
|
from model_info based on litellm.supports_reasoning(litellm_params["model"])
|
|
3. For non-wildcard reasoning routes: BACKGROUND_HEALTH_CHECK_MAX_TOKENS_REASONING
|
|
from env (if set)
|
|
4. BACKGROUND_HEALTH_CHECK_MAX_TOKENS (global, any route including wildcards)
|
|
5. Non-wildcard default: 16
|
|
6. Wildcard and nothing from (1)(4): leave unset (caller omits max_tokens)
|
|
"""
|
|
explicit = model_info.get("health_check_max_tokens", None)
|
|
if explicit is not None:
|
|
return int(explicit)
|
|
|
|
is_wildcard = _health_check_deployment_is_wildcard(litellm_params)
|
|
deployment_model = _deployment_model_string_for_health_check(litellm_params)
|
|
|
|
if not is_wildcard:
|
|
try:
|
|
is_reasoning = litellm.supports_reasoning(deployment_model)
|
|
except Exception:
|
|
is_reasoning = False
|
|
tokens_reasoning = model_info.get("health_check_max_tokens_reasoning", None)
|
|
tokens_non_reasoning = model_info.get(
|
|
"health_check_max_tokens_non_reasoning", None
|
|
)
|
|
if tokens_reasoning is not None or tokens_non_reasoning is not None:
|
|
if is_reasoning and tokens_reasoning is not None:
|
|
return int(tokens_reasoning)
|
|
if not is_reasoning and tokens_non_reasoning is not None:
|
|
return int(tokens_non_reasoning)
|
|
if is_reasoning and BACKGROUND_HEALTH_CHECK_MAX_TOKENS_REASONING is not None:
|
|
return int(BACKGROUND_HEALTH_CHECK_MAX_TOKENS_REASONING)
|
|
|
|
if BACKGROUND_HEALTH_CHECK_MAX_TOKENS is not None:
|
|
return int(BACKGROUND_HEALTH_CHECK_MAX_TOKENS)
|
|
|
|
if not is_wildcard:
|
|
return 16
|
|
|
|
return None
|
|
|
|
|
|
def _update_litellm_params_for_health_check(
|
|
model_info: dict, litellm_params: dict
|
|
) -> dict:
|
|
"""
|
|
Update the litellm params for health check.
|
|
|
|
- gets a short `messages` param for health check
|
|
- adds a bounded `max_tokens` when the deployment is a chat-style mode
|
|
(`chat`, `completion`, `responses`) or the operator explicitly opts in
|
|
via `model_info.health_check_supports_max_tokens`. Non-chat endpoints
|
|
(image, embedding, audio_*, rerank, video, ocr, search, moderation, ...)
|
|
reject unknown fields with 400 "Unknown parameter: 'max_tokens'".
|
|
- updates the `model` param with the `health_check_model` if it exists Doc: https://docs.litellm.ai/docs/proxy/health#wildcard-routes
|
|
- updates the `voice` param with the `health_check_voice` for `audio_speech` mode if it exists Doc: https://docs.litellm.ai/docs/proxy/health#text-to-speech-models
|
|
- for Bedrock models with region routing (bedrock/region/model), strips the litellm routing prefix but preserves the model ID, and pins `custom_llm_provider` to `bedrock` (only when the deployment hasn't already set one, so an explicit `bedrock_converse` survives) so the bare model id still resolves to the provider (e.g. cross-region ids like `us.cohere.embed-v4:0`)
|
|
"""
|
|
mode = _resolve_health_check_mode(
|
|
model_info, litellm_params # any-ok: untyped router config dict
|
|
)
|
|
litellm_params["messages"] = _get_random_llm_message()
|
|
if _should_inject_health_check_max_tokens(
|
|
model_info, mode # any-ok: untyped router config dict
|
|
):
|
|
_resolved_max_tokens = _resolve_health_check_max_tokens(
|
|
model_info, litellm_params
|
|
)
|
|
if _resolved_max_tokens is not None:
|
|
litellm_params["max_tokens"] = _resolved_max_tokens
|
|
|
|
# Per-model reasoning effort for health checks only (e.g. reasoning_effort=none).
|
|
if mode in _HEALTH_CHECK_MODES_SUPPORTING_REASONING_EFFORT:
|
|
_hc_reasoning_effort = model_info.get("health_check_reasoning_effort", None)
|
|
if _hc_reasoning_effort is not None:
|
|
litellm_params["reasoning_effort"] = _hc_reasoning_effort
|
|
|
|
_health_check_model = model_info.get("health_check_model", None)
|
|
if _health_check_model is not None:
|
|
litellm_params["model"] = _health_check_model
|
|
if mode == "audio_speech":
|
|
litellm_params["voice"] = model_info.get("health_check_voice", "alloy")
|
|
|
|
# Handle Bedrock region routing format: bedrock/region/model
|
|
# This is needed because health checks bypass get_llm_provider() for the model param
|
|
# Issue #15807: Without this, health checks send "region/model" as the model ID to AWS
|
|
# which causes: "bedrock-runtime.../model/us-west-2/mistral.../invoke" (region in model ID)
|
|
#
|
|
# However, we must preserve cross-region inference profile prefixes like "us.", "eu.", etc.
|
|
# Issue: Stripping these breaks AWS requirement for inference profile IDs
|
|
#
|
|
# Must also preserve route prefixes (converse/, invoke/) and handlers (llama/, deepseek_r1/, etc.)
|
|
if litellm_params["model"].startswith("bedrock/"):
|
|
from litellm.llms.bedrock.common_utils import BedrockModelInfo
|
|
|
|
model = litellm_params["model"]
|
|
# Strip only the bedrock/ prefix (preserve routes like converse/, invoke/)
|
|
if model.startswith("bedrock/"):
|
|
model = model[8:] # len("bedrock/") = 8
|
|
|
|
# Now check for region routing and strip it if present
|
|
# Need to handle formats like:
|
|
# - "us-west-2/model" → "model"
|
|
# - "converse/us-west-2/model" → "converse/model"
|
|
# - "llama/arn:..." → "llama/arn:..." (preserve handler)
|
|
#
|
|
# Strategy: Check each path segment, remove regions, preserve everything else
|
|
parts = model.split("/")
|
|
filtered_parts = []
|
|
|
|
for part in parts:
|
|
# Skip AWS regions, keep everything else
|
|
if part not in BedrockModelInfo.all_global_regions:
|
|
filtered_parts.append(part)
|
|
|
|
model = "/".join(filtered_parts)
|
|
litellm_params["model"] = model
|
|
if not litellm_params.get("custom_llm_provider"): # any-ok: untyped router dict
|
|
litellm_params["custom_llm_provider"] = ( # any-ok: untyped router dict
|
|
"bedrock"
|
|
)
|
|
|
|
return litellm_params
|
|
|
|
|
|
async def perform_health_check(
|
|
model_list: list,
|
|
model: Optional[str] = None,
|
|
cli_model: Optional[str] = None,
|
|
details: Optional[bool] = True,
|
|
model_id: Optional[str] = None,
|
|
max_concurrency: Optional[int] = None,
|
|
instrumentation_context: Optional[dict] = None,
|
|
health_check_skip_disabled_background_models: bool = False,
|
|
):
|
|
"""
|
|
Perform a health check on the system.
|
|
|
|
When model_id is provided, only the deployment with that id is checked
|
|
(so models that share the same name but have different ids are checked separately).
|
|
When model (name) is provided, all deployments matching that name are checked.
|
|
|
|
When ``health_check_skip_disabled_background_models`` is True (via
|
|
``general_settings.health_check_skip_disabled_background_models``), deployments
|
|
with ``model_info.disable_background_health_check: true`` are omitted from
|
|
this run (including targeted ``/health`` queries), consistent with the
|
|
background health loop.
|
|
|
|
Returns:
|
|
(bool): True if the health check passes, False otherwise.
|
|
"""
|
|
instrumentation_context = instrumentation_context or {}
|
|
instrumentation_enabled = bool(instrumentation_context.get("enabled", False))
|
|
cycle_id = instrumentation_context.get("cycle_id", "unknown")
|
|
source = instrumentation_context.get("source", "unknown")
|
|
|
|
if not model_list:
|
|
if cli_model:
|
|
model_list = [
|
|
{"model_name": cli_model, "litellm_params": {"model": cli_model}}
|
|
]
|
|
else:
|
|
if instrumentation_enabled:
|
|
logger.debug(
|
|
"health_check_cycle_skipped source=%s cycle_id=%s reason=no_models",
|
|
source,
|
|
cycle_id,
|
|
)
|
|
return [], [], {}
|
|
|
|
cycle_start_time = time.monotonic()
|
|
requested_model_count = len(model_list)
|
|
|
|
# Filter by model_id first so a single deployment is checked when id is specified
|
|
if model_id is not None:
|
|
_by_id = [
|
|
x for x in model_list if (x.get("model_info") or {}).get("id") == model_id
|
|
]
|
|
if _by_id:
|
|
model_list = _by_id
|
|
elif model is not None:
|
|
_new_model_list = [
|
|
x for x in model_list if x["litellm_params"]["model"] == model
|
|
]
|
|
if _new_model_list == []:
|
|
_new_model_list = [x for x in model_list if x["model_name"] == model]
|
|
model_list = _new_model_list
|
|
|
|
if health_check_skip_disabled_background_models:
|
|
model_list = [
|
|
x
|
|
for x in model_list
|
|
if not (x.get("model_info") or {}).get(
|
|
"disable_background_health_check", False
|
|
)
|
|
]
|
|
if not model_list:
|
|
if instrumentation_enabled:
|
|
logger.debug(
|
|
"health_check_cycle_skipped source=%s cycle_id=%s reason=no_models_after_filter",
|
|
source,
|
|
cycle_id,
|
|
)
|
|
return [], [], {}
|
|
|
|
post_filter_model_count = len(model_list)
|
|
model_list = filter_deployments_by_id(
|
|
model_list=model_list
|
|
) # filter duplicate deployments (e.g. when model alias'es are used)
|
|
deduped_model_count = len(model_list)
|
|
|
|
if instrumentation_enabled:
|
|
logger.debug(
|
|
"health_check_cycle_start source=%s cycle_id=%s requested_model_count=%d post_model_filter_count=%d deduped_model_count=%d max_concurrency=%s thread_count=%d rss_mb=%s",
|
|
source,
|
|
cycle_id,
|
|
requested_model_count,
|
|
post_filter_model_count,
|
|
deduped_model_count,
|
|
max_concurrency,
|
|
threading.active_count(),
|
|
_rss_mb_for_log(),
|
|
)
|
|
|
|
try:
|
|
(
|
|
healthy_endpoints,
|
|
unhealthy_endpoints,
|
|
exceptions_by_model_id,
|
|
) = await _perform_health_check(
|
|
model_list,
|
|
details,
|
|
max_concurrency=max_concurrency,
|
|
instrumentation_context=instrumentation_context,
|
|
)
|
|
except Exception:
|
|
if instrumentation_enabled:
|
|
logger.exception(
|
|
"health_check_cycle_failed source=%s cycle_id=%s model_count=%d duration_ms=%.2f thread_count=%d rss_mb=%s",
|
|
source,
|
|
cycle_id,
|
|
deduped_model_count,
|
|
(time.monotonic() - cycle_start_time) * 1000,
|
|
threading.active_count(),
|
|
_rss_mb_for_log(),
|
|
)
|
|
raise
|
|
|
|
if instrumentation_enabled:
|
|
logger.debug(
|
|
"health_check_cycle_complete source=%s cycle_id=%s model_count=%d healthy_count=%d unhealthy_count=%d duration_ms=%.2f thread_count=%d rss_mb=%s",
|
|
source,
|
|
cycle_id,
|
|
deduped_model_count,
|
|
len(healthy_endpoints),
|
|
len(unhealthy_endpoints),
|
|
(time.monotonic() - cycle_start_time) * 1000,
|
|
threading.active_count(),
|
|
_rss_mb_for_log(),
|
|
)
|
|
|
|
return healthy_endpoints, unhealthy_endpoints, exceptions_by_model_id
|