Files
MoFin/venv/lib/python3.12/site-packages/litellm/proxy/db/exception_handler.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

348 lines
14 KiB
Python

from typing import Any, Awaitable, Callable, Optional, Union
from litellm._logging import verbose_proxy_logger
from litellm.proxy._types import (
DB_CONNECTION_ERROR_TYPES,
ProxyErrorTypes,
ProxyException,
)
from litellm.secret_managers.main import str_to_bool
class PrismaDBExceptionHandler:
"""
Class to handle DB Exceptions or Connection Errors
"""
@staticmethod
def should_allow_request_on_db_unavailable() -> bool:
"""
Returns True if the request should be allowed to proceed despite the DB connection error
"""
from litellm.proxy.proxy_server import general_settings
_allow_requests_on_db_unavailable: Union[bool, str] = general_settings.get(
"allow_requests_on_db_unavailable", False
)
if isinstance(_allow_requests_on_db_unavailable, bool):
return _allow_requests_on_db_unavailable
if str_to_bool(_allow_requests_on_db_unavailable) is True:
return True
return False
@staticmethod
def is_database_connection_error(e: Exception) -> bool:
"""True iff the exception is (or could be) a DB-connectivity
failure, i.e. something that justifies the
``allow_requests_on_db_unavailable`` HA fallback.
Known data-layer PrismaError subclasses (``UniqueViolationError``,
``RecordNotFoundError``, etc.) are explicitly excluded — the DB IS
reachable, so a fallback that grants an anonymous token would be
an auth bypass. Unknown / bare ``PrismaError`` instances default
to True so genuine outages that don't match a specific subclass
still trigger the fallback.
"""
import prisma
# Explicit data-layer exclusion: DB IS reachable, fallback must
# NOT fire.
data_layer_errors = (
prisma.errors.DataError,
prisma.errors.UniqueViolationError,
prisma.errors.ForeignKeyViolationError,
prisma.errors.MissingRequiredValueError,
prisma.errors.RawQueryError,
prisma.errors.TableNotFoundError,
prisma.errors.RecordNotFoundError,
)
if isinstance(e, data_layer_errors):
return False
if isinstance(e, DB_CONNECTION_ERROR_TYPES):
return True
if isinstance(e, prisma.errors.PrismaError):
return True
if isinstance(e, ProxyException) and e.type == ProxyErrorTypes.no_db_connection:
return True
return False
@staticmethod
def is_database_transport_error(e: Exception) -> bool:
"""
Returns True only for transport/connectivity failures where a reconnect
attempt makes sense (e.g. DB is unreachable, connection dropped).
Use this for reconnect logic — data-layer errors like UniqueViolationError
mean the DB IS reachable, so reconnecting would be pointless.
"""
import prisma
if isinstance(e, DB_CONNECTION_ERROR_TYPES):
return True
if isinstance(
e,
(
prisma.errors.ClientNotConnectedError,
prisma.errors.HTTPClientClosedError,
),
):
return True
if isinstance(e, prisma.errors.PrismaError):
error_message = str(e).lower()
connection_keywords = (
"can't reach database server",
"cannot reach database server",
"can't connect",
"cannot connect",
"connection error",
"connection closed",
"timed out",
"timeout",
"connection refused",
"network is unreachable",
"no route to host",
"broken pipe",
)
if any(keyword in error_message for keyword in connection_keywords):
return True
if isinstance(e, ProxyException) and e.type == ProxyErrorTypes.no_db_connection:
return True
return False
@staticmethod
def is_prisma_engine_internal_error(e: Exception) -> bool:
"""True iff ``e`` is a non-``PrismaError`` exception raised from inside
prisma-client-py's query-engine layer.
During the instant a DB connection is torn down, the query engine can
return a malformed error payload (``user_facing_error.meta`` is
``null``). prisma-client-py's ``handle_response_errors`` then crashes
with ``AttributeError: 'NoneType' object has no attribute 'get'``
before it can raise the proper P1001 "can't reach database server"
error. That AttributeError carries no connection keyword, so it can't
be matched by message; identify it by its ``prisma.engine`` origin
instead.
Recognized ``PrismaError`` subclasses are excluded: connectivity ones
are already classified by type/keyword above, and data-layer ones
(the DB IS reachable) must stay 401.
"""
import prisma
if isinstance(e, prisma.errors.PrismaError):
return False
tb = getattr(e, "__traceback__", None)
while tb is not None:
if tb.tb_frame.f_globals.get("__name__", "").startswith("prisma.engine"):
return True
tb = tb.tb_next
return False
@staticmethod
def is_database_service_unavailable_error(e: Exception) -> bool:
"""True iff the exception means the database could not answer at the
infrastructure level (connection refused, socket/interface failure,
timeout) rather than a genuine auth failure (key not found) or a
data-layer error (the DB IS reachable and rejected the data).
Auth must answer 401 only for a key the DB confirms is invalid. When
the DB itself is unreachable, the request has to surface as 503 so
callers retry instead of treating valid keys as invalid during an
outage.
Note: prisma-client-py mislabels the P1001 "can't reach database
server" connectivity failure as a ``DataError`` (a data-layer type),
so a type-only check misses real outages. ``is_database_transport_error``
keyword-matches the connection message and catches that masquerade,
while genuine data errors (no connection keyword) correctly stay 401.
The Postgres "cached plan must not change result type" error is matched
here, not in ``is_database_transport_error``: it is a transient stale-DB-
state condition (not an invalid key), but the connection is healthy so it
must not trigger a reconnect.
A non-``PrismaError`` raised from inside the prisma query engine (e.g.
the ``AttributeError`` from ``handle_response_errors`` when the engine
returns a malformed error payload mid-tear-down) is also treated as
unavailable; see ``is_prisma_engine_internal_error``.
"""
import asyncio
if PrismaDBExceptionHandler.is_database_connection_error(e):
return True
if PrismaDBExceptionHandler.is_database_transport_error(e):
return True
if PrismaDBExceptionHandler.is_prisma_engine_internal_error(e):
return True
if "cached plan must not change result type" in str(e).lower():
return True
# OSError already covers ConnectionError and (Py3.3+) TimeoutError.
# asyncio.TimeoutError is a distinct class before Py3.11.
if isinstance(e, (OSError, asyncio.TimeoutError)):
return True
try:
import asyncpg
except ImportError:
return False
return isinstance(
e,
(
asyncpg.exceptions.PostgresConnectionError,
asyncpg.exceptions.InterfaceError,
),
)
@staticmethod
def handle_db_exception(e: Exception):
"""
Primary handler for `allow_requests_on_db_unavailable` flag. Decides whether to raise a DB Exception or not based on the flag.
- If exception is a DB Connection Error, and `allow_requests_on_db_unavailable` is True,
- Do not raise an exception, return None
- Else, raise the exception
"""
if (
PrismaDBExceptionHandler.is_database_connection_error(e)
and PrismaDBExceptionHandler.should_allow_request_on_db_unavailable()
):
return None
raise e
# Default fallback timeouts when neither the caller nor the prisma_client
# expose `_db_auth_reconnect_timeout_seconds` / `_db_auth_reconnect_lock_timeout_seconds`.
# Match the auth path's existing defaults so behavior is uniform across read paths.
_DEFAULT_RECONNECT_TIMEOUT_SECONDS = 2.0
_DEFAULT_RECONNECT_LOCK_TIMEOUT_SECONDS = 0.1
def _coerce_timeout(value: Any, fallback: float) -> float:
"""Return `value` if it is a real int/float, else `fallback`. Guards
against tests that mock `prisma_client` and leave the timeout slots as
MagicMock instances."""
if isinstance(value, (int, float)) and not isinstance(value, bool):
return float(value)
return fallback
async def call_with_db_reconnect_retry(
prisma_client: Any,
coro_factory: Callable[[], Awaitable[Any]],
*,
reason: str,
timeout_seconds: Optional[float] = None,
lock_timeout_seconds: Optional[float] = None,
) -> Any:
"""Run a Prisma read coroutine with one transport-reconnect-and-retry.
The canonical "self-heal a transient DB transport blip" wrapper used by
`PrismaClient.get_generic_data` and other read paths. Mirrors the inline
pattern in `auth_checks._fetch_key_object_from_db_with_reconnect` so we
have a single implementation rather than three drifting copies.
Behavior:
1. Await `coro_factory()`. On success, return its value.
2. On exception, if it is NOT a transport error (per
`is_database_transport_error`), re-raise — data-layer errors like
`UniqueViolationError` mean the DB is reachable, reconnect would be
pointless.
3. If `prisma_client` does not expose `attempt_db_reconnect`, re-raise.
This guards against partial stand-ins / older clients in tests.
4. Call `prisma_client.attempt_db_reconnect(reason=...)`. If it returns
False (cooldown / lock contention / reconnect failure), re-raise.
5. Otherwise await `coro_factory()` a second time and return / propagate
its result. At-most-one retry by construction — no infinite loop.
`coro_factory` MUST be a zero-arg callable that returns a fresh awaitable
on each call. Passing an already-awaited coroutine would fail on retry
with `RuntimeError: cannot reuse already awaited coroutine`.
`reason` should follow `<subsystem>_<operation>_<table>_failure` so
telemetry distinguishes between fan-out callers (e.g.
`_update_config_from_db` issues four concurrent reads).
Args:
prisma_client: The `PrismaClient` (or stand-in) that owns
`attempt_db_reconnect` and the `_db_auth_reconnect_*` defaults.
coro_factory: Zero-arg callable returning the read awaitable.
reason: Telemetry tag forwarded to `attempt_db_reconnect`.
timeout_seconds: Optional override for the reconnect cycle timeout.
Defaults to `prisma_client._db_auth_reconnect_timeout_seconds`,
then to 2.0s.
lock_timeout_seconds: Optional override for how long the helper will
wait to acquire the reconnect lock. Defaults to
`prisma_client._db_auth_reconnect_lock_timeout_seconds`, then to
0.1s.
Returns:
Whatever `coro_factory()` returns (on first or second attempt).
Raises:
Whatever `coro_factory()` raises if the failure is not a transport
error, or if the reconnect attempt does not succeed, or if the retry
also fails.
"""
try:
return await coro_factory()
except Exception as first_exc:
if not PrismaDBExceptionHandler.is_database_transport_error(first_exc):
raise
if not hasattr(prisma_client, "attempt_db_reconnect"):
raise
resolved_timeout = _coerce_timeout(
(
timeout_seconds
if timeout_seconds is not None
else getattr(prisma_client, "_db_auth_reconnect_timeout_seconds", None)
),
_DEFAULT_RECONNECT_TIMEOUT_SECONDS,
)
resolved_lock_timeout = _coerce_timeout(
(
lock_timeout_seconds
if lock_timeout_seconds is not None
else getattr(
prisma_client, "_db_auth_reconnect_lock_timeout_seconds", None
)
),
_DEFAULT_RECONNECT_LOCK_TIMEOUT_SECONDS,
)
verbose_proxy_logger.warning(
"DB transport error on read; attempting reconnect-and-retry. reason=%s error=%s",
reason,
first_exc,
)
# Preserve the original transport error in telemetry. If
# `attempt_db_reconnect` itself raises (e.g. lock cancellation, timer
# error, unexpected internal failure), surfacing that exception
# instead of `first_exc` would mask the actual DB transport problem
# in `failure_handler` / `db_exceptions` alerts. Chain the reconnect
# error as the cause for debuggability without losing the original.
try:
did_reconnect = await prisma_client.attempt_db_reconnect(
reason=reason,
timeout_seconds=resolved_timeout,
lock_timeout_seconds=resolved_lock_timeout,
)
except Exception as reconnect_exc:
verbose_proxy_logger.warning(
"DB reconnect attempt raised; preserving original transport error. "
"reason=%s reconnect_error=%s",
reason,
reconnect_exc,
)
raise first_exc from reconnect_exc
if not did_reconnect:
raise
# At most one retry. If the retry also raises a transport error, we
# propagate — repeated reconnect-loops are the watchdog's job, not
# this helper's.
return await coro_factory()