fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
500 lines
19 KiB
Python
500 lines
19 KiB
Python
import os
|
|
import threading
|
|
from collections import OrderedDict
|
|
from typing import TYPE_CHECKING, Any, Optional, Tuple, Union
|
|
|
|
from litellm._logging import verbose_logger
|
|
from litellm.integrations.arize import _utils
|
|
from litellm.integrations.arize._utils import ArizeOTELAttributes
|
|
from litellm.types.integrations.arize_phoenix import ArizePhoenixConfig
|
|
|
|
if TYPE_CHECKING:
|
|
from opentelemetry.sdk.trace import TracerProvider
|
|
from opentelemetry.sdk.trace.export import SpanProcessor
|
|
from opentelemetry.trace import Span as _Span
|
|
from opentelemetry.trace import SpanKind
|
|
from opentelemetry.trace import Tracer
|
|
|
|
from litellm.integrations.opentelemetry import OpenTelemetry as _OpenTelemetry
|
|
from litellm.integrations.opentelemetry import (
|
|
OpenTelemetryConfig as _OpenTelemetryConfig,
|
|
)
|
|
from litellm.types.integrations.arize import Protocol as _Protocol
|
|
|
|
Protocol = _Protocol
|
|
OpenTelemetryConfig = _OpenTelemetryConfig
|
|
Span = Union[_Span, Any]
|
|
OpenTelemetry = _OpenTelemetry
|
|
LITELLM_TRACER_NAME: str
|
|
else:
|
|
Protocol = Any
|
|
OpenTelemetryConfig = Any
|
|
Span = Any
|
|
Tracer = Any
|
|
TracerProvider = Any
|
|
SpanKind = Any
|
|
SpanProcessor = Any
|
|
try:
|
|
from litellm.integrations.opentelemetry import (
|
|
LITELLM_TRACER_NAME,
|
|
OpenTelemetry,
|
|
)
|
|
except ImportError:
|
|
LITELLM_TRACER_NAME = "litellm"
|
|
OpenTelemetry = None # type: ignore
|
|
|
|
|
|
ARIZE_HOSTED_PHOENIX_ENDPOINT = "https://otlp.arize.com/v1/traces"
|
|
_MAX_PROJECT_PROVIDERS = 64
|
|
|
|
|
|
class ArizePhoenixLogger(OpenTelemetry): # type: ignore
|
|
"""
|
|
Arize Phoenix logger that sends traces to a Phoenix endpoint.
|
|
|
|
Creates its own dedicated TracerProvider so it can coexist with the
|
|
generic ``otel`` callback (or any other OTEL-based integration) without
|
|
fighting over the global ``opentelemetry.trace`` TracerProvider singleton.
|
|
"""
|
|
|
|
def _init_tracing(self, tracer_provider):
|
|
"""
|
|
Override to create per-project TracerProviders (LRU-cached) for Arize Phoenix.
|
|
|
|
The base ``OpenTelemetry._init_tracing`` falls back to the global
|
|
TracerProvider when one already exists. That causes whichever
|
|
integration initialises second to silently reuse the first one's
|
|
exporter, so spans only reach one destination.
|
|
"""
|
|
from opentelemetry.trace import SpanKind
|
|
|
|
if tracer_provider is not None:
|
|
self._use_injected_tracer_provider = True
|
|
self._shared_span_processor = None
|
|
self.tracer = tracer_provider.get_tracer(LITELLM_TRACER_NAME)
|
|
self.span_kind = SpanKind
|
|
return
|
|
|
|
self._use_injected_tracer_provider = False
|
|
self._project_providers: OrderedDict[str, TracerProvider] = OrderedDict()
|
|
self._project_providers_lock = threading.Lock()
|
|
self._shared_span_processor = self._get_span_processor()
|
|
self.span_kind = SpanKind
|
|
|
|
default_project = self._resolve_project_name({})
|
|
self.tracer = self._get_tracer_for(default_project)
|
|
verbose_logger.debug(
|
|
"ArizePhoenixLogger: Initialized per-project TracerProvider cache "
|
|
"(default_project=%s, endpoint=%s, exporter=%s)",
|
|
default_project,
|
|
self.config.endpoint,
|
|
self.config.exporter,
|
|
)
|
|
|
|
def flush_tracer_providers(self) -> None:
|
|
"""
|
|
Flush all cached per-project providers and the shared span processor.
|
|
|
|
Call on graceful proxy shutdown. Do not call on LRU eviction — in-flight
|
|
spans may still reference evicted providers.
|
|
"""
|
|
if getattr(self, "_use_injected_tracer_provider", False):
|
|
return
|
|
|
|
shared_processor = getattr(self, "_shared_span_processor", None)
|
|
if shared_processor is not None:
|
|
try:
|
|
shared_processor.force_flush()
|
|
except Exception as e:
|
|
verbose_logger.debug(
|
|
"ArizePhoenixLogger: shared span processor force_flush failed: %s",
|
|
e,
|
|
)
|
|
|
|
with getattr(self, "_project_providers_lock", threading.Lock()):
|
|
providers = list(getattr(self, "_project_providers", {}).values())
|
|
|
|
for provider in providers:
|
|
try:
|
|
provider.force_flush()
|
|
except Exception as e:
|
|
verbose_logger.debug(
|
|
"ArizePhoenixLogger: TracerProvider force_flush failed: %s", e
|
|
)
|
|
|
|
def _get_litellm_resource_for_project(self, project_name: str):
|
|
"""
|
|
Build an OTEL Resource with project routing attrs that win over env detector.
|
|
|
|
Phoenix uses ``openinference.project.name``; Arize AX uses ``model_id`` and
|
|
``service.name``. Project attrs are merged last so OTEL_RESOURCE_ATTRIBUTES
|
|
from init does not pin every provider to one project.
|
|
"""
|
|
from opentelemetry.sdk.resources import OTELResourceDetector, Resource
|
|
|
|
project_attributes: dict[str, str] = {
|
|
"openinference.project.name": project_name,
|
|
"model_id": project_name,
|
|
"service.name": project_name,
|
|
}
|
|
deployment_environment = getattr(self.config, "deployment_environment", None)
|
|
if deployment_environment is not None:
|
|
project_attributes["deployment.environment"] = deployment_environment
|
|
|
|
env_resource = OTELResourceDetector().detect()
|
|
project_resource = Resource.create(project_attributes) # type: ignore[arg-type]
|
|
return env_resource.merge(project_resource)
|
|
|
|
def _build_tracer_provider_for_project(self, project_name: str) -> TracerProvider:
|
|
"""Create a TracerProvider for *project_name* (caller holds no cache lock)."""
|
|
from opentelemetry.sdk.trace import TracerProvider
|
|
|
|
provider = TracerProvider(
|
|
resource=self._get_litellm_resource_for_project(project_name)
|
|
)
|
|
provider.add_span_processor(self._shared_span_processor)
|
|
return provider
|
|
|
|
def _get_tracer_for(self, project_name: str) -> Tracer:
|
|
"""Return a tracer for *project_name*, creating/caching a provider on miss."""
|
|
if getattr(self, "_use_injected_tracer_provider", False):
|
|
return self.tracer
|
|
|
|
with self._project_providers_lock:
|
|
if project_name in self._project_providers:
|
|
self._project_providers.move_to_end(project_name)
|
|
return self._project_providers[project_name].get_tracer(
|
|
LITELLM_TRACER_NAME
|
|
)
|
|
|
|
# OTELResourceDetector().detect() is synchronous; build outside the lock so
|
|
# concurrent requests for other projects are not blocked on cache misses.
|
|
new_provider = self._build_tracer_provider_for_project(project_name)
|
|
|
|
with self._project_providers_lock:
|
|
if project_name in self._project_providers:
|
|
self._project_providers.move_to_end(project_name)
|
|
return self._project_providers[project_name].get_tracer(
|
|
LITELLM_TRACER_NAME
|
|
)
|
|
|
|
if len(self._project_providers) >= _MAX_PROJECT_PROVIDERS:
|
|
self._project_providers.popitem(last=False)
|
|
|
|
self._project_providers[project_name] = new_provider
|
|
return new_provider.get_tracer(LITELLM_TRACER_NAME)
|
|
|
|
def _resolve_tracer_for_kwargs(self, kwargs: dict) -> Tuple[str, Tracer]:
|
|
"""Resolve project name once and return the matching tracer."""
|
|
project_name = self._resolve_project_name(kwargs)
|
|
return project_name, self._get_tracer_for(project_name)
|
|
|
|
def get_tracer_to_use_for_request(self, kwargs: dict) -> Tracer:
|
|
"""Route guardrail/raw-request spans to the same per-project tracer as the request."""
|
|
if getattr(self, "_use_injected_tracer_provider", False):
|
|
return self.tracer
|
|
return self._resolve_tracer_for_kwargs(kwargs)[1]
|
|
|
|
def _init_otel_logger_on_litellm_proxy(self):
|
|
"""
|
|
Override: Arize Phoenix should NOT overwrite the proxy's
|
|
``open_telemetry_logger``. That attribute is reserved for the
|
|
primary ``otel`` callback which handles proxy-level parent spans.
|
|
"""
|
|
pass
|
|
|
|
def set_attributes(self, span: Span, kwargs, response_obj: Optional[Any]):
|
|
ArizePhoenixLogger.set_arize_phoenix_attributes(span, kwargs, response_obj)
|
|
return
|
|
|
|
@staticmethod
|
|
def set_arize_phoenix_attributes(span: Span, kwargs, response_obj):
|
|
_utils.set_attributes(span, kwargs, response_obj, ArizeOTELAttributes)
|
|
return
|
|
|
|
@staticmethod
|
|
def _normalize_project_name(name: Optional[str]) -> Optional[str]:
|
|
if name is None:
|
|
return None
|
|
normalized = str(name).strip()
|
|
return normalized if normalized else None
|
|
|
|
@staticmethod
|
|
def _iter_metadata_dicts_from_kwargs(kwargs: dict):
|
|
"""Yield request metadata dicts; standard_logging_object before litellm_params."""
|
|
for key in ("standard_logging_object", "litellm_params"):
|
|
found_key = kwargs.get(key)
|
|
if not isinstance(found_key, dict):
|
|
continue
|
|
metadata = found_key.get("metadata")
|
|
if isinstance(metadata, dict):
|
|
yield metadata
|
|
|
|
@staticmethod
|
|
def _is_proxy_request(kwargs: dict) -> bool:
|
|
"""True when the call is routed through the LiteLLM proxy.
|
|
|
|
Proxy mode is determined solely by the server-set ``proxy_server_request``
|
|
field in ``litellm_params``. Checking request metadata for
|
|
``user_api_key_auth_metadata`` is intentionally avoided: that field is
|
|
user-supplied and would let an authenticated caller fake proxy-mode
|
|
detection to route their telemetry into arbitrary Arize/Phoenix projects.
|
|
"""
|
|
litellm_params = kwargs.get("litellm_params")
|
|
return isinstance(litellm_params, dict) and bool(
|
|
litellm_params.get("proxy_server_request")
|
|
)
|
|
|
|
@staticmethod
|
|
def _project_from_metadata_dict(
|
|
metadata: dict, metadata_key: str, *, proxy_mode: bool
|
|
) -> Optional[str]:
|
|
"""
|
|
Read a Phoenix project field from proxy/SDK metadata.
|
|
|
|
On the proxy, only ``user_api_key_auth_metadata`` (team/key config) may
|
|
select the project. SDK callers may still set project fields directly on
|
|
``metadata``.
|
|
"""
|
|
auth_metadata = metadata.get("user_api_key_auth_metadata")
|
|
if isinstance(auth_metadata, dict):
|
|
project = ArizePhoenixLogger._normalize_project_name(
|
|
auth_metadata.get(metadata_key)
|
|
)
|
|
if project:
|
|
return project
|
|
|
|
if not proxy_mode:
|
|
return ArizePhoenixLogger._normalize_project_name(
|
|
metadata.get(metadata_key)
|
|
)
|
|
return None
|
|
|
|
@staticmethod
|
|
def _metadata_project_from_kwargs(kwargs: dict, metadata_key: str) -> Optional[str]:
|
|
proxy_mode = ArizePhoenixLogger._is_proxy_request(kwargs)
|
|
for metadata in ArizePhoenixLogger._iter_metadata_dicts_from_kwargs(kwargs):
|
|
project = ArizePhoenixLogger._project_from_metadata_dict(
|
|
metadata, metadata_key, proxy_mode=proxy_mode
|
|
)
|
|
if project:
|
|
return project
|
|
return None
|
|
|
|
@staticmethod
|
|
def _resolve_project_name(kwargs: dict) -> str:
|
|
"""
|
|
Resolve the target Phoenix/Arize project for this request.
|
|
|
|
Proxy priority: ``user_api_key_auth_metadata.phoenix_project_name_override``,
|
|
``user_api_key_auth_metadata.phoenix_project_name``, env, then ``default``.
|
|
SDK priority: request metadata fields, then env, then ``default``.
|
|
"""
|
|
override = ArizePhoenixLogger._metadata_project_from_kwargs(
|
|
kwargs, "phoenix_project_name_override"
|
|
)
|
|
if override:
|
|
return override
|
|
|
|
phoenix_name = ArizePhoenixLogger._metadata_project_from_kwargs(
|
|
kwargs, "phoenix_project_name"
|
|
)
|
|
if phoenix_name:
|
|
return phoenix_name
|
|
|
|
env_name = ArizePhoenixLogger._normalize_project_name(
|
|
os.environ.get("PHOENIX_PROJECT_NAME")
|
|
or os.environ.get("ARIZE_PROJECT_NAME")
|
|
)
|
|
if env_name:
|
|
return env_name
|
|
|
|
return "default"
|
|
|
|
def _get_phoenix_context(self, kwargs, tracer: Optional[Tracer] = None):
|
|
"""
|
|
Build a trace context for Phoenix's dedicated TracerProvider.
|
|
|
|
The base ``_get_span_context`` returns parent spans from the global
|
|
TracerProvider (the ``otel`` callback). Those spans live on a
|
|
*different* TracerProvider, so they won't appear in Phoenix — using
|
|
them as parents just creates broken links.
|
|
|
|
Instead we:
|
|
1. Honour an incoming ``traceparent`` HTTP header (distributed tracing).
|
|
2. In proxy mode, create our *own* parent span on Phoenix's tracer
|
|
so the hierarchy is visible end-to-end inside Phoenix.
|
|
3. In SDK (non-proxy) mode, just return (None, None) for a root span.
|
|
"""
|
|
from opentelemetry import trace
|
|
|
|
if tracer is None:
|
|
tracer = self._resolve_tracer_for_kwargs(kwargs)[1]
|
|
|
|
litellm_params = kwargs.get("litellm_params", {}) or {}
|
|
proxy_server_request = litellm_params.get("proxy_server_request", {}) or {}
|
|
headers = proxy_server_request.get("headers", {}) or {}
|
|
|
|
traceparent_ctx = (
|
|
self.get_traceparent_from_header(headers=headers)
|
|
if headers.get("traceparent")
|
|
else None
|
|
)
|
|
|
|
is_proxy_mode = bool(proxy_server_request)
|
|
|
|
if is_proxy_mode:
|
|
start_time_val = kwargs.get("start_time", kwargs.get("api_call_start_time"))
|
|
parent_span = tracer.start_span(
|
|
name="litellm_proxy_request",
|
|
start_time=(
|
|
self._to_ns(start_time_val) if start_time_val is not None else None
|
|
),
|
|
context=traceparent_ctx,
|
|
kind=self.span_kind.SERVER,
|
|
)
|
|
ctx = trace.set_span_in_context(parent_span)
|
|
return ctx, parent_span
|
|
|
|
return traceparent_ctx, None
|
|
|
|
def _handle_success(self, kwargs, response_obj, start_time, end_time):
|
|
self._handle_phoenix_trace(
|
|
kwargs, response_obj, start_time, end_time, success=True
|
|
)
|
|
|
|
def _handle_failure(self, kwargs, response_obj, start_time, end_time):
|
|
self._handle_phoenix_trace(
|
|
kwargs, response_obj, start_time, end_time, success=False
|
|
)
|
|
|
|
def _handle_phoenix_trace(
|
|
self,
|
|
kwargs,
|
|
response_obj,
|
|
start_time,
|
|
end_time,
|
|
*,
|
|
success: bool,
|
|
):
|
|
from opentelemetry.trace import Status, StatusCode
|
|
|
|
verbose_logger.debug(
|
|
"ArizePhoenixLogger: %s - kwargs: %s, OTEL config settings=%s",
|
|
"success" if success else "failure",
|
|
kwargs,
|
|
self.config,
|
|
)
|
|
|
|
_project_name, tracer = self._resolve_tracer_for_kwargs(kwargs)
|
|
ctx, parent_span = self._get_phoenix_context(kwargs, tracer=tracer)
|
|
|
|
status = Status(StatusCode.OK if success else StatusCode.ERROR)
|
|
|
|
span = tracer.start_span(
|
|
name=self._get_span_name(kwargs),
|
|
start_time=self._to_ns(start_time),
|
|
context=ctx,
|
|
)
|
|
span.set_status(status)
|
|
self.set_attributes(span, kwargs, response_obj)
|
|
if not success:
|
|
self._record_exception_on_span(span=span, kwargs=kwargs)
|
|
|
|
if success:
|
|
self._maybe_log_raw_request(
|
|
kwargs, response_obj, start_time, end_time, span
|
|
)
|
|
span.end(end_time=self._to_ns(end_time))
|
|
|
|
self._create_guardrail_span(kwargs=kwargs, context=ctx)
|
|
|
|
if parent_span is not None:
|
|
parent_span.set_status(status)
|
|
self.set_attributes(parent_span, kwargs, response_obj)
|
|
if not success:
|
|
self._record_exception_on_span(span=parent_span, kwargs=kwargs)
|
|
parent_span.end(end_time=self._to_ns(end_time))
|
|
|
|
if success:
|
|
self._record_metrics(kwargs, response_obj, start_time, end_time)
|
|
|
|
if self.config.enable_events:
|
|
self._emit_semantic_logs(kwargs, response_obj, span)
|
|
|
|
@staticmethod
|
|
def get_arize_phoenix_config() -> ArizePhoenixConfig:
|
|
"""
|
|
Retrieves the Arize Phoenix configuration based on environment variables.
|
|
Returns:
|
|
"""
|
|
api_key = os.environ.get("PHOENIX_API_KEY", None)
|
|
|
|
collector_endpoint = os.environ.get("PHOENIX_COLLECTOR_HTTP_ENDPOINT", None)
|
|
|
|
if not collector_endpoint:
|
|
grpc_endpoint = os.environ.get("PHOENIX_COLLECTOR_ENDPOINT", None)
|
|
http_endpoint = os.environ.get("PHOENIX_COLLECTOR_HTTP_ENDPOINT", None)
|
|
collector_endpoint = http_endpoint or grpc_endpoint
|
|
|
|
endpoint = None
|
|
protocol: Protocol = "otlp_http"
|
|
|
|
if collector_endpoint:
|
|
if collector_endpoint.startswith("grpc://") or (
|
|
":4317" in collector_endpoint and "/v1/traces" not in collector_endpoint
|
|
):
|
|
endpoint = collector_endpoint
|
|
protocol = "otlp_grpc"
|
|
else:
|
|
if "app.phoenix.arize.com" in collector_endpoint:
|
|
endpoint = collector_endpoint
|
|
protocol = "otlp_http"
|
|
elif "/v1/traces" not in collector_endpoint:
|
|
if collector_endpoint.endswith("/v1"):
|
|
endpoint = collector_endpoint + "/traces"
|
|
elif collector_endpoint.endswith("/"):
|
|
endpoint = f"{collector_endpoint}v1/traces"
|
|
else:
|
|
endpoint = f"{collector_endpoint}/v1/traces"
|
|
else:
|
|
endpoint = collector_endpoint
|
|
protocol = "otlp_http"
|
|
else:
|
|
endpoint = "http://localhost:6006/v1/traces"
|
|
protocol = "otlp_http"
|
|
verbose_logger.debug(
|
|
f"No PHOENIX_COLLECTOR_ENDPOINT found, using default local Phoenix endpoint: {endpoint}"
|
|
)
|
|
|
|
otlp_auth_headers = None
|
|
if api_key is not None:
|
|
otlp_auth_headers = f"Authorization=Bearer {api_key}"
|
|
elif "app.phoenix.arize.com" in endpoint:
|
|
raise ValueError(
|
|
"PHOENIX_API_KEY must be set when using Phoenix Cloud (app.phoenix.arize.com)."
|
|
)
|
|
|
|
project_name = os.environ.get("PHOENIX_PROJECT_NAME") or "default"
|
|
|
|
return ArizePhoenixConfig(
|
|
otlp_auth_headers=otlp_auth_headers,
|
|
protocol=protocol,
|
|
endpoint=endpoint,
|
|
project_name=project_name,
|
|
)
|
|
|
|
async def async_health_check(self):
|
|
config = self.get_arize_phoenix_config()
|
|
|
|
if not config.otlp_auth_headers:
|
|
return {
|
|
"status": "unhealthy",
|
|
"error_message": "PHOENIX_API_KEY environment variable not set",
|
|
}
|
|
|
|
return {
|
|
"status": "healthy",
|
|
"message": "Arize-Phoenix credentials are configured properly",
|
|
}
|