Files
MoFin/venv/lib/python3.12/site-packages/litellm/llms/soniox/common_utils.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

275 lines
9.2 KiB
Python

"""
Shared utilities for the Soniox provider (https://soniox.com).
"""
from typing import Any, Dict, List, Optional
from litellm.llms.base_llm.chat.transformation import BaseLLMException
# Soniox API base URL.
SONIOX_API_BASE: str = "https://api.soniox.com"
# Default polling interval in seconds when waiting for an async transcription
# to finish. Mirrors the Soniox SDK default.
SONIOX_DEFAULT_POLL_INTERVAL: float = 1.0
# Minimum polling interval (in seconds) the server will accept from caller-
# supplied `soniox_polling_interval` kwargs. Prevents an authenticated caller
# from forcing a worker into a tight poll loop with a zero/near-zero interval.
SONIOX_MIN_POLL_INTERVAL: float = 0.5
# Maximum polling interval (in seconds). Prevents a caller from setting an
# excessively large or non-finite interval that would keep a worker sleeping
# far longer than necessary between status checks.
SONIOX_MAX_POLL_INTERVAL: float = 60.0
# Default maximum number of polling attempts (1800 attempts * 1s ~= 30 minutes).
SONIOX_DEFAULT_MAX_POLL_ATTEMPTS: int = 1800
# Hard upper bound on polling attempts. Combined with `SONIOX_MIN_POLL_INTERVAL`
# this caps total polling time per request at ~3000s (50 minutes), preventing a
# caller from pinning a worker indefinitely via a huge attempt count.
SONIOX_MAX_POLL_ATTEMPTS: int = 6000
# Default cleanup behaviour: delete both the uploaded file (if any) and the
# transcription record after the transcript has been fetched.
SONIOX_DEFAULT_CLEANUP: List[str] = ["file", "transcription"]
# Body fields that may carry secrets and must be redacted before being
# forwarded to logging callbacks. Soniox accepts a webhook auth header value
# alongside the create-transcription request; that value lets the recipient
# authenticate webhook callbacks and must not leak into observability sinks.
SONIOX_SECRET_FIELDS: List[str] = ["webhook_auth_header_value"]
class SonioxException(BaseLLMException):
"""Provider-specific exception class for Soniox."""
pass
def get_soniox_api_key(api_key: Optional[str] = None) -> Optional[str]:
"""Resolve the Soniox API key from arg or env var."""
# Local import to avoid a circular import: litellm.secret_managers.main
# imports from litellm at top-level.
from litellm.secret_managers.main import get_secret_str
return api_key or get_secret_str("SONIOX_API_KEY")
def get_soniox_api_base(api_base: Optional[str] = None) -> str:
"""Resolve the Soniox API base URL from arg or env var (defaults to public API)."""
from litellm.secret_managers.main import get_secret_str
base = api_base or get_secret_str("SONIOX_API_BASE") or SONIOX_API_BASE
return base.rstrip("/")
def render_soniox_tokens(tokens: List[Dict[str, Any]]) -> str:
"""
Render a list of Soniox tokens to a readable transcript string.
Mirrors the behaviour of the official Soniox SDK's `renderTokens` helper:
- When the speaker changes, a `Speaker N:` tag is inserted.
- When the language changes, a `[lang]` (or `[Translation][lang]`) tag is
inserted.
If neither speaker nor language information is present on any token (i.e.
diarization and language identification are disabled), the function simply
concatenates the token texts.
"""
if not tokens:
return ""
text_parts: List[str] = []
current_speaker: Optional[Any] = None
current_language: Optional[Any] = None
for token in tokens:
text = token.get("text", "")
speaker = token.get("speaker")
language = token.get("language")
is_translation = token.get("translation_status") == "translation"
# Speaker changed -> emit a speaker tag.
if speaker is not None and speaker != current_speaker:
if current_speaker is not None:
text_parts.append("\n\n")
current_speaker = speaker
current_language = None # reset language whenever speaker changes
text_parts.append(f"Speaker {current_speaker}:")
# Language changed -> emit a language (or translation) tag.
if language is not None and language != current_language:
current_language = language
prefix = "[Translation] " if is_translation else ""
text_parts.append(f"\n{prefix}[{current_language}] ")
text = text.lstrip() if isinstance(text, str) else text
text_parts.append(text)
return "".join(text_parts)
# ---------------------------------------------------------------------------
# SRT / VTT subtitle rendering
# ---------------------------------------------------------------------------
# Maximum number of tokens to group into a single subtitle cue.
_CUE_MAX_TOKENS: int = 15
# Maximum duration (in ms) for a single cue before forcing a break.
_CUE_MAX_DURATION_MS: int = 5000
def _format_timestamp_srt(ms: int) -> str:
"""Format milliseconds as SRT timestamp: HH:MM:SS,mmm"""
if ms < 0:
ms = 0
hours = ms // 3_600_000
ms %= 3_600_000
minutes = ms // 60_000
ms %= 60_000
seconds = ms // 1_000
millis = ms % 1_000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"
def _format_timestamp_vtt(ms: int) -> str:
"""Format milliseconds as VTT timestamp: HH:MM:SS.mmm"""
if ms < 0:
ms = 0
hours = ms // 3_600_000
ms %= 3_600_000
minutes = ms // 60_000
ms %= 60_000
seconds = ms // 1_000
millis = ms % 1_000
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{millis:03d}"
def _group_tokens_into_cues(
tokens: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""
Group Soniox tokens into subtitle cues.
Each cue has:
- start_ms: int
- end_ms: int
- text: str
Grouping heuristics:
- A new cue starts when token count exceeds _CUE_MAX_TOKENS.
- A new cue starts when duration exceeds _CUE_MAX_DURATION_MS.
- A new cue starts when the speaker changes (if diarization is on).
- Tokens without timestamps are appended to the current cue.
"""
cues: List[Dict[str, Any]] = []
current_tokens: List[str] = []
current_start: Optional[int] = None
current_end: Optional[int] = None
current_speaker: Optional[Any] = None
def _flush() -> None:
if current_tokens and current_start is not None:
text = "".join(current_tokens).strip()
if text:
cues.append(
{
"start_ms": current_start,
"end_ms": (
current_end if current_end is not None else current_start
),
"text": text,
}
)
for token in tokens:
start_ms = token.get("start_ms")
end_ms = token.get("end_ms")
text = token.get("text", "")
speaker = token.get("speaker")
# Skip tokens with no timestamp data entirely if we have no cue started
if start_ms is None and current_start is None:
continue
# Speaker change forces a new cue
if speaker is not None and speaker != current_speaker:
_flush()
current_tokens = []
current_start = start_ms
current_end = end_ms
current_speaker = speaker
current_tokens.append(text)
continue
# Duration or token count exceeded -> flush
should_break = False
if len(current_tokens) >= _CUE_MAX_TOKENS:
should_break = True
elif (
current_start is not None
and start_ms is not None
and (start_ms - current_start) >= _CUE_MAX_DURATION_MS
):
should_break = True
if should_break:
_flush()
current_tokens = []
current_start = start_ms
current_end = end_ms
current_tokens.append(text)
else:
if current_start is None:
current_start = start_ms
if end_ms is not None:
current_end = end_ms
current_tokens.append(text)
_flush()
return cues
def render_soniox_tokens_as_srt(tokens: List[Dict[str, Any]]) -> str:
"""
Render Soniox tokens as SRT (SubRip) subtitle format.
Returns an empty string if no tokens have timestamp data.
"""
cues = _group_tokens_into_cues(tokens)
if not cues:
return ""
lines: List[str] = []
for idx, cue in enumerate(cues, start=1):
start = _format_timestamp_srt(cue["start_ms"])
end = _format_timestamp_srt(cue["end_ms"])
lines.append(str(idx))
lines.append(f"{start} --> {end}")
lines.append(cue["text"])
lines.append("") # blank line between cues
return "\n".join(lines)
def render_soniox_tokens_as_vtt(tokens: List[Dict[str, Any]]) -> str:
"""
Render Soniox tokens as WebVTT subtitle format.
Returns the VTT header even if no cues are present.
"""
cues = _group_tokens_into_cues(tokens)
lines: List[str] = ["WEBVTT", ""]
for cue in cues:
start = _format_timestamp_vtt(cue["start_ms"])
end = _format_timestamp_vtt(cue["end_ms"])
lines.append(f"{start} --> {end}")
lines.append(cue["text"])
lines.append("") # blank line between cues
return "\n".join(lines)