MoFin/venv/lib/python3.12/site-packages/litellm/llms/soniox/common_utils.py

"""
Shared utilities for the Soniox provider (https://soniox.com).
"""

from typing import Any, Dict, List, Optional

from litellm.llms.base_llm.chat.transformation import BaseLLMException

# Soniox API base URL.
SONIOX_API_BASE: str = "https://api.soniox.com"

# Default polling interval in seconds when waiting for an async transcription
# to finish. Mirrors the Soniox SDK default.
SONIOX_DEFAULT_POLL_INTERVAL: float = 1.0

# Minimum polling interval (in seconds) the server will accept from caller-
# supplied `soniox_polling_interval` kwargs. Prevents an authenticated caller
# from forcing a worker into a tight poll loop with a zero/near-zero interval.
SONIOX_MIN_POLL_INTERVAL: float = 0.5

# Maximum polling interval (in seconds). Prevents a caller from setting an
# excessively large or non-finite interval that would keep a worker sleeping
# far longer than necessary between status checks.
SONIOX_MAX_POLL_INTERVAL: float = 60.0

# Default maximum number of polling attempts (1800 attempts * 1s ~= 30 minutes).
SONIOX_DEFAULT_MAX_POLL_ATTEMPTS: int = 1800

# Hard upper bound on polling attempts. Combined with `SONIOX_MIN_POLL_INTERVAL`
# this caps total polling time per request at ~3000s (50 minutes), preventing a
# caller from pinning a worker indefinitely via a huge attempt count.
SONIOX_MAX_POLL_ATTEMPTS: int = 6000

# Default cleanup behaviour: delete both the uploaded file (if any) and the
# transcription record after the transcript has been fetched.
SONIOX_DEFAULT_CLEANUP: List[str] = ["file", "transcription"]

# Body fields that may carry secrets and must be redacted before being
# forwarded to logging callbacks. Soniox accepts a webhook auth header value
# alongside the create-transcription request; that value lets the recipient
# authenticate webhook callbacks and must not leak into observability sinks.
SONIOX_SECRET_FIELDS: List[str] = ["webhook_auth_header_value"]


class SonioxException(BaseLLMException):
    """Provider-specific exception class for Soniox."""

    pass


def get_soniox_api_key(api_key: Optional[str] = None) -> Optional[str]:
    """Resolve the Soniox API key from arg or env var."""
    # Local import to avoid a circular import: litellm.secret_managers.main
    # imports from litellm at top-level.
    from litellm.secret_managers.main import get_secret_str

    return api_key or get_secret_str("SONIOX_API_KEY")


def get_soniox_api_base(api_base: Optional[str] = None) -> str:
    """Resolve the Soniox API base URL from arg or env var (defaults to public API)."""
    from litellm.secret_managers.main import get_secret_str

    base = api_base or get_secret_str("SONIOX_API_BASE") or SONIOX_API_BASE
    return base.rstrip("/")


def render_soniox_tokens(tokens: List[Dict[str, Any]]) -> str:
    """
    Render a list of Soniox tokens to a readable transcript string.

    Mirrors the behaviour of the official Soniox SDK's `renderTokens` helper:
    - When the speaker changes, a `Speaker N:` tag is inserted.
    - When the language changes, a `[lang]` (or `[Translation][lang]`) tag is
      inserted.

    If neither speaker nor language information is present on any token (i.e.
    diarization and language identification are disabled), the function simply
    concatenates the token texts.
    """
    if not tokens:
        return ""

    text_parts: List[str] = []
    current_speaker: Optional[Any] = None
    current_language: Optional[Any] = None

    for token in tokens:
        text = token.get("text", "")
        speaker = token.get("speaker")
        language = token.get("language")
        is_translation = token.get("translation_status") == "translation"

        # Speaker changed -> emit a speaker tag.
        if speaker is not None and speaker != current_speaker:
            if current_speaker is not None:
                text_parts.append("\n\n")
            current_speaker = speaker
            current_language = None  # reset language whenever speaker changes
            text_parts.append(f"Speaker {current_speaker}:")

        # Language changed -> emit a language (or translation) tag.
        if language is not None and language != current_language:
            current_language = language
            prefix = "[Translation] " if is_translation else ""
            text_parts.append(f"\n{prefix}[{current_language}] ")
            text = text.lstrip() if isinstance(text, str) else text

        text_parts.append(text)

    return "".join(text_parts)


# ---------------------------------------------------------------------------
# SRT / VTT subtitle rendering
# ---------------------------------------------------------------------------

# Maximum number of tokens to group into a single subtitle cue.
_CUE_MAX_TOKENS: int = 15

# Maximum duration (in ms) for a single cue before forcing a break.
_CUE_MAX_DURATION_MS: int = 5000


def _format_timestamp_srt(ms: int) -> str:
    """Format milliseconds as SRT timestamp: HH:MM:SS,mmm"""
    if ms < 0:
        ms = 0
    hours = ms // 3_600_000
    ms %= 3_600_000
    minutes = ms // 60_000
    ms %= 60_000
    seconds = ms // 1_000
    millis = ms % 1_000
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{millis:03d}"


def _format_timestamp_vtt(ms: int) -> str:
    """Format milliseconds as VTT timestamp: HH:MM:SS.mmm"""
    if ms < 0:
        ms = 0
    hours = ms // 3_600_000
    ms %= 3_600_000
    minutes = ms // 60_000
    ms %= 60_000
    seconds = ms // 1_000
    millis = ms % 1_000
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{millis:03d}"


def _group_tokens_into_cues(
    tokens: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    """
    Group Soniox tokens into subtitle cues.

    Each cue has:
      - start_ms: int
      - end_ms: int
      - text: str

    Grouping heuristics:
      - A new cue starts when token count exceeds _CUE_MAX_TOKENS.
      - A new cue starts when duration exceeds _CUE_MAX_DURATION_MS.
      - A new cue starts when the speaker changes (if diarization is on).
      - Tokens without timestamps are appended to the current cue.
    """
    cues: List[Dict[str, Any]] = []
    current_tokens: List[str] = []
    current_start: Optional[int] = None
    current_end: Optional[int] = None
    current_speaker: Optional[Any] = None

    def _flush() -> None:
        if current_tokens and current_start is not None:
            text = "".join(current_tokens).strip()
            if text:
                cues.append(
                    {
                        "start_ms": current_start,
                        "end_ms": (
                            current_end if current_end is not None else current_start
                        ),
                        "text": text,
                    }
                )

    for token in tokens:
        start_ms = token.get("start_ms")
        end_ms = token.get("end_ms")
        text = token.get("text", "")
        speaker = token.get("speaker")

        # Skip tokens with no timestamp data entirely if we have no cue started
        if start_ms is None and current_start is None:
            continue

        # Speaker change forces a new cue
        if speaker is not None and speaker != current_speaker:
            _flush()
            current_tokens = []
            current_start = start_ms
            current_end = end_ms
            current_speaker = speaker
            current_tokens.append(text)
            continue

        # Duration or token count exceeded -> flush
        should_break = False
        if len(current_tokens) >= _CUE_MAX_TOKENS:
            should_break = True
        elif (
            current_start is not None
            and start_ms is not None
            and (start_ms - current_start) >= _CUE_MAX_DURATION_MS
        ):
            should_break = True

        if should_break:
            _flush()
            current_tokens = []
            current_start = start_ms
            current_end = end_ms
            current_tokens.append(text)
        else:
            if current_start is None:
                current_start = start_ms
            if end_ms is not None:
                current_end = end_ms
            current_tokens.append(text)

    _flush()
    return cues


def render_soniox_tokens_as_srt(tokens: List[Dict[str, Any]]) -> str:
    """
    Render Soniox tokens as SRT (SubRip) subtitle format.

    Returns an empty string if no tokens have timestamp data.
    """
    cues = _group_tokens_into_cues(tokens)
    if not cues:
        return ""

    lines: List[str] = []
    for idx, cue in enumerate(cues, start=1):
        start = _format_timestamp_srt(cue["start_ms"])
        end = _format_timestamp_srt(cue["end_ms"])
        lines.append(str(idx))
        lines.append(f"{start} --> {end}")
        lines.append(cue["text"])
        lines.append("")  # blank line between cues

    return "\n".join(lines)


def render_soniox_tokens_as_vtt(tokens: List[Dict[str, Any]]) -> str:
    """
    Render Soniox tokens as WebVTT subtitle format.

    Returns the VTT header even if no cues are present.
    """
    cues = _group_tokens_into_cues(tokens)

    lines: List[str] = ["WEBVTT", ""]
    for cue in cues:
        start = _format_timestamp_vtt(cue["start_ms"])
        end = _format_timestamp_vtt(cue["end_ms"])
        lines.append(f"{start} --> {end}")
        lines.append(cue["text"])
        lines.append("")  # blank line between cues

    return "\n".join(lines)