MoFin/venv/lib/python3.12/site-packages/litellm/llms/anthropic/chat/handler.py

"""
Calling + translation logic for anthropic's `/v1/messages` endpoint
"""

import copy
import json
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    List,
    Literal,
    Optional,
    Tuple,
    Union,
    cast,
)

import httpx  # type: ignore

import litellm
import litellm.litellm_core_utils
import litellm.types
import litellm.types.utils
from litellm.anthropic_beta_headers_manager import (
    update_request_with_filtered_beta,
)
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
    _get_httpx_client,
    get_async_httpx_client,
)
from litellm.types.llms.anthropic import (
    ContentBlockDelta,
    ContentBlockStart,
    ContentBlockStop,
    MessageBlockDelta,
    MessageStartBlock,
    UsageDelta,
)
from litellm.types.llms.openai import (
    ChatCompletionRedactedThinkingBlock,
    ChatCompletionThinkingBlock,
    ChatCompletionToolCallChunk,
    ChatCompletionToolCallFunctionChunk,
)
from litellm.types.responses.main import (
    OutputCodeInterpreterCall,
    build_code_interpreter_log_outputs,
)
from litellm.types.utils import (
    Delta,
    GenericStreamingChunk,
    LlmProviders,
    ModelResponse,
    ModelResponseStream,
    StreamingChoices,
    Usage,
    _generate_id,
)

from ...base import BaseLLM
from ..common_utils import AnthropicError, process_anthropic_headers
from .transformation import ANTHROPIC_TOOL_NAME_REVERSE_MAP_KEY, AnthropicConfig

if TYPE_CHECKING:
    from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
    from litellm.llms.base_llm.chat.transformation import BaseConfig


async def make_call(
    client: Optional[AsyncHTTPHandler],
    api_base: str,
    headers: dict,
    data: str,
    model: str,
    messages: list,
    logging_obj,
    timeout: Optional[Union[float, httpx.Timeout]],
    json_mode: bool,
    speed: Optional[str] = None,
    tool_name_reverse_map: Optional[Dict[str, str]] = None,
) -> Tuple[Any, httpx.Headers]:
    if client is None:
        client = litellm.module_level_aclient

    try:
        response = await client.post(
            api_base,
            headers=headers,
            data=data,
            stream=True,
            timeout=timeout,
            logging_obj=logging_obj,
        )
    except httpx.HTTPStatusError as e:
        error_headers = getattr(e, "headers", None)
        error_response = getattr(e, "response", None)
        if error_headers is None and error_response:
            error_headers = getattr(error_response, "headers", None)
        raise AnthropicError(
            status_code=e.response.status_code,
            message=await e.response.aread(),
            headers=error_headers,
        )
    except Exception as e:
        for exception in litellm.LITELLM_EXCEPTION_TYPES:
            if isinstance(e, exception):
                raise e
        raise AnthropicError(status_code=500, message=str(e))

    completion_stream = ModelResponseIterator(
        streaming_response=response.aiter_lines(),
        sync_stream=False,
        json_mode=json_mode,
        speed=speed,
        tool_name_reverse_map=tool_name_reverse_map,
    )

    # LOGGING
    logging_obj.post_call(
        input=messages,
        api_key="",
        original_response=completion_stream,  # Pass the completion stream for logging
        additional_args={"complete_input_dict": data},
    )

    return completion_stream, response.headers


def make_sync_call(
    client: Optional[HTTPHandler],
    api_base: str,
    headers: dict,
    data: str,
    model: str,
    messages: list,
    logging_obj,
    timeout: Optional[Union[float, httpx.Timeout]],
    json_mode: bool,
    speed: Optional[str] = None,
    tool_name_reverse_map: Optional[Dict[str, str]] = None,
) -> Tuple[Any, httpx.Headers]:
    if client is None:
        client = litellm.module_level_client  # re-use a module level client

    try:
        response = client.post(
            api_base,
            headers=headers,
            data=data,
            stream=True,
            timeout=timeout,
            logging_obj=logging_obj,
        )
    except httpx.HTTPStatusError as e:
        error_headers = getattr(e, "headers", None)
        error_response = getattr(e, "response", None)
        if error_headers is None and error_response:
            error_headers = getattr(error_response, "headers", None)
        raise AnthropicError(
            status_code=e.response.status_code,
            message=e.response.read(),
            headers=error_headers,
        )
    except Exception as e:
        for exception in litellm.LITELLM_EXCEPTION_TYPES:
            if isinstance(e, exception):
                raise e
        raise AnthropicError(status_code=500, message=str(e))

    if response.status_code != 200:
        response_headers = getattr(response, "headers", None)
        raise AnthropicError(
            status_code=response.status_code,
            message=response.read(),
            headers=response_headers,
        )

    completion_stream = ModelResponseIterator(
        streaming_response=response.iter_lines(),
        sync_stream=True,
        json_mode=json_mode,
        speed=speed,
        tool_name_reverse_map=tool_name_reverse_map,
    )

    # LOGGING
    logging_obj.post_call(
        input=messages,
        api_key="",
        original_response="first stream response received",
        additional_args={"complete_input_dict": data},
    )

    return completion_stream, response.headers


class AnthropicChatCompletion(BaseLLM):
    def __init__(self) -> None:
        super().__init__()

    async def acompletion_stream_function(
        self,
        model: str,
        messages: list,
        api_base: str,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
        timeout: Union[float, httpx.Timeout],
        client: Optional[AsyncHTTPHandler],
        encoding,
        api_key,
        logging_obj,
        stream,
        _is_function_call,
        data: dict,
        json_mode: bool,
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
        headers={},
    ):
        from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper

        data["stream"] = True

        completion_stream, headers = await make_call(
            client=client,
            api_base=api_base,
            headers=headers,
            data=json.dumps(data),
            model=model,
            messages=messages,
            logging_obj=logging_obj,
            timeout=timeout,
            json_mode=json_mode,
            speed=optional_params.get("speed") if optional_params else None,
            tool_name_reverse_map=(
                litellm_params.get(ANTHROPIC_TOOL_NAME_REVERSE_MAP_KEY)
                if isinstance(litellm_params, dict)
                else None
            ),
        )
        streamwrapper = CustomStreamWrapper(
            completion_stream=completion_stream,
            model=model,
            custom_llm_provider="anthropic",
            logging_obj=logging_obj,
            _response_headers=process_anthropic_headers(headers),
        )
        return streamwrapper

    async def acompletion_function(
        self,
        model: str,
        messages: list,
        api_base: str,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
        timeout: Union[float, httpx.Timeout],
        encoding,
        api_key,
        logging_obj,
        stream,
        _is_function_call,
        data: dict,
        optional_params: dict,
        json_mode: bool,
        litellm_params: dict,
        provider_config: "BaseConfig",
        logger_fn=None,
        headers={},
        client: Optional[AsyncHTTPHandler] = None,
    ) -> Union[ModelResponse, "CustomStreamWrapper"]:
        async_handler = client or get_async_httpx_client(
            llm_provider=litellm.LlmProviders.ANTHROPIC
        )

        try:
            response = await async_handler.post(
                api_base,
                headers=headers,
                json=data,
                timeout=timeout,
                logging_obj=logging_obj,
            )
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
                input=messages,
                api_key=api_key,
                original_response=str(e),
                additional_args={"complete_input_dict": data},
            )
            status_code = getattr(e, "status_code", 500)
            error_headers = getattr(e, "headers", None)
            error_text = getattr(e, "text", str(e))
            error_response = getattr(e, "response", None)
            if error_headers is None and error_response:
                error_headers = getattr(error_response, "headers", None)
            if error_response and hasattr(error_response, "text"):
                error_text = getattr(error_response, "text", error_text)
            raise AnthropicError(
                message=error_text,
                status_code=status_code,
                headers=error_headers,
            )

        return provider_config.transform_response(
            model=model,
            raw_response=response,
            model_response=model_response,
            logging_obj=logging_obj,
            api_key=api_key,
            request_data=data,
            messages=messages,
            optional_params=optional_params,
            litellm_params=litellm_params,
            encoding=encoding,
            json_mode=json_mode,
        )

    def completion(
        self,
        model: str,
        messages: list,
        api_base: str,
        custom_llm_provider: str,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
        encoding,
        api_key,
        logging_obj,
        optional_params: dict,
        timeout: Union[float, httpx.Timeout],
        litellm_params: dict,
        acompletion=None,
        logger_fn=None,
        headers={},
        client=None,
    ):
        from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
        from litellm.utils import ProviderConfigManager

        optional_params = copy.deepcopy(optional_params)
        stream = optional_params.pop("stream", None)
        json_mode: bool = optional_params.pop("json_mode", False)
        is_vertex_request: bool = optional_params.pop("is_vertex_request", False)
        optional_params.pop("vertex_count_tokens_location", None)
        _is_function_call = False
        messages = copy.deepcopy(messages)
        headers = AnthropicConfig().validate_environment(
            api_key=api_key,
            headers=headers,
            model=model,
            messages=messages,
            optional_params={**optional_params, "is_vertex_request": is_vertex_request},
            litellm_params=litellm_params,
        )

        config = ProviderConfigManager.get_provider_chat_config(
            model=model,
            provider=LlmProviders(custom_llm_provider),
        )
        if config is None:
            raise ValueError(
                f"Provider config not found for model: {model} and provider: {custom_llm_provider}"
            )

        data = config.transform_request(
            model=model,
            messages=messages,
            optional_params={**optional_params, "is_vertex_request": is_vertex_request},
            litellm_params=litellm_params,
            headers=headers,
        )

        headers, data = update_request_with_filtered_beta(
            headers=headers,
            request_data=data,
            provider=custom_llm_provider,
        )

        ## LOGGING
        logging_obj.pre_call(
            input=messages,
            api_key=api_key,
            additional_args={
                "complete_input_dict": data,
                "api_base": api_base,
                "headers": headers,
            },
        )
        print_verbose(f"_is_function_call: {_is_function_call}")
        if acompletion is True:
            if (
                stream is True
            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
                print_verbose("makes async anthropic streaming POST request")
                data["stream"] = stream
                return self.acompletion_stream_function(
                    model=model,
                    messages=messages,
                    data=data,
                    api_base=api_base,
                    custom_prompt_dict=custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    encoding=encoding,
                    api_key=api_key,
                    logging_obj=logging_obj,
                    optional_params=optional_params,
                    stream=stream,
                    _is_function_call=_is_function_call,
                    json_mode=json_mode,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=headers,
                    timeout=timeout,
                    client=(
                        client
                        if client is not None and isinstance(client, AsyncHTTPHandler)
                        else None
                    ),
                )
            else:
                return self.acompletion_function(
                    model=model,
                    messages=messages,
                    data=data,
                    api_base=api_base,
                    custom_prompt_dict=custom_prompt_dict,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    encoding=encoding,
                    api_key=api_key,
                    provider_config=config,
                    logging_obj=logging_obj,
                    optional_params=optional_params,
                    stream=stream,
                    _is_function_call=_is_function_call,
                    litellm_params=litellm_params,
                    logger_fn=logger_fn,
                    headers=headers,
                    client=client,
                    json_mode=json_mode,
                    timeout=timeout,
                )
        else:
            ## COMPLETION CALL
            if (
                stream is True
            ):  # if function call - fake the streaming (need complete blocks for output parsing in openai format)
                data["stream"] = stream
                completion_stream, headers = make_sync_call(
                    client=client,
                    api_base=api_base,
                    headers=headers,  # type: ignore
                    data=json.dumps(data),
                    model=model,
                    messages=messages,
                    logging_obj=logging_obj,
                    timeout=timeout,
                    json_mode=json_mode,
                    speed=optional_params.get("speed") if optional_params else None,
                    tool_name_reverse_map=(
                        litellm_params.get(ANTHROPIC_TOOL_NAME_REVERSE_MAP_KEY)
                        if isinstance(litellm_params, dict)
                        else None
                    ),
                )
                return CustomStreamWrapper(
                    completion_stream=completion_stream,
                    model=model,
                    custom_llm_provider="anthropic",
                    logging_obj=logging_obj,
                    _response_headers=process_anthropic_headers(headers),
                )

            else:
                if client is None or not isinstance(client, HTTPHandler):
                    client = _get_httpx_client(params={"timeout": timeout})
                else:
                    client = client

                try:
                    response = client.post(
                        api_base,
                        headers=headers,
                        data=json.dumps(data),
                        timeout=timeout,
                        logging_obj=logging_obj,
                    )
                except Exception as e:
                    status_code = getattr(e, "status_code", 500)
                    error_headers = getattr(e, "headers", None)
                    error_text = getattr(e, "text", str(e))
                    error_response = getattr(e, "response", None)
                    if error_headers is None and error_response:
                        error_headers = getattr(error_response, "headers", None)
                    if error_response and hasattr(error_response, "text"):
                        error_text = getattr(error_response, "text", error_text)
                    raise AnthropicError(
                        message=error_text,
                        status_code=status_code,
                        headers=error_headers,
                    )

        return config.transform_response(
            model=model,
            raw_response=response,
            model_response=model_response,
            logging_obj=logging_obj,
            api_key=api_key,
            request_data=data,
            messages=messages,
            optional_params=optional_params,
            litellm_params=litellm_params,
            encoding=encoding,
            json_mode=json_mode,
        )

    def embedding(self):
        # logic for parsing in - calling - parsing out model embedding calls
        pass


class ModelResponseIterator:
    def __init__(
        self,
        streaming_response,
        sync_stream: bool,
        json_mode: Optional[bool] = False,
        speed: Optional[str] = None,
        tool_name_reverse_map: Optional[Dict[str, str]] = None,
    ):
        self.streaming_response = streaming_response
        self.response_iterator = self.streaming_response
        self.content_blocks: List[ContentBlockDelta] = []
        self.tool_index = -1
        self.json_mode = json_mode
        self.speed = speed
        # rewritten-name -> caller's original. Built per-request from the
        # forward map in AnthropicConfig._build_request_tool_name_maps; only
        # contains entries we actually rewrote, so a tool legitimately named
        # `foo_bar` is *not* reverse-mapped just because some other tool was
        # rewritten to `foo_bar` in a different request. Empty/None is the
        # common case (no '/' or other invalid chars in any tool name).
        self.tool_name_reverse_map: Dict[str, str] = tool_name_reverse_map or {}
        # Generate response ID once per stream to match OpenAI-compatible behavior
        self.response_id = _generate_id()

        # Track if we're currently streaming a response_format tool
        self.is_response_format_tool: bool = False
        # Track if we've converted any response_format tools (affects finish_reason)
        self.converted_response_format_tool: bool = False

        # For handling partial JSON chunks from fragmentation
        # See: https://github.com/BerriAI/litellm/issues/17473
        self.accumulated_json: str = ""
        self.chunk_type: Literal["valid_json", "accumulated_json"] = "valid_json"

        # Track current content block type to avoid emitting tool calls for non-tool blocks
        # See: https://github.com/BerriAI/litellm/issues/17254
        self.current_content_block_type: Optional[str] = None

        # Accumulate web_search_tool_result blocks for multi-turn reconstruction
        # See: https://github.com/BerriAI/litellm/issues/17737
        self.web_search_results: List[Dict[str, Any]] = []

        # Accumulate compaction blocks for multi-turn reconstruction
        self.compaction_blocks: List[Dict[str, Any]] = []

        # Accumulate streamed thinking text so final usage can split reasoning
        # tokens from regular output tokens.
        self.reasoning_content_chunks: List[str] = []

        # Track server tool use inputs and results for code_interpreter_results
        self._server_tool_inputs: Dict[str, Any] = {}
        self.tool_results: List[Dict[str, Any]] = []
        self._current_server_tool_id: Optional[str] = None
        self._container_id: Optional[str] = None

    def check_empty_tool_call_args(self) -> bool:
        """
        Check if the tool call block so far has been an empty string
        """
        args = ""
        # if text content block -> skip
        if len(self.content_blocks) == 0:
            return False

        if (
            self.content_blocks[0]["delta"]["type"] == "text_delta"
            or self.content_blocks[0]["delta"]["type"] == "thinking_delta"
        ):
            return False

        for block in self.content_blocks:
            if block["delta"]["type"] == "input_json_delta":
                args += block["delta"].get("partial_json", "")  # type: ignore

        if len(args) == 0:
            return True
        return False

    def _handle_usage(self, anthropic_usage_chunk: Union[dict, UsageDelta]) -> Usage:
        reasoning_content = (
            "".join(self.reasoning_content_chunks)
            if self.reasoning_content_chunks
            else None
        )
        return AnthropicConfig().calculate_usage(
            usage_object=cast(dict, anthropic_usage_chunk),
            reasoning_content=reasoning_content,
            speed=self.speed,
        )

    def _content_block_delta_helper(self, chunk: dict) -> Tuple[
        str,
        Optional[ChatCompletionToolCallChunk],
        List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]],
        Dict[str, Any],
    ]:
        """
        Helper function to handle the content block delta
        """
        text = ""
        tool_use: Optional[ChatCompletionToolCallChunk] = None
        provider_specific_fields = {}
        content_block = ContentBlockDelta(**chunk)  # type: ignore
        thinking_blocks: List[
            Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
        ] = []

        self.content_blocks.append(content_block)
        if "text" in content_block["delta"]:
            text = content_block["delta"]["text"]
        elif "partial_json" in content_block["delta"]:
            # Only emit tool calls if we're in a tool_use or server_tool_use block
            # web_search_tool_result blocks also have input_json_delta but should not be treated as tool calls
            # See: https://github.com/BerriAI/litellm/issues/17254
            if self.current_content_block_type in ("tool_use", "server_tool_use"):
                tool_use = cast(
                    ChatCompletionToolCallChunk,
                    {
                        "id": None,
                        "type": "function",
                        "function": {
                            "name": None,
                            "arguments": content_block["delta"]["partial_json"],
                        },
                        "index": self.tool_index,
                    },
                )
        elif "citation" in content_block["delta"]:
            provider_specific_fields["citation"] = content_block["delta"]["citation"]
        elif (
            "thinking" in content_block["delta"]
            or "signature" in content_block["delta"]
        ):
            thinking_content = content_block["delta"].get("thinking")
            if isinstance(thinking_content, str) and thinking_content:
                self.reasoning_content_chunks.append(thinking_content)
            thinking_blocks = [
                ChatCompletionThinkingBlock(
                    type="thinking",
                    thinking=thinking_content or "",
                    signature=str(content_block["delta"].get("signature") or ""),
                )
            ]
            provider_specific_fields["thinking_blocks"] = thinking_blocks
        elif (
            "content" in content_block["delta"]
            and content_block["delta"].get("type") == "compaction_delta"
        ):
            # Handle compaction delta
            provider_specific_fields["compaction_delta"] = {
                "type": "compaction_delta",
                "content": content_block["delta"]["content"],
            }

        return text, tool_use, thinking_blocks, provider_specific_fields

    def _handle_reasoning_content(
        self,
        thinking_blocks: List[
            Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]
        ],
    ) -> Optional[str]:
        """
        Handle the reasoning content
        """
        reasoning_content = None
        for block in thinking_blocks:
            thinking_content = cast(Optional[str], block.get("thinking"))
            if reasoning_content is None:
                reasoning_content = ""
            if thinking_content is not None:
                reasoning_content += thinking_content
        return reasoning_content

    def _handle_redacted_thinking_content(
        self,
        content_block_start: ContentBlockStart,
        provider_specific_fields: Dict[str, Any],
    ) -> Tuple[List[ChatCompletionRedactedThinkingBlock], Dict[str, Any]]:
        """
        Handle the redacted thinking content
        """
        thinking_blocks = [
            ChatCompletionRedactedThinkingBlock(
                type="redacted_thinking",
                data=content_block_start["content_block"]["data"],  # type: ignore
            )
        ]
        provider_specific_fields["thinking_blocks"] = thinking_blocks

        return thinking_blocks, provider_specific_fields

    def get_content_block_start(self, chunk: dict) -> ContentBlockStart:
        from litellm.types.llms.anthropic import (
            ContentBlockStartText,
            ContentBlockStartToolUse,
        )

        if chunk.get("content_block", {}).get("type") == "tool_use":
            content_block_start = ContentBlockStartToolUse(**chunk)  # type: ignore
        else:
            content_block_start = ContentBlockStartText(**chunk)  # type: ignore

        return content_block_start

    def _build_code_interpreter_results(self) -> list:
        """Convert accumulated tool_results to OutputCodeInterpreterCall objects.

        Called during streaming to produce provider-neutral code_interpreter_results
        alongside the raw tool_results, so the Responses API layer doesn't need
        Anthropic-specific knowledge.

        Returns the full cumulative list each time (not incremental), matching
        how web_search_results works.  stream_chunk_builder uses "last value
        wins" for list-valued provider_specific_fields keys, so the last
        emission must contain every result.
        """
        results = []
        for tr in self.tool_results:
            if tr.get("type") != "bash_code_execution_tool_result":
                continue
            call_id = tr.get("tool_use_id", "")
            content = tr.get("content", {})
            log_outputs = build_code_interpreter_log_outputs(content)
            tool_input = self._server_tool_inputs.get(call_id, {})
            code = tool_input.get("command", "") if isinstance(tool_input, dict) else ""
            results.append(
                OutputCodeInterpreterCall(
                    type="code_interpreter_call",
                    id=call_id,
                    code=code,
                    container_id=self._container_id,
                    status="completed",
                    outputs=log_outputs,
                )
            )
        return results

    def chunk_parser(self, chunk: dict) -> ModelResponseStream:
        try:
            type_chunk = chunk.get("type", "") or ""

            text = ""
            tool_use: Optional[ChatCompletionToolCallChunk] = None
            finish_reason = ""
            usage: Optional[Usage] = None
            provider_specific_fields: Dict[str, Any] = {}
            reasoning_content: Optional[str] = None
            thinking_blocks: Optional[
                List[
                    Union[
                        ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock
                    ]
                ]
            ] = None

            # Always use index=0 for OpenAI choice format (fixes multi-choice errors)
            index = 0
            if type_chunk == "content_block_delta":
                """
                Anthropic content chunk
                chunk = {'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': 'Hello'}}
                """
                (
                    text,
                    tool_use,
                    thinking_blocks,
                    provider_specific_fields,
                ) = self._content_block_delta_helper(chunk=chunk)
                if thinking_blocks:
                    reasoning_content = self._handle_reasoning_content(
                        thinking_blocks=thinking_blocks
                    )
            elif type_chunk == "content_block_start":
                """
                event: content_block_start
                data: {"type":"content_block_start","index":1,"content_block":{"type":"tool_use","id":"toolu_01T1x1fJ34qAmk2tNTrN7Up6","name":"get_weather","input":{}}}
                """

                content_block_start = self.get_content_block_start(chunk=chunk)
                self.content_blocks = []  # reset content blocks when new block starts
                # Track current content block type for filtering deltas
                self.current_content_block_type = content_block_start["content_block"][
                    "type"
                ]
                if content_block_start["content_block"]["type"] == "text":
                    text = content_block_start["content_block"]["text"]
                elif (
                    content_block_start["content_block"]["type"] == "tool_use"
                    or content_block_start["content_block"]["type"] == "server_tool_use"
                ):
                    self.tool_index += 1
                    # Reverse-map the (sanitized) tool name back to the
                    # caller's original. No-op when the map is empty.
                    _stream_tool_name = content_block_start["content_block"]["name"]
                    if (
                        self.tool_name_reverse_map
                        and _stream_tool_name in self.tool_name_reverse_map
                    ):
                        _stream_tool_name = self.tool_name_reverse_map[
                            _stream_tool_name
                        ]
                    # Use empty string for arguments in content_block_start - actual arguments
                    # come in subsequent content_block_delta chunks and get accumulated.
                    # Using str(input) here would prepend '{}' causing invalid JSON accumulation.
                    tool_use = ChatCompletionToolCallChunk(
                        id=content_block_start["content_block"]["id"],
                        type="function",
                        function=ChatCompletionToolCallFunctionChunk(
                            name=_stream_tool_name,
                            arguments="",
                        ),
                        index=self.tool_index,
                    )
                    # Track server tool use inputs for code_interpreter_results.
                    # The initial input in content_block_start is typically {}
                    # for streaming; the full input arrives via input_json_delta
                    # and is assembled at content_block_stop.
                    if (
                        content_block_start["content_block"]["type"]
                        == "server_tool_use"
                    ):
                        self._current_server_tool_id = content_block_start[
                            "content_block"
                        ]["id"]
                        tool_input = content_block_start["content_block"].get(
                            "input", {}
                        )
                        self._server_tool_inputs[self._current_server_tool_id] = (
                            tool_input
                        )
                    # Include caller information if present (for programmatic tool calling)
                    if "caller" in content_block_start["content_block"]:
                        caller_data = content_block_start["content_block"]["caller"]
                        if caller_data:
                            tool_use["caller"] = cast(Dict[str, Any], caller_data)  # type: ignore[typeddict-item]
                elif (
                    content_block_start["content_block"]["type"] == "redacted_thinking"
                ):
                    (
                        thinking_blocks,
                        provider_specific_fields,
                    ) = self._handle_redacted_thinking_content(  # type: ignore
                        content_block_start=content_block_start,
                        provider_specific_fields=provider_specific_fields,
                    )

                elif content_block_start["content_block"]["type"] == "compaction":
                    # Handle compaction blocks
                    # The full content comes in content_block_start
                    self.compaction_blocks.append(content_block_start["content_block"])
                    provider_specific_fields["compaction_blocks"] = (
                        self.compaction_blocks
                    )
                    provider_specific_fields["compaction_start"] = {
                        "type": "compaction",
                        "content": content_block_start["content_block"].get(
                            "content", ""
                        ),
                    }

                elif content_block_start["content_block"]["type"].endswith(
                    "_tool_result"
                ):
                    # Handle all tool result types (web_search, bash_code_execution, text_editor, etc.)
                    content_type = content_block_start["content_block"]["type"]

                    # Special handling for web_search_tool_result for backwards compatibility
                    if content_type == "web_search_tool_result":
                        # Capture web_search_tool_result for multi-turn reconstruction
                        # The full content comes in content_block_start, not in deltas
                        # See: https://github.com/BerriAI/litellm/issues/17737
                        self.web_search_results.append(
                            content_block_start["content_block"]
                        )
                        provider_specific_fields["web_search_results"] = (
                            self.web_search_results
                        )
                    elif content_type == "web_fetch_tool_result":
                        # Capture web_fetch_tool_result for multi-turn reconstruction
                        # The full content comes in content_block_start, not in deltas
                        # Fixes: https://github.com/BerriAI/litellm/issues/18137
                        self.web_search_results.append(
                            content_block_start["content_block"]
                        )
                        provider_specific_fields["web_search_results"] = (
                            self.web_search_results
                        )
                    elif content_type != "tool_search_tool_result":
                        # Handle other tool results (code execution, etc.)
                        # Skip tool_search_tool_result as it's internal metadata
                        self.tool_results.append(content_block_start["content_block"])
                        provider_specific_fields["tool_results"] = self.tool_results
                        # Convert to provider-neutral code_interpreter_results
                        provider_specific_fields["code_interpreter_results"] = (
                            self._build_code_interpreter_results()
                        )

            elif type_chunk == "content_block_stop":
                ContentBlockStop(**chunk)  # type: ignore
                # check if tool call content block - only for tool_use and server_tool_use blocks
                if self.current_content_block_type in ("tool_use", "server_tool_use"):
                    is_empty = self.check_empty_tool_call_args()
                    if is_empty:
                        tool_use = ChatCompletionToolCallChunk(
                            id=None,  # type: ignore[typeddict-item]
                            type="function",
                            function=ChatCompletionToolCallFunctionChunk(
                                name=None,  # type: ignore[typeddict-item]
                                arguments="{}",
                            ),
                            index=self.tool_index,
                        )
                    # Update server_tool_inputs with fully assembled input
                    # from input_json_delta chunks (content_block_start has {})
                    if (
                        self.current_content_block_type == "server_tool_use"
                        and self._current_server_tool_id
                    ):
                        args = ""
                        for block in self.content_blocks:
                            if block["delta"]["type"] == "input_json_delta":
                                partial_json = block["delta"].get("partial_json")
                                if isinstance(partial_json, str):
                                    args += partial_json
                        if args:
                            try:
                                self._server_tool_inputs[
                                    self._current_server_tool_id
                                ] = json.loads(args)
                            except (json.JSONDecodeError, TypeError):
                                pass
                        self._current_server_tool_id = None
                # Reset response_format tool tracking when block stops
                self.is_response_format_tool = False
                # Reset current content block type
                self.current_content_block_type = None
            elif type_chunk == "tool_result":
                # Handle tool_result blocks (for tool search results with tool_reference)
                # These are automatically handled by Anthropic API, we just pass them through
                pass
            elif type_chunk == "message_delta":
                finish_reason, usage, container = self._handle_message_delta(chunk)
                if container:
                    provider_specific_fields["container"] = container
                    # Store container_id and re-emit code_interpreter_results
                    # so stream_chunk_builder's last-value-wins picks up the
                    # version with container_id populated.
                    container_id = (
                        container.get("id") if isinstance(container, dict) else None
                    )
                    if container_id and self.tool_results:
                        self._container_id = container_id
                        provider_specific_fields["code_interpreter_results"] = (
                            self._build_code_interpreter_results()
                        )
            elif type_chunk == "message_start":
                """
                Anthropic
                chunk = {
                    "type": "message_start",
                    "message": {
                        "id": "msg_vrtx_011PqREFEMzd3REdCoUFAmdG",
                        "type": "message",
                        "role": "assistant",
                        "model": "claude-3-sonnet-20240229",
                        "content": [],
                        "stop_reason": null,
                        "stop_sequence": null,
                        "usage": {
                            "input_tokens": 270,
                            "output_tokens": 1
                        }
                    }
                }
                """
                message_start_block = MessageStartBlock(**chunk)  # type: ignore
                if "usage" in message_start_block["message"]:
                    usage = self._handle_usage(
                        anthropic_usage_chunk=message_start_block["message"]["usage"]
                    )
            elif type_chunk == "error":
                """
                {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}      }
                """
                _error_dict = chunk.get("error", {}) or {}
                message = _error_dict.get("message", None) or str(chunk)
                raise AnthropicError(
                    message=message,
                    status_code=500,  # it looks like Anthropic API does not return a status code in the chunk error - default to 500
                )

            text, tool_use = self._handle_json_mode_chunk(text=text, tool_use=tool_use)

            returned_chunk = ModelResponseStream(
                choices=[
                    StreamingChoices(
                        index=index,
                        delta=Delta(
                            content=text,
                            tool_calls=[tool_use] if tool_use is not None else None,
                            provider_specific_fields=(
                                provider_specific_fields
                                if provider_specific_fields
                                else None
                            ),
                            thinking_blocks=(
                                thinking_blocks if thinking_blocks else None
                            ),
                            reasoning_content=reasoning_content,
                        ),
                        finish_reason=finish_reason,
                    )
                ],
                usage=usage,
                id=self.response_id,
            )

            return returned_chunk

        except json.JSONDecodeError:
            raise ValueError(f"Failed to decode JSON from chunk: {chunk}")

    def _handle_json_mode_chunk(
        self, text: str, tool_use: Optional[ChatCompletionToolCallChunk]
    ) -> Tuple[str, Optional[ChatCompletionToolCallChunk]]:
        """
        If JSON mode is enabled, convert the tool call to a message.

        Anthropic returns the JSON schema as part of the tool call
        OpenAI returns the JSON schema as part of the content, this handles placing it in the content

        Tool streaming follows Anthropic's fine-grained streaming pattern:
        - content_block_start: Contains complete tool info (id, name, empty arguments)
        - content_block_delta: Contains argument deltas (partial_json)
        - content_block_stop: Signals end of tool

        Reference: https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/fine-grained-tool-streaming

        Args:
            text: str
            tool_use: Optional[ChatCompletionToolCallChunk]
        Returns:
            Tuple[str, Optional[ChatCompletionToolCallChunk]]

            text: The text to use in the content
            tool_use: The ChatCompletionToolCallChunk to use in the chunk response
        """
        if not self.json_mode or tool_use is None:
            return text, tool_use

        # Check if this is a new tool call (has id)
        if tool_use.get("id") is not None:
            # New tool call from content_block_start - tool name is always complete here
            # (per Anthropic's fine-grained streaming pattern)
            tool_name = tool_use.get("function", {}).get("name", "")
            self.is_response_format_tool = tool_name == RESPONSE_FORMAT_TOOL_NAME

        # Convert tool to content if we're tracking a response_format tool
        if self.is_response_format_tool:
            message = AnthropicConfig._convert_tool_response_to_message(
                tool_calls=[tool_use]
            )
            if message is not None:
                text = message.content or ""
                tool_use = None
                # Track that we converted a response_format tool
                self.converted_response_format_tool = True

        return text, tool_use

    def _handle_message_delta(
        self, chunk: dict
    ) -> Tuple[str, Optional[Usage], Optional[Dict[str, Any]]]:
        """
        Handle message_delta event for finish_reason, usage, and container.

        Args:
            chunk: The message_delta chunk

        Returns:
            Tuple of (finish_reason, usage, container)
        """
        message_delta = MessageBlockDelta(**chunk)  # type: ignore
        finish_reason = map_finish_reason(
            finish_reason=message_delta["delta"].get("stop_reason", "stop") or "stop"
        )
        # Override finish_reason to "stop" if we converted response_format tools
        # (matches OpenAI behavior and non-streaming Anthropic implementation)
        if self.converted_response_format_tool:
            finish_reason = "stop"
        usage = self._handle_usage(anthropic_usage_chunk=message_delta["usage"])
        container = message_delta["delta"].get("container")
        return finish_reason, usage, container

    def _handle_accumulated_json_chunk(
        self, data_str: str
    ) -> Optional[ModelResponseStream]:
        """
        Handle partial JSON chunks by accumulating them until valid JSON is received.

        This fixes network fragmentation issues where SSE data chunks may be split
        across TCP packets. See: https://github.com/BerriAI/litellm/issues/17473

        Args:
            data_str: The JSON string to parse (without "data:" prefix)

        Returns:
            ModelResponseStream if JSON is complete, None if still accumulating
        """
        # Accumulate JSON data
        self.accumulated_json += data_str

        # Try to parse the accumulated JSON
        try:
            data_json = json.loads(self.accumulated_json)
            self.accumulated_json = ""  # Reset after successful parsing
            return self.chunk_parser(chunk=data_json)
        except json.JSONDecodeError:
            # If it's not valid JSON yet, continue to the next chunk
            return None

    def _parse_sse_data(self, str_line: str) -> Optional[ModelResponseStream]:
        """
        Parse SSE data line, handling both complete and partial JSON chunks.

        Args:
            str_line: The SSE line starting with "data:"

        Returns:
            ModelResponseStream if parsing succeeded, None if accumulating partial JSON
        """
        data_str = str_line[5:]  # Remove "data:" prefix

        if self.chunk_type == "accumulated_json":
            # Already in accumulation mode, keep accumulating
            return self._handle_accumulated_json_chunk(data_str)

        # Try to parse as valid JSON first
        try:
            data_json = json.loads(data_str)
            return self.chunk_parser(chunk=data_json)
        except json.JSONDecodeError:
            # Switch to accumulation mode and start accumulating
            self.chunk_type = "accumulated_json"
            return self._handle_accumulated_json_chunk(data_str)

    # Sync iterator
    def __iter__(self):
        return self

    def __next__(self):
        while True:
            try:
                chunk = self.response_iterator.__next__()
            except StopIteration:
                # If we have accumulated JSON when stream ends, try to parse it
                if self.accumulated_json:
                    try:
                        data_json = json.loads(self.accumulated_json)
                        self.accumulated_json = ""
                        return self.chunk_parser(chunk=data_json)
                    except json.JSONDecodeError:
                        pass
                raise StopIteration
            except ValueError as e:
                raise RuntimeError(f"Error receiving chunk from stream: {e}")

            try:
                str_line = chunk
                if isinstance(chunk, bytes):  # Handle binary data
                    str_line = chunk.decode("utf-8")  # Convert bytes to string
                    index = str_line.find("data:")
                    if index != -1:
                        str_line = str_line[index:]

                if str_line.startswith("data:"):
                    result = self._parse_sse_data(str_line)
                    if result is not None:
                        return result
                    # If None, continue loop to get more chunks for accumulation
                else:
                    return GenericStreamingChunk(
                        text="",
                        is_finished=False,
                        finish_reason="",
                        usage=None,
                        index=0,
                        tool_use=None,
                    )
            except StopIteration:
                raise StopIteration
            except ValueError as e:
                raise RuntimeError(
                    f"Error parsing chunk: {e},\nReceived chunk: {chunk}"
                )

    # Async iterator
    def __aiter__(self):
        self.async_response_iterator = self.streaming_response.__aiter__()
        return self

    async def __anext__(self):
        while True:
            try:
                chunk = await self.async_response_iterator.__anext__()
            except StopAsyncIteration:
                # If we have accumulated JSON when stream ends, try to parse it
                if self.accumulated_json:
                    try:
                        data_json = json.loads(self.accumulated_json)
                        self.accumulated_json = ""
                        return self.chunk_parser(chunk=data_json)
                    except json.JSONDecodeError:
                        pass
                raise StopAsyncIteration
            except ValueError as e:
                raise RuntimeError(f"Error receiving chunk from stream: {e}")

            try:
                str_line = chunk
                if isinstance(chunk, bytes):  # Handle binary data
                    str_line = chunk.decode("utf-8")  # Convert bytes to string
                    index = str_line.find("data:")
                    if index != -1:
                        str_line = str_line[index:]

                if str_line.startswith("data:"):
                    result = self._parse_sse_data(str_line)
                    if result is not None:
                        return result
                    # If None, continue loop to get more chunks for accumulation
                else:
                    return GenericStreamingChunk(
                        text="",
                        is_finished=False,
                        finish_reason="",
                        usage=None,
                        index=0,
                        tool_use=None,
                    )
            except StopAsyncIteration:
                raise StopAsyncIteration
            except ValueError as e:
                raise RuntimeError(
                    f"Error parsing chunk: {e},\nReceived chunk: {chunk}"
                )

    def convert_str_chunk_to_generic_chunk(self, chunk: str) -> ModelResponseStream:
        """
        Convert a string chunk to a GenericStreamingChunk

        Note: This is used for Anthropic pass through streaming logging

        We can move __anext__, and __next__ to use this function since it's common logic.
        Did not migrate them to minmize changes made in 1 PR.
        """
        str_line = chunk
        if isinstance(chunk, bytes):  # Handle binary data
            str_line = chunk.decode("utf-8")  # Convert bytes to string

        # Extract the data line from SSE format
        # SSE events can be: "event: X\ndata: {...}\n\n" or just "data: {...}\n\n"
        index = str_line.find("data:")
        if index != -1:
            str_line = str_line[index:]

        if str_line.startswith("data:"):
            data_json = json.loads(str_line[5:])
            return self.chunk_parser(chunk=data_json)
        else:
            return ModelResponseStream(id=self.response_id)