fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
160 lines
5.1 KiB
Python
160 lines
5.1 KiB
Python
import base64
|
|
import binascii
|
|
from collections import defaultdict
|
|
from typing import TYPE_CHECKING, Any, Dict, List, NoReturn, Optional, Tuple
|
|
|
|
from litellm.constants import request_timeout
|
|
|
|
REDUCTO_API_BASE = "https://platform.reducto.ai"
|
|
REDUCTO_ID_PREFIX = "reducto://"
|
|
|
|
if TYPE_CHECKING:
|
|
from litellm.llms.base_llm.ocr.transformation import OCRPage
|
|
|
|
|
|
def _normalize_api_base(api_base: Optional[str]) -> str:
|
|
return (api_base or REDUCTO_API_BASE).rstrip("/")
|
|
|
|
|
|
def _raise_bad_request(message: str, model: str) -> NoReturn:
|
|
import litellm
|
|
|
|
raise litellm.BadRequestError(
|
|
message=message,
|
|
model=model,
|
|
llm_provider="reducto",
|
|
)
|
|
|
|
|
|
def extract_file_id_or_bytes(
|
|
source_url: str,
|
|
model: str,
|
|
) -> Tuple[Optional[str], Optional[bytes], Optional[str]]:
|
|
if source_url.startswith(REDUCTO_ID_PREFIX):
|
|
return source_url, None, None
|
|
|
|
if source_url.startswith("http://") or source_url.startswith("https://"):
|
|
_raise_bad_request(
|
|
"Reducto requires type='file' (auto-uploaded) or a reducto:// id. Plain http(s) URLs are not supported; upload the file first.",
|
|
model=model,
|
|
)
|
|
|
|
if not source_url.startswith("data:"):
|
|
_raise_bad_request(
|
|
"Reducto requires a reducto:// id or a base64 data URI after OCR preprocessing.",
|
|
model=model,
|
|
)
|
|
|
|
try:
|
|
header, encoded = source_url.split(",", 1)
|
|
except ValueError:
|
|
_raise_bad_request("Invalid Reducto data URI provided.", model=model)
|
|
|
|
if ";base64" not in header:
|
|
_raise_bad_request(
|
|
"Reducto only supports base64-encoded data URIs.", model=model
|
|
)
|
|
|
|
mime = header.removeprefix("data:").split(";")[0] or "application/octet-stream"
|
|
try:
|
|
raw_bytes = base64.b64decode(encoded, validate=True)
|
|
except (binascii.Error, ValueError):
|
|
_raise_bad_request("Invalid Reducto base64 payload provided.", model=model)
|
|
|
|
return None, raw_bytes, mime
|
|
|
|
|
|
def _extract_file_id_from_upload_response(response: Any) -> str:
|
|
try:
|
|
payload = response.json()
|
|
except ValueError as exc:
|
|
raise ValueError(
|
|
"Reducto /upload returned a non-JSON 200 response: {}".format(response.text)
|
|
) from exc
|
|
file_id = (payload or {}).get("file_id") if isinstance(payload, dict) else None
|
|
if not isinstance(file_id, str) or not file_id:
|
|
raise ValueError(
|
|
"Reducto /upload returned 200 without a file_id; got payload={}".format(
|
|
payload
|
|
)
|
|
)
|
|
return file_id
|
|
|
|
|
|
def upload_bytes_sync(
|
|
raw_bytes: bytes,
|
|
mime: Optional[str],
|
|
api_key: str,
|
|
api_base: Optional[str],
|
|
) -> str:
|
|
import litellm
|
|
|
|
response = litellm.module_level_client.post(
|
|
url="{}{}".format(_normalize_api_base(api_base), "/upload"),
|
|
headers={"Authorization": f"Bearer {api_key}"},
|
|
files={"file": ("document", raw_bytes, mime or "application/octet-stream")},
|
|
timeout=request_timeout,
|
|
)
|
|
response.raise_for_status()
|
|
return _extract_file_id_from_upload_response(response)
|
|
|
|
|
|
async def upload_bytes_async(
|
|
raw_bytes: bytes,
|
|
mime: Optional[str],
|
|
api_key: str,
|
|
api_base: Optional[str],
|
|
) -> str:
|
|
import litellm
|
|
|
|
response = await litellm.module_level_aclient.post(
|
|
url="{}{}".format(_normalize_api_base(api_base), "/upload"),
|
|
headers={"Authorization": f"Bearer {api_key}"},
|
|
files={"file": ("document", raw_bytes, mime or "application/octet-stream")},
|
|
timeout=request_timeout,
|
|
)
|
|
response.raise_for_status()
|
|
return _extract_file_id_from_upload_response(response)
|
|
|
|
|
|
def build_pages_from_reducto(result: Dict[str, Any]) -> List["OCRPage"]:
|
|
from litellm.llms.base_llm.ocr.transformation import OCRPage
|
|
|
|
chunks = result.get("chunks", []) or []
|
|
blocks_by_page: Dict[int, List[Dict[str, Any]]] = defaultdict(list)
|
|
|
|
for chunk in chunks:
|
|
for block in chunk.get("blocks", []) or []:
|
|
page_no = (block.get("bbox") or {}).get("page")
|
|
if page_no is None:
|
|
continue
|
|
try:
|
|
normalized_page = int(page_no)
|
|
except (TypeError, ValueError):
|
|
continue
|
|
blocks_by_page[normalized_page].append(block)
|
|
|
|
if not blocks_by_page:
|
|
fallback_markdown = "\n\n".join(
|
|
chunk.get("content", "") for chunk in chunks if chunk.get("content")
|
|
)
|
|
if fallback_markdown == "":
|
|
return []
|
|
return [OCRPage(index=0, markdown=fallback_markdown)]
|
|
|
|
pages: List["OCRPage"] = []
|
|
for page_no, blocks in sorted(blocks_by_page.items()):
|
|
markdown = "\n\n".join(
|
|
block.get("content", "") for block in blocks if block.get("content")
|
|
)
|
|
page_index = max(page_no - 1, 0)
|
|
page = OCRPage(
|
|
index=page_index,
|
|
markdown=markdown,
|
|
)
|
|
# OCRPage accepts extra keys at runtime; assign blocks after construction
|
|
# so static typing does not reject provider-specific metadata.
|
|
setattr(page, "blocks", blocks)
|
|
pages.append(page)
|
|
return pages
|