fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
173 lines
5.5 KiB
Python
173 lines
5.5 KiB
Python
# Copyright 2026 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
from dataclasses import dataclass
|
|
from typing import TYPE_CHECKING, Any, Union
|
|
|
|
from . import constants
|
|
from .utils import get_token
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
import duckdb
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class DatasetParquetEntry:
|
|
"""Represents a single parquet file available for a dataset on the Hub."""
|
|
|
|
config: str
|
|
split: str
|
|
url: str
|
|
size: int
|
|
|
|
|
|
def execute_raw_sql_query(sql_query: str, *, token: str | bool | None = None) -> list[dict[str, Any]]:
|
|
normalized_query = sql_query.strip().rstrip(";").strip()
|
|
_raise_on_forbidden_query(normalized_query)
|
|
|
|
connection = None
|
|
try:
|
|
connection = _get_duckdb_connection(token=token)
|
|
relation = connection.sql(normalized_query)
|
|
if relation is None:
|
|
raise ValueError("SQL query must return rows.")
|
|
|
|
if isinstance(relation, _DuckDBCliRelation):
|
|
# DuckDB binary => run CLI => parse JSON
|
|
return relation.execute()
|
|
else:
|
|
# DuckDB Python API => fetch columns + rows => convert to dicts
|
|
columns = tuple(column[0] for column in relation.description)
|
|
rows = tuple(tuple(row) for row in relation.fetchall())
|
|
return [dict(zip(columns, row)) for row in rows]
|
|
finally:
|
|
if connection is not None:
|
|
connection.close()
|
|
|
|
|
|
def _raise_on_forbidden_query(query: str) -> None:
|
|
if len(query) == 0:
|
|
raise ValueError("SQL query cannot be empty.")
|
|
|
|
# DuckDB CLI meta-commands are dot-prefixed words (e.g. `.shell`, `.output`).
|
|
# Let's forbid them for now but allow SQL expressions like `.5` that can legitimately start a line.
|
|
for line in query.splitlines():
|
|
stripped = line.lstrip()
|
|
if stripped.startswith(".") and stripped[1:2].isalpha():
|
|
raise ValueError("DuckDB CLI meta-commands are not allowed in SQL queries.")
|
|
|
|
|
|
def _get_duckdb_connection(
|
|
token: str | bool | None,
|
|
) -> Union["duckdb.DuckDBPyConnection", "_DuckDBCliConnection"]:
|
|
try:
|
|
# If DuckDB is installed as a Python package, use it!
|
|
import duckdb
|
|
except ImportError as error:
|
|
# Otherwise, use the DuckDB CLI binary.
|
|
duckdb_binary = shutil.which("duckdb")
|
|
if duckdb_binary is None:
|
|
raise ImportError(
|
|
"DuckDB is required for `hf datasets sql`. Install the Python package with `pip install duckdb` or "
|
|
"install the DuckDB CLI binary (for example `brew install duckdb`)."
|
|
) from error
|
|
return _DuckDBCliConnection(binary_path=duckdb_binary, token=token)
|
|
|
|
# Create a new connection (Python API).
|
|
connection = duckdb.connect()
|
|
try:
|
|
for statement in _build_duckdb_secret_statements(token):
|
|
connection.execute(statement)
|
|
return connection
|
|
except Exception:
|
|
connection.close()
|
|
raise
|
|
|
|
|
|
@dataclass
|
|
class _DuckDBCliConnection:
|
|
"""DuckDB connection.
|
|
|
|
Mimics the DuckDB Python API, but runs the queries via the DuckDB CLI binary.
|
|
"""
|
|
|
|
binary_path: str
|
|
token: str | bool | None
|
|
|
|
def __post_init__(self) -> None:
|
|
self._setup_statements = _build_duckdb_secret_statements(self.token)
|
|
|
|
def sql(self, query: str) -> "_DuckDBCliRelation":
|
|
return _DuckDBCliRelation(binary_path=self.binary_path, setup_statements=self._setup_statements, query=query)
|
|
|
|
def close(self) -> None:
|
|
pass
|
|
|
|
|
|
@dataclass
|
|
class _DuckDBCliRelation:
|
|
"""DuckDB relation.
|
|
|
|
Mimics the DuckDB Python API, but runs the queries via the DuckDB CLI binary.
|
|
"""
|
|
|
|
binary_path: str
|
|
setup_statements: list[str]
|
|
query: str
|
|
|
|
def execute(self) -> list[dict[str, Any]]:
|
|
# Build the DuckDB CLI input.
|
|
setup = []
|
|
if self.setup_statements:
|
|
setup = [
|
|
f".output {os.devnull}",
|
|
*(f"{stmt};" for stmt in self.setup_statements),
|
|
".output",
|
|
]
|
|
full_query = "\n".join(setup + [self.query + ";"])
|
|
|
|
# Run DuckDB binary
|
|
result = subprocess.run(
|
|
[self.binary_path, "-json"],
|
|
input=full_query,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False,
|
|
)
|
|
if result.returncode != 0:
|
|
error_message = result.stderr.strip() or result.stdout.strip() or "DuckDB CLI command failed."
|
|
raise RuntimeError(error_message)
|
|
|
|
# Parse JSON output and return
|
|
return json.loads(result.stdout.strip())
|
|
|
|
|
|
def _build_duckdb_secret_statements(token: str | bool | None) -> list[str]:
|
|
if token is None or token is True:
|
|
token = get_token()
|
|
|
|
if not token:
|
|
return []
|
|
|
|
escaped_token = token.replace("'", "''")
|
|
escaped_endpoint = constants.ENDPOINT.replace("'", "''")
|
|
return [
|
|
f"CREATE OR REPLACE SECRET hf_hub_token (TYPE HTTP, BEARER_TOKEN '{escaped_token}', SCOPE '{escaped_endpoint}')",
|
|
f"CREATE OR REPLACE SECRET hf_token (TYPE HUGGINGFACE, TOKEN '{escaped_token}')",
|
|
]
|