Files
MoFin/venv/lib/python3.12/site-packages/huggingface_hub/_upload_pipeline.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

683 lines
28 KiB
Python

# coding=utf-8
# Copyright 2026-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Streamed, multi-commit upload of a folder on top of the Xet upload protocol.
How it works:
- The **coordinator** (caller's thread) walks the list of files and asks the Hub, 256 files at a
time, what each file is (regular git blob, xet file, ignored). Regular files are accumulated
directly; xet files are registered into a `XetSession` upload-commit, which chunks, deduplicates,
retries and uploads them in the background while the coordinator keeps going. No Python-side
sha256 computation: `hf_xet` computes it during chunking (single read pass over each file).
- Whenever enough files have accumulated (adaptive batch size), the batch is handed over to the
**committer** thread which joins the xet uploads, drops unchanged files, and creates a git
commit for the batch. While a batch is being committed, the coordinator is already uploading
the next one.
- Interrupted uploads are resumable by simply re-running the same call: already-committed files
are dropped (no-op detection against the remote oid) and already-uploaded chunks are
deduplicated by the xet storage backend, transferring ~0 bytes.
"""
import queue
import shutil
import sys
import threading
import time
from typing import TYPE_CHECKING, Any, Callable
from urllib.parse import quote
from . import constants
from ._commit_api import (
CommitOperationAdd,
CommitOperationDelete,
_fetch_upload_modes,
_send_commit,
_warn_on_overwriting_operations,
)
from .errors import RepositoryNotFoundError
from .utils import are_progress_bars_disabled, logging
from .utils._xet import (
XetTokenType,
abort_xet_session,
get_xet_session,
xet_connection_info_refresh_url,
xet_headers_without_auth,
)
if TYPE_CHECKING:
from .hf_api import CommitInfo, HfApi
logger = logging.get_logger(__name__)
# Number of files sent to the "preupload" endpoint per call (server-side limit).
PREUPLOAD_BATCH_SIZE = 256
# Files per git commit: adaptive, scaled up after fast commits and down after failures.
COMMIT_SIZE_SCALE = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000]
INITIAL_COMMIT_SIZE_INDEX = 6 # start at 256 files per commit
TARGET_COMMIT_DURATION = 40.0 # seconds; scale up batch size if commits are faster than this
MAX_COMMIT_INTERVAL = 5 * 60.0 # seconds; force a commit if the current batch is older than this
# Budget of regular-file content per commit (regular files are base64-encoded in the payload).
REGULAR_CONTENT_BYTES_BUDGET = 100 * 1024 * 1024
_SENTINEL = object() # Sentinel value for the batch queue to indicate the end of the upload
# Live display tuning
_BAR_WIDTH = 20
_REFRESH_INTERVAL = 0.5 # seconds between redraws on a TTY
_NON_TTY_LOG_INTERVAL = 30.0 # seconds between summary logs when stderr is not a TTY
def _bar(current: float, total: float, width: int = _BAR_WIDTH) -> str:
if total <= 0:
return "" * width
filled = int(min(current / total, 1.0) * width)
return "" * filled + "" * (width - filled)
def _format_bytes(n: float) -> str:
for unit in ("B", "kB", "MB", "GB", "TB"):
if abs(n) < 1000:
if n < 10:
return f"{n:.2f}{unit}"
elif n < 100:
return f"{n:.1f}{unit}"
return f"{n:.0f}{unit}"
n /= 1000
return f"{n:.1f}PB"
class _LiveDisplay:
"""Three-line live progress display on stderr::
Preparing ████████████████████ 11,100 / 11,100 ✓
Uploading ██████████████░░░░░░ 580 / 603 files 3.8GB · 19.7MB/s
Committing ██████████████████░░ 10,800 / 11,100 14 commits
A small renderer thread redraws the three lines in-place every ~0.5 s on a TTY
(worker threads only update counters under a lock). When stderr is not a TTY,
it falls back to a periodic ``logger.info`` summary instead.
Disabling progress bars (e.g. agent output mode) only turns off the TTY renderer:
the non-TTY log summaries are gated by the logger verbosity alone, so consumers
tailing stderr during a long upload still see periodic progress.
"""
_N_LINES = 3
def __init__(self, total_files: int, enabled: bool = True) -> None:
self._total = total_files
self._tty = enabled and sys.stderr.isatty()
self._active = self._tty or logger.isEnabledFor(logging.INFO)
self._lock = threading.Lock()
self._drawn = False
self._stop_event = threading.Event()
self._thread: threading.Thread | None = None
# Counters (written by coordinator/committer threads, read by the renderer)
self._prepared = 0
self._ignored = 0
self._xet_total = 0
self._xet_done: set[str] = set() # item names; unique across batches
self._committed = 0 # committed or skipped-as-unchanged
self._nb_commits = 0
# Xet transfer bytes, summed across (possibly concurrent) upload-commits
self._xet_bytes = 0
self._speed_ema = 0.0
self._prev_bytes = 0
self._prev_time: float | None = None
# -- lifecycle (main thread) ------------------------------------------------
def start(self) -> None:
if not self._active:
return
if self._tty:
sys.stderr.write(f"Found {self._total:,} files to upload\n")
sys.stderr.flush()
else:
logger.info(f"Found {self._total:,} files to upload")
self._thread = threading.Thread(target=self._render_loop, name="hf-upload-display", daemon=True)
self._thread.start()
def close(self) -> None:
if self._thread is not None:
self._stop_event.set()
self._thread.join()
if self._tty:
with self._lock:
self._redraw() # final state
# -- counter updates (coordinator / committer / xet callback threads) --------
def notify_prepared(self, n: int) -> None:
with self._lock:
self._prepared += n
def notify_ignored(self, n: int) -> None:
with self._lock:
self._ignored += n
def notify_xet_registered(self, n: int) -> None:
with self._lock:
self._xet_total += n
def notify_xet_uploaded(self, names: list[str]) -> None:
with self._lock:
self._xet_done.update(names)
def notify_skipped(self, n: int) -> None:
with self._lock:
self._committed += n
def notify_commit(self, n_files: int) -> None:
with self._lock:
self._committed += n_files
self._nb_commits += 1
def new_xet_callback(self) -> "Callable | None":
"""Progress callback for one ``new_upload_commit``.
The byte counters in ``group_report`` are cumulative *per upload-commit* and several
upload-commits can be in flight at once (one finalizing, one filling), so each commit
gets its own closure tracking its own previous value; increments are summed globally.
"""
if not self._active:
return None
prev = 0
def callback(group_report: Any, item_reports: Any) -> None:
nonlocal prev
with self._lock:
completed = group_report.total_transfer_bytes_completed
self._xet_bytes += max(0, completed - prev)
prev = completed
for item in item_reports.values():
if item.total_bytes > 0 and item.bytes_completed == item.total_bytes:
self._xet_done.add(item.item_name)
return callback
# -- rendering (display thread) ----------------------------------------------
def _render_loop(self) -> None:
last_log = 0.0
while not self._stop_event.wait(_REFRESH_INTERVAL):
with self._lock:
self._update_speed()
if self._tty:
self._redraw()
elif time.monotonic() - last_log >= _NON_TTY_LOG_INTERVAL:
logger.info(self._summary())
last_log = time.monotonic()
def _update_speed(self) -> None:
now = time.monotonic()
if self._prev_time is not None and now > self._prev_time:
rate = (self._xet_bytes - self._prev_bytes) / (now - self._prev_time)
self._speed_ema = rate if self._speed_ema == 0 else 0.3 * rate + 0.7 * self._speed_ema
self._prev_time = now
self._prev_bytes = self._xet_bytes
def _redraw(self) -> None:
if self._drawn:
sys.stderr.write(f"\033[{self._N_LINES}A")
width = shutil.get_terminal_size().columns
for line in (self._line_preparing(), self._line_uploading(), self._line_committing()):
truncated = line[: width - 4] + "..." if len(line) > width - 1 else line
sys.stderr.write(f"\r\033[K{truncated}\n")
sys.stderr.flush()
self._drawn = True
def _line_preparing(self) -> str:
done = "" if self._prepared >= self._total else ""
return f" Preparing {_bar(self._prepared, self._total)} {self._prepared:,} / {self._total:,}{done}"
def _line_uploading(self) -> str:
if self._xet_total == 0:
bar = _bar(1, 1) if self._prepared >= self._total else _bar(0, 1)
return f" Uploading {bar} -"
n_done = len(self._xet_done)
parts = []
if self._xet_bytes > 0:
parts.append(_format_bytes(self._xet_bytes))
if self._speed_ema > 0:
parts.append(f"{_format_bytes(self._speed_ema)}/s")
extra = f" {' · '.join(parts)}" if parts else ""
done = "" if self._prepared >= self._total and n_done >= self._xet_total else ""
return f" Uploading {_bar(n_done, self._xet_total)} {n_done:,} / {self._xet_total:,} files{extra}{done}"
def _line_committing(self) -> str:
effective = self._total - self._ignored
commits_str = f" {self._nb_commits} commits" if self._nb_commits > 1 else ""
done = "" if self._committed >= effective > 0 else ""
return (
f" Committing {_bar(self._committed, effective)} {self._committed:,} / {effective:,}{commits_str}{done}"
)
def _summary(self) -> str:
return (
f"Uploading... {self._prepared:,}/{self._total:,} files checked, "
f"{len(self._xet_done):,}/{self._xet_total:,} uploaded ({_format_bytes(self._xet_bytes)} transferred), "
f"{self._committed:,} committed in {self._nb_commits} commit(s)"
)
class _CommitPacer:
"""Adaptive number of files per commit, to stay below server-side commit timeouts."""
def __init__(self) -> None:
self._index = INITIAL_COMMIT_SIZE_INDEX
@property
def target(self) -> int:
return COMMIT_SIZE_SCALE[self._index]
def record_success(self, duration: float, nb_files: int) -> None:
if duration < TARGET_COMMIT_DURATION and nb_files >= self.target:
self._index = min(self._index + 1, len(COMMIT_SIZE_SCALE) - 1)
elif duration > TARGET_COMMIT_DURATION:
self._index = max(self._index - 1, 0)
def record_failure(self) -> None:
self._index = max(self._index - 1, 0)
class _Batch:
"""A group of files destined to a single git commit, with their in-flight xet uploads."""
def __init__(self) -> None:
self.ops: list[CommitOperationAdd] = []
self.regular_bytes: int = 0
self.xet_commit: Any = None # XetUploadCommit, opened lazily
self.handles: list[tuple[CommitOperationAdd, Any]] = [] # (op, XetFileUpload)
self.created_at: float = time.monotonic()
class _UploadPipeline:
def __init__(
self,
api: "HfApi",
*,
repo_id: str,
repo_type: str,
add_operations: list[CommitOperationAdd],
delete_operations: list[CommitOperationDelete],
commit_message: str,
commit_description: str | None,
token: str | bool | None,
revision: str | None,
create_pr: bool,
parent_commit: str | None,
) -> None:
self.api = api
self.repo_id = repo_id
self.repo_type = repo_type
self.add_operations = add_operations
self.delete_operations = delete_operations
self.commit_message = commit_message
self.commit_description = commit_description
self.token = token
self.headers = api._build_hf_headers(token=token)
self.revision = revision or constants.DEFAULT_REVISION
self.create_pr = create_pr
self.parent_commit = parent_commit
# The base revision is used by the coordinator for ALL preupload calls and the xet token
# refresh URL, with the `create_pr` flag — exactly like `create_commit` does. It never
# changes during the run, even after a PR has been created.
self.base_revision_quoted = quote(self.revision, safe="")
# Committer state (mutated by the committer thread only)
self.commit_revision_quoted = self.base_revision_quoted # switched to the PR ref once created
self.pr_url: str | None = None
self.pr_revision: str | None = None
self.nb_commits = 0
self.last_commit_info: "CommitInfo | None" = None
self.pacer = _CommitPacer()
# Pipeline plumbing
self.batch_queue: queue.Queue = queue.Queue(maxsize=1)
self.errors: list[BaseException] = []
self.abort_event = threading.Event()
self.display = _LiveDisplay(total_files=len(add_operations), enabled=not are_progress_bars_disabled())
# All xet uploads share the same token refresh URL. With `create_pr`, the final ref is not
# known in advance: `?create_pr=1` makes the server grant a token valid for PR refs.
refresh_url = xet_connection_info_refresh_url(
token_type=XetTokenType.WRITE,
repo_id=repo_id,
repo_type=repo_type,
revision=self.base_revision_quoted,
endpoint=api.endpoint,
)
if create_pr:
refresh_url += "?create_pr=1"
self.xet_session = get_xet_session()
self.xet_commit_kwargs = {
"token_refresh_url": refresh_url,
"token_refresh_headers": self.headers,
"custom_headers": xet_headers_without_auth(self.headers),
}
# `.gitignore` rules are enforced server-side: forward the local one if it's being uploaded.
self.gitignore_content: str | None = None
for op in add_operations:
if op.path_in_repo == ".gitignore":
with op.as_file() as f:
self.gitignore_content = f.read().decode()
break
def run(self) -> "CommitInfo":
_warn_on_overwriting_operations([*self.delete_operations, *self.add_operations])
committer = threading.Thread(target=self._committer_loop, name="hf-upload-committer", daemon=True)
committer.start()
self.display.start()
try:
self._coordinator_loop()
except BaseException:
self.abort_event.set()
abort_xet_session()
raise
finally:
if self.abort_event.is_set():
# The committer exits on its own once the queue is drained (see `_committer_loop`).
# Bound the wait so a xet call blocked on the (aborted) session can never hang the
# shutdown — the committer is a daemon thread.
committer.join(timeout=10)
else:
self.batch_queue.put(_SENTINEL)
committer.join()
self.display.close()
if self.abort_event.is_set() and self.pr_revision is not None:
logger.warning(
f"Upload to pull request {self.pr_url} did not complete. To resume into the"
f' same PR, re-run with `revision="{self.pr_revision}"` (without `create_pr=True`). Re-running'
" with `create_pr=True` would open a new pull request."
)
if self.errors:
raise self.errors[0]
return self._final_commit_info()
# ---------------------------------------------------------------- coordinator
def _coordinator_loop(self) -> None:
import hf_xet
batch = _Batch()
for start in range(0, len(self.add_operations), PREUPLOAD_BATCH_SIZE):
if self.abort_event.is_set():
self._abort_batch(batch)
return
chunk = self.add_operations[start : start + PREUPLOAD_BATCH_SIZE]
try:
_fetch_upload_modes(
additions=chunk,
repo_type=self.repo_type,
repo_id=self.repo_id,
headers=self.headers,
revision=self.base_revision_quoted,
endpoint=self.api.endpoint,
create_pr=self.create_pr,
gitignore_content=self.gitignore_content,
)
except RepositoryNotFoundError as e:
from .hf_api import _CREATE_COMMIT_NO_REPO_ERROR_MESSAGE
e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)
raise
self.display.notify_prepared(len(chunk))
for op in chunk:
if op._should_ignore:
logger.debug(f"Skipping upload for '{op.path_in_repo}' (ignored by gitignore rules).")
self.display.notify_ignored(1)
continue
if op._upload_mode == "regular":
batch.regular_bytes += op.upload_info.size
else:
if batch.xet_commit is None:
batch.xet_commit = self.xet_session.new_upload_commit(
progress_callback=self.display.new_xet_callback(), **self.xet_commit_kwargs
)
# Upload starts immediately in the background. sha256 is computed by hf_xet
# while chunking, unless already known (e.g. resumed operations).
sha_arg = op.upload_info.sha256.hex() if op.upload_info.is_hashed else hf_xet.COMPUTE_SHA256
if isinstance(op.path_or_fileobj, bytes):
handle = batch.xet_commit.start_upload_bytes(
op.path_or_fileobj, sha256=sha_arg, name=op.path_in_repo
)
else:
handle = batch.xet_commit.start_upload_file(str(op.path_or_fileobj), sha256=sha_arg)
batch.handles.append((op, handle))
self.display.notify_xet_registered(1)
batch.ops.append(op)
if (
len(batch.ops) >= self.pacer.target
or batch.regular_bytes >= REGULAR_CONTENT_BYTES_BUDGET
or (time.monotonic() - batch.created_at > MAX_COMMIT_INTERVAL and len(batch.ops) > 0)
):
self._enqueue(batch)
batch = _Batch()
self._enqueue(batch)
def _enqueue(self, batch: _Batch) -> None:
if len(batch.ops) == 0 and not (self.nb_commits == 0 and len(self.delete_operations) > 0):
return
# Blocks if a batch is already waiting: natural backpressure on scanning/uploading.
while not self.abort_event.is_set():
try:
self.batch_queue.put(batch, timeout=1.0)
return
except queue.Full:
continue
self._abort_batch(batch)
def _abort_batch(self, batch: _Batch) -> None:
if batch.xet_commit is not None:
try:
batch.xet_commit.abort()
except Exception:
pass
# ---------------------------------------------------------------- committer
def _committer_loop(self) -> None:
while True:
try:
batch = self.batch_queue.get(timeout=0.5)
except queue.Empty:
if self.abort_event.is_set():
return # aborted: exit once the queue is drained, no sentinel needed
continue
if batch is _SENTINEL:
return
try:
if not self.abort_event.is_set():
self._process_batch(batch)
else:
self._abort_batch(batch)
except BaseException as e:
self._abort_batch(batch)
self.errors.append(e)
self.abort_event.set()
def _process_batch(self, batch: _Batch) -> None:
# 1. Wait for all xet uploads of this batch and finalize them (atomic xet commit). Files
# can only be referenced by a git commit once their xet upload-commit is finalized.
if batch.xet_commit is not None:
batch.xet_commit.wait_to_finish()
for op, handle in batch.handles:
if not op.upload_info.is_hashed:
op.upload_info.sha256 = bytes.fromhex(handle.result().xet_info.sha256)
op._is_uploaded = True
# Files whose last progress tick was missed are still done at this point.
self.display.notify_xet_uploaded(
[
str(op.path_or_fileobj) if not isinstance(op.path_or_fileobj, bytes) else op.path_in_repo
for op, _ in batch.handles
]
)
# 2. Drop files that have not changed compared to the remote (prevents empty commits).
# Their chunks were deduplicated anyway (~0 bytes transferred).
ops_to_commit = []
for op in batch.ops:
if op._remote_oid is not None and op._remote_oid == op._local_oid:
logger.debug(f"Skipping commit for '{op.path_in_repo}' (file unchanged).")
self.display.notify_skipped(1)
continue
ops_to_commit.append(op)
# 3. Create the git commit(s). On failure, scale down and split the batch.
if len(ops_to_commit) > 0 or (self.nb_commits == 0 and len(self.delete_operations) > 0):
self._commit_with_split(ops_to_commit)
def _commit_with_split(self, ops: list[CommitOperationAdd]) -> None:
try:
self._do_commit(ops)
except Exception as e:
self.pacer.record_failure()
if len(ops) <= COMMIT_SIZE_SCALE[0]:
raise
logger.warning(f"Commit of {len(ops)} files failed ({e!r}). Retrying in smaller chunks.")
target = self.pacer.target
for start in range(0, len(ops), target):
self._commit_with_split(ops[start : start + target])
def _do_commit(self, ops: list[CommitOperationAdd]) -> None:
if self.create_pr and self.pr_revision is None:
# Create the (draft) pull request explicitly and push every commit to its ref. Committing
# with `?create_pr=1` instead would risk opening a second PR if the commit POST is retried
# after a lost response. Created lazily so that a fully-unchanged upload opens no PR.
# Note: PRs created this way are always opened against the default branch, hence the
# `create_pr` + `revision` combination being rejected in `upload_folder`.
pr = self.api.create_pull_request(
repo_id=self.repo_id,
title=self.commit_message,
token=self.token,
description=self.commit_description,
repo_type=self.repo_type,
)
if pr.git_reference is None:
raise ValueError("Server did not return a git reference for the created pull request.")
self.pr_url = pr.url
self.pr_revision = pr.git_reference
self.commit_revision_quoted = quote(pr.git_reference, safe="")
operations: list[Any] = list(ops)
if self.nb_commits == 0:
# Deletions and `parent_commit` ride the first commit.
operations = list(self.delete_operations) + operations
commit_message = (
self.commit_message if self.nb_commits == 0 else f"{self.commit_message} (part {self.nb_commits + 1})"
)
t0 = time.monotonic()
# Retried with backoff on transient errors: safe because the commit targets an explicit
# ref (`?create_pr=1` is never used, see above).
self.last_commit_info = _send_commit(
operations=operations,
files_to_copy={},
commit_message=commit_message,
commit_description=self.commit_description or "",
repo_type=self.repo_type,
repo_id=self.repo_id,
headers=self.headers,
revision=self.commit_revision_quoted,
endpoint=self.api.endpoint,
parent_commit=self.parent_commit if self.nb_commits == 0 else None,
retry_on_error=True,
)
duration = time.monotonic() - t0
self.pacer.record_success(duration, len(ops))
self.nb_commits += 1
for op in ops:
op._is_committed = True
self.display.notify_commit(len(ops))
logger.debug(f"Committed {len(ops)} file(s) in {duration:.1f}s: {self.last_commit_info.commit_url}")
# ---------------------------------------------------------------- result
def _final_commit_info(self) -> "CommitInfo":
from .hf_api import CommitInfo
if self.last_commit_info is None:
# Nothing was committed (everything unchanged/ignored): mimic `create_commit` and
# return info about the latest commit on the target revision.
logger.warning("No files have been modified since last commit. Skipping to prevent empty commit.")
info = self.api.repo_info(repo_id=self.repo_id, repo_type=self.repo_type, revision=self.revision)
url_prefix = self.api.endpoint
if self.repo_type != constants.REPO_TYPE_MODEL:
url_prefix = f"{url_prefix}/{self.repo_type}s"
return CommitInfo(
commit_url=f"{url_prefix}/{self.repo_id}/commit/{info.sha}",
commit_message=self.commit_message,
commit_description=self.commit_description or "",
oid=info.sha, # type: ignore
_endpoint=self.api.endpoint,
)
if self.nb_commits > 1:
logger.info(f"Upload completed in {self.nb_commits} commits.")
if self.pr_url is not None:
# PR upload: attach the PR info (commit responses don't carry it; the PR is created separately).
return CommitInfo(
commit_url=self.last_commit_info.commit_url,
commit_message=self.last_commit_info.commit_message,
commit_description=self.last_commit_info.commit_description,
oid=self.last_commit_info.oid,
pr_url=self.pr_url,
_endpoint=self.api.endpoint,
)
return self.last_commit_info
def pipelined_upload(
api: "HfApi",
*,
repo_id: str,
repo_type: str,
add_operations: list[CommitOperationAdd],
delete_operations: list[CommitOperationDelete],
commit_message: str,
commit_description: str | None = None,
token: str | bool | None = None,
revision: str | None = None,
create_pr: bool = False,
parent_commit: str | None = None,
) -> "CommitInfo":
"""Upload a prepared list of operations through the streamed multi-commit pipeline.
Requires `hf_xet` to be installed. See module docstring for the architecture.
"""
return _UploadPipeline(
api,
repo_id=repo_id,
repo_type=repo_type,
add_operations=add_operations,
delete_operations=delete_operations,
commit_message=commit_message,
commit_description=commit_description,
token=token,
revision=revision,
create_pr=create_pr,
parent_commit=parent_commit,
).run()