# coding=utf-8 # Copyright 2026-present, the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Streamed, multi-commit upload of a folder on top of the Xet upload protocol. How it works: - The **coordinator** (caller's thread) walks the list of files and asks the Hub, 256 files at a time, what each file is (regular git blob, xet file, ignored). Regular files are accumulated directly; xet files are registered into a `XetSession` upload-commit, which chunks, deduplicates, retries and uploads them in the background while the coordinator keeps going. No Python-side sha256 computation: `hf_xet` computes it during chunking (single read pass over each file). - Whenever enough files have accumulated (adaptive batch size), the batch is handed over to the **committer** thread which joins the xet uploads, drops unchanged files, and creates a git commit for the batch. While a batch is being committed, the coordinator is already uploading the next one. - Interrupted uploads are resumable by simply re-running the same call: already-committed files are dropped (no-op detection against the remote oid) and already-uploaded chunks are deduplicated by the xet storage backend, transferring ~0 bytes. """ import queue import shutil import sys import threading import time from typing import TYPE_CHECKING, Any, Callable from urllib.parse import quote from . import constants from ._commit_api import ( CommitOperationAdd, CommitOperationDelete, _fetch_upload_modes, _send_commit, _warn_on_overwriting_operations, ) from .errors import RepositoryNotFoundError from .utils import are_progress_bars_disabled, logging from .utils._xet import ( XetTokenType, abort_xet_session, get_xet_session, xet_connection_info_refresh_url, xet_headers_without_auth, ) if TYPE_CHECKING: from .hf_api import CommitInfo, HfApi logger = logging.get_logger(__name__) # Number of files sent to the "preupload" endpoint per call (server-side limit). PREUPLOAD_BATCH_SIZE = 256 # Files per git commit: adaptive, scaled up after fast commits and down after failures. COMMIT_SIZE_SCALE = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000] INITIAL_COMMIT_SIZE_INDEX = 6 # start at 256 files per commit TARGET_COMMIT_DURATION = 40.0 # seconds; scale up batch size if commits are faster than this MAX_COMMIT_INTERVAL = 5 * 60.0 # seconds; force a commit if the current batch is older than this # Budget of regular-file content per commit (regular files are base64-encoded in the payload). REGULAR_CONTENT_BYTES_BUDGET = 100 * 1024 * 1024 _SENTINEL = object() # Sentinel value for the batch queue to indicate the end of the upload # Live display tuning _BAR_WIDTH = 20 _REFRESH_INTERVAL = 0.5 # seconds between redraws on a TTY _NON_TTY_LOG_INTERVAL = 30.0 # seconds between summary logs when stderr is not a TTY def _bar(current: float, total: float, width: int = _BAR_WIDTH) -> str: if total <= 0: return "░" * width filled = int(min(current / total, 1.0) * width) return "█" * filled + "░" * (width - filled) def _format_bytes(n: float) -> str: for unit in ("B", "kB", "MB", "GB", "TB"): if abs(n) < 1000: if n < 10: return f"{n:.2f}{unit}" elif n < 100: return f"{n:.1f}{unit}" return f"{n:.0f}{unit}" n /= 1000 return f"{n:.1f}PB" class _LiveDisplay: """Three-line live progress display on stderr:: Preparing ████████████████████ 11,100 / 11,100 ✓ Uploading ██████████████░░░░░░ 580 / 603 files 3.8GB · 19.7MB/s Committing ██████████████████░░ 10,800 / 11,100 14 commits A small renderer thread redraws the three lines in-place every ~0.5 s on a TTY (worker threads only update counters under a lock). When stderr is not a TTY, it falls back to a periodic ``logger.info`` summary instead. Disabling progress bars (e.g. agent output mode) only turns off the TTY renderer: the non-TTY log summaries are gated by the logger verbosity alone, so consumers tailing stderr during a long upload still see periodic progress. """ _N_LINES = 3 def __init__(self, total_files: int, enabled: bool = True) -> None: self._total = total_files self._tty = enabled and sys.stderr.isatty() self._active = self._tty or logger.isEnabledFor(logging.INFO) self._lock = threading.Lock() self._drawn = False self._stop_event = threading.Event() self._thread: threading.Thread | None = None # Counters (written by coordinator/committer threads, read by the renderer) self._prepared = 0 self._ignored = 0 self._xet_total = 0 self._xet_done: set[str] = set() # item names; unique across batches self._committed = 0 # committed or skipped-as-unchanged self._nb_commits = 0 # Xet transfer bytes, summed across (possibly concurrent) upload-commits self._xet_bytes = 0 self._speed_ema = 0.0 self._prev_bytes = 0 self._prev_time: float | None = None # -- lifecycle (main thread) ------------------------------------------------ def start(self) -> None: if not self._active: return if self._tty: sys.stderr.write(f"Found {self._total:,} files to upload\n") sys.stderr.flush() else: logger.info(f"Found {self._total:,} files to upload") self._thread = threading.Thread(target=self._render_loop, name="hf-upload-display", daemon=True) self._thread.start() def close(self) -> None: if self._thread is not None: self._stop_event.set() self._thread.join() if self._tty: with self._lock: self._redraw() # final state # -- counter updates (coordinator / committer / xet callback threads) -------- def notify_prepared(self, n: int) -> None: with self._lock: self._prepared += n def notify_ignored(self, n: int) -> None: with self._lock: self._ignored += n def notify_xet_registered(self, n: int) -> None: with self._lock: self._xet_total += n def notify_xet_uploaded(self, names: list[str]) -> None: with self._lock: self._xet_done.update(names) def notify_skipped(self, n: int) -> None: with self._lock: self._committed += n def notify_commit(self, n_files: int) -> None: with self._lock: self._committed += n_files self._nb_commits += 1 def new_xet_callback(self) -> "Callable | None": """Progress callback for one ``new_upload_commit``. The byte counters in ``group_report`` are cumulative *per upload-commit* and several upload-commits can be in flight at once (one finalizing, one filling), so each commit gets its own closure tracking its own previous value; increments are summed globally. """ if not self._active: return None prev = 0 def callback(group_report: Any, item_reports: Any) -> None: nonlocal prev with self._lock: completed = group_report.total_transfer_bytes_completed self._xet_bytes += max(0, completed - prev) prev = completed for item in item_reports.values(): if item.total_bytes > 0 and item.bytes_completed == item.total_bytes: self._xet_done.add(item.item_name) return callback # -- rendering (display thread) ---------------------------------------------- def _render_loop(self) -> None: last_log = 0.0 while not self._stop_event.wait(_REFRESH_INTERVAL): with self._lock: self._update_speed() if self._tty: self._redraw() elif time.monotonic() - last_log >= _NON_TTY_LOG_INTERVAL: logger.info(self._summary()) last_log = time.monotonic() def _update_speed(self) -> None: now = time.monotonic() if self._prev_time is not None and now > self._prev_time: rate = (self._xet_bytes - self._prev_bytes) / (now - self._prev_time) self._speed_ema = rate if self._speed_ema == 0 else 0.3 * rate + 0.7 * self._speed_ema self._prev_time = now self._prev_bytes = self._xet_bytes def _redraw(self) -> None: if self._drawn: sys.stderr.write(f"\033[{self._N_LINES}A") width = shutil.get_terminal_size().columns for line in (self._line_preparing(), self._line_uploading(), self._line_committing()): truncated = line[: width - 4] + "..." if len(line) > width - 1 else line sys.stderr.write(f"\r\033[K{truncated}\n") sys.stderr.flush() self._drawn = True def _line_preparing(self) -> str: done = " ✓" if self._prepared >= self._total else "" return f" Preparing {_bar(self._prepared, self._total)} {self._prepared:,} / {self._total:,}{done}" def _line_uploading(self) -> str: if self._xet_total == 0: bar = _bar(1, 1) if self._prepared >= self._total else _bar(0, 1) return f" Uploading {bar} -" n_done = len(self._xet_done) parts = [] if self._xet_bytes > 0: parts.append(_format_bytes(self._xet_bytes)) if self._speed_ema > 0: parts.append(f"{_format_bytes(self._speed_ema)}/s") extra = f" {' · '.join(parts)}" if parts else "" done = " ✓" if self._prepared >= self._total and n_done >= self._xet_total else "" return f" Uploading {_bar(n_done, self._xet_total)} {n_done:,} / {self._xet_total:,} files{extra}{done}" def _line_committing(self) -> str: effective = self._total - self._ignored commits_str = f" {self._nb_commits} commits" if self._nb_commits > 1 else "" done = " ✓" if self._committed >= effective > 0 else "" return ( f" Committing {_bar(self._committed, effective)} {self._committed:,} / {effective:,}{commits_str}{done}" ) def _summary(self) -> str: return ( f"Uploading... {self._prepared:,}/{self._total:,} files checked, " f"{len(self._xet_done):,}/{self._xet_total:,} uploaded ({_format_bytes(self._xet_bytes)} transferred), " f"{self._committed:,} committed in {self._nb_commits} commit(s)" ) class _CommitPacer: """Adaptive number of files per commit, to stay below server-side commit timeouts.""" def __init__(self) -> None: self._index = INITIAL_COMMIT_SIZE_INDEX @property def target(self) -> int: return COMMIT_SIZE_SCALE[self._index] def record_success(self, duration: float, nb_files: int) -> None: if duration < TARGET_COMMIT_DURATION and nb_files >= self.target: self._index = min(self._index + 1, len(COMMIT_SIZE_SCALE) - 1) elif duration > TARGET_COMMIT_DURATION: self._index = max(self._index - 1, 0) def record_failure(self) -> None: self._index = max(self._index - 1, 0) class _Batch: """A group of files destined to a single git commit, with their in-flight xet uploads.""" def __init__(self) -> None: self.ops: list[CommitOperationAdd] = [] self.regular_bytes: int = 0 self.xet_commit: Any = None # XetUploadCommit, opened lazily self.handles: list[tuple[CommitOperationAdd, Any]] = [] # (op, XetFileUpload) self.created_at: float = time.monotonic() class _UploadPipeline: def __init__( self, api: "HfApi", *, repo_id: str, repo_type: str, add_operations: list[CommitOperationAdd], delete_operations: list[CommitOperationDelete], commit_message: str, commit_description: str | None, token: str | bool | None, revision: str | None, create_pr: bool, parent_commit: str | None, ) -> None: self.api = api self.repo_id = repo_id self.repo_type = repo_type self.add_operations = add_operations self.delete_operations = delete_operations self.commit_message = commit_message self.commit_description = commit_description self.token = token self.headers = api._build_hf_headers(token=token) self.revision = revision or constants.DEFAULT_REVISION self.create_pr = create_pr self.parent_commit = parent_commit # The base revision is used by the coordinator for ALL preupload calls and the xet token # refresh URL, with the `create_pr` flag — exactly like `create_commit` does. It never # changes during the run, even after a PR has been created. self.base_revision_quoted = quote(self.revision, safe="") # Committer state (mutated by the committer thread only) self.commit_revision_quoted = self.base_revision_quoted # switched to the PR ref once created self.pr_url: str | None = None self.pr_revision: str | None = None self.nb_commits = 0 self.last_commit_info: "CommitInfo | None" = None self.pacer = _CommitPacer() # Pipeline plumbing self.batch_queue: queue.Queue = queue.Queue(maxsize=1) self.errors: list[BaseException] = [] self.abort_event = threading.Event() self.display = _LiveDisplay(total_files=len(add_operations), enabled=not are_progress_bars_disabled()) # All xet uploads share the same token refresh URL. With `create_pr`, the final ref is not # known in advance: `?create_pr=1` makes the server grant a token valid for PR refs. refresh_url = xet_connection_info_refresh_url( token_type=XetTokenType.WRITE, repo_id=repo_id, repo_type=repo_type, revision=self.base_revision_quoted, endpoint=api.endpoint, ) if create_pr: refresh_url += "?create_pr=1" self.xet_session = get_xet_session() self.xet_commit_kwargs = { "token_refresh_url": refresh_url, "token_refresh_headers": self.headers, "custom_headers": xet_headers_without_auth(self.headers), } # `.gitignore` rules are enforced server-side: forward the local one if it's being uploaded. self.gitignore_content: str | None = None for op in add_operations: if op.path_in_repo == ".gitignore": with op.as_file() as f: self.gitignore_content = f.read().decode() break def run(self) -> "CommitInfo": _warn_on_overwriting_operations([*self.delete_operations, *self.add_operations]) committer = threading.Thread(target=self._committer_loop, name="hf-upload-committer", daemon=True) committer.start() self.display.start() try: self._coordinator_loop() except BaseException: self.abort_event.set() abort_xet_session() raise finally: if self.abort_event.is_set(): # The committer exits on its own once the queue is drained (see `_committer_loop`). # Bound the wait so a xet call blocked on the (aborted) session can never hang the # shutdown — the committer is a daemon thread. committer.join(timeout=10) else: self.batch_queue.put(_SENTINEL) committer.join() self.display.close() if self.abort_event.is_set() and self.pr_revision is not None: logger.warning( f"Upload to pull request {self.pr_url} did not complete. To resume into the" f' same PR, re-run with `revision="{self.pr_revision}"` (without `create_pr=True`). Re-running' " with `create_pr=True` would open a new pull request." ) if self.errors: raise self.errors[0] return self._final_commit_info() # ---------------------------------------------------------------- coordinator def _coordinator_loop(self) -> None: import hf_xet batch = _Batch() for start in range(0, len(self.add_operations), PREUPLOAD_BATCH_SIZE): if self.abort_event.is_set(): self._abort_batch(batch) return chunk = self.add_operations[start : start + PREUPLOAD_BATCH_SIZE] try: _fetch_upload_modes( additions=chunk, repo_type=self.repo_type, repo_id=self.repo_id, headers=self.headers, revision=self.base_revision_quoted, endpoint=self.api.endpoint, create_pr=self.create_pr, gitignore_content=self.gitignore_content, ) except RepositoryNotFoundError as e: from .hf_api import _CREATE_COMMIT_NO_REPO_ERROR_MESSAGE e.append_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE) raise self.display.notify_prepared(len(chunk)) for op in chunk: if op._should_ignore: logger.debug(f"Skipping upload for '{op.path_in_repo}' (ignored by gitignore rules).") self.display.notify_ignored(1) continue if op._upload_mode == "regular": batch.regular_bytes += op.upload_info.size else: if batch.xet_commit is None: batch.xet_commit = self.xet_session.new_upload_commit( progress_callback=self.display.new_xet_callback(), **self.xet_commit_kwargs ) # Upload starts immediately in the background. sha256 is computed by hf_xet # while chunking, unless already known (e.g. resumed operations). sha_arg = op.upload_info.sha256.hex() if op.upload_info.is_hashed else hf_xet.COMPUTE_SHA256 if isinstance(op.path_or_fileobj, bytes): handle = batch.xet_commit.start_upload_bytes( op.path_or_fileobj, sha256=sha_arg, name=op.path_in_repo ) else: handle = batch.xet_commit.start_upload_file(str(op.path_or_fileobj), sha256=sha_arg) batch.handles.append((op, handle)) self.display.notify_xet_registered(1) batch.ops.append(op) if ( len(batch.ops) >= self.pacer.target or batch.regular_bytes >= REGULAR_CONTENT_BYTES_BUDGET or (time.monotonic() - batch.created_at > MAX_COMMIT_INTERVAL and len(batch.ops) > 0) ): self._enqueue(batch) batch = _Batch() self._enqueue(batch) def _enqueue(self, batch: _Batch) -> None: if len(batch.ops) == 0 and not (self.nb_commits == 0 and len(self.delete_operations) > 0): return # Blocks if a batch is already waiting: natural backpressure on scanning/uploading. while not self.abort_event.is_set(): try: self.batch_queue.put(batch, timeout=1.0) return except queue.Full: continue self._abort_batch(batch) def _abort_batch(self, batch: _Batch) -> None: if batch.xet_commit is not None: try: batch.xet_commit.abort() except Exception: pass # ---------------------------------------------------------------- committer def _committer_loop(self) -> None: while True: try: batch = self.batch_queue.get(timeout=0.5) except queue.Empty: if self.abort_event.is_set(): return # aborted: exit once the queue is drained, no sentinel needed continue if batch is _SENTINEL: return try: if not self.abort_event.is_set(): self._process_batch(batch) else: self._abort_batch(batch) except BaseException as e: self._abort_batch(batch) self.errors.append(e) self.abort_event.set() def _process_batch(self, batch: _Batch) -> None: # 1. Wait for all xet uploads of this batch and finalize them (atomic xet commit). Files # can only be referenced by a git commit once their xet upload-commit is finalized. if batch.xet_commit is not None: batch.xet_commit.wait_to_finish() for op, handle in batch.handles: if not op.upload_info.is_hashed: op.upload_info.sha256 = bytes.fromhex(handle.result().xet_info.sha256) op._is_uploaded = True # Files whose last progress tick was missed are still done at this point. self.display.notify_xet_uploaded( [ str(op.path_or_fileobj) if not isinstance(op.path_or_fileobj, bytes) else op.path_in_repo for op, _ in batch.handles ] ) # 2. Drop files that have not changed compared to the remote (prevents empty commits). # Their chunks were deduplicated anyway (~0 bytes transferred). ops_to_commit = [] for op in batch.ops: if op._remote_oid is not None and op._remote_oid == op._local_oid: logger.debug(f"Skipping commit for '{op.path_in_repo}' (file unchanged).") self.display.notify_skipped(1) continue ops_to_commit.append(op) # 3. Create the git commit(s). On failure, scale down and split the batch. if len(ops_to_commit) > 0 or (self.nb_commits == 0 and len(self.delete_operations) > 0): self._commit_with_split(ops_to_commit) def _commit_with_split(self, ops: list[CommitOperationAdd]) -> None: try: self._do_commit(ops) except Exception as e: self.pacer.record_failure() if len(ops) <= COMMIT_SIZE_SCALE[0]: raise logger.warning(f"Commit of {len(ops)} files failed ({e!r}). Retrying in smaller chunks.") target = self.pacer.target for start in range(0, len(ops), target): self._commit_with_split(ops[start : start + target]) def _do_commit(self, ops: list[CommitOperationAdd]) -> None: if self.create_pr and self.pr_revision is None: # Create the (draft) pull request explicitly and push every commit to its ref. Committing # with `?create_pr=1` instead would risk opening a second PR if the commit POST is retried # after a lost response. Created lazily so that a fully-unchanged upload opens no PR. # Note: PRs created this way are always opened against the default branch, hence the # `create_pr` + `revision` combination being rejected in `upload_folder`. pr = self.api.create_pull_request( repo_id=self.repo_id, title=self.commit_message, token=self.token, description=self.commit_description, repo_type=self.repo_type, ) if pr.git_reference is None: raise ValueError("Server did not return a git reference for the created pull request.") self.pr_url = pr.url self.pr_revision = pr.git_reference self.commit_revision_quoted = quote(pr.git_reference, safe="") operations: list[Any] = list(ops) if self.nb_commits == 0: # Deletions and `parent_commit` ride the first commit. operations = list(self.delete_operations) + operations commit_message = ( self.commit_message if self.nb_commits == 0 else f"{self.commit_message} (part {self.nb_commits + 1})" ) t0 = time.monotonic() # Retried with backoff on transient errors: safe because the commit targets an explicit # ref (`?create_pr=1` is never used, see above). self.last_commit_info = _send_commit( operations=operations, files_to_copy={}, commit_message=commit_message, commit_description=self.commit_description or "", repo_type=self.repo_type, repo_id=self.repo_id, headers=self.headers, revision=self.commit_revision_quoted, endpoint=self.api.endpoint, parent_commit=self.parent_commit if self.nb_commits == 0 else None, retry_on_error=True, ) duration = time.monotonic() - t0 self.pacer.record_success(duration, len(ops)) self.nb_commits += 1 for op in ops: op._is_committed = True self.display.notify_commit(len(ops)) logger.debug(f"Committed {len(ops)} file(s) in {duration:.1f}s: {self.last_commit_info.commit_url}") # ---------------------------------------------------------------- result def _final_commit_info(self) -> "CommitInfo": from .hf_api import CommitInfo if self.last_commit_info is None: # Nothing was committed (everything unchanged/ignored): mimic `create_commit` and # return info about the latest commit on the target revision. logger.warning("No files have been modified since last commit. Skipping to prevent empty commit.") info = self.api.repo_info(repo_id=self.repo_id, repo_type=self.repo_type, revision=self.revision) url_prefix = self.api.endpoint if self.repo_type != constants.REPO_TYPE_MODEL: url_prefix = f"{url_prefix}/{self.repo_type}s" return CommitInfo( commit_url=f"{url_prefix}/{self.repo_id}/commit/{info.sha}", commit_message=self.commit_message, commit_description=self.commit_description or "", oid=info.sha, # type: ignore _endpoint=self.api.endpoint, ) if self.nb_commits > 1: logger.info(f"Upload completed in {self.nb_commits} commits.") if self.pr_url is not None: # PR upload: attach the PR info (commit responses don't carry it; the PR is created separately). return CommitInfo( commit_url=self.last_commit_info.commit_url, commit_message=self.last_commit_info.commit_message, commit_description=self.last_commit_info.commit_description, oid=self.last_commit_info.oid, pr_url=self.pr_url, _endpoint=self.api.endpoint, ) return self.last_commit_info def pipelined_upload( api: "HfApi", *, repo_id: str, repo_type: str, add_operations: list[CommitOperationAdd], delete_operations: list[CommitOperationDelete], commit_message: str, commit_description: str | None = None, token: str | bool | None = None, revision: str | None = None, create_pr: bool = False, parent_commit: str | None = None, ) -> "CommitInfo": """Upload a prepared list of operations through the streamed multi-commit pipeline. Requires `hf_xet` to be installed. See module docstring for the architecture. """ return _UploadPipeline( api, repo_id=repo_id, repo_type=repo_type, add_operations=add_operations, delete_operations=delete_operations, commit_message=commit_message, commit_description=commit_description, token=token, revision=revision, create_pr=create_pr, parent_commit=parent_commit, ).run()