Files
MoFin/venv/lib/python3.12/site-packages/huggingface_hub/cli/_cp.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

253 lines
10 KiB
Python

# Copyright 2026-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared ``cp`` command to copy files between local paths, repositories and buckets.
This single command backs three identical CLI entry points: ``hf cp`` (top-level),
``hf repos cp`` and ``hf buckets cp``. It supports any source/destination combination
of local file, repo/bucket ``hf://`` URI, and ``-`` (stdin/stdout), with two exceptions:
- bucket-to-repo copies are not supported (server limitation), and
- local-to-local copies (use a regular ``cp`` for that).
"""
import os
import sys
from dataclasses import replace
from typing import Annotated, Literal
import typer
from huggingface_hub import HfApi
from huggingface_hub.errors import CLIError
from huggingface_hub.utils import HfUri, SoftTemporaryDirectory, disable_progress_bars, is_hf_uri, parse_hf_uri
from ._cli_utils import TokenOpt, get_hf_api
from ._output import out
CP_EXAMPLES = [
# Download (repo or bucket -> local / stdout)
"hf cp hf://username/my-model/config.json",
"hf cp hf://username/my-model/config.json ./config.json",
"hf cp hf://datasets/username/my-dataset/data.csv ./data/",
"hf cp hf://buckets/username/my-bucket/config.json -",
# Upload (local / stdin -> repo or bucket)
"hf cp ./model.safetensors hf://username/my-model/model.safetensors",
"hf cp ./config.json hf://buckets/username/my-bucket/logs/",
"hf cp - hf://buckets/username/my-bucket/config.json",
# Remote to remote (repo/bucket -> repo/bucket, server-side when possible)
"hf cp hf://username/source-model/ hf://username/dest-model/",
"hf cp hf://datasets/username/my-dataset/processed/ hf://buckets/username/my-bucket/processed/",
"hf cp hf://buckets/username/my-bucket/logs/ hf://buckets/username/archive-bucket/ # copies contents only",
]
# Which alias registered the command, used to restrict the remote endpoint type (see `_enforce_context`).
CpContext = Literal["repos", "buckets"]
def make_cp(context: CpContext | None = None):
"""Build the ``cp`` command function for a given alias.
The three entry points (`hf cp`, `hf repos cp`, `hf buckets cp`) share the exact same logic;
'context' only adds a guardrail on the remote endpoint type (see `_enforce_context`).
"""
def cp(
src: Annotated[
str,
typer.Argument(help="Source: local file, hf:// URI (repo or bucket), or - for stdin."),
],
dst: Annotated[
str | None,
typer.Argument(help="Destination: local path, hf:// URI (repo or bucket), or - for stdout."),
] = None,
token: TokenOpt = None,
) -> None:
"""Copy files between local paths, repositories, and buckets.
Handles uploads (local/stdin -> repo/bucket), downloads (repo/bucket -> local/stdout) and
remote-to-remote copies (repo/bucket -> repo/bucket). Bucket-to-repo and local-to-local
copies are not supported. For directories, use `hf upload`/`hf download` (repos) or
`hf buckets sync` (buckets). Remote-to-remote copies only work within the same storage
region (https://huggingface.co/docs/hub/storage-regions).
"""
_enforce_context(context, src, dst)
_run_cp(src, dst, token)
return cp
def _enforce_context(context: CpContext | None, src: str, dst: str | None) -> None:
"""Guardrail for the `hf repos cp` / `hf buckets cp` aliases.
These aliases are exact duplicates of `hf cp`, so a bare `hf repos cp` could otherwise touch a
bucket (and vice versa). We validate the type of the remote side: the destination for uploads and
remote-to-remote copies, or the source when downloading to a local path / stdout. The top-level
`hf cp` (i.e. 'context' is None) accepts any combination.
"""
if context is None:
return
# The remote endpoint is the destination when it is an hf:// URI, otherwise the source (download).
remote = dst if (dst is not None and is_hf_uri(dst)) else src
if not is_hf_uri(remote):
return
if context == "repos" and parse_hf_uri(remote).is_bucket:
raise CLIError("`hf repos cp` only works with repositories. Use `hf cp` or `hf buckets cp` for buckets.")
if context == "buckets" and not parse_hf_uri(remote).is_bucket:
raise CLIError("`hf buckets cp` only works with buckets. Use `hf cp` or `hf repos cp` for repositories.")
def _run_cp(src: str, dst: str | None, token: str | None) -> None:
api = get_hf_api(token=token)
src_is_stdin = src == "-"
dst_is_stdout = dst == "-"
src_is_hf = is_hf_uri(src)
dst_is_hf = dst is not None and is_hf_uri(dst)
# --- Remote to remote: delegate to copy_files (repo/bucket -> repo/bucket) ---
if src_is_hf and dst_is_hf:
assert dst is not None # guaranteed by dst_is_hf
api.copy_files(src, dst)
out.result("Copied", src=src, dst=dst)
return
# --- At least one side must be a remote hf:// URI (rules out local->local, stdin->local, etc.) ---
if not src_is_hf and not dst_is_hf:
if dst is None:
raise typer.BadParameter("Missing destination. Provide a repo or bucket hf:// URI as DST.")
raise typer.BadParameter(
"One of SRC or DST must be a repo (hf://username/...) or bucket (hf://buckets/...) URI."
)
# --- Download: repo/bucket -> local file or stdout ---
if src_is_hf:
if dst_is_stdout:
_download_file_to_stdout(api, src)
return
_download_file_to_local(api, src, dst)
return
# --- Upload: local file or stdin -> repo/bucket ---
assert dst is not None # guaranteed: reaching here means dst_is_hf is True
_upload_file_to_remote(api, src, dst, src_is_stdin=src_is_stdin)
def _download_file_to_stdout(api: HfApi, src: str) -> None:
uri = parse_hf_uri(src)
filename = _source_filename(uri, src)
# Suppress progress bars to avoid polluting the piped output.
with disable_progress_bars():
with SoftTemporaryDirectory() as tmp_dir:
tmp_path = os.path.join(tmp_dir, filename)
_download_single(api, uri, tmp_path)
with open(tmp_path, "rb") as f:
while chunk := f.read(32_000_000): # 32MB chunks
sys.stdout.buffer.write(chunk)
def _download_file_to_local(api: HfApi, src: str, dst: str | None) -> None:
uri = parse_hf_uri(src)
filename = _source_filename(uri, src)
if dst is None:
local_path = filename
elif os.path.isdir(dst) or dst.endswith(os.sep) or dst.endswith("/"):
local_path = os.path.join(dst, filename)
else:
local_path = dst
parent_dir = os.path.dirname(local_path)
if parent_dir:
os.makedirs(parent_dir, exist_ok=True)
_download_single(api, uri, local_path)
out.result("Downloaded", src=src, dst=local_path)
def _download_single(api: HfApi, uri: HfUri, local_path: str) -> None:
"""Download a single file (repo or bucket) to ``local_path``.
Used by `_download_file_to_local` and `_download_file_to_stdout`.
"""
if uri.is_bucket:
api.download_bucket_files(uri.id, [(uri.path_in_repo, local_path)], raise_on_missing_files=True)
else:
# Download into a temporary folder next to the destination (rather than the shared cache)
# so the final move stays on the same filesystem and is instant. The temp folder is
# cleaned up automatically once the move is complete.
parent_dir = os.path.dirname(local_path) or "."
with SoftTemporaryDirectory(prefix=".tmp", dir=parent_dir) as tmp_dir:
downloaded_path = api.hf_hub_download(
repo_id=uri.id,
repo_type=uri.type,
filename=uri.path_in_repo,
revision=uri.revision,
local_dir=tmp_dir,
)
os.replace(downloaded_path, local_path)
def _source_filename(uri: HfUri, src: str) -> str:
if uri.path_in_repo == "" or src.endswith("/"):
raise typer.BadParameter(
"Source path must include a file name, not just a repo/bucket or directory path."
" Use `hf download` or `hf buckets sync` to copy directories."
)
return uri.path_in_repo.rsplit("/", 1)[-1]
def _upload_file_to_remote(api: HfApi, src: str, dst: str, *, src_is_stdin: bool) -> None:
uri = parse_hf_uri(dst)
if src_is_stdin:
if uri.path_in_repo == "" or dst.endswith("/"):
raise typer.BadParameter("Stdin upload requires a full destination path including filename.")
data = sys.stdin.buffer.read()
_upload_single(api, uri, data, uri.path_in_repo)
out.result("Uploaded", src="stdin", dst=uri.to_uri())
return
if os.path.isdir(src):
raise typer.BadParameter(
"Source must be a file, not a directory. Use `hf upload` or `hf buckets sync` for directories."
)
if not os.path.isfile(src):
raise typer.BadParameter(f"Source file not found: {src}")
prefix = uri.path_in_repo
if prefix == "":
remote_path = os.path.basename(src)
elif dst.endswith("/"):
remote_path = prefix + "/" + os.path.basename(src)
else:
remote_path = prefix
_upload_single(api, uri, src, remote_path)
out.result("Uploaded", src=src, dst=replace(uri, path_in_repo=remote_path).to_uri())
def _upload_single(api: HfApi, uri: HfUri, source: str | bytes, remote_path: str) -> None:
"""Upload a single file or bytes (to a repo or bucket)."""
if uri.is_bucket:
api.batch_bucket_files(uri.id, add=[(source, remote_path)])
else:
api.upload_file(
path_or_fileobj=source,
path_in_repo=remote_path,
repo_id=uri.id,
repo_type=uri.type,
revision=uri.revision,
)