Files
MoFin/venv/lib/python3.12/site-packages/huggingface_hub/cli/jobs.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

1309 lines
44 KiB
Python

# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains commands to interact with jobs on the Hugging Face Hub.
Usage:
# run a job
hf jobs run <image> <command>
# List running or completed jobs
hf jobs ls [-a] [-f key=value]
# Print logs from a job (non-blocking)
hf jobs logs <job-id>
# Stream logs from a job (blocking, like `docker logs -f`)
hf jobs logs -f <job-id>
# Stream resources usage stats and metrics from a job
hf jobs stats <job-id>
# Inspect detailed information about a job
hf jobs inspect <job-id>
# Cancel a running job
hf jobs cancel <job-id>
# Wait until one or more jobs finish
hf jobs wait <job-id> [<job-id>...]
# List available hardware options
hf jobs hardware
# Run a UV script
hf jobs uv run <script>
# Schedule a job
hf jobs scheduled run <schedule> <image> <command>
# List scheduled jobs
hf jobs scheduled ls [-a] [-f key=value]
# Inspect a scheduled job
hf jobs scheduled inspect <scheduled_job_id>
# Suspend a scheduled job
hf jobs scheduled suspend <scheduled_job_id>
# Resume a scheduled job
hf jobs scheduled resume <scheduled_job_id>
# Delete a scheduled job
hf jobs scheduled delete <scheduled_job_id>
"""
import itertools
import multiprocessing
import multiprocessing.pool
import shutil
import time
from collections.abc import Callable, Iterable
from fnmatch import fnmatch
from queue import Empty, Queue
from typing import Annotated, Any, TypeVar
from urllib.parse import urlsplit
import typer
from huggingface_hub import HfApi, JobHardware, JobInfo, JobStage
from huggingface_hub._jobs_api import TERMINAL_JOB_STAGES
from huggingface_hub.errors import CLIError
from huggingface_hub.utils import logging
from huggingface_hub.utils._cache_manager import _format_size
from huggingface_hub.utils._parsing import format_duration, parse_duration
from ._cli_utils import (
EnvFileOpt,
EnvOpt,
SecretsFileOpt,
SecretsOpt,
SoftChoice,
SshDryRunOpt,
SshIdentityFileOpt,
TokenOpt,
VolumesOpt,
exec_ssh,
get_hf_api,
parse_env_map,
parse_volumes,
typer_factory,
)
from ._output import _dataclass_to_dict, out
logger = logging.get_logger(__name__)
def _parse_namespace_from_job_id(job_id: str, namespace: str | None) -> tuple[str, str | None]:
"""Extract namespace from job_id if provided in 'namespace/job_id' format.
Allows users to pass job IDs copied from the Hub UI (e.g. 'username/job_id')
instead of only bare job IDs. If the namespace is also provided explicitly via
--namespace and conflicts, a CLIError is raised.
"""
if not job_id:
raise CLIError("Job ID cannot be empty.")
if job_id.count("/") > 1:
raise CLIError(f"Job ID must be in the form 'job_id' or 'namespace/job_id': '{job_id}'.")
if "/" not in job_id:
return job_id, namespace
extracted_namespace, parsed_job_id = job_id.split("/", 1)
if not extracted_namespace or not parsed_job_id:
raise CLIError(f"Job ID must be in the form 'job_id' or 'namespace/job_id': '{job_id}'.")
if namespace is not None and namespace != extracted_namespace:
raise CLIError(
f"Conflicting namespace: got --namespace='{namespace}' but job ID implies namespace='{extracted_namespace}'"
)
return parsed_job_id, extracted_namespace
STATS_UPDATE_MIN_INTERVAL = 0.1 # we set a limit here since there is one update per second per job
# Common job-related options
ImageArg = Annotated[
str,
typer.Argument(
help="The Docker image to use.",
),
]
ImageOpt = Annotated[
str | None,
typer.Option(
help="Use a custom Docker image with `uv` installed.",
),
]
FlavorOpt = Annotated[
str | None,
typer.Option(
help="Flavor for the hardware. Run 'hf jobs hardware' to list available flavors. Defaults to `cpu-basic`.",
click_type=SoftChoice(JobHardware),
),
]
LabelsOpt = Annotated[
list[str] | None,
typer.Option(
"-l",
"--label",
help="Set labels. E.g. --label KEY=VALUE or --label LABEL",
),
]
TimeoutOpt = Annotated[
str | None,
typer.Option(
help="Max duration: int/float with s (seconds, default), m (minutes), h (hours) or d (days).",
),
]
DetachOpt = Annotated[
bool,
typer.Option(
"-d",
"--detach",
help="Run the Job in the background and print the Job ID.",
),
]
NamespaceOpt = Annotated[
str | None,
typer.Option(
help="The namespace where the job will be running. Defaults to the current user's namespace.",
),
]
ExposeOpt = Annotated[
list[int] | None,
typer.Option(
"--expose",
help="Expose a container port through the jobs proxy. Repeat the flag for multiple ports (e.g. `--expose 8000 --expose 8001`). Each exposed port is reachable on the public jobs domain; access requires an HF token with read access to the job's namespace.",
),
]
SshEnabledOpt = Annotated[
bool,
typer.Option(
"--ssh",
help="Make the job's container reachable over SSH. Connect with `hf jobs ssh <job_id>`. Requires an SSH public key registered on https://huggingface.co/settings/keys.",
),
]
WithOpt = Annotated[
list[str] | None,
typer.Option(
"--with",
help="Run with the given packages installed",
),
]
PythonOpt = Annotated[
str | None,
typer.Option(
"-p",
"--python",
help="The Python interpreter to use for the run environment",
),
]
SuspendOpt = Annotated[
bool | None,
typer.Option(
help="Suspend (pause) the scheduled Job",
),
]
ConcurrencyOpt = Annotated[
bool | None,
typer.Option(
help="Allow multiple instances of this Job to run concurrently",
),
]
ScheduleArg = Annotated[
str,
typer.Argument(
help="One of annually, yearly, monthly, weekly, daily, hourly, or a CRON schedule expression.",
),
]
ScriptArg = Annotated[
str,
typer.Argument(
help="UV script to run (local file or URL)",
),
]
ScriptArgsArg = Annotated[
list[str] | None,
typer.Argument(
help="Arguments for the script",
),
]
CommandArg = Annotated[
list[str],
typer.Argument(
help="The command to run.",
),
]
JobIdArg = Annotated[
str,
typer.Argument(
help="Job ID (or 'namespace/job_id')",
),
]
JobIdsArg = Annotated[
list[str] | None,
typer.Argument(
help="Job IDs (or 'namespace/job_id')",
),
]
ScheduledJobIdArg = Annotated[
str,
typer.Argument(
help="Scheduled Job ID (or 'namespace/scheduled_job_id')",
),
]
jobs_cli = typer_factory(help="Run and manage Jobs on the Hub.")
def _stream_logs_and_check_status(api: HfApi, job: JobInfo) -> None:
"""Stream Job logs until the Job ends, then fail the command if the Job did not complete successfully."""
for log in api.fetch_job_logs(job_id=job.id, namespace=job.owner.name, follow=True):
out.text(log)
# The log stream can end while the Job is still scheduling or shutting down: settle the final state.
final = api.wait_for_job(job_id=job.id, namespace=job.owner.name)
if final.status.stage != JobStage.COMPLETED:
message = f": {final.status.message}" if final.status.message else ""
raise CLIError(f"Job {final.id} finished with stage '{final.status.stage}'{message}")
out.text(f"Job {final.id} completed")
@jobs_cli.command(
"run",
context_settings={"ignore_unknown_options": True},
examples=[
"hf jobs run python:3.12 python -c 'print(\"Hello!\")'",
"hf jobs run --detach python:3.12 python script.py",
"hf jobs run -e FOO=foo python:3.12 python script.py",
"hf jobs run --secrets HF_TOKEN python:3.12 python script.py",
"hf jobs run -v hf://org/my-model:/data -v hf://buckets/org/b:/mnt python:3.12 python script.py",
],
)
def jobs_run(
image: ImageArg,
command: CommandArg,
env: EnvOpt = None,
secrets: SecretsOpt = None,
label: LabelsOpt = None,
volume: VolumesOpt = None,
env_file: EnvFileOpt = None,
secrets_file: SecretsFileOpt = None,
flavor: FlavorOpt = None,
timeout: TimeoutOpt = None,
detach: DetachOpt = False,
expose: ExposeOpt = None,
ssh: SshEnabledOpt = False,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Run a Job."""
env_map = parse_env_map(env, env_file)
secrets_map = parse_env_map(secrets, secrets_file)
api = get_hf_api(token=token)
job = api.run_job(
image=image,
command=command,
env=env_map,
secrets=secrets_map,
labels=_parse_labels_map(label),
volumes=parse_volumes(volume),
flavor=flavor,
timeout=timeout,
expose=expose,
ssh=ssh,
namespace=namespace,
)
out.result("Job started", id=job.id, url=job.url)
if isinstance(job.status.expose_urls, list):
urls = "\n".join(f" {url}" for url in job.status.expose_urls)
out.hint(f"Exposed ports are reachable at (requires an HF token with read access to the job):\n{urls}")
if isinstance(job.status.ssh_url, str):
out.hint(f"Use `hf jobs ssh {job.owner.name}/{job.id}` to open an SSH session into the job.")
if detach:
job_ref = f"{job.owner.name}/{job.id}"
out.hint(f"Use `hf jobs logs -f {job_ref}` to stream logs, or `hf jobs inspect {job_ref}` to check status.")
return
_stream_logs_and_check_status(api, job)
@jobs_cli.command(
"logs",
examples=[
"hf jobs logs <job_id>",
"hf jobs logs -f <job_id>",
"hf jobs logs --tail 20 <job_id>",
"hf jobs logs -f --tail 100 <job_id>",
],
)
def jobs_logs(
job_id: JobIdArg,
follow: Annotated[
bool,
typer.Option(
"-f",
"--follow",
help="Follow log output (stream until the job completes). Without this flag, only currently available logs are printed.",
),
] = False,
tail: Annotated[
int | None,
typer.Option(
"-n",
"--tail",
help="Number of lines to show from the end of the logs. When combined with --follow, starts streaming from the last N lines.",
),
] = None,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Fetch the logs of a Job.
By default, prints currently available logs and exits (non-blocking).
Use --follow/-f to stream logs in real-time until the job completes.
Use --tail/-n to limit the number of lines returned (server-side when supported).
Note: following exits when the log stream ends, regardless of whether the Job
succeeded or failed. Run `hf jobs inspect <job_id>` to check the final status.
"""
job_id, namespace = _parse_namespace_from_job_id(job_id, namespace)
api = get_hf_api(token=token)
logs = api.fetch_job_logs(job_id=job_id, namespace=namespace, follow=follow, tail=tail)
for log in logs:
out.text(log)
if follow:
job_ref = f"{namespace}/{job_id}" if namespace else job_id
out.hint(f"Stream ended. Run `hf jobs inspect {job_ref}` to check the final status (e.g. COMPLETED or ERROR).")
def _matches_filters(job_properties: dict[str, str], filters: list[tuple[str, str, str]]) -> bool:
"""Check if scheduled job matches all specified filters."""
for key, op_str, pattern in filters:
value = job_properties.get(key)
if value is None:
if op_str == "!=":
continue
return False
match = fnmatch(value.lower(), pattern.lower())
if (op_str == "=" and not match) or (op_str == "!=" and match):
return False
return True
def _clear_line(n: int) -> None:
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for i in range(n):
print(LINE_UP, end=LINE_CLEAR)
def _get_jobs_stats_rows(
job_id: str, metrics_stream: Iterable[dict[str, Any]], table_headers: list[str]
) -> Iterable[tuple[bool, str, list[list[str | int]]]]:
for metrics in metrics_stream:
row = [
job_id,
f"{metrics['cpu_usage_pct']}%",
round(metrics["cpu_millicores"] / 1000.0, 1),
f"{round(100 * metrics['memory_used_bytes'] / metrics['memory_total_bytes'], 2)}%",
f"{_format_size(metrics['memory_used_bytes'])}B / {_format_size(metrics['memory_total_bytes'])}B",
f"{_format_size(metrics['rx_bps'])}bps / {_format_size(metrics['tx_bps'])}bps",
]
if metrics["gpus"] and isinstance(metrics["gpus"], dict):
rows = [row] + [[""] * len(row)] * (len(metrics["gpus"]) - 1)
for row, gpu_id in zip(rows, sorted(metrics["gpus"])):
gpu = metrics["gpus"][gpu_id]
row += [
f"{gpu['utilization']}%",
f"{round(100 * gpu['memory_used_bytes'] / gpu['memory_total_bytes'], 2)}%",
f"{_format_size(gpu['memory_used_bytes'])}B / {_format_size(gpu['memory_total_bytes'])}B",
]
else:
row += ["N/A"] * (len(table_headers) - len(row))
rows = [row]
yield False, job_id, rows
yield True, job_id, []
@jobs_cli.command("stats", examples=["hf jobs stats <job_id>"])
def jobs_stats(
job_ids: JobIdsArg = None,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Fetch the resource usage statistics and metrics of Jobs"""
if job_ids is not None:
parsed_ids = []
for job_id in job_ids:
job_id, namespace = _parse_namespace_from_job_id(job_id, namespace)
parsed_ids.append(job_id)
job_ids = parsed_ids
api = get_hf_api(token=token)
if namespace is None:
namespace = api.whoami()["name"]
if job_ids is None:
job_ids = [
job.id
for job in api.list_jobs(namespace=namespace)
if (job.status.stage if job.status else "UNKNOWN") in ("RUNNING", "UPDATING")
]
if len(job_ids) == 0:
out.text("No running jobs found")
return
table_headers = [
"JOB ID",
"CPU %",
"NUM CPU",
"MEM %",
"MEM USAGE",
"NET I/O",
"GPU UTIL %",
"GPU MEM %",
"GPU MEM USAGE",
]
with multiprocessing.pool.ThreadPool(len(job_ids)) as pool:
rows_per_job_id: dict[str, list[list[str | int]]] = {}
for job_id in job_ids:
row: list[str | int] = [job_id]
row += ["-- / --" if ("/" in header or "USAGE" in header) else "--" for header in table_headers[1:]]
rows_per_job_id[job_id] = [row]
last_update_time = time.time()
total_rows = [row for job_id in rows_per_job_id for row in rows_per_job_id[job_id]]
# In-place refresh (cursor-up + clear) requires a fixed line count and layout —
# `out.table`'s mode-dependent formatting would break it.
print(_tabulate(total_rows, headers=table_headers))
kwargs_list = [
{
"job_id": job_id,
"metrics_stream": api.fetch_job_metrics(job_id=job_id, namespace=namespace),
"table_headers": table_headers,
}
for job_id in job_ids
]
for done, job_id, rows in iflatmap_unordered(pool, _get_jobs_stats_rows, kwargs_list=kwargs_list):
if done:
rows_per_job_id.pop(job_id, None)
else:
rows_per_job_id[job_id] = rows
now = time.time()
if now - last_update_time >= STATS_UPDATE_MIN_INTERVAL:
_clear_line(2 + len(total_rows))
total_rows = [row for job_id in rows_per_job_id for row in rows_per_job_id[job_id]]
print(_tabulate(total_rows, headers=table_headers))
last_update_time = now
@jobs_cli.command(
"list | ls | ps",
examples=[
"hf jobs ls",
"hf jobs ls -a",
"hf jobs ls --status running,scheduling",
"hf jobs ls --label env=prod --label team=ml",
"hf jobs ls --all --label hf-sandbox=1",
],
)
def jobs_ps(
all: Annotated[
bool,
typer.Option(
"-a",
"--all",
help="Show all Jobs (default shows running and scheduling). Cannot be combined with --status.",
),
] = False,
status: Annotated[
list[str] | None,
typer.Option(
"--status",
click_type=SoftChoice(JobStage),
help="Only show Jobs with the given status. Comma-separated or repeated, e.g. `--status running,scheduling`.",
),
] = None,
label: Annotated[
list[str] | None,
typer.Option(
"-l",
"--label",
help="Only show Jobs with the given `key=value` label. Repeat to require several labels, e.g. `--label env=prod --label team=ml`.",
),
] = None,
limit: Annotated[
int,
typer.Option(
"--limit",
help="Maximum number of Jobs to display. Set to 0 to show all (no limit).",
),
] = 100,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
filter: Annotated[
list[str] | None,
typer.Option(
"-f",
"--filter",
help="(Deprecated) Use `--status` and `--label` instead.",
),
] = None,
) -> None:
"""List Jobs.
Use `--status` to filter by status (see [`JobStage`] for possible values) and `--label` to filter by `key=value`
labels. A Job must match every filter to be listed.
"""
api = get_hf_api(token=token)
if filter:
out.warning(
f"Ignoring filter '{filter}'."
" `-f`/`--filter` is deprecated and will be removed in a future release. Use `--status`/`--label`."
)
if all and status:
raise CLIError("`-a`/`--all` cannot be combined with `--status`.")
# Status filtering (default to active Jobs, unless `--all` or `--status` is provided).
raw_statuses: list[str] = []
for value in status or []:
raw_statuses.extend(part.strip() for part in value.split(",") if part.strip())
server_statuses: list[str] | None
if raw_statuses:
server_statuses = raw_statuses
elif all:
server_statuses = None
else:
server_statuses = [JobStage.RUNNING.value, JobStage.SCHEDULING.value]
# Labels filtering
labels: dict[str, str] = {}
for item in label or []:
if "=" not in item:
raise CLIError(f"Invalid label filter '{item}': must be in the form 'key=value'")
key, value = item.split("=")
labels[key] = value
jobs_iter = api.list_jobs(namespace=namespace, status=server_statuses, labels=labels or None)
# Apply the display limit. Fetch one extra Job to detect (and warn about) truncation.
truncated = False
if limit > 0:
jobs = list(itertools.islice(jobs_iter, limit + 1))
if len(jobs) > limit:
truncated = True
jobs = jobs[:limit]
else:
jobs = list(jobs_iter)
# Build display items. Augment the raw api dict with curated, table-friendly columns.
job_items: list[dict[str, Any]] = []
for job in jobs:
job_item = _dataclass_to_dict(job)
durations = job_item.get("durations") or {}
cmd = job_item.get("command") or []
job_item["job_id"] = job_item.get("id", "")
job_item["image/space"] = job_item.get("docker_image") or "N/A"
job_item["command"] = " ".join(cmd) if cmd else "N/A"
job_item["created"] = job_item["created_at"][:19].replace("T", " ") if job_item.get("created_at") else "N/A"
job_item["status"] = (job_item.get("status") or {}).get("stage", "UNKNOWN")
job_item["runtime"] = format_duration(durations.get("running_secs"))
job_items.append(job_item)
out.table(
job_items,
headers=["job_id", "image/space", "command", "created", "status", "runtime"],
id_key="job_id",
)
if truncated:
out.hint(f"Output truncated to {limit} Jobs. Use `--limit 0` to show all (or `--limit N`).")
if not job_items:
if raw_statuses or labels:
filters_msg = ", ".join(
[*(f"status={s}" for s in raw_statuses), *(f"label={k}={v}" for k, v in labels.items())]
)
out.text(f"No jobs matched filters: {filters_msg}")
elif not all:
out.hint("No running jobs. Use `-a`/`--all` to include finished (and failed) jobs.")
@jobs_cli.command("hardware", examples=["hf jobs hardware"])
def jobs_hardware() -> None:
"""List available hardware options for Jobs"""
api = get_hf_api()
hardware_list = api.list_jobs_hardware()
items = []
for hw in hardware_list:
accelerator_info = ""
if hw.accelerator:
accelerator_info = f"{hw.accelerator.quantity}x {hw.accelerator.model} ({hw.accelerator.vram})"
cost_min = f"${hw.unit_cost_usd:.4f}" if hw.unit_cost_usd else "free"
cost_hour = f"${hw.unit_cost_usd * 60:.2f}" if hw.unit_cost_usd else "free"
items.append(
{
"name": hw.name,
"pretty name": hw.pretty_name,
"cpu": hw.cpu,
"ram": hw.ram,
"storage": hw.ephemeral_storage,
"accelerator": accelerator_info,
"cost/min": cost_min,
"cost/hour": cost_hour,
}
)
out.table(items)
out.hint("Use `hf jobs run --flavor <name> ...` to request a specific hardware flavor.")
@jobs_cli.command("inspect", examples=["hf jobs inspect <job_id>"])
def jobs_inspect(
job_ids: Annotated[
list[str],
typer.Argument(
help="Job IDs to inspect (or 'namespace/job_id')",
),
],
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Display detailed information on one or more Jobs"""
parsed_ids = []
for job_id in job_ids:
job_id, namespace = _parse_namespace_from_job_id(job_id, namespace)
parsed_ids.append(job_id)
job_ids = parsed_ids
api = get_hf_api(token=token)
jobs = [api.inspect_job(job_id=job_id, namespace=namespace) for job_id in job_ids]
out.table([_dataclass_to_dict(job) for job in jobs])
@jobs_cli.command("cancel", examples=["hf jobs cancel <job_id>"])
def jobs_cancel(
job_id: JobIdArg,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Cancel a Job"""
job_id, namespace = _parse_namespace_from_job_id(job_id, namespace)
api = get_hf_api(token=token)
api.cancel_job(job_id=job_id, namespace=namespace)
out.result("Job cancelled", id=job_id)
@jobs_cli.command(
"wait",
examples=[
"hf jobs wait <job_id>",
"hf jobs wait <job_id_1> <job_id_2>",
"hf jobs ls -q | xargs hf jobs wait",
],
)
def jobs_wait(
job_ids: Annotated[
list[str],
typer.Argument(
help="Job IDs to wait for (or 'namespace/job_id').",
),
],
timeout: Annotated[
str | None,
typer.Option(
help="Max time to wait: int/float with s (seconds, default), m (minutes), h (hours) or d (days).",
),
] = None,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Wait for one or more Jobs to reach a terminal state.
Blocks until every Job has finished, then exits with code 0 if all Jobs completed
successfully, or a non-zero exit code if any Job was canceled, errored or deleted.
All Jobs must belong to the same namespace.
"""
parsed_ids = []
namespaces = set()
for job_id in job_ids:
parsed_id, parsed_namespace = _parse_namespace_from_job_id(job_id, namespace)
parsed_ids.append(parsed_id)
namespaces.add(parsed_namespace)
if len(namespaces) > 1:
raise CLIError(
"All Job IDs must be in the same namespace, got: "
+ ", ".join(str(ns) for ns in sorted(namespaces, key=str))
)
namespace = namespaces.pop()
timeout_secs = parse_duration(timeout) if timeout is not None else None
api = get_hf_api(token=token)
status = out.status(f"Waiting for {len(parsed_ids)} Job(s) to finish...")
try:
jobs = api.wait_for_job(parsed_ids, timeout=timeout_secs, namespace=namespace)
except TimeoutError:
status.done("Timed out.")
raise CLIError(f"Timed out after {timeout} waiting for Job(s) to finish.") from None
status.done(f"{len(jobs)} Job(s) finished.")
out.table([{"id": job.id, "stage": str(job.status.stage), "message": job.status.message} for job in jobs])
failed = [job for job in jobs if job.status.stage != JobStage.COMPLETED]
if failed:
raise CLIError(
f"{len(failed)} of {len(jobs)} Job(s) did not complete successfully: "
+ ", ".join(f"{job.id} ({job.status.stage})" for job in failed)
)
@jobs_cli.command(
"labels",
examples=[
"hf jobs labels <job_id> --label env=prod --label team=ml",
"hf jobs labels <job_id> --clear",
],
)
def jobs_labels(
job_id: JobIdArg,
label: LabelsOpt = None,
clear: Annotated[bool, typer.Option("--clear", help="Remove all labels from the job.")] = False,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Update labels on a Job. Replaces all existing labels."""
if not label and not clear:
raise CLIError("Please set at least one label with --label. To remove all labels, pass --clear.")
if label and clear:
raise CLIError(
"Cannot set labels and clear them at the same time. Please use either --label or --clear, not both."
)
job_id, namespace = _parse_namespace_from_job_id(job_id, namespace)
labels = _parse_labels_map(label) or {}
api = get_hf_api(token=token)
job = api.update_job_labels(job_id=job_id, labels=labels, namespace=namespace)
out.result("Labels updated", id=job.id)
@jobs_cli.command(
"ssh",
examples=[
"hf jobs ssh <job_id>",
"hf jobs ssh <job_id> --dry-run",
"hf jobs ssh <job_id> -i ~/.ssh/id_ed25519",
],
)
def jobs_ssh(
job_id: JobIdArg,
identity_file: SshIdentityFileOpt = None,
dry_run: SshDryRunOpt = False,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""SSH into a running Job.
If the Job is not yet running, waits until it reaches the RUNNING state before
connecting. Requires the Job to be started with SSH enabled (`hf jobs run --ssh ...`)
and your SSH public key to be registered at https://huggingface.co/settings/keys.
"""
job_id, namespace = _parse_namespace_from_job_id(job_id, namespace)
api = get_hf_api(token=token)
job = api.inspect_job(job_id=job_id, namespace=namespace)
if job.status.ssh_url is None:
raise CLIError("SSH is not enabled on this job. Start a job with SSH support using `hf jobs run --ssh ...`.")
if job.status.stage in TERMINAL_JOB_STAGES:
raise CLIError(f"Cannot SSH into job '{job.id}': job has already finished (stage: '{job.status.stage}').")
if job.status.stage != JobStage.RUNNING:
status = out.status(f"Waiting for job '{job.id}' to be running (stage: '{job.status.stage}')...")
job = api.wait_for_job(job_id=job.id, namespace=namespace, stages=[JobStage.RUNNING])
if job.status.stage != JobStage.RUNNING:
status.done("Job finished.")
raise CLIError(
f"Cannot SSH into job '{job.id}': job finished before reaching RUNNING (stage: '{job.status.stage}')."
)
status.done("Job is running.")
ssh_url = urlsplit(job.status.ssh_url)
exec_ssh(
f"{ssh_url.username}@{ssh_url.hostname}", # type: ignore
port=ssh_url.port,
identity_file=identity_file,
dry_run=dry_run,
)
uv_app = typer_factory(help="Run UV scripts (Python with inline dependencies) on HF infrastructure.")
jobs_cli.add_typer(uv_app, name="uv")
@uv_app.command(
"run",
context_settings={"ignore_unknown_options": True},
examples=[
"hf jobs uv run my_script.py",
"hf jobs uv run --detach my_script.py",
"hf jobs uv run ml_training.py --flavor a10g-small",
"hf jobs uv run --with transformers train.py",
"hf jobs uv run -v hf://org/my-model:/data -v hf://buckets/org/b:/mnt script.py",
],
)
def jobs_uv_run(
script: ScriptArg,
script_args: ScriptArgsArg = None,
image: ImageOpt = None,
flavor: FlavorOpt = None,
env: EnvOpt = None,
secrets: SecretsOpt = None,
label: LabelsOpt = None,
volume: VolumesOpt = None,
env_file: EnvFileOpt = None,
secrets_file: SecretsFileOpt = None,
timeout: TimeoutOpt = None,
detach: DetachOpt = False,
expose: ExposeOpt = None,
ssh: SshEnabledOpt = False,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
with_: WithOpt = None,
python: PythonOpt = None,
) -> None:
"""Run a UV script (local file or URL) on HF infrastructure"""
env_map = parse_env_map(env, env_file)
secrets_map = parse_env_map(secrets, secrets_file)
api = get_hf_api(token=token)
job = api.run_uv_job(
script=script,
script_args=script_args or [],
dependencies=with_,
python=python,
image=image,
env=env_map,
secrets=secrets_map,
labels=_parse_labels_map(label),
volumes=parse_volumes(volume),
flavor=flavor,
timeout=timeout,
expose=expose,
ssh=ssh,
namespace=namespace,
)
out.result("Job started", id=job.id, url=job.url)
if isinstance(job.status.expose_urls, list):
urls = "\n".join(f" {url}" for url in job.status.expose_urls)
out.hint(f"Exposed ports are reachable at (requires an HF token with read access to the job):\n{urls}")
if isinstance(job.status.ssh_url, str):
out.hint(f"Use `hf jobs ssh {job.owner.name}/{job.id}` to open an SSH session into the job.")
if detach:
job_ref = f"{job.owner.name}/{job.id}"
out.hint(f"Use `hf jobs logs -f {job_ref}` to stream logs, or `hf jobs inspect {job_ref}` to check status.")
return
_stream_logs_and_check_status(api, job)
scheduled_app = typer_factory(help="Create and manage scheduled Jobs on the Hub.")
jobs_cli.add_typer(scheduled_app, name="scheduled")
@scheduled_app.command(
"run",
context_settings={"ignore_unknown_options": True},
examples=['hf jobs scheduled run "0 0 * * *" python:3.12 python script.py'],
)
def scheduled_run(
schedule: ScheduleArg,
image: ImageArg,
command: CommandArg,
suspend: SuspendOpt = None,
concurrency: ConcurrencyOpt = None,
env: EnvOpt = None,
secrets: SecretsOpt = None,
label: LabelsOpt = None,
volume: VolumesOpt = None,
env_file: EnvFileOpt = None,
secrets_file: SecretsFileOpt = None,
flavor: FlavorOpt = None,
timeout: TimeoutOpt = None,
expose: ExposeOpt = None,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Schedule a Job."""
env_map = parse_env_map(env, env_file)
secrets_map = parse_env_map(secrets, secrets_file)
api = get_hf_api(token=token)
scheduled_job = api.create_scheduled_job(
image=image,
command=command,
schedule=schedule,
suspend=suspend,
concurrency=concurrency,
env=env_map,
secrets=secrets_map,
labels=_parse_labels_map(label),
volumes=parse_volumes(volume),
flavor=flavor,
timeout=timeout,
expose=expose,
namespace=namespace,
)
out.result("Scheduled Job created", id=scheduled_job.id)
out.hint(f"Use `hf jobs scheduled inspect {scheduled_job.id}` to view its details.")
@scheduled_app.command("list | ls | ps", examples=["hf jobs scheduled ls"])
def scheduled_ps(
all: Annotated[
bool,
typer.Option(
"-a",
"--all",
help="Show all scheduled Jobs (default hides suspended)",
),
] = False,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
filter: Annotated[
list[str] | None,
typer.Option(
"-f",
"--filter",
help="Filter output based on conditions provided (format: key=value)",
),
] = None,
) -> None:
"""List scheduled Jobs"""
api = get_hf_api(token=token)
scheduled_jobs = api.list_scheduled_jobs(namespace=namespace)
filters: list[tuple[str, str, str]] = []
for f in filter or []:
if "=" in f:
key, value = f.split("=", 1)
# Negate predicate in case of key!=value
if key.endswith("!"):
op = "!="
key = key[:-1]
else:
op = "="
filters.append((key.lower(), op, value.lower()))
else:
out.warning(f"Ignoring invalid filter format '{f}'. Use key=value format.")
# Filter scheduled jobs (operating on ScheduledJobInfo objects to preserve existing filter behavior)
filtered_jobs = []
for scheduled_job in scheduled_jobs:
suspend = scheduled_job.suspend or False
if not all and suspend:
continue
image_or_space = scheduled_job.job_spec.docker_image or "N/A"
cmd = scheduled_job.job_spec.command or []
command_str = " ".join(cmd) if cmd else "N/A"
props = {"id": scheduled_job.id, "image": image_or_space, "suspend": str(suspend), "command": command_str}
if not _matches_filters(props, filters):
continue
filtered_jobs.append(scheduled_job)
# Build display items. Augment with curated columns.
items: list[dict[str, Any]] = []
for sj in filtered_jobs:
item = _dataclass_to_dict(sj)
job_spec = item.get("job_spec") or {}
status_dict = item.get("status") or {}
last_job = status_dict.get("last_job")
cmd = job_spec.get("command") or []
item["image/space"] = job_spec.get("docker_image") or "N/A"
item["command"] = " ".join(cmd) if cmd else "N/A"
item["last_run"] = last_job["at"][:19].replace("T", " ") if last_job and last_job.get("at") else "N/A"
item["next_run"] = (
status_dict["next_job_run_at"][:19].replace("T", " ") if status_dict.get("next_job_run_at") else "N/A"
)
item["suspend"] = item.get("suspend") or False
items.append(item)
out.table(
items,
headers=["id", "schedule", "image/space", "command", "last_run", "next_run", "suspend"],
id_key="id",
)
if not items and filters:
filters_msg = ", ".join(f"{k}{o}{v}" for k, o, v in filters)
out.text(f"No scheduled jobs matched filters: {filters_msg}")
@scheduled_app.command("inspect", examples=["hf jobs scheduled inspect <id>"])
def scheduled_inspect(
scheduled_job_ids: Annotated[
list[str],
typer.Argument(
help="Scheduled Job IDs to inspect (or 'namespace/scheduled_job_id')",
),
],
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Display detailed information on one or more scheduled Jobs"""
parsed_ids = []
for job_id in scheduled_job_ids:
job_id, namespace = _parse_namespace_from_job_id(job_id, namespace)
parsed_ids.append(job_id)
scheduled_job_ids = parsed_ids
api = get_hf_api(token=token)
scheduled_jobs = [
api.inspect_scheduled_job(scheduled_job_id=scheduled_job_id, namespace=namespace)
for scheduled_job_id in scheduled_job_ids
]
out.table([_dataclass_to_dict(scheduled_job) for scheduled_job in scheduled_jobs])
@scheduled_app.command("delete", examples=["hf jobs scheduled delete <id>"])
def scheduled_delete(
scheduled_job_id: ScheduledJobIdArg,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Delete a scheduled Job."""
scheduled_job_id, namespace = _parse_namespace_from_job_id(scheduled_job_id, namespace)
api = get_hf_api(token=token)
api.delete_scheduled_job(scheduled_job_id=scheduled_job_id, namespace=namespace)
out.result("Scheduled Job deleted", id=scheduled_job_id)
@scheduled_app.command("suspend", examples=["hf jobs scheduled suspend <id>"])
def scheduled_suspend(
scheduled_job_id: ScheduledJobIdArg,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Suspend (pause) a scheduled Job."""
scheduled_job_id, namespace = _parse_namespace_from_job_id(scheduled_job_id, namespace)
api = get_hf_api(token=token)
api.suspend_scheduled_job(scheduled_job_id=scheduled_job_id, namespace=namespace)
out.result("Scheduled Job suspended", id=scheduled_job_id)
out.hint(f"Use `hf jobs scheduled resume {scheduled_job_id}` to resume it.")
@scheduled_app.command("resume", examples=["hf jobs scheduled resume <id>"])
def scheduled_resume(
scheduled_job_id: ScheduledJobIdArg,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Resume (unpause) a scheduled Job."""
scheduled_job_id, namespace = _parse_namespace_from_job_id(scheduled_job_id, namespace)
api = get_hf_api(token=token)
api.resume_scheduled_job(scheduled_job_id=scheduled_job_id, namespace=namespace)
out.result("Scheduled Job resumed", id=scheduled_job_id)
@scheduled_app.command(
"labels",
examples=[
"hf jobs scheduled labels <id> --label env=prod --label team=ml",
"hf jobs scheduled labels <id> --clear",
],
)
def scheduled_labels(
scheduled_job_id: ScheduledJobIdArg,
label: LabelsOpt = None,
clear: Annotated[bool, typer.Option("--clear", help="Remove all labels from the scheduled job.")] = False,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
) -> None:
"""Update labels on a scheduled Job. Replaces all existing labels."""
if not label and not clear:
raise CLIError("Please set at least one label with --label. To remove all labels, pass --clear.")
if label and clear:
raise CLIError(
"Cannot set labels and clear them at the same time. Please use either --label or --clear, not both."
)
scheduled_job_id, namespace = _parse_namespace_from_job_id(scheduled_job_id, namespace)
labels = _parse_labels_map(label) or {}
api = get_hf_api(token=token)
scheduled_job = api.update_scheduled_job_labels(
scheduled_job_id=scheduled_job_id, labels=labels, namespace=namespace
)
out.result("Labels updated", id=scheduled_job.id)
scheduled_uv_app = typer_factory(help="Schedule UV scripts on HF infrastructure.")
scheduled_app.add_typer(scheduled_uv_app, name="uv")
@scheduled_uv_app.command(
"run",
context_settings={"ignore_unknown_options": True},
examples=[
'hf jobs scheduled uv run "0 0 * * *" script.py',
'hf jobs scheduled uv run "0 0 * * *" script.py --with pandas',
],
)
def scheduled_uv_run(
schedule: ScheduleArg,
script: ScriptArg,
script_args: ScriptArgsArg = None,
suspend: SuspendOpt = None,
concurrency: ConcurrencyOpt = None,
image: ImageOpt = None,
flavor: FlavorOpt = None,
env: EnvOpt = None,
secrets: SecretsOpt = None,
label: LabelsOpt = None,
volume: VolumesOpt = None,
env_file: EnvFileOpt = None,
secrets_file: SecretsFileOpt = None,
timeout: TimeoutOpt = None,
expose: ExposeOpt = None,
namespace: NamespaceOpt = None,
token: TokenOpt = None,
with_: WithOpt = None,
python: PythonOpt = None,
) -> None:
"""Run a UV script (local file or URL) on HF infrastructure"""
env_map = parse_env_map(env, env_file)
secrets_map = parse_env_map(secrets, secrets_file)
api = get_hf_api(token=token)
job = api.create_scheduled_uv_job(
script=script,
script_args=script_args or [],
schedule=schedule,
suspend=suspend,
concurrency=concurrency,
dependencies=with_,
python=python,
image=image,
env=env_map,
secrets=secrets_map,
labels=_parse_labels_map(label),
volumes=parse_volumes(volume),
flavor=flavor,
timeout=timeout,
expose=expose,
namespace=namespace,
)
out.result("Scheduled Job created", id=job.id)
out.hint(f"Use `hf jobs scheduled inspect {job.id}` to view its details.")
### UTILS
def _parse_labels_map(labels: list[str] | None) -> dict[str, str] | None:
"""Parse label key-value pairs from CLI arguments.
Args:
labels: List of label strings in KEY=VALUE format. If KEY only, then VALUE is set to empty string.
Returns:
Dictionary mapping label keys to values, or None if no labels provided.
"""
if not labels:
return None
labels_map: dict[str, str] = {}
for label_var in labels:
key, value = label_var.split("=", 1) if "=" in label_var else (label_var, "")
labels_map[key] = value
return labels_map
def _tabulate(rows: list[list[str | int]], headers: list[str]) -> str:
"""
Inspired by:
- stackoverflow.com/a/8356620/593036
- stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
"""
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
terminal_width = max(shutil.get_terminal_size().columns, len(headers) * 12)
while len(headers) + sum(col_widths) > terminal_width:
col_to_minimize = col_widths.index(max(col_widths))
col_widths[col_to_minimize] //= 2
if len(headers) + sum(col_widths) <= terminal_width:
col_widths[col_to_minimize] = terminal_width - sum(col_widths) - len(headers) + col_widths[col_to_minimize]
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
lines = []
lines.append(row_format.format(*headers))
lines.append(row_format.format(*["-" * w for w in col_widths]))
for row in rows:
row_format_args = [
str(x)[: col_width - 3] + "..." if len(str(x)) > col_width else str(x)
for x, col_width in zip(row, col_widths)
]
lines.append(row_format.format(*row_format_args))
return "\n".join(lines)
T = TypeVar("T")
def _write_generator_to_queue(queue: Queue[T], func: Callable[..., Iterable[T]], kwargs: dict) -> None:
for result in func(**kwargs):
queue.put(result)
def iflatmap_unordered(
pool: multiprocessing.pool.ThreadPool,
func: Callable[..., Iterable[T]],
*,
kwargs_list: list[dict],
) -> Iterable[T]:
"""
Takes a function that returns an iterable of items, and run it in parallel using threads to return the flattened iterable of items as they arrive.
This is inspired by those three `map()` variants, and is the mix of all three:
* `imap()`: like `map()` but returns an iterable instead of a list of results
* `imap_unordered()`: like `imap()` but the output is sorted by time of arrival
* `flatmap()`: like `map()` but given a function which returns a list, `flatmap()` returns the flattened list that is the concatenation of all the output lists
"""
queue: Queue[T] = Queue()
async_results = [pool.apply_async(_write_generator_to_queue, (queue, func, kwargs)) for kwargs in kwargs_list]
try:
while True:
try:
yield queue.get(timeout=0.05)
except Empty:
if all(async_result.ready() for async_result in async_results) and queue.empty():
break
except KeyboardInterrupt:
pass
finally:
# we get the result in case there's an error to raise
try:
[async_result.get(timeout=0.05) for async_result in async_results]
except multiprocessing.TimeoutError:
pass