1344 lines
47 KiB
Python
Executable File
1344 lines
47 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
run-recipe.py - One-click model deployment using YAML recipes
|
|
|
|
This script provides a high-level interface for deploying models with
|
|
pre-configured settings. It handles:
|
|
- Model download from HuggingFace (optional)
|
|
- Container building and distribution to worker nodes
|
|
- Mod application
|
|
- Launch script generation
|
|
- Both solo (single node) and cluster deployments
|
|
|
|
Usage:
|
|
./run-recipe.py recipes/glm-4.7-nvfp4.yaml
|
|
./run-recipe.py glm-4.7-nvfp4 --port 9000 --solo
|
|
./run-recipe.py minimax-m2-awq --setup # Full setup: build + download + run
|
|
./run-recipe.py --list
|
|
|
|
================================================================================
|
|
ARCHITECTURE OVERVIEW (for developers extending this script)
|
|
================================================================================
|
|
|
|
DEPLOYMENT PIPELINE:
|
|
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
│ CLI Args → Load Recipe → Resolve Nodes → Build → Download → Run │
|
|
└─────────────────────────────────────────────────────────────────────────┘
|
|
|
|
KEY ABSTRACTIONS:
|
|
- Recipe (YAML): Declarative model configuration (see load_recipe docstring)
|
|
- Phases: Build, Download, Run - each can run independently (--build-only, etc.)
|
|
- Nodes: Head (first) + Workers (rest) - images/models copied to workers
|
|
|
|
EXTENSION POINTS:
|
|
|
|
1. ADD NEW RECIPE FIELDS:
|
|
- Update load_recipe() to validate/set defaults
|
|
- Use the field in generate_launch_script() or main()
|
|
- Document in recipe YAML schema below
|
|
|
|
2. ADD NEW CLI OPTIONS:
|
|
- Add to appropriate argument group in main()
|
|
- Handle in the corresponding phase (build/download/run)
|
|
- Pass to generate_launch_script() via overrides dict if needed
|
|
|
|
3. ADD NEW DEPLOYMENT PHASES:
|
|
- Follow the pattern: check if needed → dry-run print → execute
|
|
- Insert between existing phases in main()
|
|
- Add corresponding --phase-only flag
|
|
|
|
4. SUPPORT NEW MODEL SOURCES:
|
|
- Add detection logic in download_model() or check_model_exists()
|
|
- Create new download script or handle inline
|
|
|
|
5. SUPPORT NEW CONTAINER RUNTIMES:
|
|
- Modify check_image_exists() and build_image()
|
|
- May need to update launch-cluster.sh as well
|
|
|
|
RECIPE YAML SCHEMA:
|
|
name: str # Required: Human-readable name
|
|
recipe_version: str # Required: Recipe schema version (e.g., '1'). Used by run-recipe.py
|
|
# to check compatibility and available features.
|
|
container: str # Required: Docker image tag
|
|
command: str # Required: vLLM serve command with {placeholders}
|
|
description: str # Optional: Brief description
|
|
model: str # Optional: HuggingFace model ID for --setup
|
|
mods: list[str] # Optional: Mod directories to apply
|
|
defaults: dict # Optional: Default values for command placeholders
|
|
env: dict # Optional: Environment variables
|
|
build_args: list[str] # Optional: Args for build-and-copy.sh
|
|
cluster_only: bool # Optional: Require cluster mode (default: false)
|
|
solo_only: bool # Optional: Require solo mode (default: false)
|
|
|
|
RECIPE VERSION HISTORY:
|
|
Version 1 (default): Initial schema with all fields above supported.
|
|
|
|
RELATED FILES:
|
|
- run-recipe.sh: Bash wrapper that ensures Python deps are installed
|
|
- recipes/*.yaml: Recipe definitions
|
|
- examples/: Example launch scripts for direct use with launch-cluster.sh
|
|
- launch-cluster.sh: Low-level container orchestration
|
|
- build-and-copy.sh: Docker build and distribution
|
|
- hf-download.sh: HuggingFace model download and sync
|
|
- autodiscover.sh: Network topology detection
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import subprocess
|
|
import shlex
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
print("Error: PyYAML is required. Install with: pip install pyyaml")
|
|
sys.exit(1)
|
|
|
|
|
|
SCRIPT_DIR = Path(__file__).parent.resolve()
|
|
RECIPES_DIR = SCRIPT_DIR / "recipes"
|
|
LAUNCH_SCRIPT = SCRIPT_DIR / "launch-cluster.sh"
|
|
BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh"
|
|
DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh"
|
|
AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh"
|
|
ENV_FILE = None # Will be set from CLI argument or default
|
|
|
|
|
|
def load_recipe(recipe_path: Path) -> dict[str, Any]:
|
|
"""
|
|
Load and validate a recipe YAML file.
|
|
|
|
This function handles recipe resolution from multiple locations and validates
|
|
required fields. Recipes are the core configuration format for deployments.
|
|
|
|
EXTENSIBILITY:
|
|
- To add new required fields: Add to the 'required' list below
|
|
- To add new optional fields with defaults: Add to the setdefault() calls at the end
|
|
- Recipe search order: exact path -> recipes/ dir -> with .yaml -> with .yml
|
|
|
|
RECIPE SCHEMA:
|
|
name (str, required): Human-readable name for the recipe
|
|
recipe_version (str, required): Schema version for compatibility checking.
|
|
Used by run-recipe.py to determine which features are available.
|
|
Current version: '1'. Bump when adding new recipe fields.
|
|
container (str, required): Docker image tag to use (e.g., 'vllm-node-mxfp4')
|
|
command (str, required): vLLM serve command template with {placeholders}
|
|
description (str, optional): Brief description shown in --list
|
|
model (str, optional): HuggingFace model ID for --setup downloads
|
|
mods (list[str], optional): List of mod directories to apply (e.g., 'mods/fix-glm')
|
|
defaults (dict, optional): Default values for command placeholders
|
|
env (dict, optional): Environment variables to export before running
|
|
build_args (list[str], optional): Extra args for build-and-copy.sh (e.g., ['-f', 'Dockerfile.mxfp4'])
|
|
cluster_only (bool, optional): If True, recipe cannot run in solo mode
|
|
solo_only (bool, optional): If True, recipe cannot run in cluster mode
|
|
|
|
Args:
|
|
recipe_path: Path object pointing to YAML file or just recipe name
|
|
|
|
Returns:
|
|
Validated recipe dictionary with all fields populated (defaults applied)
|
|
|
|
Raises:
|
|
SystemExit: If recipe not found or validation fails
|
|
"""
|
|
if not recipe_path.exists():
|
|
# Try candidates in order: add extension to original path first,
|
|
# then fall back to flat recipes/ directory (for bare recipe names)
|
|
candidates = [
|
|
recipe_path.with_suffix(".yaml"),
|
|
recipe_path.with_suffix(".yml"),
|
|
RECIPES_DIR / recipe_path.name,
|
|
RECIPES_DIR / f"{recipe_path.name}.yaml",
|
|
RECIPES_DIR / f"{recipe_path.name}.yml",
|
|
RECIPES_DIR / f"{recipe_path.stem}.yaml",
|
|
]
|
|
for candidate in candidates:
|
|
if candidate.exists():
|
|
recipe_path = candidate
|
|
break
|
|
else:
|
|
print(f"Error: Recipe not found: {recipe_path}")
|
|
print(f"Searched in: {recipe_path}, {RECIPES_DIR}")
|
|
sys.exit(1)
|
|
|
|
with open(recipe_path) as f:
|
|
recipe = yaml.safe_load(f)
|
|
|
|
# Validate required fields
|
|
required = ["name", "recipe_version", "container", "command"]
|
|
for field in required:
|
|
if field not in recipe:
|
|
print(f"Error: Recipe missing required field: {field}")
|
|
sys.exit(1)
|
|
|
|
# Set defaults for optional fields
|
|
recipe.setdefault("description", "")
|
|
recipe.setdefault("model", None)
|
|
recipe.setdefault("mods", [])
|
|
recipe.setdefault("defaults", {})
|
|
recipe.setdefault("env", {})
|
|
recipe.setdefault("cluster_only", False)
|
|
recipe.setdefault("solo_only", False)
|
|
|
|
# Validate recipe version compatibility
|
|
# EXTENSIBILITY: When adding new schema versions, update SUPPORTED_VERSIONS
|
|
# and add migration/compatibility logic below
|
|
SUPPORTED_VERSIONS = ["1"]
|
|
recipe_ver = str(recipe["recipe_version"])
|
|
if recipe_ver not in SUPPORTED_VERSIONS:
|
|
print(
|
|
f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}"
|
|
)
|
|
print("Some features may not work correctly. Consider updating run-recipe.py.")
|
|
|
|
return recipe
|
|
|
|
|
|
def list_recipes() -> None:
|
|
"""
|
|
List all available recipes with their metadata.
|
|
|
|
Scans the recipes/ directory for YAML files and displays key information.
|
|
Used by the --list CLI option.
|
|
|
|
EXTENSIBILITY:
|
|
- To show additional fields: Add them to the print statements in the loop
|
|
- To support different output formats (e.g., JSON): Add a format parameter
|
|
- Recipe directory is defined by RECIPES_DIR constant at module level
|
|
"""
|
|
if not RECIPES_DIR.exists():
|
|
print("No recipes directory found.")
|
|
return
|
|
|
|
recipes = sorted(RECIPES_DIR.glob("*.yaml"))
|
|
if not recipes:
|
|
print("No recipes found in recipes/ directory.")
|
|
return
|
|
|
|
print("Available recipes:\n")
|
|
for recipe_path in recipes:
|
|
try:
|
|
recipe = load_recipe(recipe_path)
|
|
name = recipe.get("name", recipe_path.stem)
|
|
recipe_version = recipe.get("recipe_version", "1")
|
|
desc = recipe.get("description", "")
|
|
container = recipe.get("container", "vllm-node")
|
|
build_args = recipe.get("build_args", [])
|
|
model = recipe.get("model", "")
|
|
mods = recipe.get("mods", [])
|
|
cluster_only = recipe.get("cluster_only", False)
|
|
solo_only = recipe.get("solo_only", False)
|
|
|
|
print(f" {recipe_path.name}")
|
|
print(f" Name: {name}")
|
|
if desc:
|
|
print(f" Description: {desc}")
|
|
if model:
|
|
print(f" Model: {model}")
|
|
if cluster_only:
|
|
print(" Cluster only: Yes")
|
|
if solo_only:
|
|
print(" Solo only: Yes")
|
|
print(f" Container: {container}")
|
|
if build_args:
|
|
print(f" Build args: {' '.join(build_args)}")
|
|
if mods:
|
|
print(f" Mods: {', '.join(mods)}")
|
|
print()
|
|
except Exception as e:
|
|
print(f" {recipe_path.name} (error loading: {e})")
|
|
print()
|
|
|
|
|
|
def check_image_exists(image: str, host: str | None = None) -> bool:
|
|
"""
|
|
Check if a Docker image exists locally or on a remote host.
|
|
|
|
Used to avoid redundant builds and to verify cluster nodes have the image.
|
|
|
|
EXTENSIBILITY:
|
|
- To support other container runtimes (podman): Modify the docker command
|
|
- To add image version/digest checking: Parse 'docker image inspect' JSON output
|
|
- For custom SSH options: Modify the ssh command array
|
|
|
|
Args:
|
|
image: Docker image tag to check (e.g., 'vllm-node-mxfp4')
|
|
host: Optional remote hostname/IP. If None, checks locally.
|
|
|
|
Returns:
|
|
True if image exists, False otherwise
|
|
"""
|
|
if host:
|
|
result = subprocess.run(
|
|
[
|
|
"ssh",
|
|
"-o",
|
|
"BatchMode=yes",
|
|
"-o",
|
|
"StrictHostKeyChecking=no",
|
|
host,
|
|
f"docker image inspect '{image}'",
|
|
],
|
|
capture_output=True,
|
|
)
|
|
else:
|
|
result = subprocess.run(
|
|
["docker", "image", "inspect", image], capture_output=True
|
|
)
|
|
return result.returncode == 0
|
|
|
|
|
|
def build_image(
|
|
image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None
|
|
) -> bool:
|
|
"""
|
|
Build the container image using build-and-copy.sh.
|
|
|
|
Delegates to the build-and-copy.sh script which handles multi-stage builds,
|
|
cache optimization, and distribution to worker nodes.
|
|
|
|
EXTENSIBILITY:
|
|
- To add new build options: Add them to build_args in the recipe's build_args field
|
|
- To support different Dockerfiles: Use build_args = ['-f', 'Dockerfile.custom']
|
|
- To add build-time secrets: Modify cmd array to include --secret flags
|
|
- To add progress callbacks: Capture subprocess output line-by-line
|
|
|
|
BUILD_ARGS EXAMPLES:
|
|
['-f', 'Dockerfile.mxfp4'] - Use alternate Dockerfile
|
|
['--no-cache'] - Force full rebuild
|
|
['--build-arg', 'VAR=value'] - Pass build-time variables
|
|
|
|
Args:
|
|
image: Target image tag
|
|
copy_to: List of worker hostnames to copy image to after build
|
|
build_args: Extra arguments passed to build-and-copy.sh
|
|
|
|
Returns:
|
|
True if build (and copy) succeeded, False otherwise
|
|
"""
|
|
if not BUILD_SCRIPT.exists():
|
|
print(f"Error: Build script not found: {BUILD_SCRIPT}")
|
|
return False
|
|
|
|
cmd = [str(BUILD_SCRIPT), "-t", image]
|
|
if build_args:
|
|
cmd.extend(build_args)
|
|
if copy_to:
|
|
cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
|
|
|
|
print(f"Building image '{image}'...")
|
|
if build_args:
|
|
print(f"Build args: {' '.join(build_args)}")
|
|
if copy_to:
|
|
print(f"Will copy to: {', '.join(copy_to)}")
|
|
|
|
result = subprocess.run(cmd)
|
|
return result.returncode == 0
|
|
|
|
|
|
def download_model(model: str, copy_to: list[str] | None = None) -> bool:
|
|
"""
|
|
Download model from HuggingFace using hf-download.sh.
|
|
|
|
Delegates to hf-download.sh which handles HF authentication, caching,
|
|
and rsync to worker nodes.
|
|
|
|
EXTENSIBILITY:
|
|
- To support other model sources: Create a new download script and switch based on model URL
|
|
- To add download progress: Capture subprocess output
|
|
- To support private models: hf-download.sh uses HF_TOKEN env var
|
|
- To add model verification: Check sha256 of downloaded files
|
|
|
|
Args:
|
|
model: HuggingFace model ID (e.g., 'Salyut1/GLM-4.7-NVFP4')
|
|
copy_to: List of worker hostnames to copy model cache to
|
|
|
|
Returns:
|
|
True if download (and copy) succeeded, False otherwise
|
|
"""
|
|
if not DOWNLOAD_SCRIPT.exists():
|
|
print(f"Error: Download script not found: {DOWNLOAD_SCRIPT}")
|
|
return False
|
|
|
|
cmd = [str(DOWNLOAD_SCRIPT), model]
|
|
if copy_to:
|
|
cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
|
|
|
|
print(f"Downloading model '{model}'...")
|
|
if copy_to:
|
|
print(f"Will copy to: {', '.join(copy_to)}")
|
|
|
|
result = subprocess.run(cmd)
|
|
return result.returncode == 0
|
|
|
|
|
|
def check_model_exists(model: str) -> bool:
|
|
"""
|
|
Check if a model exists in the HuggingFace cache.
|
|
|
|
Checks the standard HF cache location for completed downloads.
|
|
|
|
EXTENSIBILITY:
|
|
- To support custom cache locations: Add HF_HOME env var support
|
|
- To verify model integrity: Check for complete snapshot with config.json
|
|
- To support other model sources: Add URL/path prefix detection
|
|
|
|
Args:
|
|
model: HuggingFace model ID (e.g., 'org/model-name')
|
|
|
|
Returns:
|
|
True if model appears to be fully downloaded, False otherwise
|
|
"""
|
|
# Convert model name to cache directory format
|
|
# e.g., "Salyut1/GLM-4.7-NVFP4" -> "models--Salyut1--GLM-4.7-NVFP4"
|
|
cache_name = f"models--{model.replace('/', '--')}"
|
|
cache_path = Path.home() / ".cache" / "huggingface" / "hub" / cache_name
|
|
|
|
if cache_path.exists():
|
|
# Check for snapshots directory which indicates complete download
|
|
snapshots = cache_path / "snapshots"
|
|
if snapshots.exists() and any(snapshots.iterdir()):
|
|
return True
|
|
return False
|
|
|
|
|
|
def generate_launch_script(
|
|
recipe: dict[str, Any],
|
|
overrides: dict[str, Any],
|
|
is_solo: bool = False,
|
|
extra_args: list[str] | None = None,
|
|
no_ray: bool = False,
|
|
) -> str:
|
|
"""
|
|
Generate a bash launch script from the recipe.
|
|
|
|
Creates a self-contained bash script that runs inside the container.
|
|
Handles template substitution, environment variables, and solo mode adjustments.
|
|
|
|
EXTENSIBILITY:
|
|
- To add new template variables: Add them to recipe['defaults'] or CLI overrides
|
|
- To add pre/post hooks: Add 'pre_command'/'post_command' fields to recipe schema
|
|
- To add conditional logic: Use Jinja2 templating instead of str.format()
|
|
- To support GPU selection: Add CUDA_VISIBLE_DEVICES to env handling
|
|
|
|
TEMPLATE VARIABLES (use {variable_name} in recipe command):
|
|
port: API server port (default from recipe)
|
|
host: API server bind address
|
|
tensor_parallel: Number of GPUs for tensor parallelism
|
|
gpu_memory_utilization: Fraction of GPU memory to use
|
|
max_model_len: Maximum sequence length
|
|
(custom variables can be added via recipe defaults)
|
|
|
|
SOLO MODE BEHAVIOR:
|
|
- Removes '--distributed-executor-backend ray' lines
|
|
- Typically sets tensor_parallel=1 (handled by caller)
|
|
|
|
EXTRA ARGS:
|
|
- Appended verbatim to the end of the vLLM command
|
|
- Allows passing any vLLM argument not covered by template variables
|
|
- vLLM uses "last wins" semantics for duplicate arguments
|
|
|
|
Args:
|
|
recipe: Loaded recipe dictionary
|
|
overrides: CLI-provided parameter overrides (take precedence over defaults)
|
|
is_solo: If True, strip distributed executor configuration
|
|
extra_args: Additional arguments to append to vLLM command (after --)
|
|
|
|
Returns:
|
|
Complete bash script content as string
|
|
|
|
Raises:
|
|
SystemExit: If required template variables are missing
|
|
"""
|
|
# Merge defaults with overrides
|
|
params = {**recipe.get("defaults", {}), **overrides}
|
|
|
|
# Build the script
|
|
lines = ["#!/bin/bash", f"# Generated from recipe: {recipe['name']}", ""]
|
|
|
|
# Add environment variables
|
|
env_vars = recipe.get("env", {})
|
|
if env_vars:
|
|
lines.append("# Environment variables")
|
|
for key, value in env_vars.items():
|
|
lines.append(f'export {key}="{value}"')
|
|
lines.append("")
|
|
|
|
# Format the command with parameters
|
|
command = recipe["command"]
|
|
try:
|
|
command = command.format(**params)
|
|
except KeyError as e:
|
|
print(f"Error: Missing parameter in recipe command: {e}")
|
|
print(f"Available parameters: {list(params.keys())}")
|
|
sys.exit(1)
|
|
|
|
# In solo or no-ray mode, remove --distributed-executor-backend
|
|
# (not needed for solo; no-ray uses PyTorch distributed instead)
|
|
if is_solo or no_ray:
|
|
import re
|
|
|
|
# Remove just the flag and its value, not the whole line
|
|
command = re.sub(r"--distributed-executor-backend\s+\S+", "", command)
|
|
# Remove lines that are now empty or just a backslash continuation
|
|
lines_list = command.split("\n")
|
|
filtered_lines = [line for line in lines_list if line.strip() not in ("", "\\")]
|
|
command = "\n".join(filtered_lines)
|
|
|
|
# Remove trailing backslash if present
|
|
command = command.rstrip()
|
|
if command.endswith("\\"):
|
|
command = command.rstrip("\\\n").rstrip()
|
|
|
|
# Append extra args if provided (after --)
|
|
if extra_args:
|
|
# Join extra args and append to command
|
|
extra_args_str = " ".join(shlex.quote(a) for a in extra_args)
|
|
command = command + " " + extra_args_str
|
|
|
|
lines.append("# Run the model")
|
|
lines.append(command.strip())
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def parse_nodes(nodes_arg: str | None) -> list[str]:
|
|
"""
|
|
Parse comma-separated node list.
|
|
|
|
Simple utility to split node specifications. The first node is
|
|
always treated as the head node for cluster deployments.
|
|
|
|
Args:
|
|
nodes_arg: Comma-separated string like '192.168.1.1,192.168.1.2'
|
|
|
|
Returns:
|
|
List of stripped node identifiers, empty list if input is None/empty
|
|
"""
|
|
if not nodes_arg:
|
|
return []
|
|
return [n.strip() for n in nodes_arg.split(",") if n.strip()]
|
|
|
|
|
|
def get_worker_nodes(nodes: list[str]) -> list[str]:
|
|
"""
|
|
Get worker nodes (all nodes except the first/head node).
|
|
|
|
In a Ray cluster, the first node runs the head process.
|
|
Workers are all subsequent nodes that join the cluster.
|
|
|
|
Args:
|
|
nodes: Full list of nodes (head first, then workers)
|
|
|
|
Returns:
|
|
List of worker nodes (excluding head), empty if single node
|
|
"""
|
|
if len(nodes) <= 1:
|
|
return []
|
|
return nodes[1:]
|
|
|
|
|
|
def load_env_file() -> dict[str, str]:
|
|
"""
|
|
Load environment variables from .env file.
|
|
|
|
Reads the .env file created by --discover for persistent cluster configuration.
|
|
|
|
EXTENSIBILITY:
|
|
- To support multiple .env files: Add a --env-file CLI argument
|
|
- To add validation: Check for required keys after loading
|
|
|
|
SUPPORTED KEYS (set by --discover):
|
|
CLUSTER_NODES: Comma-separated list of node IPs
|
|
LOCAL_IP: This machine's IP address
|
|
ETH_IF: Ethernet interface name
|
|
IB_IF: InfiniBand interface name (if available)
|
|
|
|
Returns:
|
|
Dictionary of key=value pairs from .env file
|
|
"""
|
|
env = {}
|
|
if ENV_FILE.exists():
|
|
with open(ENV_FILE) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line and not line.startswith("#") and "=" in line:
|
|
key, _, value = line.partition("=")
|
|
# Remove quotes if present
|
|
value = value.strip().strip('"').strip("'")
|
|
env[key.strip()] = value
|
|
return env
|
|
|
|
|
|
def run_autodiscover() -> dict[str, str] | None:
|
|
"""
|
|
Run autodiscover.sh interactively and return discovered configuration.
|
|
|
|
Executes the autodiscover.sh script to detect cluster topology,
|
|
including interactive per-node confirmation and .env saving.
|
|
After autodiscover.sh completes, reads configuration from .env file.
|
|
|
|
Returns:
|
|
Dictionary with discovered configuration from .env, or None if discovery failed
|
|
"""
|
|
if not AUTODISCOVER_SCRIPT.exists():
|
|
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
|
|
return None
|
|
|
|
print("Running autodiscover...")
|
|
print()
|
|
|
|
# Pass CONFIG_FILE so autodiscover.sh knows where to save the config.
|
|
# Do NOT set CONFIG_FILE_SET=true — that would cause an error if the file
|
|
# doesn't exist yet (it's the file we're about to create).
|
|
env_vars = os.environ.copy()
|
|
env_vars["CONFIG_FILE"] = str(ENV_FILE)
|
|
env_vars["FORCE_DISCOVER"] = "true"
|
|
env_vars.pop("CONFIG_FILE_SET", None)
|
|
|
|
# Run autodiscover interactively so its prompts are shown to the user
|
|
script = f"""
|
|
source '{AUTODISCOVER_SCRIPT}'
|
|
run_autodiscover
|
|
"""
|
|
|
|
result = subprocess.run(["bash", "-c", script], env=env_vars)
|
|
|
|
if result.returncode != 0:
|
|
print("Error: Autodiscover failed")
|
|
return None
|
|
|
|
# Read configuration from the .env file that autodiscover.sh wrote
|
|
env = load_env_file()
|
|
if not env.get("CLUSTER_NODES"):
|
|
print("Autodiscover completed but no CLUSTER_NODES found in .env")
|
|
return None
|
|
|
|
return env
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main entry point for the recipe runner.
|
|
|
|
Orchestrates the full deployment pipeline:
|
|
1. Parse CLI arguments and load recipe
|
|
2. Resolve cluster nodes (CLI -> .env -> autodiscover)
|
|
3. Build phase: Build container if missing, copy to workers
|
|
4. Download phase: Download model if missing, copy to workers
|
|
5. Run phase: Generate launch script and execute via launch-cluster.sh
|
|
|
|
EXTENSIBILITY:
|
|
- To add new CLI options: Add to the appropriate argument group
|
|
- To add new phases: Insert between existing phases with similar pattern
|
|
- To add pre/post hooks: Add hook execution before/after subprocess calls
|
|
- To add logging: Replace print() with logging module calls
|
|
- To add config file support: Load defaults from ~/.config/vllm-recipes.yaml
|
|
|
|
EXIT CODES:
|
|
0: Success
|
|
1: Error (recipe not found, build failed, validation error, etc.)
|
|
|
|
Returns:
|
|
Exit code for sys.exit()
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Run a model using a YAML recipe",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Basic usage
|
|
%(prog)s glm-4.7-nvfp4
|
|
%(prog)s glm-4.7-nvfp4 --port 9000 --solo
|
|
|
|
# Full setup (build container + download model + run)
|
|
%(prog)s glm-4.7-nvfp4 --setup
|
|
|
|
# Cluster deployment (manual)
|
|
%(prog)s glm-4.7-nvfp4 -n 192.168.1.1,192.168.1.2 --setup
|
|
|
|
# Cluster deployment (auto-discover)
|
|
%(prog)s --discover # Detect nodes and save to .env
|
|
%(prog)s glm-4.7-nvfp4 --setup # Uses nodes from .env
|
|
|
|
# Just build/download without running
|
|
%(prog)s glm-4.7-nvfp4 --build-only
|
|
%(prog)s glm-4.7-nvfp4 --download-only
|
|
|
|
# Pass extra arguments to vLLM (after --)
|
|
%(prog)s glm-4.7-nvfp4 --solo -- --load-format safetensors
|
|
%(prog)s glm-4.7-nvfp4 --solo -- --served-model-name my-api
|
|
|
|
# List available recipes
|
|
%(prog)s --list
|
|
|
|
# Show current .env configuration
|
|
%(prog)s --show-env
|
|
""",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"recipe",
|
|
nargs="?",
|
|
help="Path to recipe YAML file (or just the name without .yaml)",
|
|
)
|
|
parser.add_argument(
|
|
"--list", "-l", action="store_true", help="List available recipes"
|
|
)
|
|
|
|
# Setup options
|
|
setup_group = parser.add_argument_group("Setup options")
|
|
setup_group.add_argument(
|
|
"--setup",
|
|
action="store_true",
|
|
help="Full setup: build container (if missing) + download model (if missing) + run",
|
|
)
|
|
setup_group.add_argument(
|
|
"--build-only",
|
|
action="store_true",
|
|
help="Only build/copy the container image, don't run",
|
|
)
|
|
setup_group.add_argument(
|
|
"--download-only",
|
|
action="store_true",
|
|
help="Only download/copy the model, don't run",
|
|
)
|
|
setup_group.add_argument(
|
|
"--force-build", action="store_true", help="Force rebuild even if image exists"
|
|
)
|
|
setup_group.add_argument(
|
|
"--force-download",
|
|
action="store_true",
|
|
help="Force re-download even if model exists",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Show what would be executed without running",
|
|
)
|
|
|
|
# Override options
|
|
override_group = parser.add_argument_group("Recipe overrides")
|
|
override_group.add_argument("--port", type=int, help="Override port")
|
|
override_group.add_argument("--host", help="Override host")
|
|
override_group.add_argument(
|
|
"--tensor-parallel",
|
|
"--tp",
|
|
type=int,
|
|
dest="tensor_parallel",
|
|
help="Override tensor parallelism",
|
|
)
|
|
override_group.add_argument(
|
|
"--gpu-memory-utilization",
|
|
"--gpu-mem",
|
|
type=float,
|
|
dest="gpu_memory_utilization",
|
|
help="Override GPU memory utilization",
|
|
)
|
|
override_group.add_argument(
|
|
"--max-model-len",
|
|
type=int,
|
|
dest="max_model_len",
|
|
help="Override max model length",
|
|
)
|
|
|
|
# Launch options (passed to launch-cluster.sh)
|
|
launch_group = parser.add_argument_group(
|
|
"Launch options (passed to launch-cluster.sh)"
|
|
)
|
|
launch_group.add_argument(
|
|
"--solo", action="store_true", help="Run in solo mode (single node, no Ray)"
|
|
)
|
|
launch_group.add_argument(
|
|
"-n", "--nodes", help="Comma-separated list of node IPs (first is head node)"
|
|
)
|
|
launch_group.add_argument(
|
|
"-d", "--daemon", action="store_true", help="Run in daemon mode"
|
|
)
|
|
launch_group.add_argument(
|
|
"-t",
|
|
"--container",
|
|
dest="container_override",
|
|
help="Override container image from recipe",
|
|
)
|
|
launch_group.add_argument(
|
|
"--nccl-debug",
|
|
choices=["VERSION", "WARN", "INFO", "TRACE"],
|
|
help="NCCL debug level",
|
|
)
|
|
launch_group.add_argument(
|
|
"-e",
|
|
"--env",
|
|
action="append",
|
|
dest="env_vars",
|
|
default=[],
|
|
metavar="VAR=VALUE",
|
|
help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.",
|
|
)
|
|
launch_group.add_argument(
|
|
"--no-ray",
|
|
action="store_true",
|
|
dest="no_ray",
|
|
help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--master-port",
|
|
"--head-port",
|
|
type=int,
|
|
dest="master_port",
|
|
help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--name",
|
|
dest="container_name",
|
|
help="Override container name (default: vllm_node)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--eth-if",
|
|
dest="eth_if",
|
|
help="Ethernet interface (overrides .env and auto-detection)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--ib-if",
|
|
dest="ib_if",
|
|
help="InfiniBand interface (overrides .env and auto-detection)",
|
|
)
|
|
launch_group.add_argument(
|
|
"-j",
|
|
dest="build_jobs",
|
|
type=int,
|
|
metavar="N",
|
|
help="Number of parallel build jobs inside container",
|
|
)
|
|
launch_group.add_argument(
|
|
"--no-cache-dirs",
|
|
action="store_true",
|
|
dest="no_cache_dirs",
|
|
help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton",
|
|
)
|
|
launch_group.add_argument(
|
|
"--non-privileged",
|
|
action="store_true",
|
|
dest="non_privileged",
|
|
help="Run in non-privileged mode (removes --privileged and --ipc=host)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--mem-limit-gb",
|
|
type=int,
|
|
dest="mem_limit_gb",
|
|
help="Memory limit in GB (only with --non-privileged)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--mem-swap-limit-gb",
|
|
type=int,
|
|
dest="mem_swap_limit_gb",
|
|
help="Memory+swap limit in GB (only with --non-privileged)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--pids-limit",
|
|
type=int,
|
|
dest="pids_limit",
|
|
help="Process limit (only with --non-privileged, default: 4096)",
|
|
)
|
|
launch_group.add_argument(
|
|
"--shm-size-gb",
|
|
type=int,
|
|
dest="shm_size_gb",
|
|
help="Shared memory size in GB (only with --non-privileged, default: 64)",
|
|
)
|
|
|
|
# Config file option
|
|
parser.add_argument(
|
|
"--config",
|
|
dest="config_file",
|
|
metavar="FILE",
|
|
help="Path to .env configuration file (default: .env in script directory)",
|
|
)
|
|
|
|
# Cluster discovery options
|
|
discover_group = parser.add_argument_group("Cluster discovery")
|
|
discover_group.add_argument(
|
|
"--discover",
|
|
action="store_true",
|
|
help="Auto-detect cluster nodes and save to .env file",
|
|
)
|
|
discover_group.add_argument(
|
|
"--show-env", action="store_true", help="Show current .env configuration"
|
|
)
|
|
|
|
# Use parse_known_args to allow extra vLLM arguments after --
|
|
args, extra_args = parser.parse_known_args()
|
|
|
|
# Set .env file path (use default if not specified)
|
|
global ENV_FILE
|
|
if args.config_file:
|
|
ENV_FILE = Path(args.config_file).resolve()
|
|
else:
|
|
ENV_FILE = SCRIPT_DIR / ".env"
|
|
|
|
# Filter out the -- separator if present
|
|
if extra_args and extra_args[0] == "--":
|
|
extra_args = extra_args[1:]
|
|
|
|
# Handle --discover (can be run with or without a recipe)
|
|
if args.discover:
|
|
env = run_autodiscover()
|
|
if env is None:
|
|
return 1
|
|
|
|
print("Discovered configuration:")
|
|
for key, value in sorted(env.items()):
|
|
print(f" {key}={value}")
|
|
print()
|
|
|
|
if not args.recipe:
|
|
return 0
|
|
|
|
# Handle --show-env
|
|
if args.show_env:
|
|
env = load_env_file()
|
|
if env:
|
|
print(f"Current .env configuration ({ENV_FILE}):")
|
|
for key, value in sorted(env.items()):
|
|
print(f" {key}={value}")
|
|
else:
|
|
print(f"No .env file found at {ENV_FILE}")
|
|
print("Run with --discover to auto-detect cluster nodes.")
|
|
|
|
if not args.recipe:
|
|
return 0
|
|
print()
|
|
|
|
if args.list:
|
|
list_recipes()
|
|
return 0
|
|
|
|
if not args.recipe:
|
|
parser.print_help()
|
|
return 1
|
|
|
|
# Load recipe
|
|
recipe_path = Path(args.recipe)
|
|
recipe = load_recipe(recipe_path)
|
|
|
|
print(f"Recipe: {recipe['name']}")
|
|
if recipe.get("description"):
|
|
print(f" {recipe['description']}")
|
|
print()
|
|
|
|
# Determine container image
|
|
container = args.container_override or recipe["container"]
|
|
model = recipe.get("model")
|
|
build_args = recipe.get("build_args", [])
|
|
|
|
# Parse nodes - check command line first, then .env file, then autodiscover
|
|
nodes = parse_nodes(args.nodes) if not args.solo else []
|
|
nodes_from_env = False
|
|
eth_if = None
|
|
ib_if = None
|
|
|
|
if not args.solo:
|
|
# Try to load from .env file
|
|
env = load_env_file()
|
|
if not nodes:
|
|
if env.get("CLUSTER_NODES"):
|
|
nodes = parse_nodes(env["CLUSTER_NODES"])
|
|
nodes_from_env = True
|
|
if nodes:
|
|
print(f"Using cluster nodes from .env: {', '.join(nodes)}")
|
|
print()
|
|
else:
|
|
# No nodes specified and no .env - run autodiscover
|
|
print("No cluster nodes configured. Running autodiscover...")
|
|
print()
|
|
|
|
discovered_env = run_autodiscover()
|
|
if discovered_env and discovered_env.get("CLUSTER_NODES"):
|
|
env = discovered_env # use freshly loaded env from autodiscover
|
|
nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
|
|
nodes_from_env = True
|
|
|
|
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
|
|
eth_if = args.eth_if or None
|
|
ib_if = args.ib_if or None
|
|
if not eth_if or not ib_if:
|
|
if not eth_if and env.get("ETH_IF"):
|
|
eth_if = env["ETH_IF"]
|
|
if not ib_if and env.get("IB_IF"):
|
|
ib_if = env["IB_IF"]
|
|
|
|
worker_nodes = get_worker_nodes(nodes) if nodes else []
|
|
is_cluster = len(nodes) > 1
|
|
|
|
# Check if recipe requires cluster mode
|
|
cluster_only = recipe.get("cluster_only", False)
|
|
solo_only = recipe.get("solo_only", False)
|
|
is_solo = args.solo or not is_cluster
|
|
|
|
if getattr(args, "no_ray", False) and is_solo:
|
|
print(
|
|
"Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray."
|
|
)
|
|
return 1
|
|
|
|
if cluster_only and is_solo:
|
|
print(f"Error: Recipe '{recipe['name']}' requires cluster mode.")
|
|
print(f"This model is too large to run on a single node.")
|
|
print()
|
|
print("Options:")
|
|
print(
|
|
f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2"
|
|
)
|
|
print(f" 2. Auto-discover and save: {sys.argv[0]} --discover")
|
|
print(f" Then run: {sys.argv[0]} {args.recipe}")
|
|
return 1
|
|
if solo_only and not is_solo:
|
|
print(f"Error: Recipe '{recipe['name']}' requires solo mode.")
|
|
print("This recipe is intended to run on a single node only.")
|
|
print()
|
|
print("Options:")
|
|
print(f" 1. Run solo: {sys.argv[0]} {args.recipe} --solo")
|
|
print(f" 2. Remove nodes from .env: {sys.argv[0]} --show-env")
|
|
return 1
|
|
|
|
# Determine copy targets for build/model distribution.
|
|
# Prefer COPY_HOSTS from .env (may differ from CLUSTER_NODES in mesh mode),
|
|
# fall back to worker_nodes derived from CLUSTER_NODES.
|
|
if is_cluster:
|
|
copy_hosts_str = env.get("COPY_HOSTS")
|
|
if copy_hosts_str:
|
|
copy_targets = [h.strip() for h in copy_hosts_str.split(",") if h.strip()]
|
|
else:
|
|
copy_targets = worker_nodes
|
|
else:
|
|
copy_targets = None
|
|
|
|
if args.dry_run:
|
|
print("=== Dry Run ===")
|
|
print(f"Container: {container}")
|
|
if build_args:
|
|
print(f"Build args: {' '.join(build_args)}")
|
|
if model:
|
|
print(f"Model: {model}")
|
|
if cluster_only:
|
|
print("Cluster only: Yes (model too large for single node)")
|
|
if solo_only:
|
|
print("Solo only: Yes (single node only)")
|
|
if nodes:
|
|
source = "(from .env)" if nodes_from_env else ""
|
|
print(f"Nodes: {', '.join(nodes)} {source}".strip())
|
|
print(f" Head: {nodes[0]}")
|
|
if worker_nodes:
|
|
print(f" Workers: {', '.join(worker_nodes)}")
|
|
print(f"Solo mode: {is_solo}")
|
|
if eth_if:
|
|
print(
|
|
f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}"
|
|
)
|
|
if ib_if:
|
|
print(
|
|
f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}"
|
|
)
|
|
if args.container_name:
|
|
print(f"Container name: {args.container_name}")
|
|
if args.non_privileged:
|
|
print("Non-privileged mode: Yes")
|
|
print()
|
|
|
|
# --- Build Phase ---
|
|
if args.build_only or args.setup or args.force_build:
|
|
if args.dry_run:
|
|
image_exists = check_image_exists(container)
|
|
if args.force_build or not image_exists:
|
|
print(f"Would build container: {container}")
|
|
if copy_targets:
|
|
print(f" Would copy to: {', '.join(copy_targets)}")
|
|
else:
|
|
print(f"Container '{container}' already exists locally.")
|
|
if copy_targets:
|
|
print(f" Would check/copy to workers: {', '.join(copy_targets)}")
|
|
print()
|
|
else:
|
|
image_exists = check_image_exists(container)
|
|
|
|
if args.force_build or not image_exists:
|
|
print("=== Building Container ===")
|
|
if not build_image(container, copy_targets, build_args):
|
|
print("Error: Failed to build container")
|
|
return 1
|
|
print()
|
|
else:
|
|
print(f"Container '{container}' already exists locally.")
|
|
# Check worker nodes in cluster mode
|
|
if copy_targets:
|
|
missing_on = []
|
|
for worker in copy_targets:
|
|
if not check_image_exists(container, worker):
|
|
missing_on.append(worker)
|
|
if missing_on:
|
|
print(f"Container missing on workers: {', '.join(missing_on)}")
|
|
print("Building and copying...")
|
|
if not build_image(container, missing_on, build_args):
|
|
print("Error: Failed to build/copy container")
|
|
return 1
|
|
print()
|
|
|
|
if args.build_only:
|
|
print("Build complete." if not args.dry_run else "")
|
|
return 0
|
|
|
|
# --- Download Phase ---
|
|
if model and (args.download_only or args.setup or args.force_download):
|
|
if args.dry_run:
|
|
model_exists = check_model_exists(model)
|
|
if args.force_download or not model_exists:
|
|
print(f"Would download model: {model}")
|
|
if copy_targets:
|
|
print(f" Would copy to: {', '.join(copy_targets)}")
|
|
else:
|
|
print(f"Model '{model}' already exists in cache.")
|
|
print()
|
|
else:
|
|
model_exists = check_model_exists(model)
|
|
|
|
if args.force_download or not model_exists:
|
|
print("=== Downloading Model ===")
|
|
if not download_model(model, copy_targets):
|
|
print("Error: Failed to download model")
|
|
return 1
|
|
print()
|
|
else:
|
|
print(f"Model '{model}' already exists in cache.")
|
|
print()
|
|
|
|
if args.download_only:
|
|
print("Download complete." if not args.dry_run else "")
|
|
return 0
|
|
|
|
# --- Run Phase ---
|
|
if args.build_only or args.download_only:
|
|
return 0
|
|
|
|
# Check if image exists (if not using --setup)
|
|
if not args.dry_run and not args.setup and not check_image_exists(container):
|
|
print(f"Container image '{container}' not found locally.")
|
|
print()
|
|
print("Options:")
|
|
print(f" 1. Use --setup to build and run")
|
|
print(f" 2. Build manually: ./build-and-copy.sh -t {container}")
|
|
print()
|
|
response = input("Build now? [y/N] ").strip().lower()
|
|
if response == "y":
|
|
if not build_image(container, copy_targets, build_args):
|
|
print("Error: Failed to build image")
|
|
return 1
|
|
else:
|
|
print("Aborting.")
|
|
return 1
|
|
|
|
# Build overrides from CLI args
|
|
overrides = {}
|
|
for key in [
|
|
"port",
|
|
"host",
|
|
"tensor_parallel",
|
|
"gpu_memory_utilization",
|
|
"max_model_len",
|
|
]:
|
|
value = getattr(args, key, None)
|
|
if value is not None:
|
|
overrides[key] = value
|
|
|
|
# In solo mode, default tensor_parallel to 1 (unless user explicitly set --tp)
|
|
if is_solo and "tensor_parallel" not in overrides:
|
|
overrides["tensor_parallel"] = 1
|
|
|
|
# Check for duplicate arguments (warn if extra_args duplicate CLI overrides)
|
|
if extra_args:
|
|
# Map vLLM flags to our override keys
|
|
flag_to_override = {
|
|
"--port": "port",
|
|
"--host": "host",
|
|
"--tensor-parallel-size": "tensor_parallel",
|
|
"-tp": "tensor_parallel",
|
|
"--gpu-memory-utilization": "gpu_memory_utilization",
|
|
"--max-model-len": "max_model_len",
|
|
}
|
|
for i, arg in enumerate(extra_args):
|
|
# Check both exact flag and =value syntax
|
|
flag = arg.split("=")[0] if "=" in arg else arg
|
|
if flag in flag_to_override:
|
|
override_key = flag_to_override[flag]
|
|
if override_key in overrides:
|
|
print(
|
|
f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override"
|
|
)
|
|
print(
|
|
f" vLLM uses last value; extra args appear after template substitution"
|
|
)
|
|
|
|
# Generate launch script
|
|
script_content = generate_launch_script(
|
|
recipe,
|
|
overrides,
|
|
is_solo=is_solo,
|
|
extra_args=extra_args,
|
|
no_ray=getattr(args, "no_ray", False),
|
|
)
|
|
|
|
if args.dry_run:
|
|
print("=== Generated Launch Script ===")
|
|
print(script_content)
|
|
print("=== What would be executed ===")
|
|
print()
|
|
print("1. The above script is saved to a temporary file")
|
|
print()
|
|
print("2. launch-cluster.sh is called with:")
|
|
cmd_parts = [" ./launch-cluster.sh", "-t", container]
|
|
for mod in recipe.get("mods", []):
|
|
cmd_parts.extend(["--apply-mod", mod])
|
|
if args.solo:
|
|
cmd_parts.append("--solo")
|
|
elif not is_cluster:
|
|
cmd_parts.append("--solo")
|
|
if args.daemon:
|
|
cmd_parts.append("-d")
|
|
if getattr(args, "no_ray", False):
|
|
cmd_parts.append("--no-ray")
|
|
if nodes:
|
|
cmd_parts.extend(["-n", ",".join(nodes)])
|
|
if args.nccl_debug:
|
|
cmd_parts.extend(["--nccl-debug", args.nccl_debug])
|
|
for env_var in args.env_vars:
|
|
cmd_parts.extend(["-e", env_var])
|
|
if args.master_port:
|
|
cmd_parts.extend(["--master-port", str(args.master_port)])
|
|
if args.container_name:
|
|
cmd_parts.extend(["--name", args.container_name])
|
|
if eth_if:
|
|
cmd_parts.extend(["--eth-if", eth_if])
|
|
if ib_if:
|
|
cmd_parts.extend(["--ib-if", ib_if])
|
|
if args.build_jobs:
|
|
cmd_parts.extend(["-j", str(args.build_jobs)])
|
|
if args.no_cache_dirs:
|
|
cmd_parts.append("--no-cache-dirs")
|
|
if args.non_privileged:
|
|
cmd_parts.append("--non-privileged")
|
|
if args.mem_limit_gb:
|
|
cmd_parts.extend(["--mem-limit-gb", str(args.mem_limit_gb)])
|
|
if args.mem_swap_limit_gb:
|
|
cmd_parts.extend(["--mem-swap-limit-gb", str(args.mem_swap_limit_gb)])
|
|
if args.pids_limit:
|
|
cmd_parts.extend(["--pids-limit", str(args.pids_limit)])
|
|
if args.shm_size_gb:
|
|
cmd_parts.extend(["--shm-size-gb", str(args.shm_size_gb)])
|
|
if args.config_file:
|
|
cmd_parts.extend(["--config", args.config_file])
|
|
cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"])
|
|
print(" ".join(cmd_parts))
|
|
print()
|
|
print("3. The launch script runs inside the container")
|
|
return 0
|
|
|
|
# Write temporary launch script
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
|
|
f.write(script_content)
|
|
temp_script = f.name
|
|
|
|
try:
|
|
os.chmod(temp_script, 0o755)
|
|
|
|
# Build launch-cluster.sh command
|
|
cmd = [str(LAUNCH_SCRIPT), "-t", container]
|
|
|
|
# Add mods
|
|
for mod in recipe.get("mods", []):
|
|
mod_path = SCRIPT_DIR / mod
|
|
if not mod_path.exists():
|
|
print(f"Warning: Mod path not found: {mod_path}")
|
|
cmd.extend(["--apply-mod", str(mod_path)])
|
|
|
|
# Add launch options
|
|
if args.solo:
|
|
cmd.append("--solo")
|
|
elif not is_cluster:
|
|
# Auto-enable solo mode if no cluster nodes specified
|
|
cmd.append("--solo")
|
|
|
|
if args.daemon:
|
|
cmd.append("-d")
|
|
|
|
if getattr(args, "no_ray", False):
|
|
cmd.append("--no-ray")
|
|
|
|
# Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover)
|
|
if nodes:
|
|
cmd.extend(["-n", ",".join(nodes)])
|
|
|
|
if args.nccl_debug:
|
|
cmd.extend(["--nccl-debug", args.nccl_debug])
|
|
|
|
for env_var in args.env_vars:
|
|
cmd.extend(["-e", env_var])
|
|
|
|
if args.master_port:
|
|
cmd.extend(["--master-port", str(args.master_port)])
|
|
if args.container_name:
|
|
cmd.extend(["--name", args.container_name])
|
|
if eth_if:
|
|
cmd.extend(["--eth-if", eth_if])
|
|
if ib_if:
|
|
cmd.extend(["--ib-if", ib_if])
|
|
if args.build_jobs:
|
|
cmd.extend(["-j", str(args.build_jobs)])
|
|
if args.no_cache_dirs:
|
|
cmd.append("--no-cache-dirs")
|
|
if args.non_privileged:
|
|
cmd.append("--non-privileged")
|
|
if args.mem_limit_gb:
|
|
cmd.extend(["--mem-limit-gb", str(args.mem_limit_gb)])
|
|
if args.mem_swap_limit_gb:
|
|
cmd.extend(["--mem-swap-limit-gb", str(args.mem_swap_limit_gb)])
|
|
if args.pids_limit:
|
|
cmd.extend(["--pids-limit", str(args.pids_limit)])
|
|
if args.shm_size_gb:
|
|
cmd.extend(["--shm-size-gb", str(args.shm_size_gb)])
|
|
|
|
if args.config_file:
|
|
cmd.extend(["--config", args.config_file])
|
|
|
|
# Add launch script
|
|
cmd.extend(["--launch-script", temp_script])
|
|
|
|
print(f"=== Launching ===")
|
|
print(f"Container: {container}")
|
|
if recipe.get("mods"):
|
|
print(f"Mods: {', '.join(recipe['mods'])}")
|
|
if is_cluster:
|
|
print(f"Cluster: {len(nodes)} nodes")
|
|
else:
|
|
print("Mode: Solo")
|
|
print()
|
|
|
|
# Execute
|
|
result = subprocess.run(cmd)
|
|
return result.returncode
|
|
|
|
finally:
|
|
# Cleanup temp script
|
|
try:
|
|
os.unlink(temp_script)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|