From 30f16f1d4e8db252d4cd95f88588d39d291f87fc Mon Sep 17 00:00:00 2001 From: Raphael Amorim Date: Tue, 3 Feb 2026 15:32:28 -0500 Subject: [PATCH] feat: Add recipe-based one-click model deployment system Introduces a YAML recipe system for simplified model deployment: - run-recipe.py: Main script handling build, download, and launch - run-recipe.sh: Bash wrapper for dependency management - recipes/: Pre-configured recipes for common models - glm-4.7-flash-awq.yaml: GLM-4.7-Flash with AWQ quantization - glm-4.7-nvfp4.yaml: GLM-4.7 with NVFP4 (cluster-only) - minimax-m2-awq.yaml: MiniMax M2 with AWQ - openai-gpt-oss-120b.yaml: OpenAI GPT-OSS 120B with MXFP4 Key features: - Auto-discover cluster nodes with --discover, saves to .env - Load nodes from .env automatically on subsequent runs - cluster_only flag for models requiring multi-node setup - build_args field for Dockerfile selection (--pre-tf, --exp-mxfp4) - Solo mode auto-strips --distributed-executor-backend ray - --setup flag for full build + download + run workflow - --dry-run to preview execution without running Usage: ./run-recipe.sh --discover # Find and save cluster nodes ./run-recipe.sh glm-4.7-flash-awq --solo --setup ./run-recipe.sh glm-4.7-nvfp4 --setup # Uses nodes from .env --- recipes/README.md | 266 +++++++ recipes/glm-4.7-flash-awq.yaml | 64 ++ recipes/minimax-m2-awq.yaml | 40 ++ recipes/openai-gpt-oss-120b.yaml | 52 ++ run-recipe.py | 1123 ++++++++++++++++++++++++++++++ run-recipe.sh | 42 ++ 6 files changed, 1587 insertions(+) create mode 100644 recipes/README.md create mode 100644 recipes/glm-4.7-flash-awq.yaml create mode 100644 recipes/minimax-m2-awq.yaml create mode 100644 recipes/openai-gpt-oss-120b.yaml create mode 100755 run-recipe.py create mode 100755 run-recipe.sh diff --git a/recipes/README.md b/recipes/README.md new file mode 100644 index 0000000..836ec03 --- /dev/null +++ b/recipes/README.md @@ -0,0 +1,266 @@ +# Recipes + +Recipes provide a **one-click solution** for deploying models with pre-configured settings. Each recipe is a YAML file that specifies: + +- HuggingFace model to download +- Container image and build arguments +- Required mods/patches +- Default parameters (port, host, tensor parallelism, etc.) +- Environment variables +- The vLLM serve command + +## Quick Start + +```bash +# List available recipes +./run-recipe.sh --list + +# Run a recipe in solo mode (single node) +./run-recipe.sh glm-4.7-flash-awq --solo + +# Full setup: build container + download model + run +./run-recipe.sh glm-4.7-flash-awq --solo --setup + +# Run with overrides +./run-recipe.sh glm-4.7-flash-awq --solo --port 9000 --gpu-mem 0.8 + +# Cluster deployment +./run-recipe.sh glm-4.7-nvfp4 -n 192.168.1.10,192.168.1.11 --setup +``` + +## Cluster Node Discovery + +The recipe runner can automatically discover cluster nodes: + +```bash +# Auto-discover nodes and save to .env +./run-recipe.sh --discover + +# Show current .env configuration +./run-recipe.sh --show-env + +# Run recipe (uses nodes from .env automatically) +./run-recipe.sh glm-4.7-nvfp4 --setup +``` + +When you run `--discover`, it: +1. Scans the network for nodes with SSH access +2. Prompts you to select which nodes to include +3. Saves the configuration to `.env` + +Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`. + +## Workflow Modes + +### Solo Mode (Single Node) +```bash +# Explicitly run in solo mode +./run-recipe.sh glm-4.7-flash-awq --solo + +# If no nodes configured, defaults to solo +./run-recipe.sh minimax-m2-awq +``` + +### Cluster Mode (Multiple Nodes) +```bash +# Specify nodes directly (first IP is head node) +./run-recipe.sh glm-4.7-nvfp4 -n 192.168.1.10,192.168.1.11 --setup + +# Or use auto-discovered nodes from .env +./run-recipe.sh --discover # First time only +./run-recipe.sh glm-4.7-nvfp4 --setup +``` + +When using cluster mode with `--setup`: +- Container is built locally and copied to all worker nodes +- Model is downloaded locally and copied to all worker nodes + +### Cluster-Only Recipes + +Some models are too large to run on a single node. These recipes have `cluster_only: true` and will fail with a helpful error if you try to run them in solo mode: + +```bash +$ ./run-recipe.sh glm-4.7-nvfp4 --solo +Error: Recipe 'GLM-4.7-NVFP4' requires cluster mode. +This model is too large to run on a single node. + +Options: + 1. Specify nodes directly: ./run-recipe.sh glm-4.7-nvfp4 -n node1,node2 + 2. Auto-discover and save: ./run-recipe.sh --discover + Then run: ./run-recipe.sh glm-4.7-nvfp4 +``` + +## Setup Options + +| Flag | Description | +|------|-------------| +| `--setup` | Full setup: build (if missing) + download (if missing) + run | +| `--build-only` | Only build/copy the container, don't run | +| `--download-only` | Only download/copy the model, don't run | +| `--force-build` | Rebuild even if container exists | +| `--force-download` | Re-download even if model exists | +| `--dry-run` | Show what would happen without executing | + +## Recipe Format + +```yaml +# Required fields +name: Human-readable name +container: docker-image-name +command: | + vllm serve model/name \ + --port {port} \ + --host {host} + +# Optional fields +description: What this recipe does +model: org/model-name # HuggingFace model ID for --setup downloads +cluster_only: false # Set to true if model requires cluster mode +build_args: # Extra args for build-and-copy.sh + - --pre-tf # e.g., for transformers 5.0 + - --exp-mxfp4 # e.g., for MXFP4 Dockerfile +mods: + - mods/some-patch +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.85 + max_model_len: 32000 +env: + SOME_VAR: "value" +``` + +### Build Arguments + +The `build_args` field passes flags to `build-and-copy.sh`: + +| Flag | Description | +|------|-------------| +| `--pre-tf` | Use transformers 5.0 (required for GLM-4.7 models) | +| `--exp-mxfp4` | Use MXFP4 Dockerfile (for MXFP4 quantized models) | +| `--use-wheels` | Use pre-built wheels instead of building from source | + +### Parameter Substitution + +Use `{param_name}` in the command to substitute values from defaults or CLI overrides: + +```yaml +defaults: + port: 8000 + tensor_parallel: 2 + +command: | + vllm serve my/model \ + --port {port} \ + -tp {tensor_parallel} +``` + +Override at runtime: +```bash +./run-recipe.sh my-recipe --port 9000 --tp 4 +``` + +## CLI Reference + +``` +Usage: ./run-recipe.sh [OPTIONS] [RECIPE] + +Cluster discovery: + --discover Auto-detect cluster nodes and save to .env + --show-env Show current .env configuration + +Recipe overrides: + --port PORT Override port + --host HOST Override host + --tensor-parallel, --tp N Override tensor parallelism + --gpu-memory-utilization N Override GPU memory utilization (--gpu-mem) + --max-model-len N Override max model length + +Setup options: + --setup Full setup: build + download + run + --build-only Only build/copy container, don't run + --download-only Only download/copy model, don't run + --force-build Rebuild even if container exists + --force-download Re-download even if model exists + +Launch options: + --solo Run in solo mode (single node, no Ray) + -n, --nodes IPS Comma-separated node IPs (first = head) + -d, --daemon Run in daemon mode + -t, --container IMAGE Override container from recipe + --nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE) + +Other: + --dry-run Show what would be executed + --list, -l List available recipes +``` + +## Creating a Recipe + +1. Create a new `.yaml` file in `recipes/` +2. Specify required fields: `name`, `container`, `command` +3. Add `build_args` if your model needs special build options +4. Add `mods` if your model needs patches +5. Set `cluster_only: true` if model is too large for single node +6. Set sensible `defaults` +7. Add `env` variables if needed + +Example: +```yaml +name: My Model +description: My custom model setup +container: vllm-node-tf5 + +build_args: + - --pre-tf + +mods: + - mods/my-fix + +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 1 + gpu_memory_utilization: 0.85 + +command: | + vllm serve org/my-model \ + --port {port} \ + --host {host} \ + -tp {tensor_parallel} \ + --gpu-memory-utilization {gpu_memory_utilization} +``` + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ run-recipe.sh / run-recipe.py │ +│ - Parses YAML recipe │ +│ - Auto-discovers cluster nodes (--discover) │ +│ - Loads nodes from .env │ +│ - Handles --setup (build + download + run) │ +│ - Generates launch script from template │ +│ - Applies CLI overrides │ +└──────────┬────────────────────────┬─────────────────────┘ + │ calls (for build) │ calls (for download) + ▼ ▼ +┌──────────────────────┐ ┌───────────────────────────────┐ +│ build-and-copy.sh │ │ hf-download.sh │ +│ - Docker build │ │ - HuggingFace model download │ +│ - Copy to workers │ │ - Rsync to workers │ +└──────────────────────┘ └───────────────────────────────┘ + │ + │ then calls (for run) + ▼ +┌─────────────────────────────────────────────────────────┐ +│ launch-cluster.sh │ +│ - Cluster orchestration │ +│ - Container lifecycle │ +│ - Mod application │ +│ - Launch script execution │ +└─────────────────────────────────────────────────────────┘ +``` + +This separation follows the Unix philosophy: `run-recipe.sh` provides convenience, while the underlying scripts remain focused on their specific tasks. diff --git a/recipes/glm-4.7-flash-awq.yaml b/recipes/glm-4.7-flash-awq.yaml new file mode 100644 index 0000000..b0acb2f --- /dev/null +++ b/recipes/glm-4.7-flash-awq.yaml @@ -0,0 +1,64 @@ +# Recipe: GLM-4.7-Flash-AWQ-4bit +# cyankiwi's AWQ quantized GLM-4.7-Flash model +# Requires a patch for inference speed optimization +# +# NOTE: vLLM implementation is suboptimal even with the patch. +# The model performance is still significantly slower than it should be +# for a model with this number of active parameters. Running in cluster +# increases prompt processing performance, but not token generation. +# Expect ~40 t/s generation speed in both single node and cluster. + +recipe_version: "1" +name: GLM-4.7-Flash-AWQ +description: vLLM serving cyankiwi/GLM-4.7-Flash-AWQ-4bit with speed optimization patch + +# HuggingFace model to download +model: cyankiwi/GLM-4.7-Flash-AWQ-4bit + +# This model can run on single node (solo) or cluster +cluster_only: false + +# Container image to use +container: vllm-node-tf5 + +# Build arguments for build-and-copy.sh +# tf5 = transformers 5.0 (required for GLM-4.7) +build_args: + - --pre-tf + +# Mods to apply before running (paths relative to repo root) +# This mod prevents severe inference speed degradation +mods: + - mods/fix-glm-4.7-flash-AWQ + +# Default settings (can be overridden via CLI) +defaults: + port: 8888 + host: 0.0.0.0 + tensor_parallel: 1 + gpu_memory_utilization: 0.7 + max_model_len: 202752 + max_num_batched_tokens: 4096 + max_num_seqs: 64 + served_model_name: glm-4.7-flash + +# Environment variables to set in the container +env: + # Add any required env vars here + +# The vLLM serve command template +# Use {var_name} for substitution from defaults/overrides +# In cluster mode, --distributed-executor-backend ray and -tp 2 are added +command: | + vllm serve cyankiwi/GLM-4.7-Flash-AWQ-4bit \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --enable-auto-tool-choice \ + --served-model-name {served_model_name} \ + --max-model-len {max_model_len} \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --max-num-seqs {max_num_seqs} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + -tp {tensor_parallel} \ + --host {host} \ + --port {port} diff --git a/recipes/minimax-m2-awq.yaml b/recipes/minimax-m2-awq.yaml new file mode 100644 index 0000000..8d584c7 --- /dev/null +++ b/recipes/minimax-m2-awq.yaml @@ -0,0 +1,40 @@ +# Recipe: MiniMax-M2-AWQ +# MiniMax M2 model with AWQ quantization + +recipe_version: "1" +name: MiniMax-M2-AWQ +description: vLLM serving MiniMax-M2-AWQ with Ray distributed backend + +# HuggingFace model to download (optional, for --download-model) +model: QuantTrio/MiniMax-M2-AWQ + +# Container image to use +container: vllm-node + +# No mods required +mods: [] + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.7 + max_model_len: 128000 + +# Environment variables +env: {} + +# The vLLM serve command template +command: | + vllm serve QuantTrio/MiniMax-M2-AWQ \ + --port {port} \ + --host {host} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray \ + --max-model-len {max_model_len} \ + --load-format fastsafetensors \ + --enable-auto-tool-choice \ + --tool-call-parser minimax_m2 \ + --reasoning-parser minimax_m2_append_think diff --git a/recipes/openai-gpt-oss-120b.yaml b/recipes/openai-gpt-oss-120b.yaml new file mode 100644 index 0000000..0e56aab --- /dev/null +++ b/recipes/openai-gpt-oss-120b.yaml @@ -0,0 +1,52 @@ +# Recipe: OpenAI GPT-OSS 120B +# OpenAI's open source 120B MoE model with MXFP4 quantization support + +recipe_version: "1" +name: OpenAI GPT-OSS 120B +description: vLLM serving openai/gpt-oss-120b with MXFP4 quantization and FlashInfer + +# HuggingFace model to download (optional, for --download-model) +model: openai/gpt-oss-120b + +# Container image to use +container: vllm-node-mxfp4 + +# Build arguments for build-and-copy.sh +build_args: + - --exp-mxfp4 + +# No mods required for this model +mods: [] + +# Default settings (can be overridden via CLI) +defaults: + port: 8888 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.70 + max_num_batched_tokens: 8192 + +# Environment variables to set in the container +env: + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: "1" + +# The vLLM serve command template +# Uses MXFP4 quantization for memory efficiency +command: | + vllm serve openai/gpt-oss-120b \ + --tool-call-parser openai \ + --reasoning-parser openai_gptoss \ + --enable-auto-tool-choice \ + --tensor-parallel-size {tensor_parallel} \ + --distributed-executor-backend ray \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --enable-prefix-caching \ + --load-format fastsafetensors \ + --quantization mxfp4 \ + --mxfp4-backend CUTLASS \ + --mxfp4-layers moe,qkv,o,lm_head \ + --attention-backend FLASHINFER \ + --kv-cache-dtype fp8 \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --host {host} \ + --port {port} diff --git a/run-recipe.py b/run-recipe.py new file mode 100755 index 0000000..16038ee --- /dev/null +++ b/run-recipe.py @@ -0,0 +1,1123 @@ +#!/usr/bin/env python3 +""" +run-recipe.py - One-click model deployment using YAML recipes + +This script provides a high-level interface for deploying models with +pre-configured settings. It handles: +- Model download from HuggingFace (optional) +- Container building and distribution to worker nodes +- Mod application +- Launch script generation +- Both solo (single node) and cluster deployments + +Usage: + ./run-recipe.py recipes/glm-4.7-nvfp4.yaml + ./run-recipe.py glm-4.7-nvfp4 --port 9000 --solo + ./run-recipe.py minimax-m2-awq --setup # Full setup: build + download + run + ./run-recipe.py --list + +================================================================================ +ARCHITECTURE OVERVIEW (for developers extending this script) +================================================================================ + +DEPLOYMENT PIPELINE: + ┌─────────────────────────────────────────────────────────────────────────┐ + │ CLI Args → Load Recipe → Resolve Nodes → Build → Download → Run │ + └─────────────────────────────────────────────────────────────────────────┘ + +KEY ABSTRACTIONS: + - Recipe (YAML): Declarative model configuration (see load_recipe docstring) + - Phases: Build, Download, Run - each can run independently (--build-only, etc.) + - Nodes: Head (first) + Workers (rest) - images/models copied to workers + +EXTENSION POINTS: + + 1. ADD NEW RECIPE FIELDS: + - Update load_recipe() to validate/set defaults + - Use the field in generate_launch_script() or main() + - Document in recipe YAML schema below + + 2. ADD NEW CLI OPTIONS: + - Add to appropriate argument group in main() + - Handle in the corresponding phase (build/download/run) + - Pass to generate_launch_script() via overrides dict if needed + + 3. ADD NEW DEPLOYMENT PHASES: + - Follow the pattern: check if needed → dry-run print → execute + - Insert between existing phases in main() + - Add corresponding --phase-only flag + + 4. SUPPORT NEW MODEL SOURCES: + - Add detection logic in download_model() or check_model_exists() + - Create new download script or handle inline + + 5. SUPPORT NEW CONTAINER RUNTIMES: + - Modify check_image_exists() and build_image() + - May need to update launch-cluster.sh as well + +RECIPE YAML SCHEMA: + name: str # Required: Human-readable name + recipe_version: str # Required: Recipe schema version (e.g., '1'). Used by run-recipe.py + # to check compatibility and available features. + container: str # Required: Docker image tag + command: str # Required: vLLM serve command with {placeholders} + description: str # Optional: Brief description + model: str # Optional: HuggingFace model ID for --setup + mods: list[str] # Optional: Mod directories to apply + defaults: dict # Optional: Default values for command placeholders + env: dict # Optional: Environment variables + build_args: list[str] # Optional: Args for build-and-copy.sh + cluster_only: bool # Optional: Require cluster mode (default: false) + +RECIPE VERSION HISTORY: + Version 1 (default): Initial schema with all fields above supported. + +RELATED FILES: + - run-recipe.sh: Bash wrapper that ensures Python deps are installed + - recipes/*.yaml: Recipe definitions + - launch-cluster.sh: Low-level container orchestration + - build-and-copy.sh: Docker build and distribution + - hf-download.sh: HuggingFace model download and sync + - autodiscover.sh: Network topology detection +""" + +import argparse +import os +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Any + +try: + import yaml +except ImportError: + print("Error: PyYAML is required. Install with: pip install pyyaml") + sys.exit(1) + + +SCRIPT_DIR = Path(__file__).parent.resolve() +RECIPES_DIR = SCRIPT_DIR / "recipes" +LAUNCH_SCRIPT = SCRIPT_DIR / "launch-cluster.sh" +BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh" +DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh" +AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh" +ENV_FILE = SCRIPT_DIR / ".env" + + +def load_recipe(recipe_path: Path) -> dict[str, Any]: + """ + Load and validate a recipe YAML file. + + This function handles recipe resolution from multiple locations and validates + required fields. Recipes are the core configuration format for deployments. + + EXTENSIBILITY: + - To add new required fields: Add to the 'required' list below + - To add new optional fields with defaults: Add to the setdefault() calls at the end + - Recipe search order: exact path -> recipes/ dir -> with .yaml -> with .yml + + RECIPE SCHEMA: + name (str, required): Human-readable name for the recipe + recipe_version (str, required): Schema version for compatibility checking. + Used by run-recipe.py to determine which features are available. + Current version: '1'. Bump when adding new recipe fields. + container (str, required): Docker image tag to use (e.g., 'vllm-node-mxfp4') + command (str, required): vLLM serve command template with {placeholders} + description (str, optional): Brief description shown in --list + model (str, optional): HuggingFace model ID for --setup downloads + mods (list[str], optional): List of mod directories to apply (e.g., 'mods/fix-glm') + defaults (dict, optional): Default values for command placeholders + env (dict, optional): Environment variables to export before running + build_args (list[str], optional): Extra args for build-and-copy.sh (e.g., ['-f', 'Dockerfile.mxfp4']) + cluster_only (bool, optional): If True, recipe cannot run in solo mode + + Args: + recipe_path: Path object pointing to YAML file or just recipe name + + Returns: + Validated recipe dictionary with all fields populated (defaults applied) + + Raises: + SystemExit: If recipe not found or validation fails + """ + if not recipe_path.exists(): + # Try recipes directory with various extensions + candidates = [ + RECIPES_DIR / recipe_path.name, + RECIPES_DIR / f"{recipe_path.name}.yaml", + RECIPES_DIR / f"{recipe_path.name}.yml", + RECIPES_DIR / f"{recipe_path.stem}.yaml", + ] + for candidate in candidates: + if candidate.exists(): + recipe_path = candidate + break + else: + print(f"Error: Recipe not found: {recipe_path}") + print(f"Searched in: {recipe_path}, {RECIPES_DIR}") + sys.exit(1) + + with open(recipe_path) as f: + recipe = yaml.safe_load(f) + + # Validate required fields + required = ["name", "recipe_version", "container", "command"] + for field in required: + if field not in recipe: + print(f"Error: Recipe missing required field: {field}") + sys.exit(1) + + # Set defaults for optional fields + recipe.setdefault("description", "") + recipe.setdefault("model", None) + recipe.setdefault("mods", []) + recipe.setdefault("defaults", {}) + recipe.setdefault("env", {}) + + # Validate recipe version compatibility + # EXTENSIBILITY: When adding new schema versions, update SUPPORTED_VERSIONS + # and add migration/compatibility logic below + SUPPORTED_VERSIONS = ["1"] + recipe_ver = str(recipe["recipe_version"]) + if recipe_ver not in SUPPORTED_VERSIONS: + print(f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}") + print("Some features may not work correctly. Consider updating run-recipe.py.") + + return recipe + + +def list_recipes() -> None: + """ + List all available recipes with their metadata. + + Scans the recipes/ directory for YAML files and displays key information. + Used by the --list CLI option. + + EXTENSIBILITY: + - To show additional fields: Add them to the print statements in the loop + - To support different output formats (e.g., JSON): Add a format parameter + - Recipe directory is defined by RECIPES_DIR constant at module level + """ + if not RECIPES_DIR.exists(): + print("No recipes directory found.") + return + + recipes = sorted(RECIPES_DIR.glob("*.yaml")) + if not recipes: + print("No recipes found in recipes/ directory.") + return + + print("Available recipes:\n") + for recipe_path in recipes: + try: + recipe = load_recipe(recipe_path) + name = recipe.get("name", recipe_path.stem) + recipe_version = recipe.get("recipe_version", "1") + desc = recipe.get("description", "") + container = recipe.get("container", "vllm-node") + build_args = recipe.get("build_args", []) + model = recipe.get("model", "") + mods = recipe.get("mods", []) + cluster_only = recipe.get("cluster_only", False) + + print(f" {recipe_path.name}") + print(f" Name: {name}") + if desc: + print(f" Description: {desc}") + if model: + print(f" Model: {model}") + if cluster_only: + print(f" Cluster only: Yes") + print(f" Container: {container}") + if build_args: + print(f" Build args: {' '.join(build_args)}") + if mods: + print(f" Mods: {', '.join(mods)}") + print() + except Exception as e: + print(f" {recipe_path.name} (error loading: {e})") + print() + + +def check_image_exists(image: str, host: str | None = None) -> bool: + """ + Check if a Docker image exists locally or on a remote host. + + Used to avoid redundant builds and to verify cluster nodes have the image. + + EXTENSIBILITY: + - To support other container runtimes (podman): Modify the docker command + - To add image version/digest checking: Parse 'docker image inspect' JSON output + - For custom SSH options: Modify the ssh command array + + Args: + image: Docker image tag to check (e.g., 'vllm-node-mxfp4') + host: Optional remote hostname/IP. If None, checks locally. + + Returns: + True if image exists, False otherwise + """ + if host: + result = subprocess.run( + ["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", + host, f"docker image inspect '{image}'"], + capture_output=True + ) + else: + result = subprocess.run( + ["docker", "image", "inspect", image], + capture_output=True + ) + return result.returncode == 0 + + +def build_image(image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None) -> bool: + """ + Build the container image using build-and-copy.sh. + + Delegates to the build-and-copy.sh script which handles multi-stage builds, + cache optimization, and distribution to worker nodes. + + EXTENSIBILITY: + - To add new build options: Add them to build_args in the recipe's build_args field + - To support different Dockerfiles: Use build_args = ['-f', 'Dockerfile.custom'] + - To add build-time secrets: Modify cmd array to include --secret flags + - To add progress callbacks: Capture subprocess output line-by-line + + BUILD_ARGS EXAMPLES: + ['-f', 'Dockerfile.mxfp4'] - Use alternate Dockerfile + ['--no-cache'] - Force full rebuild + ['--build-arg', 'VAR=value'] - Pass build-time variables + + Args: + image: Target image tag + copy_to: List of worker hostnames to copy image to after build + build_args: Extra arguments passed to build-and-copy.sh + + Returns: + True if build (and copy) succeeded, False otherwise + """ + if not BUILD_SCRIPT.exists(): + print(f"Error: Build script not found: {BUILD_SCRIPT}") + return False + + cmd = [str(BUILD_SCRIPT), "-t", image] + if build_args: + cmd.extend(build_args) + if copy_to: + cmd.extend(["--copy-to", ",".join(copy_to)]) + + print(f"Building image '{image}'...") + if build_args: + print(f"Build args: {' '.join(build_args)}") + if copy_to: + print(f"Will copy to: {', '.join(copy_to)}") + + result = subprocess.run(cmd) + return result.returncode == 0 + + +def download_model(model: str, copy_to: list[str] | None = None) -> bool: + """ + Download model from HuggingFace using hf-download.sh. + + Delegates to hf-download.sh which handles HF authentication, caching, + and rsync to worker nodes. + + EXTENSIBILITY: + - To support other model sources: Create a new download script and switch based on model URL + - To add download progress: Capture subprocess output + - To support private models: hf-download.sh uses HF_TOKEN env var + - To add model verification: Check sha256 of downloaded files + + Args: + model: HuggingFace model ID (e.g., 'Salyut1/GLM-4.7-NVFP4') + copy_to: List of worker hostnames to copy model cache to + + Returns: + True if download (and copy) succeeded, False otherwise + """ + if not DOWNLOAD_SCRIPT.exists(): + print(f"Error: Download script not found: {DOWNLOAD_SCRIPT}") + return False + + cmd = [str(DOWNLOAD_SCRIPT), model] + if copy_to: + cmd.extend(["--copy-to", ",".join(copy_to)]) + + print(f"Downloading model '{model}'...") + if copy_to: + print(f"Will copy to: {', '.join(copy_to)}") + + result = subprocess.run(cmd) + return result.returncode == 0 + + +def check_model_exists(model: str) -> bool: + """ + Check if a model exists in the HuggingFace cache. + + Checks the standard HF cache location for completed downloads. + + EXTENSIBILITY: + - To support custom cache locations: Add HF_HOME env var support + - To verify model integrity: Check for complete snapshot with config.json + - To support other model sources: Add URL/path prefix detection + + Args: + model: HuggingFace model ID (e.g., 'org/model-name') + + Returns: + True if model appears to be fully downloaded, False otherwise + """ + # Convert model name to cache directory format + # e.g., "Salyut1/GLM-4.7-NVFP4" -> "models--Salyut1--GLM-4.7-NVFP4" + cache_name = f"models--{model.replace('/', '--')}" + cache_path = Path.home() / ".cache" / "huggingface" / "hub" / cache_name + + if cache_path.exists(): + # Check for snapshots directory which indicates complete download + snapshots = cache_path / "snapshots" + if snapshots.exists() and any(snapshots.iterdir()): + return True + return False + + +def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False) -> str: + """ + Generate a bash launch script from the recipe. + + Creates a self-contained bash script that runs inside the container. + Handles template substitution, environment variables, and solo mode adjustments. + + EXTENSIBILITY: + - To add new template variables: Add them to recipe['defaults'] or CLI overrides + - To add pre/post hooks: Add 'pre_command'/'post_command' fields to recipe schema + - To add conditional logic: Use Jinja2 templating instead of str.format() + - To support GPU selection: Add CUDA_VISIBLE_DEVICES to env handling + + TEMPLATE VARIABLES (use {variable_name} in recipe command): + port: API server port (default from recipe) + host: API server bind address + tensor_parallel: Number of GPUs for tensor parallelism + gpu_memory_utilization: Fraction of GPU memory to use + max_model_len: Maximum sequence length + (custom variables can be added via recipe defaults) + + SOLO MODE BEHAVIOR: + - Removes '--distributed-executor-backend ray' lines + - Typically sets tensor_parallel=1 (handled by caller) + + Args: + recipe: Loaded recipe dictionary + overrides: CLI-provided parameter overrides (take precedence over defaults) + is_solo: If True, strip distributed executor configuration + + Returns: + Complete bash script content as string + + Raises: + SystemExit: If required template variables are missing + """ + # Merge defaults with overrides + params = {**recipe.get("defaults", {}), **overrides} + + # Build the script + lines = ["#!/bin/bash", f"# Generated from recipe: {recipe['name']}", ""] + + # Add environment variables + env_vars = recipe.get("env", {}) + if env_vars: + lines.append("# Environment variables") + for key, value in env_vars.items(): + lines.append(f"export {key}=\"{value}\"") + lines.append("") + + # Format the command with parameters + command = recipe["command"] + try: + command = command.format(**params) + except KeyError as e: + print(f"Error: Missing parameter in recipe command: {e}") + print(f"Available parameters: {list(params.keys())}") + sys.exit(1) + + # In solo mode, remove --distributed-executor-backend ray + # (it's not needed and can cause issues on single node) + if is_solo: + import re + # Remove the entire line containing --distributed-executor-backend + # This handles multi-line commands with backslash continuations + lines_list = command.split('\n') + filtered_lines = [ + line for line in lines_list + if '--distributed-executor-backend' not in line + ] + command = '\n'.join(filtered_lines) + + lines.append("# Run the model") + lines.append(command.strip()) + lines.append("") + + return "\n".join(lines) + + +def parse_nodes(nodes_arg: str | None) -> list[str]: + """ + Parse comma-separated node list. + + Simple utility to split node specifications. The first node is + always treated as the head node for cluster deployments. + + Args: + nodes_arg: Comma-separated string like '192.168.1.1,192.168.1.2' + + Returns: + List of stripped node identifiers, empty list if input is None/empty + """ + if not nodes_arg: + return [] + return [n.strip() for n in nodes_arg.split(",") if n.strip()] + + +def get_worker_nodes(nodes: list[str]) -> list[str]: + """ + Get worker nodes (all nodes except the first/head node). + + In a Ray cluster, the first node runs the head process. + Workers are all subsequent nodes that join the cluster. + + Args: + nodes: Full list of nodes (head first, then workers) + + Returns: + List of worker nodes (excluding head), empty if single node + """ + if len(nodes) <= 1: + return [] + return nodes[1:] + + +def load_env_file() -> dict[str, str]: + """ + Load environment variables from .env file. + + Reads the .env file created by --discover for persistent cluster configuration. + + EXTENSIBILITY: + - To add new persistent settings: Just add them to save_env_file() + - To support multiple .env files: Add a --env-file CLI argument + - To add validation: Check for required keys after loading + + SUPPORTED KEYS (set by --discover): + CLUSTER_NODES: Comma-separated list of node IPs + LOCAL_IP: This machine's IP address + ETH_IF: Ethernet interface name + IB_IF: InfiniBand interface name (if available) + + Returns: + Dictionary of key=value pairs from .env file + """ + env = {} + if ENV_FILE.exists(): + with open(ENV_FILE) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + # Remove quotes if present + value = value.strip().strip('"').strip("'") + env[key.strip()] = value + return env + + +def save_env_file(env: dict[str, str]) -> None: + """ + Save environment variables to .env file. + + Persists cluster configuration discovered by autodiscover.sh. + Values are properly quoted if they contain spaces or commas. + + EXTENSIBILITY: + - To add new persistent settings: Just add them to the env dict before calling + - To add timestamps/metadata: Add comment lines to the output + - To support append mode: Read existing, merge, then write + + Args: + env: Dictionary of key=value pairs to save + """ + lines = ["# Auto-generated by run-recipe.py --discover", ""] + for key, value in sorted(env.items()): + # Quote values with spaces + if " " in value or "," in value: + lines.append(f'{key}="{value}"') + else: + lines.append(f"{key}={value}") + lines.append("") + + with open(ENV_FILE, "w") as f: + f.write("\n".join(lines)) + + print(f"Saved to {ENV_FILE}") + + +def run_autodiscover() -> dict[str, str] | None: + """ + Run autodiscover.sh and return discovered configuration. + + Executes the autodiscover.sh script to detect cluster topology, + then presents an interactive node selection menu. + + EXTENSIBILITY: + - To add new discovery methods: Extend autodiscover.sh or add Python detection here + - To add GPU detection: Add nvidia-smi parsing to discovered env + - To skip interactive selection: Add a --non-interactive flag + - To add node health checks: Ping/SSH test each discovered node + + DISCOVERED VARIABLES: + CLUSTER_NODES: Comma-separated list of node IPs (user-selected) + LOCAL_IP: This machine's IP address + ETH_IF: Ethernet interface name (e.g., 'eth0') + IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available + + Returns: + Dictionary with discovered configuration, or None if discovery failed + """ + if not AUTODISCOVER_SCRIPT.exists(): + print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") + return None + + print("Running autodiscover...") + print() + + # Run autodiscover in a subshell and capture the variables + # We source the script and print the variables we care about + script = f""" + source '{AUTODISCOVER_SCRIPT}' + detect_interfaces + detect_local_ip + detect_nodes + echo "CLUSTER_NODES=$NODES_ARG" + echo "LOCAL_IP=$LOCAL_IP" + echo "ETH_IF=$ETH_IF" + echo "IB_IF=$IB_IF" + """ + + result = subprocess.run( + ["bash", "-c", script], + capture_output=True, + text=True + ) + + if result.returncode != 0: + print("Autodiscover output:") + print(result.stdout) + if result.stderr: + print(result.stderr) + print("Error: Autodiscover failed") + return None + + # Print the autodiscover output (excluding the final variable lines) + output_lines = result.stdout.strip().split("\n") + env = {} + for line in output_lines: + if "=" in line and any(line.startswith(k) for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]): + key, _, value = line.partition("=") + env[key] = value + else: + print(line) + + print() + + # Interactive node selection + if env.get("CLUSTER_NODES"): + all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()] + local_ip = env.get("LOCAL_IP", "") + + if len(all_nodes) > 1: + print("Select which nodes to include in the cluster:") + print() + + selected_nodes = [] + for node in all_nodes: + is_local = node == local_ip + label = f"{node} (this machine)" if is_local else node + + # Default to yes for all nodes + while True: + response = input(f" Include {label}? [Y/n]: ").strip().lower() + if response in ("", "y", "yes"): + selected_nodes.append(node) + break + elif response in ("n", "no"): + break + else: + print(" Please enter 'y' or 'n'") + + print() + + if not selected_nodes: + print("No nodes selected. Aborting.") + return None + + if len(selected_nodes) == 1: + print(f"Only one node selected: {selected_nodes[0]}") + print("This will run in solo mode (single node).") + else: + print(f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}") + + env["CLUSTER_NODES"] = ",".join(selected_nodes) + print() + + return env + + +def main(): + """ + Main entry point for the recipe runner. + + Orchestrates the full deployment pipeline: + 1. Parse CLI arguments and load recipe + 2. Resolve cluster nodes (CLI -> .env -> autodiscover) + 3. Build phase: Build container if missing, copy to workers + 4. Download phase: Download model if missing, copy to workers + 5. Run phase: Generate launch script and execute via launch-cluster.sh + + EXTENSIBILITY: + - To add new CLI options: Add to the appropriate argument group + - To add new phases: Insert between existing phases with similar pattern + - To add pre/post hooks: Add hook execution before/after subprocess calls + - To add logging: Replace print() with logging module calls + - To add config file support: Load defaults from ~/.config/vllm-recipes.yaml + + EXIT CODES: + 0: Success + 1: Error (recipe not found, build failed, validation error, etc.) + + Returns: + Exit code for sys.exit() + """ + parser = argparse.ArgumentParser( + description="Run a model using a YAML recipe", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage + %(prog)s glm-4.7-nvfp4 + %(prog)s glm-4.7-nvfp4 --port 9000 --solo + + # Full setup (build container + download model + run) + %(prog)s glm-4.7-nvfp4 --setup + + # Cluster deployment (manual) + %(prog)s glm-4.7-nvfp4 -n 192.168.1.1,192.168.1.2 --setup + + # Cluster deployment (auto-discover) + %(prog)s --discover # Detect nodes and save to .env + %(prog)s glm-4.7-nvfp4 --setup # Uses nodes from .env + + # Just build/download without running + %(prog)s glm-4.7-nvfp4 --build-only + %(prog)s glm-4.7-nvfp4 --download-only + + # List available recipes + %(prog)s --list + + # Show current .env configuration + %(prog)s --show-env + """ + ) + + parser.add_argument( + "recipe", + nargs="?", + help="Path to recipe YAML file (or just the name without .yaml)" + ) + parser.add_argument( + "--list", "-l", + action="store_true", + help="List available recipes" + ) + + # Setup options + setup_group = parser.add_argument_group("Setup options") + setup_group.add_argument( + "--setup", + action="store_true", + help="Full setup: build container (if missing) + download model (if missing) + run" + ) + setup_group.add_argument( + "--build-only", + action="store_true", + help="Only build/copy the container image, don't run" + ) + setup_group.add_argument( + "--download-only", + action="store_true", + help="Only download/copy the model, don't run" + ) + setup_group.add_argument( + "--force-build", + action="store_true", + help="Force rebuild even if image exists" + ) + setup_group.add_argument( + "--force-download", + action="store_true", + help="Force re-download even if model exists" + ) + + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be executed without running" + ) + + # Override options + override_group = parser.add_argument_group("Recipe overrides") + override_group.add_argument("--port", type=int, help="Override port") + override_group.add_argument("--host", help="Override host") + override_group.add_argument("--tensor-parallel", "--tp", type=int, dest="tensor_parallel", help="Override tensor parallelism") + override_group.add_argument("--gpu-memory-utilization", "--gpu-mem", type=float, dest="gpu_memory_utilization", help="Override GPU memory utilization") + override_group.add_argument("--max-model-len", type=int, dest="max_model_len", help="Override max model length") + + # Launch options (passed to launch-cluster.sh) + launch_group = parser.add_argument_group("Launch options (passed to launch-cluster.sh)") + launch_group.add_argument("--solo", action="store_true", help="Run in solo mode (single node, no Ray)") + launch_group.add_argument("-n", "--nodes", help="Comma-separated list of node IPs (first is head node)") + launch_group.add_argument("-d", "--daemon", action="store_true", help="Run in daemon mode") + launch_group.add_argument("-t", "--container", dest="container_override", help="Override container image from recipe") + launch_group.add_argument("--nccl-debug", choices=["VERSION", "WARN", "INFO", "TRACE"], help="NCCL debug level") + + # Cluster discovery options + discover_group = parser.add_argument_group("Cluster discovery") + discover_group.add_argument( + "--discover", + action="store_true", + help="Auto-detect cluster nodes and save to .env file" + ) + discover_group.add_argument( + "--show-env", + action="store_true", + help="Show current .env configuration" + ) + + args = parser.parse_args() + + # Handle --discover (can be run with or without a recipe) + if args.discover: + env = run_autodiscover() + if env is None: + return 1 + + print("Discovered configuration:") + for key, value in sorted(env.items()): + print(f" {key}={value}") + print() + + save_env_file(env) + + if not args.recipe: + return 0 + + # Handle --show-env + if args.show_env: + env = load_env_file() + if env: + print(f"Current .env configuration ({ENV_FILE}):") + for key, value in sorted(env.items()): + print(f" {key}={value}") + else: + print(f"No .env file found at {ENV_FILE}") + print("Run with --discover to auto-detect cluster nodes.") + + if not args.recipe: + return 0 + print() + + if args.list: + list_recipes() + return 0 + + if not args.recipe: + parser.print_help() + return 1 + + # Load recipe + recipe_path = Path(args.recipe) + recipe = load_recipe(recipe_path) + + print(f"Recipe: {recipe['name']}") + if recipe.get("description"): + print(f" {recipe['description']}") + print() + + # Determine container image + container = args.container_override or recipe["container"] + model = recipe.get("model") + build_args = recipe.get("build_args", []) + + # Parse nodes - check command line first, then .env file, then autodiscover + nodes = parse_nodes(args.nodes) + nodes_from_env = False + + if not nodes and not args.solo: + # Try to load from .env file + env = load_env_file() + if env.get("CLUSTER_NODES"): + nodes = parse_nodes(env["CLUSTER_NODES"]) + nodes_from_env = True + if nodes: + print(f"Using cluster nodes from .env: {', '.join(nodes)}") + print() + else: + # No nodes specified and no .env - run autodiscover + print("No cluster nodes configured. Running autodiscover...") + print() + + discovered_env = run_autodiscover() + if discovered_env and discovered_env.get("CLUSTER_NODES"): + nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) + nodes_from_env = True + + if nodes: + # Ask if user wants to save to .env + print() + response = input("Save this configuration to .env for future use? [Y/n]: ").strip().lower() + if response in ("", "y", "yes"): + save_env_file(discovered_env) + print() + + worker_nodes = get_worker_nodes(nodes) if nodes else [] + is_cluster = len(nodes) > 1 + + # Check if recipe requires cluster mode + cluster_only = recipe.get("cluster_only", False) + is_solo = args.solo or not is_cluster + + if cluster_only and is_solo: + print(f"Error: Recipe '{recipe['name']}' requires cluster mode.") + print(f"This model is too large to run on a single node.") + print() + print("Options:") + print(f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2") + print(f" 2. Auto-discover and save: {sys.argv[0]} --discover") + print(f" Then run: {sys.argv[0]} {args.recipe}") + return 1 + + # Determine copy targets for cluster deployments + copy_targets = worker_nodes if is_cluster else None + + if args.dry_run: + print("=== Dry Run ===") + print(f"Container: {container}") + if build_args: + print(f"Build args: {' '.join(build_args)}") + if model: + print(f"Model: {model}") + if cluster_only: + print(f"Cluster only: Yes (model too large for single node)") + if nodes: + source = "(from .env)" if nodes_from_env else "" + print(f"Nodes: {', '.join(nodes)} {source}".strip()) + print(f" Head: {nodes[0]}") + if worker_nodes: + print(f" Workers: {', '.join(worker_nodes)}") + print(f"Solo mode: {is_solo}") + print() + + # --- Build Phase --- + if args.build_only or args.setup or args.force_build: + if args.dry_run: + image_exists = check_image_exists(container) + if args.force_build or not image_exists: + print(f"Would build container: {container}") + if copy_targets: + print(f" Would copy to: {', '.join(copy_targets)}") + else: + print(f"Container '{container}' already exists locally.") + if copy_targets: + print(f" Would check/copy to workers: {', '.join(copy_targets)}") + print() + else: + image_exists = check_image_exists(container) + + if args.force_build or not image_exists: + print("=== Building Container ===") + if not build_image(container, copy_targets, build_args): + print("Error: Failed to build container") + return 1 + print() + else: + print(f"Container '{container}' already exists locally.") + # Check worker nodes in cluster mode + if copy_targets: + missing_on = [] + for worker in copy_targets: + if not check_image_exists(container, worker): + missing_on.append(worker) + if missing_on: + print(f"Container missing on workers: {', '.join(missing_on)}") + print("Building and copying...") + if not build_image(container, missing_on, build_args): + print("Error: Failed to build/copy container") + return 1 + print() + + if args.build_only: + print("Build complete." if not args.dry_run else "") + return 0 + + # --- Download Phase --- + if model and (args.download_only or args.setup or args.force_download): + if args.dry_run: + model_exists = check_model_exists(model) + if args.force_download or not model_exists: + print(f"Would download model: {model}") + if copy_targets: + print(f" Would copy to: {', '.join(copy_targets)}") + else: + print(f"Model '{model}' already exists in cache.") + print() + else: + model_exists = check_model_exists(model) + + if args.force_download or not model_exists: + print("=== Downloading Model ===") + if not download_model(model, copy_targets): + print("Error: Failed to download model") + return 1 + print() + else: + print(f"Model '{model}' already exists in cache.") + print() + + if args.download_only: + print("Download complete." if not args.dry_run else "") + return 0 + + # --- Run Phase --- + if args.build_only or args.download_only: + return 0 + + # Check if image exists (if not using --setup) + if not args.dry_run and not args.setup and not check_image_exists(container): + print(f"Container image '{container}' not found locally.") + print() + print("Options:") + print(f" 1. Use --setup to build and run") + print(f" 2. Build manually: ./build-and-copy.sh -t {container}") + print() + response = input("Build now? [y/N] ").strip().lower() + if response == 'y': + if not build_image(container, copy_targets, build_args): + print("Error: Failed to build image") + return 1 + else: + print("Aborting.") + return 1 + + # Build overrides from CLI args + overrides = {} + for key in ["port", "host", "tensor_parallel", "gpu_memory_utilization", "max_model_len"]: + value = getattr(args, key, None) + if value is not None: + overrides[key] = value + + # In solo mode, default tensor_parallel to 1 (unless user explicitly set --tp) + if is_solo and "tensor_parallel" not in overrides: + overrides["tensor_parallel"] = 1 + + # Generate launch script + script_content = generate_launch_script(recipe, overrides, is_solo=is_solo) + + if args.dry_run: + print("=== Generated Launch Script ===") + print(script_content) + print("=== What would be executed ===") + print() + print("1. The above script is saved to a temporary file") + print() + print("2. launch-cluster.sh is called with:") + cmd_parts = [" ./launch-cluster.sh", "-t", container] + for mod in recipe.get("mods", []): + cmd_parts.extend(["--apply-mod", mod]) + if args.solo: + cmd_parts.append("--solo") + elif not is_cluster: + cmd_parts.append("--solo") + if args.daemon: + cmd_parts.append("-d") + if nodes: + cmd_parts.extend(["-n", ",".join(nodes)]) + if args.nccl_debug: + cmd_parts.extend(["--nccl-debug", args.nccl_debug]) + cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"]) + print(" ".join(cmd_parts)) + print() + print("3. The launch script runs inside the container") + return 0 + + # Write temporary launch script + with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f: + f.write(script_content) + temp_script = f.name + + try: + os.chmod(temp_script, 0o755) + + # Build launch-cluster.sh command + cmd = [str(LAUNCH_SCRIPT), "-t", container] + + # Add mods + for mod in recipe.get("mods", []): + mod_path = SCRIPT_DIR / mod + if not mod_path.exists(): + print(f"Warning: Mod path not found: {mod_path}") + cmd.extend(["--apply-mod", str(mod_path)]) + + # Add launch options + if args.solo: + cmd.append("--solo") + elif not is_cluster: + # Auto-enable solo mode if no cluster nodes specified + cmd.append("--solo") + + if args.daemon: + cmd.append("-d") + + # Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover) + if nodes: + cmd.extend(["-n", ",".join(nodes)]) + + if args.nccl_debug: + cmd.extend(["--nccl-debug", args.nccl_debug]) + + # Add launch script + cmd.extend(["--launch-script", temp_script]) + + print(f"=== Launching ===") + print(f"Container: {container}") + if recipe.get("mods"): + print(f"Mods: {', '.join(recipe['mods'])}") + if is_cluster: + print(f"Cluster: {len(nodes)} nodes") + else: + print("Mode: Solo") + print() + + # Execute + result = subprocess.run(cmd) + return result.returncode + + finally: + # Cleanup temp script + try: + os.unlink(temp_script) + except OSError: + pass + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/run-recipe.sh b/run-recipe.sh new file mode 100755 index 0000000..aeb82b2 --- /dev/null +++ b/run-recipe.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# run-recipe.sh - Wrapper for run-recipe.py +# +# Ensures Python dependencies are available and runs the recipe runner. +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RECIPE_SCRIPT="$SCRIPT_DIR/run-recipe.py" + +# Check for Python 3.10+ +if command -v python3 &>/dev/null; then + PYTHON=python3 +elif command -v python &>/dev/null; then + PYTHON=python +else + echo "Error: Python 3 not found. Please install Python 3.10 or later." + exit 1 +fi + +# Verify version +PY_VERSION=$($PYTHON -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') +PY_MAJOR=$($PYTHON -c 'import sys; print(sys.version_info.major)') +PY_MINOR=$($PYTHON -c 'import sys; print(sys.version_info.minor)') + +if [[ "$PY_MAJOR" -lt 3 ]] || [[ "$PY_MAJOR" -eq 3 && "$PY_MINOR" -lt 10 ]]; then + echo "Error: Python 3.10+ required, found $PY_VERSION" + exit 1 +fi + +# Check for PyYAML and install if missing +if ! $PYTHON -c "import yaml" 2>/dev/null; then + echo "Installing PyYAML..." + $PYTHON -m pip install --quiet pyyaml + if [[ $? -ne 0 ]]; then + echo "Error: Failed to install PyYAML. Try: pip install pyyaml" + exit 1 + fi +fi + +# Run the recipe script +exec $PYTHON "$RECIPE_SCRIPT" "$@"