From 751bc5a47aef4268a1cc17abb325068991e01397 Mon Sep 17 00:00:00 2001 From: Raphael Amorim Date: Sun, 25 Jan 2026 21:22:45 -0500 Subject: [PATCH 1/7] Adding sample profile and profile loader --- README.md | 65 ++++++++-- launch-cluster.sh | 97 ++++++++++++++ profiles/README.md | 184 +++++++++++++++++++++++++++ profiles/example-vllm-minimax.sh | 15 +++ profiles/vllm-glm-4.7-nvfp4.sh | 17 +++ profiles/vllm-openai-gpt-oss-120b.sh | 20 +++ 6 files changed, 390 insertions(+), 8 deletions(-) create mode 100644 profiles/README.md create mode 100644 profiles/example-vllm-minimax.sh create mode 100644 profiles/vllm-glm-4.7-nvfp4.sh create mode 100644 profiles/vllm-openai-gpt-oss-120b.sh diff --git a/README.md b/README.md index 97a400f..da2580c 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,11 @@ While it was primarily developed to support multi-node inference, it works just - [4. Using `run-cluster-node.sh` (Internal)](#4-using-run-cluster-nodesh-internal) - [5. Configuration Details](#5-configuration-details) - [6. Mods and Patches](#6-mods-and-patches) -- [7. Using cluster mode for inference](#7-using-cluster-mode-for-inference) -- [8. Fastsafetensors](#8-fastsafetensors) -- [9. Benchmarking](#9-benchmarking) -- [10. Downloading Models](#10-downloading-models) +- [7. Launch Scripts](#7-launch-scripts) +- [8. Using cluster mode for inference](#8-using-cluster-mode-for-inference) +- [9. Fastsafetensors](#9-fastsafetensors) +- [10. Benchmarking](#10-benchmarking) +- [11. Downloading Models](#11-downloading-models) ## DISCLAIMER @@ -770,7 +771,55 @@ Mods can be used for: - Customizing vLLM behavior for specific workloads - Rapid iteration on development without rebuilding the entire image -## 7\. Using cluster mode for inference +## 7\. Launch Scripts + +Launch scripts provide a simple way to define reusable model configurations. Instead of passing long command lines, you can create a bash script that is copied into the container and executed directly. + +### Basic Usage + +```bash +# Use a launch script by name (looks in profiles/ directory) +./launch-cluster.sh --launch-script example-vllm-minimax + +# Use with explicit nodes +./launch-cluster.sh -n 192.168.1.1,192.168.1.2 --launch-script vllm-openai-gpt-oss-120b.sh + +# Combine with mods for models requiring patches +./launch-cluster.sh --launch-script vllm-glm-4.7-nvfp4.sh --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4 +``` + +### Script Format + +Launch scripts are simple bash files that run directly inside the container: + +```bash +#!/bin/bash +# PROFILE: OpenAI GPT-OSS 120B +# DESCRIPTION: vLLM serving openai/gpt-oss-120b with FlashInfer MOE optimization + +# Set environment variables if needed +export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 + +# Run your command +vllm serve openai/gpt-oss-120b \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size 2 \ + --distributed-executor-backend ray \ + --enable-auto-tool-choice +``` + +### Available Launch Scripts + +The `profiles/` directory contains ready-to-use launch scripts: + +- **example-vllm-minimax.sh** - MiniMax-M2-AWQ with Ray distributed backend +- **vllm-openai-gpt-oss-120b.sh** - OpenAI GPT-OSS 120B with FlashInfer MOE +- **vllm-glm-4.7-nvfp4.sh** - GLM-4.7-NVFP4 (requires the glm4_moe patch mod) + +See [profiles/README.md](profiles/README.md) for detailed documentation and more examples. + +## 8\. Using cluster mode for inference First, start follow the instructions above to start the head container on your first Spark, and node container on the second Spark. Then, on the first Spark, run vllm like this: @@ -787,7 +836,7 @@ docker exec -it vllm_node And execute vllm command inside. -## 8\. Fastsafetensors +## 9\. Fastsafetensors This build includes support for fastsafetensors loading which significantly improves loading speeds, especially on DGX Spark where MMAP performance is very poor currently. [Fasttensors](https://github.com/foundation-model-stack/fastsafetensors/) solve this issue by using more efficient multi-threaded loading while avoiding mmap. @@ -801,11 +850,11 @@ To use this method, simply include `--load-format fastsafetensors` when running HF_HUB_OFFLINE=1 vllm serve openai/gpt-oss-120b --port 8888 --host 0.0.0.0 --trust_remote_code --swap-space 16 --gpu-memory-utilization 0.7 -tp 2 --distributed-executor-backend ray --load-format fastsafetensors ``` -## 9\. Benchmarking +## 10\. Benchmarking I recommend using [llama-benchy](https://github.com/eugr/llama-benchy) - a new benchmarking tool that delivers results in the same format as llama-bench from llama.cpp suite. -## 10\. Downloading Models +## 11\. Downloading Models The `hf-download.sh` script provides a convenient way to download models from HuggingFace and distribute them across your cluster nodes. It uses Huggingface CLI via `uvx` for fast downloads and `rsync` for distribution across the cluster. diff --git a/launch-cluster.sh b/launch-cluster.sh index f3e645d..2851701 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -26,6 +26,8 @@ ACTION="start" CLUSTER_WAS_RUNNING="false" MOD_PATHS=() MOD_TYPES=() +LAUNCH_SCRIPT_PATH="" +SCRIPT_DIR="$(dirname "$(realpath "$0")")" ACTIONS_ARG="" SOLO_MODE="false" @@ -41,11 +43,16 @@ usage() { echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)" echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO." echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)" + echo " --launch-script Path to bash script to execute in the container (from profiles/ directory or absolute path)" echo " --check-config Check configuration and auto-detection without launching" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" echo " -d Daemon mode (only for 'start' action)" echo " action start | stop | status | exec (Default: start)" echo " command Command to run (only for 'exec' action)" + echo "" + echo "Launch Script Usage:" + echo " $0 --launch-script profiles/my-script.sh # Script copied to container and executed" + echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script" exit 1 } @@ -59,6 +66,7 @@ while [[ "$#" -gt 0 ]]; do --ib-if) IB_IF="$2"; shift ;; -e|--env) DOCKER_ARGS="$DOCKER_ARGS -e $2"; shift ;; --apply-mod) MOD_PATHS+=("$2"); shift ;; + --launch-script) LAUNCH_SCRIPT_PATH="$2"; shift ;; --nccl-debug) if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then NCCL_DEBUG_VAL="$2" @@ -107,6 +115,37 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then esac fi +# Resolve launch script path if specified +if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then + # Check if it's an absolute path or relative path that exists + if [[ -f "$LAUNCH_SCRIPT_PATH" ]]; then + LAUNCH_SCRIPT_PATH=$(realpath "$LAUNCH_SCRIPT_PATH") + # Check if it's just a filename, look in profiles/ directory + elif [[ -f "$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" ]]; then + LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" + # Check if it's a name without .sh extension + elif [[ -f "$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" ]]; then + LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" + else + echo "Error: Launch script '$LAUNCH_SCRIPT_PATH' not found." + echo "Searched in:" + echo " - $LAUNCH_SCRIPT_PATH" + echo " - $SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" + echo " - $SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" + exit 1 + fi + + echo "Using launch script: $LAUNCH_SCRIPT_PATH" + + # Set command to run the copied script (use absolute path since docker exec may not be in /workspace) + COMMAND_TO_RUN="/workspace/exec-script.sh" + + # If launch script is specified, default action to exec unless explicitly set to stop/status + if [[ "$ACTION" == "start" ]]; then + ACTION="exec" + fi +fi + # Validate MOD_PATHS if set for i in "${!MOD_PATHS[@]}"; do mod_path="${MOD_PATHS[$i]}" @@ -426,6 +465,51 @@ apply_mod_to_container() { fi } +# Copy Launch Script to Container Function +copy_launch_script_to_container() { + local node_ip="$1" + local container="$2" + local is_local="$3" # true/false + local script_path="$4" + + echo "Copying launch script to $node_ip..." + + # Command prefix for remote vs local + local cmd_prefix="" + if [[ "$is_local" == "false" ]]; then + cmd_prefix="ssh -o BatchMode=yes -o StrictHostKeyChecking=no $node_ip" + fi + + local target_script_path="$script_path" + local remote_cleanup_path="" + + # Copy script to remote node first if needed + if [[ "$is_local" == "false" ]]; then + local remote_tmp="/tmp/exec_script_$(date +%s)_$RANDOM.sh" + echo " Copying script to $node_ip:$remote_tmp..." + if ! scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$node_ip:$remote_tmp"; then + echo "Error: Failed to copy launch script to $node_ip" + exit 1 + fi + target_script_path="$remote_tmp" + remote_cleanup_path="$remote_tmp" + fi + + # Copy script into container as /workspace/exec-script.sh + echo " Copying script into container..." + $cmd_prefix docker cp "$target_script_path" "$container:/workspace/exec-script.sh" + + # Make executable + $cmd_prefix docker exec "$container" chmod +x /workspace/exec-script.sh + + # Cleanup remote temp + if [[ -n "$remote_cleanup_path" ]]; then + ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" "rm -f $remote_cleanup_path" + fi + + echo " Launch script copied to $node_ip" +} + # Start Cluster Function start_cluster() { check_cluster_running @@ -494,6 +578,19 @@ start_cluster() { done fi + # Copy launch script if specified + if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then + echo "Copying launch script to cluster nodes..." + + # Copy to Head + copy_launch_script_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "$LAUNCH_SCRIPT_PATH" + + # Copy to Workers + for worker in "${PEER_NODES[@]}"; do + copy_launch_script_to_container "$worker" "$CONTAINER_NAME" "false" "$LAUNCH_SCRIPT_PATH" + done + fi + if [[ "$SOLO_MODE" == "false" ]]; then wait_for_cluster else diff --git a/profiles/README.md b/profiles/README.md new file mode 100644 index 0000000..470e9f8 --- /dev/null +++ b/profiles/README.md @@ -0,0 +1,184 @@ +# Launch Scripts + +This directory contains bash scripts that can be executed in the container using the `--launch-script` option. Launch scripts are simple, executable bash files that run directly inside the container. + +## Why Launch Scripts? + +- **Simple** - Just write a bash script that runs your command +- **Flexible** - Use any bash features: environment variables, conditionals, loops +- **Standalone** - Each script can be tested directly on a head node +- **No magic** - What you see is what gets executed + +## Usage + +```bash +# Use a launch script by name (looks in profiles/ directory) +./launch-cluster.sh --launch-script example-vllm-minimax + +# Use a launch script by filename +./launch-cluster.sh --launch-script example-vllm-minimax.sh + +# Use a launch script with absolute path +./launch-cluster.sh --launch-script /path/to/my-script.sh + +# Combine with mods if needed +./launch-cluster.sh --launch-script my-script.sh --apply-mod mods/my-patch + +# Combine with other options +./launch-cluster.sh -n 192.168.1.1,192.168.1.2 --launch-script my-model.sh -d +``` + +When using `--launch-script`, the `exec` action is automatically implied if no action is specified. + +## Script Structure + +Launch scripts are simple bash scripts. The script is copied into the container at `/workspace/exec-script.sh` and executed. + +```bash +#!/bin/bash +# PROFILE: Human-readable name +# DESCRIPTION: What this script does + +# Optional: Set environment variables +export MY_VAR="value" + +# Run your command +vllm serve org/model-name \ + --port 8000 \ + --host 0.0.0.0 \ + --gpu-memory-utilization 0.7 +``` + +### Metadata Comments + +The `# PROFILE:` and `# DESCRIPTION:` comments are optional but recommended for documentation: + +```bash +#!/bin/bash +# PROFILE: MiniMax-M2-AWQ Example +# DESCRIPTION: vLLM serving MiniMax-M2-AWQ with Ray distributed backend +``` + +## Examples + +### Basic vLLM Serving + +```bash +#!/bin/bash +# PROFILE: MiniMax-M2-AWQ +# DESCRIPTION: vLLM serving MiniMax-M2-AWQ with Ray distributed backend + +vllm serve QuantTrio/MiniMax-M2-AWQ \ + --port 8000 \ + --host 0.0.0.0 \ + --gpu-memory-utilization 0.7 \ + -tp 2 \ + --distributed-executor-backend ray \ + --max-model-len 128000 \ + --load-format fastsafetensors \ + --enable-auto-tool-choice \ + --tool-call-parser minimax_m2 +``` + +### With Environment Variables + +```bash +#!/bin/bash +# PROFILE: OpenAI GPT-OSS 120B +# DESCRIPTION: vLLM serving openai/gpt-oss-120b with FlashInfer MOE optimization + +# Enable FlashInfer MOE with MXFP4/MXFP8 quantization +export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 + +vllm serve openai/gpt-oss-120b \ + --tool-call-parser openai \ + --enable-auto-tool-choice \ + --tensor-parallel-size 2 \ + --distributed-executor-backend ray \ + --host 0.0.0.0 \ + --port 8000 +``` + +### With Conditional Logic + +```bash +#!/bin/bash +# PROFILE: Adaptive Model Server +# DESCRIPTION: Adjusts settings based on available GPUs + +GPU_COUNT=$(nvidia-smi -L | wc -l) +echo "Detected $GPU_COUNT GPUs" + +if [[ $GPU_COUNT -ge 4 ]]; then + TP_SIZE=4 + MEM_UTIL=0.9 +else + TP_SIZE=2 + MEM_UTIL=0.7 +fi + +vllm serve meta-llama/Llama-3.1-70B-Instruct \ + --port 8000 \ + --host 0.0.0.0 \ + -tp $TP_SIZE \ + --gpu-memory-utilization $MEM_UTIL \ + --distributed-executor-backend ray +``` + +### SGLang + +```bash +#!/bin/bash +# PROFILE: SGLang Llama 3.1 +# DESCRIPTION: SGLang runtime with Llama 3.1 + +sglang launch meta-llama/Llama-3.1-8B-Instruct \ + --port 8000 \ + --host 0.0.0.0 \ + --tp 2 +``` + +### With Model Requiring Patches + +If your model requires patches, use `--apply-mod` alongside `--launch-script`: + +```bash +# Script: vllm-glm-4.7-nvfp4.sh +#!/bin/bash +# PROFILE: Salyut1/GLM-4.7-NVFP4 +# DESCRIPTION: vLLM serving GLM-4.7-NVFP4 +# NOTE: Requires --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4 + +vllm serve Salyut1/GLM-4.7-NVFP4 \ + --attention-config.backend flashinfer \ + --tool-call-parser glm47 \ + -tp 2 \ + --host 0.0.0.0 \ + --port 8000 +``` + +Usage: +```bash +./launch-cluster.sh --launch-script vllm-glm-4.7-nvfp4.sh --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4 exec +``` + +## Creating a New Launch Script + +1. Create a new `.sh` file in this directory +2. Add the shebang `#!/bin/bash` +3. Add `# PROFILE:` and `# DESCRIPTION:` comments +4. Write your command (e.g., `vllm serve ...`) +5. Run with `./launch-cluster.sh --launch-script my-script.sh exec` + +## Testing Scripts + +Since launch scripts are standard bash files, you can test them directly: + +```bash +# Inside a running container or on a head node with the runtime installed +cd profiles +./my-script.sh +``` + +This makes development and debugging much easier than complex configuration systems. + diff --git a/profiles/example-vllm-minimax.sh b/profiles/example-vllm-minimax.sh new file mode 100644 index 0000000..c9e30ff --- /dev/null +++ b/profiles/example-vllm-minimax.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# PROFILE: MiniMax-M2-AWQ Example +# DESCRIPTION: vLLM serving MiniMax-M2-AWQ with Ray distributed backend + +vllm serve QuantTrio/MiniMax-M2-AWQ \ + --port 8000 \ + --host 0.0.0.0 \ + --gpu-memory-utilization 0.7 \ + -tp 2 \ + --distributed-executor-backend ray \ + --max-model-len 128000 \ + --load-format fastsafetensors \ + --enable-auto-tool-choice \ + --tool-call-parser minimax_m2 \ + --reasoning-parser minimax_m2_append_think diff --git a/profiles/vllm-glm-4.7-nvfp4.sh b/profiles/vllm-glm-4.7-nvfp4.sh new file mode 100644 index 0000000..b2a037c --- /dev/null +++ b/profiles/vllm-glm-4.7-nvfp4.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# PROFILE: Salyut1/GLM-4.7-NVFP4 +# DESCRIPTION: vLLM serving GLM-4.7-NVFP4 +# NOTE: This profile requires --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4 to fix k/v scales incompatibility +# See: https://huggingface.co/Salyut1/GLM-4.7-NVFP4/discussions/3#694ab9b6e2efa04b7ecb0c4b + +vllm serve Salyut1/GLM-4.7-NVFP4 \ + --attention-config.backend flashinfer \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --enable-auto-tool-choice \ + -tp 2 \ + --gpu-memory-utilization 0.88 \ + --max-model-len 32000 \ + --distributed-executor-backend ray \ + --host 0.0.0.0 \ + --port 8000 diff --git a/profiles/vllm-openai-gpt-oss-120b.sh b/profiles/vllm-openai-gpt-oss-120b.sh new file mode 100644 index 0000000..e9283ec --- /dev/null +++ b/profiles/vllm-openai-gpt-oss-120b.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# PROFILE: OpenAI GPT-OSS 120B +# DESCRIPTION: vLLM serving openai/gpt-oss-120b with FlashInfer MOE optimization + +# Enable FlashInfer MOE with MXFP4/MXFP8 quantization +export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 + +vllm serve openai/gpt-oss-120b \ + --tool-call-parser openai \ + --enable-auto-tool-choice \ + --tensor-parallel-size 2 \ + --distributed-executor-backend ray \ + --kv-cache-dtype fp8 \ + --gpu-memory-utilization 0.70 \ + --max-model-len 128000 \ + --max-num-batched-tokens 4096 \ + --max-num-seqs 8 \ + --enable-prefix-caching \ + --host 0.0.0.0 \ + --port 8000 From 30f16f1d4e8db252d4cd95f88588d39d291f87fc Mon Sep 17 00:00:00 2001 From: Raphael Amorim Date: Tue, 3 Feb 2026 15:32:28 -0500 Subject: [PATCH 2/7] feat: Add recipe-based one-click model deployment system Introduces a YAML recipe system for simplified model deployment: - run-recipe.py: Main script handling build, download, and launch - run-recipe.sh: Bash wrapper for dependency management - recipes/: Pre-configured recipes for common models - glm-4.7-flash-awq.yaml: GLM-4.7-Flash with AWQ quantization - glm-4.7-nvfp4.yaml: GLM-4.7 with NVFP4 (cluster-only) - minimax-m2-awq.yaml: MiniMax M2 with AWQ - openai-gpt-oss-120b.yaml: OpenAI GPT-OSS 120B with MXFP4 Key features: - Auto-discover cluster nodes with --discover, saves to .env - Load nodes from .env automatically on subsequent runs - cluster_only flag for models requiring multi-node setup - build_args field for Dockerfile selection (--pre-tf, --exp-mxfp4) - Solo mode auto-strips --distributed-executor-backend ray - --setup flag for full build + download + run workflow - --dry-run to preview execution without running Usage: ./run-recipe.sh --discover # Find and save cluster nodes ./run-recipe.sh glm-4.7-flash-awq --solo --setup ./run-recipe.sh glm-4.7-nvfp4 --setup # Uses nodes from .env --- recipes/README.md | 266 +++++++ recipes/glm-4.7-flash-awq.yaml | 64 ++ recipes/minimax-m2-awq.yaml | 40 ++ recipes/openai-gpt-oss-120b.yaml | 52 ++ run-recipe.py | 1123 ++++++++++++++++++++++++++++++ run-recipe.sh | 42 ++ 6 files changed, 1587 insertions(+) create mode 100644 recipes/README.md create mode 100644 recipes/glm-4.7-flash-awq.yaml create mode 100644 recipes/minimax-m2-awq.yaml create mode 100644 recipes/openai-gpt-oss-120b.yaml create mode 100755 run-recipe.py create mode 100755 run-recipe.sh diff --git a/recipes/README.md b/recipes/README.md new file mode 100644 index 0000000..836ec03 --- /dev/null +++ b/recipes/README.md @@ -0,0 +1,266 @@ +# Recipes + +Recipes provide a **one-click solution** for deploying models with pre-configured settings. Each recipe is a YAML file that specifies: + +- HuggingFace model to download +- Container image and build arguments +- Required mods/patches +- Default parameters (port, host, tensor parallelism, etc.) +- Environment variables +- The vLLM serve command + +## Quick Start + +```bash +# List available recipes +./run-recipe.sh --list + +# Run a recipe in solo mode (single node) +./run-recipe.sh glm-4.7-flash-awq --solo + +# Full setup: build container + download model + run +./run-recipe.sh glm-4.7-flash-awq --solo --setup + +# Run with overrides +./run-recipe.sh glm-4.7-flash-awq --solo --port 9000 --gpu-mem 0.8 + +# Cluster deployment +./run-recipe.sh glm-4.7-nvfp4 -n 192.168.1.10,192.168.1.11 --setup +``` + +## Cluster Node Discovery + +The recipe runner can automatically discover cluster nodes: + +```bash +# Auto-discover nodes and save to .env +./run-recipe.sh --discover + +# Show current .env configuration +./run-recipe.sh --show-env + +# Run recipe (uses nodes from .env automatically) +./run-recipe.sh glm-4.7-nvfp4 --setup +``` + +When you run `--discover`, it: +1. Scans the network for nodes with SSH access +2. Prompts you to select which nodes to include +3. Saves the configuration to `.env` + +Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`. + +## Workflow Modes + +### Solo Mode (Single Node) +```bash +# Explicitly run in solo mode +./run-recipe.sh glm-4.7-flash-awq --solo + +# If no nodes configured, defaults to solo +./run-recipe.sh minimax-m2-awq +``` + +### Cluster Mode (Multiple Nodes) +```bash +# Specify nodes directly (first IP is head node) +./run-recipe.sh glm-4.7-nvfp4 -n 192.168.1.10,192.168.1.11 --setup + +# Or use auto-discovered nodes from .env +./run-recipe.sh --discover # First time only +./run-recipe.sh glm-4.7-nvfp4 --setup +``` + +When using cluster mode with `--setup`: +- Container is built locally and copied to all worker nodes +- Model is downloaded locally and copied to all worker nodes + +### Cluster-Only Recipes + +Some models are too large to run on a single node. These recipes have `cluster_only: true` and will fail with a helpful error if you try to run them in solo mode: + +```bash +$ ./run-recipe.sh glm-4.7-nvfp4 --solo +Error: Recipe 'GLM-4.7-NVFP4' requires cluster mode. +This model is too large to run on a single node. + +Options: + 1. Specify nodes directly: ./run-recipe.sh glm-4.7-nvfp4 -n node1,node2 + 2. Auto-discover and save: ./run-recipe.sh --discover + Then run: ./run-recipe.sh glm-4.7-nvfp4 +``` + +## Setup Options + +| Flag | Description | +|------|-------------| +| `--setup` | Full setup: build (if missing) + download (if missing) + run | +| `--build-only` | Only build/copy the container, don't run | +| `--download-only` | Only download/copy the model, don't run | +| `--force-build` | Rebuild even if container exists | +| `--force-download` | Re-download even if model exists | +| `--dry-run` | Show what would happen without executing | + +## Recipe Format + +```yaml +# Required fields +name: Human-readable name +container: docker-image-name +command: | + vllm serve model/name \ + --port {port} \ + --host {host} + +# Optional fields +description: What this recipe does +model: org/model-name # HuggingFace model ID for --setup downloads +cluster_only: false # Set to true if model requires cluster mode +build_args: # Extra args for build-and-copy.sh + - --pre-tf # e.g., for transformers 5.0 + - --exp-mxfp4 # e.g., for MXFP4 Dockerfile +mods: + - mods/some-patch +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.85 + max_model_len: 32000 +env: + SOME_VAR: "value" +``` + +### Build Arguments + +The `build_args` field passes flags to `build-and-copy.sh`: + +| Flag | Description | +|------|-------------| +| `--pre-tf` | Use transformers 5.0 (required for GLM-4.7 models) | +| `--exp-mxfp4` | Use MXFP4 Dockerfile (for MXFP4 quantized models) | +| `--use-wheels` | Use pre-built wheels instead of building from source | + +### Parameter Substitution + +Use `{param_name}` in the command to substitute values from defaults or CLI overrides: + +```yaml +defaults: + port: 8000 + tensor_parallel: 2 + +command: | + vllm serve my/model \ + --port {port} \ + -tp {tensor_parallel} +``` + +Override at runtime: +```bash +./run-recipe.sh my-recipe --port 9000 --tp 4 +``` + +## CLI Reference + +``` +Usage: ./run-recipe.sh [OPTIONS] [RECIPE] + +Cluster discovery: + --discover Auto-detect cluster nodes and save to .env + --show-env Show current .env configuration + +Recipe overrides: + --port PORT Override port + --host HOST Override host + --tensor-parallel, --tp N Override tensor parallelism + --gpu-memory-utilization N Override GPU memory utilization (--gpu-mem) + --max-model-len N Override max model length + +Setup options: + --setup Full setup: build + download + run + --build-only Only build/copy container, don't run + --download-only Only download/copy model, don't run + --force-build Rebuild even if container exists + --force-download Re-download even if model exists + +Launch options: + --solo Run in solo mode (single node, no Ray) + -n, --nodes IPS Comma-separated node IPs (first = head) + -d, --daemon Run in daemon mode + -t, --container IMAGE Override container from recipe + --nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE) + +Other: + --dry-run Show what would be executed + --list, -l List available recipes +``` + +## Creating a Recipe + +1. Create a new `.yaml` file in `recipes/` +2. Specify required fields: `name`, `container`, `command` +3. Add `build_args` if your model needs special build options +4. Add `mods` if your model needs patches +5. Set `cluster_only: true` if model is too large for single node +6. Set sensible `defaults` +7. Add `env` variables if needed + +Example: +```yaml +name: My Model +description: My custom model setup +container: vllm-node-tf5 + +build_args: + - --pre-tf + +mods: + - mods/my-fix + +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 1 + gpu_memory_utilization: 0.85 + +command: | + vllm serve org/my-model \ + --port {port} \ + --host {host} \ + -tp {tensor_parallel} \ + --gpu-memory-utilization {gpu_memory_utilization} +``` + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ run-recipe.sh / run-recipe.py │ +│ - Parses YAML recipe │ +│ - Auto-discovers cluster nodes (--discover) │ +│ - Loads nodes from .env │ +│ - Handles --setup (build + download + run) │ +│ - Generates launch script from template │ +│ - Applies CLI overrides │ +└──────────┬────────────────────────┬─────────────────────┘ + │ calls (for build) │ calls (for download) + ▼ ▼ +┌──────────────────────┐ ┌───────────────────────────────┐ +│ build-and-copy.sh │ │ hf-download.sh │ +│ - Docker build │ │ - HuggingFace model download │ +│ - Copy to workers │ │ - Rsync to workers │ +└──────────────────────┘ └───────────────────────────────┘ + │ + │ then calls (for run) + ▼ +┌─────────────────────────────────────────────────────────┐ +│ launch-cluster.sh │ +│ - Cluster orchestration │ +│ - Container lifecycle │ +│ - Mod application │ +│ - Launch script execution │ +└─────────────────────────────────────────────────────────┘ +``` + +This separation follows the Unix philosophy: `run-recipe.sh` provides convenience, while the underlying scripts remain focused on their specific tasks. diff --git a/recipes/glm-4.7-flash-awq.yaml b/recipes/glm-4.7-flash-awq.yaml new file mode 100644 index 0000000..b0acb2f --- /dev/null +++ b/recipes/glm-4.7-flash-awq.yaml @@ -0,0 +1,64 @@ +# Recipe: GLM-4.7-Flash-AWQ-4bit +# cyankiwi's AWQ quantized GLM-4.7-Flash model +# Requires a patch for inference speed optimization +# +# NOTE: vLLM implementation is suboptimal even with the patch. +# The model performance is still significantly slower than it should be +# for a model with this number of active parameters. Running in cluster +# increases prompt processing performance, but not token generation. +# Expect ~40 t/s generation speed in both single node and cluster. + +recipe_version: "1" +name: GLM-4.7-Flash-AWQ +description: vLLM serving cyankiwi/GLM-4.7-Flash-AWQ-4bit with speed optimization patch + +# HuggingFace model to download +model: cyankiwi/GLM-4.7-Flash-AWQ-4bit + +# This model can run on single node (solo) or cluster +cluster_only: false + +# Container image to use +container: vllm-node-tf5 + +# Build arguments for build-and-copy.sh +# tf5 = transformers 5.0 (required for GLM-4.7) +build_args: + - --pre-tf + +# Mods to apply before running (paths relative to repo root) +# This mod prevents severe inference speed degradation +mods: + - mods/fix-glm-4.7-flash-AWQ + +# Default settings (can be overridden via CLI) +defaults: + port: 8888 + host: 0.0.0.0 + tensor_parallel: 1 + gpu_memory_utilization: 0.7 + max_model_len: 202752 + max_num_batched_tokens: 4096 + max_num_seqs: 64 + served_model_name: glm-4.7-flash + +# Environment variables to set in the container +env: + # Add any required env vars here + +# The vLLM serve command template +# Use {var_name} for substitution from defaults/overrides +# In cluster mode, --distributed-executor-backend ray and -tp 2 are added +command: | + vllm serve cyankiwi/GLM-4.7-Flash-AWQ-4bit \ + --tool-call-parser glm47 \ + --reasoning-parser glm45 \ + --enable-auto-tool-choice \ + --served-model-name {served_model_name} \ + --max-model-len {max_model_len} \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --max-num-seqs {max_num_seqs} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + -tp {tensor_parallel} \ + --host {host} \ + --port {port} diff --git a/recipes/minimax-m2-awq.yaml b/recipes/minimax-m2-awq.yaml new file mode 100644 index 0000000..8d584c7 --- /dev/null +++ b/recipes/minimax-m2-awq.yaml @@ -0,0 +1,40 @@ +# Recipe: MiniMax-M2-AWQ +# MiniMax M2 model with AWQ quantization + +recipe_version: "1" +name: MiniMax-M2-AWQ +description: vLLM serving MiniMax-M2-AWQ with Ray distributed backend + +# HuggingFace model to download (optional, for --download-model) +model: QuantTrio/MiniMax-M2-AWQ + +# Container image to use +container: vllm-node + +# No mods required +mods: [] + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.7 + max_model_len: 128000 + +# Environment variables +env: {} + +# The vLLM serve command template +command: | + vllm serve QuantTrio/MiniMax-M2-AWQ \ + --port {port} \ + --host {host} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray \ + --max-model-len {max_model_len} \ + --load-format fastsafetensors \ + --enable-auto-tool-choice \ + --tool-call-parser minimax_m2 \ + --reasoning-parser minimax_m2_append_think diff --git a/recipes/openai-gpt-oss-120b.yaml b/recipes/openai-gpt-oss-120b.yaml new file mode 100644 index 0000000..0e56aab --- /dev/null +++ b/recipes/openai-gpt-oss-120b.yaml @@ -0,0 +1,52 @@ +# Recipe: OpenAI GPT-OSS 120B +# OpenAI's open source 120B MoE model with MXFP4 quantization support + +recipe_version: "1" +name: OpenAI GPT-OSS 120B +description: vLLM serving openai/gpt-oss-120b with MXFP4 quantization and FlashInfer + +# HuggingFace model to download (optional, for --download-model) +model: openai/gpt-oss-120b + +# Container image to use +container: vllm-node-mxfp4 + +# Build arguments for build-and-copy.sh +build_args: + - --exp-mxfp4 + +# No mods required for this model +mods: [] + +# Default settings (can be overridden via CLI) +defaults: + port: 8888 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.70 + max_num_batched_tokens: 8192 + +# Environment variables to set in the container +env: + VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: "1" + +# The vLLM serve command template +# Uses MXFP4 quantization for memory efficiency +command: | + vllm serve openai/gpt-oss-120b \ + --tool-call-parser openai \ + --reasoning-parser openai_gptoss \ + --enable-auto-tool-choice \ + --tensor-parallel-size {tensor_parallel} \ + --distributed-executor-backend ray \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --enable-prefix-caching \ + --load-format fastsafetensors \ + --quantization mxfp4 \ + --mxfp4-backend CUTLASS \ + --mxfp4-layers moe,qkv,o,lm_head \ + --attention-backend FLASHINFER \ + --kv-cache-dtype fp8 \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --host {host} \ + --port {port} diff --git a/run-recipe.py b/run-recipe.py new file mode 100755 index 0000000..16038ee --- /dev/null +++ b/run-recipe.py @@ -0,0 +1,1123 @@ +#!/usr/bin/env python3 +""" +run-recipe.py - One-click model deployment using YAML recipes + +This script provides a high-level interface for deploying models with +pre-configured settings. It handles: +- Model download from HuggingFace (optional) +- Container building and distribution to worker nodes +- Mod application +- Launch script generation +- Both solo (single node) and cluster deployments + +Usage: + ./run-recipe.py recipes/glm-4.7-nvfp4.yaml + ./run-recipe.py glm-4.7-nvfp4 --port 9000 --solo + ./run-recipe.py minimax-m2-awq --setup # Full setup: build + download + run + ./run-recipe.py --list + +================================================================================ +ARCHITECTURE OVERVIEW (for developers extending this script) +================================================================================ + +DEPLOYMENT PIPELINE: + ┌─────────────────────────────────────────────────────────────────────────┐ + │ CLI Args → Load Recipe → Resolve Nodes → Build → Download → Run │ + └─────────────────────────────────────────────────────────────────────────┘ + +KEY ABSTRACTIONS: + - Recipe (YAML): Declarative model configuration (see load_recipe docstring) + - Phases: Build, Download, Run - each can run independently (--build-only, etc.) + - Nodes: Head (first) + Workers (rest) - images/models copied to workers + +EXTENSION POINTS: + + 1. ADD NEW RECIPE FIELDS: + - Update load_recipe() to validate/set defaults + - Use the field in generate_launch_script() or main() + - Document in recipe YAML schema below + + 2. ADD NEW CLI OPTIONS: + - Add to appropriate argument group in main() + - Handle in the corresponding phase (build/download/run) + - Pass to generate_launch_script() via overrides dict if needed + + 3. ADD NEW DEPLOYMENT PHASES: + - Follow the pattern: check if needed → dry-run print → execute + - Insert between existing phases in main() + - Add corresponding --phase-only flag + + 4. SUPPORT NEW MODEL SOURCES: + - Add detection logic in download_model() or check_model_exists() + - Create new download script or handle inline + + 5. SUPPORT NEW CONTAINER RUNTIMES: + - Modify check_image_exists() and build_image() + - May need to update launch-cluster.sh as well + +RECIPE YAML SCHEMA: + name: str # Required: Human-readable name + recipe_version: str # Required: Recipe schema version (e.g., '1'). Used by run-recipe.py + # to check compatibility and available features. + container: str # Required: Docker image tag + command: str # Required: vLLM serve command with {placeholders} + description: str # Optional: Brief description + model: str # Optional: HuggingFace model ID for --setup + mods: list[str] # Optional: Mod directories to apply + defaults: dict # Optional: Default values for command placeholders + env: dict # Optional: Environment variables + build_args: list[str] # Optional: Args for build-and-copy.sh + cluster_only: bool # Optional: Require cluster mode (default: false) + +RECIPE VERSION HISTORY: + Version 1 (default): Initial schema with all fields above supported. + +RELATED FILES: + - run-recipe.sh: Bash wrapper that ensures Python deps are installed + - recipes/*.yaml: Recipe definitions + - launch-cluster.sh: Low-level container orchestration + - build-and-copy.sh: Docker build and distribution + - hf-download.sh: HuggingFace model download and sync + - autodiscover.sh: Network topology detection +""" + +import argparse +import os +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Any + +try: + import yaml +except ImportError: + print("Error: PyYAML is required. Install with: pip install pyyaml") + sys.exit(1) + + +SCRIPT_DIR = Path(__file__).parent.resolve() +RECIPES_DIR = SCRIPT_DIR / "recipes" +LAUNCH_SCRIPT = SCRIPT_DIR / "launch-cluster.sh" +BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh" +DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh" +AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh" +ENV_FILE = SCRIPT_DIR / ".env" + + +def load_recipe(recipe_path: Path) -> dict[str, Any]: + """ + Load and validate a recipe YAML file. + + This function handles recipe resolution from multiple locations and validates + required fields. Recipes are the core configuration format for deployments. + + EXTENSIBILITY: + - To add new required fields: Add to the 'required' list below + - To add new optional fields with defaults: Add to the setdefault() calls at the end + - Recipe search order: exact path -> recipes/ dir -> with .yaml -> with .yml + + RECIPE SCHEMA: + name (str, required): Human-readable name for the recipe + recipe_version (str, required): Schema version for compatibility checking. + Used by run-recipe.py to determine which features are available. + Current version: '1'. Bump when adding new recipe fields. + container (str, required): Docker image tag to use (e.g., 'vllm-node-mxfp4') + command (str, required): vLLM serve command template with {placeholders} + description (str, optional): Brief description shown in --list + model (str, optional): HuggingFace model ID for --setup downloads + mods (list[str], optional): List of mod directories to apply (e.g., 'mods/fix-glm') + defaults (dict, optional): Default values for command placeholders + env (dict, optional): Environment variables to export before running + build_args (list[str], optional): Extra args for build-and-copy.sh (e.g., ['-f', 'Dockerfile.mxfp4']) + cluster_only (bool, optional): If True, recipe cannot run in solo mode + + Args: + recipe_path: Path object pointing to YAML file or just recipe name + + Returns: + Validated recipe dictionary with all fields populated (defaults applied) + + Raises: + SystemExit: If recipe not found or validation fails + """ + if not recipe_path.exists(): + # Try recipes directory with various extensions + candidates = [ + RECIPES_DIR / recipe_path.name, + RECIPES_DIR / f"{recipe_path.name}.yaml", + RECIPES_DIR / f"{recipe_path.name}.yml", + RECIPES_DIR / f"{recipe_path.stem}.yaml", + ] + for candidate in candidates: + if candidate.exists(): + recipe_path = candidate + break + else: + print(f"Error: Recipe not found: {recipe_path}") + print(f"Searched in: {recipe_path}, {RECIPES_DIR}") + sys.exit(1) + + with open(recipe_path) as f: + recipe = yaml.safe_load(f) + + # Validate required fields + required = ["name", "recipe_version", "container", "command"] + for field in required: + if field not in recipe: + print(f"Error: Recipe missing required field: {field}") + sys.exit(1) + + # Set defaults for optional fields + recipe.setdefault("description", "") + recipe.setdefault("model", None) + recipe.setdefault("mods", []) + recipe.setdefault("defaults", {}) + recipe.setdefault("env", {}) + + # Validate recipe version compatibility + # EXTENSIBILITY: When adding new schema versions, update SUPPORTED_VERSIONS + # and add migration/compatibility logic below + SUPPORTED_VERSIONS = ["1"] + recipe_ver = str(recipe["recipe_version"]) + if recipe_ver not in SUPPORTED_VERSIONS: + print(f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}") + print("Some features may not work correctly. Consider updating run-recipe.py.") + + return recipe + + +def list_recipes() -> None: + """ + List all available recipes with their metadata. + + Scans the recipes/ directory for YAML files and displays key information. + Used by the --list CLI option. + + EXTENSIBILITY: + - To show additional fields: Add them to the print statements in the loop + - To support different output formats (e.g., JSON): Add a format parameter + - Recipe directory is defined by RECIPES_DIR constant at module level + """ + if not RECIPES_DIR.exists(): + print("No recipes directory found.") + return + + recipes = sorted(RECIPES_DIR.glob("*.yaml")) + if not recipes: + print("No recipes found in recipes/ directory.") + return + + print("Available recipes:\n") + for recipe_path in recipes: + try: + recipe = load_recipe(recipe_path) + name = recipe.get("name", recipe_path.stem) + recipe_version = recipe.get("recipe_version", "1") + desc = recipe.get("description", "") + container = recipe.get("container", "vllm-node") + build_args = recipe.get("build_args", []) + model = recipe.get("model", "") + mods = recipe.get("mods", []) + cluster_only = recipe.get("cluster_only", False) + + print(f" {recipe_path.name}") + print(f" Name: {name}") + if desc: + print(f" Description: {desc}") + if model: + print(f" Model: {model}") + if cluster_only: + print(f" Cluster only: Yes") + print(f" Container: {container}") + if build_args: + print(f" Build args: {' '.join(build_args)}") + if mods: + print(f" Mods: {', '.join(mods)}") + print() + except Exception as e: + print(f" {recipe_path.name} (error loading: {e})") + print() + + +def check_image_exists(image: str, host: str | None = None) -> bool: + """ + Check if a Docker image exists locally or on a remote host. + + Used to avoid redundant builds and to verify cluster nodes have the image. + + EXTENSIBILITY: + - To support other container runtimes (podman): Modify the docker command + - To add image version/digest checking: Parse 'docker image inspect' JSON output + - For custom SSH options: Modify the ssh command array + + Args: + image: Docker image tag to check (e.g., 'vllm-node-mxfp4') + host: Optional remote hostname/IP. If None, checks locally. + + Returns: + True if image exists, False otherwise + """ + if host: + result = subprocess.run( + ["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", + host, f"docker image inspect '{image}'"], + capture_output=True + ) + else: + result = subprocess.run( + ["docker", "image", "inspect", image], + capture_output=True + ) + return result.returncode == 0 + + +def build_image(image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None) -> bool: + """ + Build the container image using build-and-copy.sh. + + Delegates to the build-and-copy.sh script which handles multi-stage builds, + cache optimization, and distribution to worker nodes. + + EXTENSIBILITY: + - To add new build options: Add them to build_args in the recipe's build_args field + - To support different Dockerfiles: Use build_args = ['-f', 'Dockerfile.custom'] + - To add build-time secrets: Modify cmd array to include --secret flags + - To add progress callbacks: Capture subprocess output line-by-line + + BUILD_ARGS EXAMPLES: + ['-f', 'Dockerfile.mxfp4'] - Use alternate Dockerfile + ['--no-cache'] - Force full rebuild + ['--build-arg', 'VAR=value'] - Pass build-time variables + + Args: + image: Target image tag + copy_to: List of worker hostnames to copy image to after build + build_args: Extra arguments passed to build-and-copy.sh + + Returns: + True if build (and copy) succeeded, False otherwise + """ + if not BUILD_SCRIPT.exists(): + print(f"Error: Build script not found: {BUILD_SCRIPT}") + return False + + cmd = [str(BUILD_SCRIPT), "-t", image] + if build_args: + cmd.extend(build_args) + if copy_to: + cmd.extend(["--copy-to", ",".join(copy_to)]) + + print(f"Building image '{image}'...") + if build_args: + print(f"Build args: {' '.join(build_args)}") + if copy_to: + print(f"Will copy to: {', '.join(copy_to)}") + + result = subprocess.run(cmd) + return result.returncode == 0 + + +def download_model(model: str, copy_to: list[str] | None = None) -> bool: + """ + Download model from HuggingFace using hf-download.sh. + + Delegates to hf-download.sh which handles HF authentication, caching, + and rsync to worker nodes. + + EXTENSIBILITY: + - To support other model sources: Create a new download script and switch based on model URL + - To add download progress: Capture subprocess output + - To support private models: hf-download.sh uses HF_TOKEN env var + - To add model verification: Check sha256 of downloaded files + + Args: + model: HuggingFace model ID (e.g., 'Salyut1/GLM-4.7-NVFP4') + copy_to: List of worker hostnames to copy model cache to + + Returns: + True if download (and copy) succeeded, False otherwise + """ + if not DOWNLOAD_SCRIPT.exists(): + print(f"Error: Download script not found: {DOWNLOAD_SCRIPT}") + return False + + cmd = [str(DOWNLOAD_SCRIPT), model] + if copy_to: + cmd.extend(["--copy-to", ",".join(copy_to)]) + + print(f"Downloading model '{model}'...") + if copy_to: + print(f"Will copy to: {', '.join(copy_to)}") + + result = subprocess.run(cmd) + return result.returncode == 0 + + +def check_model_exists(model: str) -> bool: + """ + Check if a model exists in the HuggingFace cache. + + Checks the standard HF cache location for completed downloads. + + EXTENSIBILITY: + - To support custom cache locations: Add HF_HOME env var support + - To verify model integrity: Check for complete snapshot with config.json + - To support other model sources: Add URL/path prefix detection + + Args: + model: HuggingFace model ID (e.g., 'org/model-name') + + Returns: + True if model appears to be fully downloaded, False otherwise + """ + # Convert model name to cache directory format + # e.g., "Salyut1/GLM-4.7-NVFP4" -> "models--Salyut1--GLM-4.7-NVFP4" + cache_name = f"models--{model.replace('/', '--')}" + cache_path = Path.home() / ".cache" / "huggingface" / "hub" / cache_name + + if cache_path.exists(): + # Check for snapshots directory which indicates complete download + snapshots = cache_path / "snapshots" + if snapshots.exists() and any(snapshots.iterdir()): + return True + return False + + +def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False) -> str: + """ + Generate a bash launch script from the recipe. + + Creates a self-contained bash script that runs inside the container. + Handles template substitution, environment variables, and solo mode adjustments. + + EXTENSIBILITY: + - To add new template variables: Add them to recipe['defaults'] or CLI overrides + - To add pre/post hooks: Add 'pre_command'/'post_command' fields to recipe schema + - To add conditional logic: Use Jinja2 templating instead of str.format() + - To support GPU selection: Add CUDA_VISIBLE_DEVICES to env handling + + TEMPLATE VARIABLES (use {variable_name} in recipe command): + port: API server port (default from recipe) + host: API server bind address + tensor_parallel: Number of GPUs for tensor parallelism + gpu_memory_utilization: Fraction of GPU memory to use + max_model_len: Maximum sequence length + (custom variables can be added via recipe defaults) + + SOLO MODE BEHAVIOR: + - Removes '--distributed-executor-backend ray' lines + - Typically sets tensor_parallel=1 (handled by caller) + + Args: + recipe: Loaded recipe dictionary + overrides: CLI-provided parameter overrides (take precedence over defaults) + is_solo: If True, strip distributed executor configuration + + Returns: + Complete bash script content as string + + Raises: + SystemExit: If required template variables are missing + """ + # Merge defaults with overrides + params = {**recipe.get("defaults", {}), **overrides} + + # Build the script + lines = ["#!/bin/bash", f"# Generated from recipe: {recipe['name']}", ""] + + # Add environment variables + env_vars = recipe.get("env", {}) + if env_vars: + lines.append("# Environment variables") + for key, value in env_vars.items(): + lines.append(f"export {key}=\"{value}\"") + lines.append("") + + # Format the command with parameters + command = recipe["command"] + try: + command = command.format(**params) + except KeyError as e: + print(f"Error: Missing parameter in recipe command: {e}") + print(f"Available parameters: {list(params.keys())}") + sys.exit(1) + + # In solo mode, remove --distributed-executor-backend ray + # (it's not needed and can cause issues on single node) + if is_solo: + import re + # Remove the entire line containing --distributed-executor-backend + # This handles multi-line commands with backslash continuations + lines_list = command.split('\n') + filtered_lines = [ + line for line in lines_list + if '--distributed-executor-backend' not in line + ] + command = '\n'.join(filtered_lines) + + lines.append("# Run the model") + lines.append(command.strip()) + lines.append("") + + return "\n".join(lines) + + +def parse_nodes(nodes_arg: str | None) -> list[str]: + """ + Parse comma-separated node list. + + Simple utility to split node specifications. The first node is + always treated as the head node for cluster deployments. + + Args: + nodes_arg: Comma-separated string like '192.168.1.1,192.168.1.2' + + Returns: + List of stripped node identifiers, empty list if input is None/empty + """ + if not nodes_arg: + return [] + return [n.strip() for n in nodes_arg.split(",") if n.strip()] + + +def get_worker_nodes(nodes: list[str]) -> list[str]: + """ + Get worker nodes (all nodes except the first/head node). + + In a Ray cluster, the first node runs the head process. + Workers are all subsequent nodes that join the cluster. + + Args: + nodes: Full list of nodes (head first, then workers) + + Returns: + List of worker nodes (excluding head), empty if single node + """ + if len(nodes) <= 1: + return [] + return nodes[1:] + + +def load_env_file() -> dict[str, str]: + """ + Load environment variables from .env file. + + Reads the .env file created by --discover for persistent cluster configuration. + + EXTENSIBILITY: + - To add new persistent settings: Just add them to save_env_file() + - To support multiple .env files: Add a --env-file CLI argument + - To add validation: Check for required keys after loading + + SUPPORTED KEYS (set by --discover): + CLUSTER_NODES: Comma-separated list of node IPs + LOCAL_IP: This machine's IP address + ETH_IF: Ethernet interface name + IB_IF: InfiniBand interface name (if available) + + Returns: + Dictionary of key=value pairs from .env file + """ + env = {} + if ENV_FILE.exists(): + with open(ENV_FILE) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + # Remove quotes if present + value = value.strip().strip('"').strip("'") + env[key.strip()] = value + return env + + +def save_env_file(env: dict[str, str]) -> None: + """ + Save environment variables to .env file. + + Persists cluster configuration discovered by autodiscover.sh. + Values are properly quoted if they contain spaces or commas. + + EXTENSIBILITY: + - To add new persistent settings: Just add them to the env dict before calling + - To add timestamps/metadata: Add comment lines to the output + - To support append mode: Read existing, merge, then write + + Args: + env: Dictionary of key=value pairs to save + """ + lines = ["# Auto-generated by run-recipe.py --discover", ""] + for key, value in sorted(env.items()): + # Quote values with spaces + if " " in value or "," in value: + lines.append(f'{key}="{value}"') + else: + lines.append(f"{key}={value}") + lines.append("") + + with open(ENV_FILE, "w") as f: + f.write("\n".join(lines)) + + print(f"Saved to {ENV_FILE}") + + +def run_autodiscover() -> dict[str, str] | None: + """ + Run autodiscover.sh and return discovered configuration. + + Executes the autodiscover.sh script to detect cluster topology, + then presents an interactive node selection menu. + + EXTENSIBILITY: + - To add new discovery methods: Extend autodiscover.sh or add Python detection here + - To add GPU detection: Add nvidia-smi parsing to discovered env + - To skip interactive selection: Add a --non-interactive flag + - To add node health checks: Ping/SSH test each discovered node + + DISCOVERED VARIABLES: + CLUSTER_NODES: Comma-separated list of node IPs (user-selected) + LOCAL_IP: This machine's IP address + ETH_IF: Ethernet interface name (e.g., 'eth0') + IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available + + Returns: + Dictionary with discovered configuration, or None if discovery failed + """ + if not AUTODISCOVER_SCRIPT.exists(): + print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") + return None + + print("Running autodiscover...") + print() + + # Run autodiscover in a subshell and capture the variables + # We source the script and print the variables we care about + script = f""" + source '{AUTODISCOVER_SCRIPT}' + detect_interfaces + detect_local_ip + detect_nodes + echo "CLUSTER_NODES=$NODES_ARG" + echo "LOCAL_IP=$LOCAL_IP" + echo "ETH_IF=$ETH_IF" + echo "IB_IF=$IB_IF" + """ + + result = subprocess.run( + ["bash", "-c", script], + capture_output=True, + text=True + ) + + if result.returncode != 0: + print("Autodiscover output:") + print(result.stdout) + if result.stderr: + print(result.stderr) + print("Error: Autodiscover failed") + return None + + # Print the autodiscover output (excluding the final variable lines) + output_lines = result.stdout.strip().split("\n") + env = {} + for line in output_lines: + if "=" in line and any(line.startswith(k) for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]): + key, _, value = line.partition("=") + env[key] = value + else: + print(line) + + print() + + # Interactive node selection + if env.get("CLUSTER_NODES"): + all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()] + local_ip = env.get("LOCAL_IP", "") + + if len(all_nodes) > 1: + print("Select which nodes to include in the cluster:") + print() + + selected_nodes = [] + for node in all_nodes: + is_local = node == local_ip + label = f"{node} (this machine)" if is_local else node + + # Default to yes for all nodes + while True: + response = input(f" Include {label}? [Y/n]: ").strip().lower() + if response in ("", "y", "yes"): + selected_nodes.append(node) + break + elif response in ("n", "no"): + break + else: + print(" Please enter 'y' or 'n'") + + print() + + if not selected_nodes: + print("No nodes selected. Aborting.") + return None + + if len(selected_nodes) == 1: + print(f"Only one node selected: {selected_nodes[0]}") + print("This will run in solo mode (single node).") + else: + print(f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}") + + env["CLUSTER_NODES"] = ",".join(selected_nodes) + print() + + return env + + +def main(): + """ + Main entry point for the recipe runner. + + Orchestrates the full deployment pipeline: + 1. Parse CLI arguments and load recipe + 2. Resolve cluster nodes (CLI -> .env -> autodiscover) + 3. Build phase: Build container if missing, copy to workers + 4. Download phase: Download model if missing, copy to workers + 5. Run phase: Generate launch script and execute via launch-cluster.sh + + EXTENSIBILITY: + - To add new CLI options: Add to the appropriate argument group + - To add new phases: Insert between existing phases with similar pattern + - To add pre/post hooks: Add hook execution before/after subprocess calls + - To add logging: Replace print() with logging module calls + - To add config file support: Load defaults from ~/.config/vllm-recipes.yaml + + EXIT CODES: + 0: Success + 1: Error (recipe not found, build failed, validation error, etc.) + + Returns: + Exit code for sys.exit() + """ + parser = argparse.ArgumentParser( + description="Run a model using a YAML recipe", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Basic usage + %(prog)s glm-4.7-nvfp4 + %(prog)s glm-4.7-nvfp4 --port 9000 --solo + + # Full setup (build container + download model + run) + %(prog)s glm-4.7-nvfp4 --setup + + # Cluster deployment (manual) + %(prog)s glm-4.7-nvfp4 -n 192.168.1.1,192.168.1.2 --setup + + # Cluster deployment (auto-discover) + %(prog)s --discover # Detect nodes and save to .env + %(prog)s glm-4.7-nvfp4 --setup # Uses nodes from .env + + # Just build/download without running + %(prog)s glm-4.7-nvfp4 --build-only + %(prog)s glm-4.7-nvfp4 --download-only + + # List available recipes + %(prog)s --list + + # Show current .env configuration + %(prog)s --show-env + """ + ) + + parser.add_argument( + "recipe", + nargs="?", + help="Path to recipe YAML file (or just the name without .yaml)" + ) + parser.add_argument( + "--list", "-l", + action="store_true", + help="List available recipes" + ) + + # Setup options + setup_group = parser.add_argument_group("Setup options") + setup_group.add_argument( + "--setup", + action="store_true", + help="Full setup: build container (if missing) + download model (if missing) + run" + ) + setup_group.add_argument( + "--build-only", + action="store_true", + help="Only build/copy the container image, don't run" + ) + setup_group.add_argument( + "--download-only", + action="store_true", + help="Only download/copy the model, don't run" + ) + setup_group.add_argument( + "--force-build", + action="store_true", + help="Force rebuild even if image exists" + ) + setup_group.add_argument( + "--force-download", + action="store_true", + help="Force re-download even if model exists" + ) + + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be executed without running" + ) + + # Override options + override_group = parser.add_argument_group("Recipe overrides") + override_group.add_argument("--port", type=int, help="Override port") + override_group.add_argument("--host", help="Override host") + override_group.add_argument("--tensor-parallel", "--tp", type=int, dest="tensor_parallel", help="Override tensor parallelism") + override_group.add_argument("--gpu-memory-utilization", "--gpu-mem", type=float, dest="gpu_memory_utilization", help="Override GPU memory utilization") + override_group.add_argument("--max-model-len", type=int, dest="max_model_len", help="Override max model length") + + # Launch options (passed to launch-cluster.sh) + launch_group = parser.add_argument_group("Launch options (passed to launch-cluster.sh)") + launch_group.add_argument("--solo", action="store_true", help="Run in solo mode (single node, no Ray)") + launch_group.add_argument("-n", "--nodes", help="Comma-separated list of node IPs (first is head node)") + launch_group.add_argument("-d", "--daemon", action="store_true", help="Run in daemon mode") + launch_group.add_argument("-t", "--container", dest="container_override", help="Override container image from recipe") + launch_group.add_argument("--nccl-debug", choices=["VERSION", "WARN", "INFO", "TRACE"], help="NCCL debug level") + + # Cluster discovery options + discover_group = parser.add_argument_group("Cluster discovery") + discover_group.add_argument( + "--discover", + action="store_true", + help="Auto-detect cluster nodes and save to .env file" + ) + discover_group.add_argument( + "--show-env", + action="store_true", + help="Show current .env configuration" + ) + + args = parser.parse_args() + + # Handle --discover (can be run with or without a recipe) + if args.discover: + env = run_autodiscover() + if env is None: + return 1 + + print("Discovered configuration:") + for key, value in sorted(env.items()): + print(f" {key}={value}") + print() + + save_env_file(env) + + if not args.recipe: + return 0 + + # Handle --show-env + if args.show_env: + env = load_env_file() + if env: + print(f"Current .env configuration ({ENV_FILE}):") + for key, value in sorted(env.items()): + print(f" {key}={value}") + else: + print(f"No .env file found at {ENV_FILE}") + print("Run with --discover to auto-detect cluster nodes.") + + if not args.recipe: + return 0 + print() + + if args.list: + list_recipes() + return 0 + + if not args.recipe: + parser.print_help() + return 1 + + # Load recipe + recipe_path = Path(args.recipe) + recipe = load_recipe(recipe_path) + + print(f"Recipe: {recipe['name']}") + if recipe.get("description"): + print(f" {recipe['description']}") + print() + + # Determine container image + container = args.container_override or recipe["container"] + model = recipe.get("model") + build_args = recipe.get("build_args", []) + + # Parse nodes - check command line first, then .env file, then autodiscover + nodes = parse_nodes(args.nodes) + nodes_from_env = False + + if not nodes and not args.solo: + # Try to load from .env file + env = load_env_file() + if env.get("CLUSTER_NODES"): + nodes = parse_nodes(env["CLUSTER_NODES"]) + nodes_from_env = True + if nodes: + print(f"Using cluster nodes from .env: {', '.join(nodes)}") + print() + else: + # No nodes specified and no .env - run autodiscover + print("No cluster nodes configured. Running autodiscover...") + print() + + discovered_env = run_autodiscover() + if discovered_env and discovered_env.get("CLUSTER_NODES"): + nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) + nodes_from_env = True + + if nodes: + # Ask if user wants to save to .env + print() + response = input("Save this configuration to .env for future use? [Y/n]: ").strip().lower() + if response in ("", "y", "yes"): + save_env_file(discovered_env) + print() + + worker_nodes = get_worker_nodes(nodes) if nodes else [] + is_cluster = len(nodes) > 1 + + # Check if recipe requires cluster mode + cluster_only = recipe.get("cluster_only", False) + is_solo = args.solo or not is_cluster + + if cluster_only and is_solo: + print(f"Error: Recipe '{recipe['name']}' requires cluster mode.") + print(f"This model is too large to run on a single node.") + print() + print("Options:") + print(f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2") + print(f" 2. Auto-discover and save: {sys.argv[0]} --discover") + print(f" Then run: {sys.argv[0]} {args.recipe}") + return 1 + + # Determine copy targets for cluster deployments + copy_targets = worker_nodes if is_cluster else None + + if args.dry_run: + print("=== Dry Run ===") + print(f"Container: {container}") + if build_args: + print(f"Build args: {' '.join(build_args)}") + if model: + print(f"Model: {model}") + if cluster_only: + print(f"Cluster only: Yes (model too large for single node)") + if nodes: + source = "(from .env)" if nodes_from_env else "" + print(f"Nodes: {', '.join(nodes)} {source}".strip()) + print(f" Head: {nodes[0]}") + if worker_nodes: + print(f" Workers: {', '.join(worker_nodes)}") + print(f"Solo mode: {is_solo}") + print() + + # --- Build Phase --- + if args.build_only or args.setup or args.force_build: + if args.dry_run: + image_exists = check_image_exists(container) + if args.force_build or not image_exists: + print(f"Would build container: {container}") + if copy_targets: + print(f" Would copy to: {', '.join(copy_targets)}") + else: + print(f"Container '{container}' already exists locally.") + if copy_targets: + print(f" Would check/copy to workers: {', '.join(copy_targets)}") + print() + else: + image_exists = check_image_exists(container) + + if args.force_build or not image_exists: + print("=== Building Container ===") + if not build_image(container, copy_targets, build_args): + print("Error: Failed to build container") + return 1 + print() + else: + print(f"Container '{container}' already exists locally.") + # Check worker nodes in cluster mode + if copy_targets: + missing_on = [] + for worker in copy_targets: + if not check_image_exists(container, worker): + missing_on.append(worker) + if missing_on: + print(f"Container missing on workers: {', '.join(missing_on)}") + print("Building and copying...") + if not build_image(container, missing_on, build_args): + print("Error: Failed to build/copy container") + return 1 + print() + + if args.build_only: + print("Build complete." if not args.dry_run else "") + return 0 + + # --- Download Phase --- + if model and (args.download_only or args.setup or args.force_download): + if args.dry_run: + model_exists = check_model_exists(model) + if args.force_download or not model_exists: + print(f"Would download model: {model}") + if copy_targets: + print(f" Would copy to: {', '.join(copy_targets)}") + else: + print(f"Model '{model}' already exists in cache.") + print() + else: + model_exists = check_model_exists(model) + + if args.force_download or not model_exists: + print("=== Downloading Model ===") + if not download_model(model, copy_targets): + print("Error: Failed to download model") + return 1 + print() + else: + print(f"Model '{model}' already exists in cache.") + print() + + if args.download_only: + print("Download complete." if not args.dry_run else "") + return 0 + + # --- Run Phase --- + if args.build_only or args.download_only: + return 0 + + # Check if image exists (if not using --setup) + if not args.dry_run and not args.setup and not check_image_exists(container): + print(f"Container image '{container}' not found locally.") + print() + print("Options:") + print(f" 1. Use --setup to build and run") + print(f" 2. Build manually: ./build-and-copy.sh -t {container}") + print() + response = input("Build now? [y/N] ").strip().lower() + if response == 'y': + if not build_image(container, copy_targets, build_args): + print("Error: Failed to build image") + return 1 + else: + print("Aborting.") + return 1 + + # Build overrides from CLI args + overrides = {} + for key in ["port", "host", "tensor_parallel", "gpu_memory_utilization", "max_model_len"]: + value = getattr(args, key, None) + if value is not None: + overrides[key] = value + + # In solo mode, default tensor_parallel to 1 (unless user explicitly set --tp) + if is_solo and "tensor_parallel" not in overrides: + overrides["tensor_parallel"] = 1 + + # Generate launch script + script_content = generate_launch_script(recipe, overrides, is_solo=is_solo) + + if args.dry_run: + print("=== Generated Launch Script ===") + print(script_content) + print("=== What would be executed ===") + print() + print("1. The above script is saved to a temporary file") + print() + print("2. launch-cluster.sh is called with:") + cmd_parts = [" ./launch-cluster.sh", "-t", container] + for mod in recipe.get("mods", []): + cmd_parts.extend(["--apply-mod", mod]) + if args.solo: + cmd_parts.append("--solo") + elif not is_cluster: + cmd_parts.append("--solo") + if args.daemon: + cmd_parts.append("-d") + if nodes: + cmd_parts.extend(["-n", ",".join(nodes)]) + if args.nccl_debug: + cmd_parts.extend(["--nccl-debug", args.nccl_debug]) + cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"]) + print(" ".join(cmd_parts)) + print() + print("3. The launch script runs inside the container") + return 0 + + # Write temporary launch script + with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f: + f.write(script_content) + temp_script = f.name + + try: + os.chmod(temp_script, 0o755) + + # Build launch-cluster.sh command + cmd = [str(LAUNCH_SCRIPT), "-t", container] + + # Add mods + for mod in recipe.get("mods", []): + mod_path = SCRIPT_DIR / mod + if not mod_path.exists(): + print(f"Warning: Mod path not found: {mod_path}") + cmd.extend(["--apply-mod", str(mod_path)]) + + # Add launch options + if args.solo: + cmd.append("--solo") + elif not is_cluster: + # Auto-enable solo mode if no cluster nodes specified + cmd.append("--solo") + + if args.daemon: + cmd.append("-d") + + # Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover) + if nodes: + cmd.extend(["-n", ",".join(nodes)]) + + if args.nccl_debug: + cmd.extend(["--nccl-debug", args.nccl_debug]) + + # Add launch script + cmd.extend(["--launch-script", temp_script]) + + print(f"=== Launching ===") + print(f"Container: {container}") + if recipe.get("mods"): + print(f"Mods: {', '.join(recipe['mods'])}") + if is_cluster: + print(f"Cluster: {len(nodes)} nodes") + else: + print("Mode: Solo") + print() + + # Execute + result = subprocess.run(cmd) + return result.returncode + + finally: + # Cleanup temp script + try: + os.unlink(temp_script) + except OSError: + pass + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/run-recipe.sh b/run-recipe.sh new file mode 100755 index 0000000..aeb82b2 --- /dev/null +++ b/run-recipe.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# run-recipe.sh - Wrapper for run-recipe.py +# +# Ensures Python dependencies are available and runs the recipe runner. +# + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RECIPE_SCRIPT="$SCRIPT_DIR/run-recipe.py" + +# Check for Python 3.10+ +if command -v python3 &>/dev/null; then + PYTHON=python3 +elif command -v python &>/dev/null; then + PYTHON=python +else + echo "Error: Python 3 not found. Please install Python 3.10 or later." + exit 1 +fi + +# Verify version +PY_VERSION=$($PYTHON -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') +PY_MAJOR=$($PYTHON -c 'import sys; print(sys.version_info.major)') +PY_MINOR=$($PYTHON -c 'import sys; print(sys.version_info.minor)') + +if [[ "$PY_MAJOR" -lt 3 ]] || [[ "$PY_MAJOR" -eq 3 && "$PY_MINOR" -lt 10 ]]; then + echo "Error: Python 3.10+ required, found $PY_VERSION" + exit 1 +fi + +# Check for PyYAML and install if missing +if ! $PYTHON -c "import yaml" 2>/dev/null; then + echo "Installing PyYAML..." + $PYTHON -m pip install --quiet pyyaml + if [[ $? -ne 0 ]]; then + echo "Error: Failed to install PyYAML. Try: pip install pyyaml" + exit 1 + fi +fi + +# Run the recipe script +exec $PYTHON "$RECIPE_SCRIPT" "$@" From 28ba6090fcc86ce53583d87e55e6d88a4bf6afc6 Mon Sep 17 00:00:00 2001 From: Raphael Amorim Date: Tue, 3 Feb 2026 17:32:59 -0500 Subject: [PATCH 3/7] Adding suggestions from Eugr and unit tests --- .github/workflows/test-recipes.yml | 59 ++ {profiles => examples}/README.md | 8 +- .../example-vllm-minimax.sh | 0 {profiles => examples}/vllm-glm-4.7-nvfp4.sh | 0 .../vllm-openai-gpt-oss-120b.sh | 0 launch-cluster.sh | 29 +- run-recipe.py | 1 + tests/expected_commands.sh | 89 ++ tests/test_recipes.sh | 859 ++++++++++++++++++ 9 files changed, 1024 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/test-recipes.yml rename {profiles => examples}/README.md (90%) rename {profiles => examples}/example-vllm-minimax.sh (100%) rename {profiles => examples}/vllm-glm-4.7-nvfp4.sh (100%) rename {profiles => examples}/vllm-openai-gpt-oss-120b.sh (100%) create mode 100644 tests/expected_commands.sh create mode 100755 tests/test_recipes.sh diff --git a/.github/workflows/test-recipes.yml b/.github/workflows/test-recipes.yml new file mode 100644 index 0000000..0a0a0e1 --- /dev/null +++ b/.github/workflows/test-recipes.yml @@ -0,0 +1,59 @@ +name: Recipe Tests + +on: + push: + branches: [ main, profiles ] + paths: + - 'run-recipe.py' + - 'run-recipe.sh' + - 'launch-cluster.sh' + - 'recipes/**' + - 'tests/**' + - '.github/workflows/test-recipes.yml' + pull_request: + branches: [ main, profiles ] + paths: + - 'run-recipe.py' + - 'run-recipe.sh' + - 'launch-cluster.sh' + - 'recipes/**' + - 'tests/**' + - '.github/workflows/test-recipes.yml' + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pyyaml + + - name: Make scripts executable + run: | + chmod +x run-recipe.py run-recipe.sh launch-cluster.sh + chmod +x tests/test_recipes.sh + + - name: Run recipe integration tests + run: | + ./tests/test_recipes.sh -v + + - name: Verify all recipes with dry-run + run: | + for recipe in recipes/*.yaml; do + name=$(basename "$recipe" .yaml) + echo "Testing recipe: $name" + ./run-recipe.py "$name" --dry-run --solo || exit 1 + done diff --git a/profiles/README.md b/examples/README.md similarity index 90% rename from profiles/README.md rename to examples/README.md index 470e9f8..4d84af9 100644 --- a/profiles/README.md +++ b/examples/README.md @@ -1,6 +1,8 @@ -# Launch Scripts +# Example Launch Scripts -This directory contains bash scripts that can be executed in the container using the `--launch-script` option. Launch scripts are simple, executable bash files that run directly inside the container. +This directory contains example bash scripts that demonstrate how to use the `--launch-script` option directly with `launch-cluster.sh`. + +**Note:** For most use cases, the recipe system (`./run-recipe.sh`) is the recommended approach. These examples are provided for reference and for advanced users who need direct control over the launch process. ## Why Launch Scripts? @@ -12,7 +14,7 @@ This directory contains bash scripts that can be executed in the container using ## Usage ```bash -# Use a launch script by name (looks in profiles/ directory) +# Use a launch script by name (looks in examples/ directory) ./launch-cluster.sh --launch-script example-vllm-minimax # Use a launch script by filename diff --git a/profiles/example-vllm-minimax.sh b/examples/example-vllm-minimax.sh similarity index 100% rename from profiles/example-vllm-minimax.sh rename to examples/example-vllm-minimax.sh diff --git a/profiles/vllm-glm-4.7-nvfp4.sh b/examples/vllm-glm-4.7-nvfp4.sh similarity index 100% rename from profiles/vllm-glm-4.7-nvfp4.sh rename to examples/vllm-glm-4.7-nvfp4.sh diff --git a/profiles/vllm-openai-gpt-oss-120b.sh b/examples/vllm-openai-gpt-oss-120b.sh similarity index 100% rename from profiles/vllm-openai-gpt-oss-120b.sh rename to examples/vllm-openai-gpt-oss-120b.sh diff --git a/launch-cluster.sh b/launch-cluster.sh index 2851701..a49cbe1 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -43,7 +43,7 @@ usage() { echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)" echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO." echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)" - echo " --launch-script Path to bash script to execute in the container (from profiles/ directory or absolute path)" + echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path)" echo " --check-config Check configuration and auto-detection without launching" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" echo " -d Daemon mode (only for 'start' action)" @@ -51,7 +51,7 @@ usage() { echo " command Command to run (only for 'exec' action)" echo "" echo "Launch Script Usage:" - echo " $0 --launch-script profiles/my-script.sh # Script copied to container and executed" + echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed" echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script" exit 1 } @@ -120,18 +120,18 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then # Check if it's an absolute path or relative path that exists if [[ -f "$LAUNCH_SCRIPT_PATH" ]]; then LAUNCH_SCRIPT_PATH=$(realpath "$LAUNCH_SCRIPT_PATH") - # Check if it's just a filename, look in profiles/ directory - elif [[ -f "$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" ]]; then - LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" + # Check if it's just a filename, look in examples/ directory + elif [[ -f "$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" ]]; then + LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" # Check if it's a name without .sh extension - elif [[ -f "$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" ]]; then - LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" + elif [[ -f "$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" ]]; then + LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" else echo "Error: Launch script '$LAUNCH_SCRIPT_PATH' not found." echo "Searched in:" echo " - $LAUNCH_SCRIPT_PATH" - echo " - $SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" - echo " - $SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" + echo " - $SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" + echo " - $SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" exit 1 fi @@ -578,17 +578,10 @@ start_cluster() { done fi - # Copy launch script if specified + # Copy launch script to head node only (workers don't need it - they just run Ray) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then - echo "Copying launch script to cluster nodes..." - - # Copy to Head + echo "Copying launch script to head node..." copy_launch_script_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "$LAUNCH_SCRIPT_PATH" - - # Copy to Workers - for worker in "${PEER_NODES[@]}"; do - copy_launch_script_to_container "$worker" "$CONTAINER_NAME" "false" "$LAUNCH_SCRIPT_PATH" - done fi if [[ "$SOLO_MODE" == "false" ]]; then diff --git a/run-recipe.py b/run-recipe.py index 16038ee..43b4f2a 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -75,6 +75,7 @@ RECIPE VERSION HISTORY: RELATED FILES: - run-recipe.sh: Bash wrapper that ensures Python deps are installed - recipes/*.yaml: Recipe definitions + - examples/: Example launch scripts for direct use with launch-cluster.sh - launch-cluster.sh: Low-level container orchestration - build-and-copy.sh: Docker build and distribution - hf-download.sh: HuggingFace model download and sync diff --git a/tests/expected_commands.sh b/tests/expected_commands.sh new file mode 100644 index 0000000..7ee60f1 --- /dev/null +++ b/tests/expected_commands.sh @@ -0,0 +1,89 @@ +# Expected vLLM serve arguments for each recipe +# This file is used by test_recipes.sh to verify recipes match README documentation +# +# Format: Each recipe has a section with expected arguments +# Tests will verify these arguments appear in the dry-run output +# +# IMPORTANT: Keep this in sync with README.md documentation +# When updating recipes, update both README.md and this file + +# ============================================================================== +# glm-4.7-flash-awq +# README Reference: Lines 186-198 (solo) and 203-218 (cluster) +# ============================================================================== +GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit" +GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5" +GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ" +GLM_FLASH_AWQ_ARGS=( + "--tool-call-parser glm47" + "--reasoning-parser glm45" + "--enable-auto-tool-choice" + "--served-model-name glm-4.7-flash" + "--max-model-len 202752" + "--max-num-batched-tokens 4096" + "--max-num-seqs 64" + "--gpu-memory-utilization 0.7" + "--port 8888" + "--host 0.0.0.0" +) + +# ============================================================================== +# openai-gpt-oss-120b +# README Reference: Lines 244-257 (solo) and 264-280 (cluster) +# ============================================================================== +GPT_OSS_MODEL="openai/gpt-oss-120b" +GPT_OSS_CONTAINER="vllm-node-mxfp4" +GPT_OSS_ARGS=( + "--port 8888" + "--host 0.0.0.0" + "--enable-auto-tool-choice" + "--tool-call-parser openai" + "--reasoning-parser openai_gptoss" + "--gpu-memory-utilization 0.7" + "--enable-prefix-caching" + "--load-format fastsafetensors" + "--quantization mxfp4" + "--mxfp4-backend CUTLASS" + "--mxfp4-layers moe,qkv,o,lm_head" + "--attention-backend FLASHINFER" + "--kv-cache-dtype fp8" + "--max-num-batched-tokens 8192" +) + +# ============================================================================== +# minimax-m2-awq +# README Reference: Not explicitly documented, but based on model requirements +# ============================================================================== +MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ" +MINIMAX_CONTAINER="vllm-node" +MINIMAX_ARGS=( + "--port 8000" + "--host 0.0.0.0" + "--gpu-memory-utilization 0.7" + "--max-model-len 128000" + "--load-format fastsafetensors" + "--enable-auto-tool-choice" + "--tool-call-parser minimax_m2" + "--reasoning-parser minimax_m2_append_think" +) + +# ============================================================================== +# Cluster Mode Expected Arguments +# These are arguments that should appear ONLY in cluster mode +# Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node) +# ============================================================================== + +# glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model) +GLM_FLASH_AWQ_CLUSTER_TP="1" + +# openai-gpt-oss-120b cluster mode (2 nodes = tp 2) +GPT_OSS_CLUSTER_TP="2" +GPT_OSS_CLUSTER_ARGS=( + "--distributed-executor-backend ray" +) + +# minimax-m2-awq cluster mode (2 nodes = tp 2) +MINIMAX_CLUSTER_TP="2" +MINIMAX_CLUSTER_ARGS=( + "--distributed-executor-backend ray" +) diff --git a/tests/test_recipes.sh b/tests/test_recipes.sh new file mode 100755 index 0000000..6e44e26 --- /dev/null +++ b/tests/test_recipes.sh @@ -0,0 +1,859 @@ +#!/bin/bash +# +# test_recipes.sh - Integration tests for run-recipe.py and launch-cluster.sh +# +# These tests use --dry-run mode to verify compatibility without actually +# running containers. Suitable for CI/CD pipelines. +# +# Usage: +# ./tests/test_recipes.sh # Run all tests +# ./tests/test_recipes.sh -v # Verbose output +# + +set -e + +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +VERBOSE="${1:-}" + +# Load expected commands for README verification +source "$SCRIPT_DIR/expected_commands.sh" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Test counters +TESTS_PASSED=0 +TESTS_FAILED=0 +TESTS_SKIPPED=0 + +# Helper functions +log_test() { + echo -e "${YELLOW}[TEST]${NC} $1" +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $1" + TESTS_PASSED=$((TESTS_PASSED + 1)) +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $1" + TESTS_FAILED=$((TESTS_FAILED + 1)) +} + +log_skip() { + echo -e "${YELLOW}[SKIP]${NC} $1" + TESTS_SKIPPED=$((TESTS_SKIPPED + 1)) +} + +log_verbose() { + if [[ "$VERBOSE" == "-v" ]]; then + echo " $1" + fi +} + +# Check prerequisites +check_prerequisites() { + log_test "Checking prerequisites..." + + if ! command -v python3 &> /dev/null; then + log_fail "python3 not found" + exit 1 + fi + + # Check Python version + python_version=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') + if [[ $(echo "$python_version < 3.10" | bc -l) -eq 1 ]]; then + log_fail "Python 3.10+ required, found $python_version" + exit 1 + fi + + # Check PyYAML + if ! python3 -c "import yaml" 2>/dev/null; then + log_fail "PyYAML not installed" + exit 1 + fi + + log_pass "Prerequisites OK (Python $python_version with PyYAML)" +} + +# Test: run-recipe.py exists and is executable +test_run_recipe_exists() { + log_test "run-recipe.py exists and is executable" + + if [[ -x "$PROJECT_DIR/run-recipe.py" ]]; then + log_pass "run-recipe.py is executable" + else + log_fail "run-recipe.py not found or not executable" + fi +} + +# Test: launch-cluster.sh exists and is executable +test_launch_cluster_exists() { + log_test "launch-cluster.sh exists and is executable" + + if [[ -x "$PROJECT_DIR/launch-cluster.sh" ]]; then + log_pass "launch-cluster.sh is executable" + else + log_fail "launch-cluster.sh not found or not executable" + fi +} + +# Test: run-recipe.py --list works +test_list_recipes() { + log_test "run-recipe.py --list" + + output=$("$PROJECT_DIR/run-recipe.py" --list 2>&1) + + if [[ $? -eq 0 ]] && echo "$output" | grep -q "Available recipes"; then + log_pass "--list shows available recipes" + log_verbose "Found recipes in output" + else + log_fail "--list failed or no recipes found" + log_verbose "$output" + fi +} + +# Test: All recipes have required recipe_version field +test_recipe_version_required() { + log_test "All recipes have required recipe_version field" + + local all_valid=true + for recipe in "$PROJECT_DIR/recipes/"*.yaml; do + if [[ -f "$recipe" ]]; then + recipe_name=$(basename "$recipe") + if ! grep -q "^recipe_version:" "$recipe"; then + log_verbose "$recipe_name missing recipe_version" + all_valid=false + fi + fi + done + + if [[ "$all_valid" == "true" ]]; then + log_pass "All recipes have recipe_version field" + else + log_fail "Some recipes missing recipe_version field" + fi +} + +# Test: All recipes load without errors +test_all_recipes_load() { + log_test "All recipes load without errors" + + local all_valid=true + for recipe in "$PROJECT_DIR/recipes/"*.yaml; do + if [[ -f "$recipe" ]]; then + recipe_name=$(basename "$recipe" .yaml) + # Try to load recipe with --dry-run (will fail early if recipe is invalid) + if ! "$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1 | grep -q "Error:"; then + log_verbose "$recipe_name loads OK" + else + log_verbose "$recipe_name failed to load" + all_valid=false + fi + fi + done + + if [[ "$all_valid" == "true" ]]; then + log_pass "All recipes load successfully" + else + log_fail "Some recipes failed to load" + fi +} + +# Test: Dry-run generates valid launch script +test_dry_run_generates_script() { + log_test "Dry-run generates valid launch script" + + # Find first available recipe + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + + if echo "$output" | grep -q "#!/bin/bash" && echo "$output" | grep -q "vllm serve"; then + log_pass "Dry-run generates bash script with vllm serve command" + else + log_fail "Dry-run output doesn't contain expected content" + log_verbose "$output" + fi +} + +# Test: Solo mode sets tensor_parallel=1 +test_solo_mode_tp1() { + log_test "Solo mode sets tensor_parallel=1" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + + # Check that -tp 1 is in the output (solo mode should set tp=1) + if echo "$output" | grep -q "\-tp 1"; then + log_pass "Solo mode correctly sets -tp 1" + else + log_fail "Solo mode did not set -tp 1" + log_verbose "$output" + fi +} + +# Test: Solo mode removes --distributed-executor-backend ray +test_solo_mode_removes_ray() { + log_test "Solo mode removes --distributed-executor-backend ray" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + + # Check that --distributed-executor-backend is NOT in the output + if ! echo "$output" | grep -q "\-\-distributed-executor-backend"; then + log_pass "Solo mode correctly removes --distributed-executor-backend" + else + log_fail "Solo mode did not remove --distributed-executor-backend" + log_verbose "$output" + fi +} + +# Test: Cluster mode preserves --distributed-executor-backend ray +test_cluster_mode_keeps_ray() { + log_test "Cluster mode preserves --distributed-executor-backend ray" + + # Use minimax-m2-awq which explicitly has --distributed-executor-backend ray + if [[ ! -f "$PROJECT_DIR/recipes/minimax-m2-awq.yaml" ]]; then + log_skip "minimax-m2-awq.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "192.168.1.1,192.168.1.2" 2>&1) + + # Check that --distributed-executor-backend IS in the output for cluster mode + if echo "$output" | grep -q "\-\-distributed-executor-backend ray"; then + log_pass "Cluster mode correctly preserves --distributed-executor-backend ray" + else + log_fail "Cluster mode did not preserve --distributed-executor-backend" + log_verbose "$output" + fi +} + +# Test: CLI overrides work (--port) +test_cli_override_port() { + log_test "CLI override --port works" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 9999 2>&1) + + if echo "$output" | grep -q "\-\-port 9999"; then + log_pass "--port override correctly applied" + else + log_fail "--port override not found in output" + log_verbose "$output" + fi +} + +# Test: launch-cluster.sh --help works +test_launch_cluster_help() { + log_test "launch-cluster.sh --help" + + output=$("$PROJECT_DIR/launch-cluster.sh" --help 2>&1 || true) + + if echo "$output" | grep -q "Usage:"; then + log_pass "--help shows usage information" + else + log_fail "--help did not show usage" + log_verbose "$output" + fi +} + +# Test: launch-cluster.sh references examples/ not profiles/ +test_launch_cluster_examples_path() { + log_test "launch-cluster.sh references examples/ directory" + + if grep -q "examples/" "$PROJECT_DIR/launch-cluster.sh"; then + log_pass "launch-cluster.sh references examples/" + else + log_fail "launch-cluster.sh does not reference examples/" + fi + + if grep -q "profiles/" "$PROJECT_DIR/launch-cluster.sh"; then + log_fail "launch-cluster.sh still references profiles/" + fi +} + +# Test: Unsupported recipe version shows warning +test_unsupported_recipe_version() { + log_test "Unsupported recipe_version shows warning" + + # Create a temporary recipe with unsupported version + temp_recipe=$(mktemp) + cat > "$temp_recipe" << 'EOF' +recipe_version: "999" +name: Test Recipe +container: test-container +command: echo "test" +EOF + + output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1) + rm -f "$temp_recipe" + + if echo "$output" | grep -q "Warning.*schema version"; then + log_pass "Unsupported recipe_version shows warning" + else + log_fail "No warning for unsupported recipe_version" + log_verbose "$output" + fi +} + +# Test: Missing recipe_version fails +test_missing_recipe_version_fails() { + log_test "Missing recipe_version field fails" + + # Create a temporary recipe without recipe_version + temp_recipe=$(mktemp) + cat > "$temp_recipe" << 'EOF' +name: Test Recipe +container: test-container +command: echo "test" +EOF + + output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true) + rm -f "$temp_recipe" + + if echo "$output" | grep -q "Error.*recipe_version"; then + log_pass "Missing recipe_version correctly fails" + else + log_fail "Missing recipe_version did not fail as expected" + log_verbose "$output" + fi +} + +# Test: cluster_only recipe fails in solo mode +test_cluster_only_fails_solo() { + log_test "cluster_only recipe fails in solo mode" + + # Create a temporary cluster_only recipe + temp_recipe=$(mktemp) + cat > "$temp_recipe" << 'EOF' +recipe_version: "1" +name: Cluster Only Test +container: test-container +cluster_only: true +command: echo "test" +EOF + + output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true) + exit_code=$? + rm -f "$temp_recipe" + + if echo "$output" | grep -q "requires cluster mode"; then + log_pass "cluster_only recipe correctly fails in solo mode" + else + log_fail "cluster_only recipe did not fail in solo mode" + log_verbose "$output" + fi +} + +# ============================================================================== +# Launch-cluster.sh Command Line Verification Tests +# ============================================================================== +# These tests verify that the dry-run output contains the expected +# launch-cluster.sh command line arguments matching the recipe configuration. + +# Helper: Extract launch-cluster command from dry-run output +extract_launch_cmd() { + echo "$1" | grep -A5 "launch-cluster.sh is called with:" | grep -v "launch-cluster.sh is called with:" | tr '\n' ' ' +} + +# Test: Solo mode generates --solo flag in launch-cluster command +test_launch_cmd_solo_flag() { + log_test "Launch command includes --solo flag in solo mode" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-solo"; then + log_pass "Launch command includes --solo flag" + else + log_fail "Launch command missing --solo flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Cluster mode generates -n flag with nodes +test_launch_cmd_nodes_flag() { + log_test "Launch command includes -n flag with nodes in cluster mode" + + output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-n 10.0.0.1,10.0.0.2"; then + log_pass "Launch command includes -n with correct nodes" + else + log_fail "Launch command missing or incorrect -n flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Container image from recipe is passed to launch-cluster +test_launch_cmd_container_image() { + log_test "Launch command includes correct container image (-t)" + + # Use openai-gpt-oss-120b which has a specific container name + if [[ ! -f "$PROJECT_DIR/recipes/openai-gpt-oss-120b.yaml" ]]; then + log_skip "openai-gpt-oss-120b.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" openai-gpt-oss-120b --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + # Check the container is vllm-node-mxfp4 (from the recipe) + if echo "$launch_cmd" | grep -q "\-t vllm-node-mxfp4"; then + log_pass "Launch command includes correct container image" + else + log_fail "Launch command has wrong container image" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Mods from recipe are passed as --apply-mod +test_launch_cmd_mods() { + log_test "Launch command includes --apply-mod for recipe mods" + + # Use glm-4.7-flash-awq which has a mod + if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then + log_skip "glm-4.7-flash-awq.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-apply-mod"; then + log_pass "Launch command includes --apply-mod for mods" + else + log_fail "Launch command missing --apply-mod" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Daemon mode flag is passed through +test_launch_cmd_daemon_flag() { + log_test "Launch command includes -d flag in daemon mode" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -d 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-d"; then + log_pass "Launch command includes -d flag" + else + log_fail "Launch command missing -d flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: NCCL debug level is passed through +test_launch_cmd_nccl_debug() { + log_test "Launch command includes --nccl-debug when specified" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --nccl-debug INFO 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-nccl-debug INFO"; then + log_pass "Launch command includes --nccl-debug INFO" + else + log_fail "Launch command missing --nccl-debug" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: --launch-script is always included +test_launch_cmd_launch_script() { + log_test "Launch command includes --launch-script" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-launch-script"; then + log_pass "Launch command includes --launch-script" + else + log_fail "Launch command missing --launch-script" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Container override (-t CLI) takes precedence +test_launch_cmd_container_override() { + log_test "CLI container override (-t) takes precedence" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -t my-custom-image 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-t my-custom-image"; then + log_pass "Container override correctly applied" + else + log_fail "Container override not applied" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Cluster mode does NOT include --solo flag +test_launch_cmd_no_solo_in_cluster() { + log_test "Launch command does NOT include --solo in cluster mode" + + output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -qv "\-\-solo" || ! echo "$launch_cmd" | grep -q "\-\-solo"; then + log_pass "Cluster mode correctly omits --solo flag" + else + log_fail "Cluster mode incorrectly includes --solo flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# ============================================================================== +# README Documentation Verification Tests +# ============================================================================== +# These tests verify that recipe dry-run output matches the expected commands +# documented in README.md. Expected values are defined in expected_commands.sh + +# Helper: Extract the generated launch script from dry-run output +extract_vllm_command() { + # Extract lines between "Generated Launch Script" and "What would be executed" + echo "$1" | sed -n '/=== Generated Launch Script ===/,/=== What would be executed ===/p' | grep -v "===" | grep -v "^#" | grep -v "^$" +} + +# Helper: Verify a recipe contains all expected arguments +verify_recipe_args() { + local recipe_name="$1" + local expected_model="$2" + local expected_container="$3" + shift 3 + local expected_args=("$@") + + log_test "README match: $recipe_name" + + if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then + log_skip "${recipe_name}.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + vllm_cmd=$(extract_vllm_command "$output") + launch_cmd=$(extract_launch_cmd "$output") + + local all_passed=true + local missing_args=() + + # Check model name + if ! echo "$vllm_cmd" | grep -q "$expected_model"; then + missing_args+=("model: $expected_model") + all_passed=false + fi + + # Check container + if ! echo "$launch_cmd" | grep -q "\-t $expected_container"; then + missing_args+=("container: $expected_container") + all_passed=false + fi + + # Check each expected argument + for arg in "${expected_args[@]}"; do + # Handle arguments that may have slight formatting differences + # Extract the flag and value separately for flexible matching + local flag=$(echo "$arg" | awk '{print $1}') + local value=$(echo "$arg" | cut -d' ' -f2-) + + # Use grep -F for fixed string matching (avoids -- being treated as grep options) + if ! echo "$vllm_cmd" | grep -qF -- "$flag"; then + missing_args+=("$arg") + all_passed=false + elif [[ -n "$value" ]] && [[ "$value" != "$flag" ]]; then + # Check if value is present (might be on next line due to formatting) + if ! echo "$vllm_cmd" | grep -qF -- "$value"; then + missing_args+=("$arg (flag present, value mismatch)") + all_passed=false + fi + fi + done + + if [[ "$all_passed" == "true" ]]; then + log_pass "README match: $recipe_name - all expected arguments present" + else + log_fail "README match: $recipe_name - missing arguments" + for missing in "${missing_args[@]}"; do + log_verbose " Missing: $missing" + done + log_verbose " vLLM command: $vllm_cmd" + fi +} + +# Test: glm-4.7-flash-awq matches README documentation +test_readme_glm_flash_awq() { + verify_recipe_args "glm-4.7-flash-awq" \ + "$GLM_FLASH_AWQ_MODEL" \ + "$GLM_FLASH_AWQ_CONTAINER" \ + "${GLM_FLASH_AWQ_ARGS[@]}" +} + +# Test: openai-gpt-oss-120b matches README documentation +test_readme_gpt_oss() { + verify_recipe_args "openai-gpt-oss-120b" \ + "$GPT_OSS_MODEL" \ + "$GPT_OSS_CONTAINER" \ + "${GPT_OSS_ARGS[@]}" +} + +# Test: minimax-m2-awq matches expected configuration +test_readme_minimax() { + verify_recipe_args "minimax-m2-awq" \ + "$MINIMAX_MODEL" \ + "$MINIMAX_CONTAINER" \ + "${MINIMAX_ARGS[@]}" +} + +# Test: glm-4.7-flash-awq includes correct mod +test_readme_glm_flash_mod() { + log_test "README match: glm-4.7-flash-awq mod path" + + if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then + log_skip "glm-4.7-flash-awq.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "$GLM_FLASH_AWQ_MOD"; then + log_pass "README match: glm-4.7-flash-awq has correct mod path" + else + log_fail "README match: glm-4.7-flash-awq missing expected mod: $GLM_FLASH_AWQ_MOD" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Helper: Verify cluster mode specific arguments +verify_cluster_args() { + local recipe_name="$1" + local expected_tp="$2" + shift 2 + local expected_args=("$@") + + log_test "README match (cluster): $recipe_name" + + if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then + log_skip "${recipe_name}.yaml not found" + return + fi + + # Use fake nodes for cluster mode + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + vllm_cmd=$(extract_vllm_command "$output") + + local all_passed=true + local missing_args=() + + # Check tensor parallel + if ! echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) $expected_tp"; then + missing_args+=("tensor_parallel: $expected_tp") + all_passed=false + fi + + # Check cluster-specific arguments + for arg in "${expected_args[@]}"; do + if ! echo "$vllm_cmd" | grep -qF -- "$arg"; then + missing_args+=("$arg") + all_passed=false + fi + done + + if [[ "$all_passed" == "true" ]]; then + log_pass "README match (cluster): $recipe_name - cluster args correct" + else + log_fail "README match (cluster): $recipe_name - missing cluster arguments" + for missing in "${missing_args[@]}"; do + log_verbose " Missing: $missing" + done + log_verbose " vLLM command: $vllm_cmd" + fi +} + +# Test: openai-gpt-oss-120b cluster mode has correct tensor_parallel and ray backend +test_readme_gpt_oss_cluster() { + verify_cluster_args "openai-gpt-oss-120b" \ + "$GPT_OSS_CLUSTER_TP" \ + "${GPT_OSS_CLUSTER_ARGS[@]}" +} + +# Test: minimax-m2-awq cluster mode has correct tensor_parallel and ray backend +test_readme_minimax_cluster() { + verify_cluster_args "minimax-m2-awq" \ + "$MINIMAX_CLUSTER_TP" \ + "${MINIMAX_CLUSTER_ARGS[@]}" +} + +# Test: glm-4.7-flash-awq cluster mode stays at tp=1 (single GPU model) +test_readme_glm_flash_cluster() { + log_test "README match (cluster): glm-4.7-flash-awq stays tp=1" + + if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then + log_skip "glm-4.7-flash-awq.yaml not found" + return + fi + + # Even in cluster mode, this model uses tp=1 + output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + vllm_cmd=$(extract_vllm_command "$output") + + if echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) 1"; then + log_pass "README match (cluster): glm-4.7-flash-awq correctly keeps tp=1" + else + log_fail "README match (cluster): glm-4.7-flash-awq should have tp=1" + log_verbose " vLLM command: $vllm_cmd" + fi +} + +# Run all tests +main() { + echo "==============================================" + echo " run-recipe.py Integration Tests" + echo "==============================================" + echo "" + + cd "$PROJECT_DIR" + + check_prerequisites + echo "" + + # File existence tests + test_run_recipe_exists + test_launch_cluster_exists + echo "" + + # Basic functionality tests + test_list_recipes + test_recipe_version_required + test_all_recipes_load + echo "" + + # Dry-run tests + test_dry_run_generates_script + test_solo_mode_tp1 + test_solo_mode_removes_ray + test_cluster_mode_keeps_ray + test_cli_override_port + echo "" + + # launch-cluster.sh command line verification tests + echo "--- Launch Command Verification ---" + test_launch_cmd_solo_flag + test_launch_cmd_nodes_flag + test_launch_cmd_container_image + test_launch_cmd_mods + test_launch_cmd_daemon_flag + test_launch_cmd_nccl_debug + test_launch_cmd_launch_script + test_launch_cmd_container_override + test_launch_cmd_no_solo_in_cluster + echo "" + + # README documentation verification tests + echo "--- README Documentation Verification (Solo Mode) ---" + test_readme_glm_flash_awq + test_readme_gpt_oss + test_readme_minimax + test_readme_glm_flash_mod + echo "" + + # Cluster mode documentation verification tests + echo "--- README Documentation Verification (Cluster Mode) ---" + test_readme_gpt_oss_cluster + test_readme_minimax_cluster + test_readme_glm_flash_cluster + echo "" + + # launch-cluster.sh tests + test_launch_cluster_help + test_launch_cluster_examples_path + echo "" + + # Validation tests + test_unsupported_recipe_version + test_missing_recipe_version_fails + test_cluster_only_fails_solo + echo "" + + # Summary + echo "==============================================" + echo " Test Summary" + echo "==============================================" + echo -e " ${GREEN}Passed:${NC} $TESTS_PASSED" + echo -e " ${RED}Failed:${NC} $TESTS_FAILED" + echo -e " ${YELLOW}Skipped:${NC} $TESTS_SKIPPED" + echo "==============================================" + + if [[ $TESTS_FAILED -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@" From b1516f688a0adc5c2c3d6f1034fc5594e0aab5ae Mon Sep 17 00:00:00 2001 From: Raphael Amorim Date: Tue, 3 Feb 2026 17:35:33 -0500 Subject: [PATCH 4/7] fix: Allow PR tests from any branch and add manual trigger --- .github/workflows/test-recipes.yml | 2 +- run-recipe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-recipes.yml b/.github/workflows/test-recipes.yml index 0a0a0e1..23863de 100644 --- a/.github/workflows/test-recipes.yml +++ b/.github/workflows/test-recipes.yml @@ -11,7 +11,6 @@ on: - 'tests/**' - '.github/workflows/test-recipes.yml' pull_request: - branches: [ main, profiles ] paths: - 'run-recipe.py' - 'run-recipe.sh' @@ -19,6 +18,7 @@ on: - 'recipes/**' - 'tests/**' - '.github/workflows/test-recipes.yml' + workflow_dispatch: jobs: test: diff --git a/run-recipe.py b/run-recipe.py index 43b4f2a..dd0fcaf 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -21,7 +21,7 @@ ARCHITECTURE OVERVIEW (for developers extending this script) ================================================================================ DEPLOYMENT PIPELINE: - ┌─────────────────────────────────────────────────────────────────────────┐ + ┌─────────────────────────────────────────────────────────────────────────────┐ │ CLI Args → Load Recipe → Resolve Nodes → Build → Download → Run │ └─────────────────────────────────────────────────────────────────────────┘ From f7830636af628d540bdfd0c7225a0cd75c29b0b1 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 4 Feb 2026 11:36:55 -0800 Subject: [PATCH 5/7] Cleaning up launch-cluster changes --- launch-cluster.sh | 59 ++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index a49cbe1..0b71250 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -43,12 +43,12 @@ usage() { echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)" echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO." echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)" - echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path)" + echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted." echo " --check-config Check configuration and auto-detection without launching" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" echo " -d Daemon mode (only for 'start' action)" - echo " action start | stop | status | exec (Default: start)" - echo " command Command to run (only for 'exec' action)" + echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." + echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" echo "Launch Script Usage:" echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed" @@ -80,9 +80,17 @@ while [[ "$#" -gt 0 ]]; do -d) DAEMON_MODE="true" ;; -h|--help) usage ;; start|stop|status) + if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then + echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." + exit 1 + fi ACTION="$1" ;; exec) + if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then + echo "Error: Action 'exec' is not compatible with --launch-script. Please omit the action or not use --launch-script." + exit 1 + fi ACTION="exec" shift COMMAND_TO_RUN="$@" @@ -93,6 +101,10 @@ while [[ "$#" -gt 0 ]]; do # unless it's the default 'start' implied. # However, to support "omitted" = start, we need to be careful. # If the arg looks like a command, it's exec. + if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then + echo "Error: Command is not compatible with --launch-script. Please omit the command or not use --launch-script." + exit 1 + fi ACTION="exec" COMMAND_TO_RUN="$@" break @@ -467,47 +479,21 @@ apply_mod_to_container() { # Copy Launch Script to Container Function copy_launch_script_to_container() { - local node_ip="$1" - local container="$2" - local is_local="$3" # true/false - local script_path="$4" + local container="$1" + local script_path="$2" - echo "Copying launch script to $node_ip..." - - # Command prefix for remote vs local - local cmd_prefix="" - if [[ "$is_local" == "false" ]]; then - cmd_prefix="ssh -o BatchMode=yes -o StrictHostKeyChecking=no $node_ip" - fi + echo "Copying launch script to head node..." local target_script_path="$script_path" - local remote_cleanup_path="" - - # Copy script to remote node first if needed - if [[ "$is_local" == "false" ]]; then - local remote_tmp="/tmp/exec_script_$(date +%s)_$RANDOM.sh" - echo " Copying script to $node_ip:$remote_tmp..." - if ! scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$node_ip:$remote_tmp"; then - echo "Error: Failed to copy launch script to $node_ip" - exit 1 - fi - target_script_path="$remote_tmp" - remote_cleanup_path="$remote_tmp" - fi # Copy script into container as /workspace/exec-script.sh echo " Copying script into container..." - $cmd_prefix docker cp "$target_script_path" "$container:/workspace/exec-script.sh" + docker cp "$target_script_path" "$container:/workspace/exec-script.sh" # Make executable - $cmd_prefix docker exec "$container" chmod +x /workspace/exec-script.sh + docker exec "$container" chmod +x /workspace/exec-script.sh - # Cleanup remote temp - if [[ -n "$remote_cleanup_path" ]]; then - ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" "rm -f $remote_cleanup_path" - fi - - echo " Launch script copied to $node_ip" + echo " Launch script copied to head node" } # Start Cluster Function @@ -580,8 +566,7 @@ start_cluster() { # Copy launch script to head node only (workers don't need it - they just run Ray) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then - echo "Copying launch script to head node..." - copy_launch_script_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "$LAUNCH_SCRIPT_PATH" + copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" fi if [[ "$SOLO_MODE" == "false" ]]; then From ec987259a04c187328c2c63f7a2d19c25d886132 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 4 Feb 2026 12:01:53 -0800 Subject: [PATCH 6/7] Recipes and Launch Script support --- README.md | 57 ++++++++++++++++++++++++++++++-- recipes/glm-4.7-flash-awq.yaml | 2 +- recipes/openai-gpt-oss-120b.yaml | 2 +- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f8b6ac3..66b06fb 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,58 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +### 2026-02-04 + +#### Recipes support + +A major contribution from @raphaelamorim - model recipes. +Recipes allow to launch models with preconfigured settings with one command. + +Example: + +```bash +# List available recipes +./run-recipe.sh --list + +# Run a recipe in solo mode (single node) +./run-recipe.sh glm-4.7-flash-awq --solo + +# Full setup: build container + download model + run +./run-recipe.sh glm-4.7-flash-awq --solo --setup + +# Run with overrides +./run-recipe.sh glm-4.7-flash-awq --solo --port 9000 --gpu-mem 0.8 + +# Cluster deployment +./run-recipe.sh glm-4.7-nvfp4 --setup +``` + +Please refer to the [documentation](recipes/README.md) for the details. + +#### Launch script option + +You can now specify a launch script to execute on head node instead of specifying a command directly via `exec` action. +Example: + +```bash +./launch-cluster.sh --launch-script examples/vllm-openai-gpt-oss-120b.sh +``` + +Thanks @raphaelamorim for the contribution! + + +#### Ability to apply vLLM PRs during build + +`./build-and-copy.sh` now supports ability to apply vLLM PRs to builds. PR is applied to the most recent vLLM commit (or specific vllm-ref if set). This does NOT apply to wheels build and MXFP4 special build! + +To use, just specify `--apply-vllm-pr ` in the arguments. Please note that it may fail depending on whether the PR needs a rebase for the specified vLLM reference/main branch. Use with caution! + +Example: + +```bash +./build-and-copy.sh -t vllm-node-20260204-pr31740 --apply-vllm-pr 31740 -c +``` + ### 2026-02-02 #### Nemotron Nano mod @@ -671,6 +723,7 @@ You can override the auto-detected values if needed: | `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. | | `--check-config` | Check configuration and auto-detection without launching. | | `--solo` | Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster | +| `--launch-script` | Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted. | | `-d` | Run in daemon mode (detached). | ## 3\. Running the Container (Manual) @@ -887,13 +940,13 @@ vllm serve openai/gpt-oss-120b \ ### Available Launch Scripts -The `profiles/` directory contains ready-to-use launch scripts: +The `examples/` directory contains ready-to-use launch scripts: - **example-vllm-minimax.sh** - MiniMax-M2-AWQ with Ray distributed backend - **vllm-openai-gpt-oss-120b.sh** - OpenAI GPT-OSS 120B with FlashInfer MOE - **vllm-glm-4.7-nvfp4.sh** - GLM-4.7-NVFP4 (requires the glm4_moe patch mod) -See [profiles/README.md](profiles/README.md) for detailed documentation and more examples. +See [examples/README.md](examples/README.md) for detailed documentation and more examples. ## 8\. Using cluster mode for inference diff --git a/recipes/glm-4.7-flash-awq.yaml b/recipes/glm-4.7-flash-awq.yaml index b0acb2f..9cf4ae0 100644 --- a/recipes/glm-4.7-flash-awq.yaml +++ b/recipes/glm-4.7-flash-awq.yaml @@ -33,7 +33,7 @@ mods: # Default settings (can be overridden via CLI) defaults: - port: 8888 + port: 8000 host: 0.0.0.0 tensor_parallel: 1 gpu_memory_utilization: 0.7 diff --git a/recipes/openai-gpt-oss-120b.yaml b/recipes/openai-gpt-oss-120b.yaml index 0e56aab..09cfa52 100644 --- a/recipes/openai-gpt-oss-120b.yaml +++ b/recipes/openai-gpt-oss-120b.yaml @@ -20,7 +20,7 @@ mods: [] # Default settings (can be overridden via CLI) defaults: - port: 8888 + port: 8000 host: 0.0.0.0 tensor_parallel: 2 gpu_memory_utilization: 0.70 From f139c4b55db2f404ac21e4504d4a799a664f558d Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 4 Feb 2026 12:06:30 -0800 Subject: [PATCH 7/7] Updated tests --- tests/expected_commands.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/expected_commands.sh b/tests/expected_commands.sh index 7ee60f1..5139038 100644 --- a/tests/expected_commands.sh +++ b/tests/expected_commands.sh @@ -23,7 +23,7 @@ GLM_FLASH_AWQ_ARGS=( "--max-num-batched-tokens 4096" "--max-num-seqs 64" "--gpu-memory-utilization 0.7" - "--port 8888" + "--port 8000" "--host 0.0.0.0" ) @@ -34,7 +34,7 @@ GLM_FLASH_AWQ_ARGS=( GPT_OSS_MODEL="openai/gpt-oss-120b" GPT_OSS_CONTAINER="vllm-node-mxfp4" GPT_OSS_ARGS=( - "--port 8888" + "--port 8000" "--host 0.0.0.0" "--enable-auto-tool-choice" "--tool-call-parser openai"