Merge branch 'main' into pytorch-base

2026-02-04 12:07:06 -08:00
parent d8e183cc9b f139c4b55d
commit 66210e641d
15 changed files with 3020 additions and 10 deletions
--- a/.github/workflows/test-recipes.yml
+++ b/.github/workflows/test-recipes.yml
@@ -0,0 +1,59 @@
 name: Recipe Tests
 on:
  push:
    branches: [ main, profiles ]
    paths:
      - 'run-recipe.py'
      - 'run-recipe.sh'
      - 'launch-cluster.sh'
      - 'recipes/**'
      - 'tests/**'
      - '.github/workflows/test-recipes.yml'
  pull_request:
    paths:
      - 'run-recipe.py'
      - 'run-recipe.sh'
      - 'launch-cluster.sh'
      - 'recipes/**'
      - 'tests/**'
      - '.github/workflows/test-recipes.yml'
  workflow_dispatch:
 jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ['3.10', '3.11', '3.12']
    steps:
    - uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install pyyaml
    - name: Make scripts executable
      run: |
        chmod +x run-recipe.py run-recipe.sh launch-cluster.sh
        chmod +x tests/test_recipes.sh
    - name: Run recipe integration tests
      run: |
        ./tests/test_recipes.sh -v
    - name: Verify all recipes with dry-run
      run: |
        for recipe in recipes/*.yaml; do
          name=$(basename "$recipe" .yaml)
          echo "Testing recipe: $name"
          ./run-recipe.py "$name" --dry-run --solo || exit 1
        done
--- a/README.md
+++ b/README.md
@@ -16,10 +16,11 @@ While it was primarily developed to support multi-node inference, it works just
 - [4. Using `run-cluster-node.sh` (Internal)](#4-using-run-cluster-nodesh-internal)
 - [5. Configuration Details](#5-configuration-details)
 - [6. Mods and Patches](#6-mods-and-patches)
- [7. Using cluster mode for inference](#7-using-cluster-mode-for-inference)
+- [7. Launch Scripts](#7-launch-scripts)
- [8. Fastsafetensors](#8-fastsafetensors)
+- [8. Using cluster mode for inference](#8-using-cluster-mode-for-inference)
- [9. Benchmarking](#9-benchmarking)
+- [9. Fastsafetensors](#9-fastsafetensors)
- [10. Downloading Models](#10-downloading-models)
+- [10. Benchmarking](#10-benchmarking)
 - [11. Downloading Models](#11-downloading-models)
 ## DISCLAIMER
@@ -158,6 +159,58 @@ Don't do it every time you rebuild, because it will slow down compilation times.
 For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
 ### 2026-02-04
 #### Recipes support
 A major contribution from @raphaelamorim - model recipes. 
 Recipes allow to launch models with preconfigured settings with one command.
 Example:
 ```bash
 # List available recipes
 ./run-recipe.sh --list
 # Run a recipe in solo mode (single node)
 ./run-recipe.sh glm-4.7-flash-awq --solo
 # Full setup: build container + download model + run
 ./run-recipe.sh glm-4.7-flash-awq --solo --setup
 # Run with overrides
 ./run-recipe.sh glm-4.7-flash-awq --solo --port 9000 --gpu-mem 0.8
 # Cluster deployment
 ./run-recipe.sh glm-4.7-nvfp4 --setup
 ```
 Please refer to the [documentation](recipes/README.md) for the details.
 #### Launch script option
 You can now specify a launch script to execute on head node instead of specifying a command directly via `exec` action. 
 Example: 
 ```bash
 ./launch-cluster.sh --launch-script examples/vllm-openai-gpt-oss-120b.sh
 ```
 Thanks @raphaelamorim for the contribution!
 #### Ability to apply vLLM PRs during build
 `./build-and-copy.sh` now supports ability to apply vLLM PRs to builds. PR is applied to the most recent vLLM commit (or specific vllm-ref if set). This does NOT apply to wheels build and MXFP4 special build!
 To use, just specify `--apply-vllm-pr <pr_num>` in the arguments. Please note that it may fail depending on whether the PR needs a rebase for the specified vLLM reference/main branch. Use with caution!
 Example:
 ```bash
 ./build-and-copy.sh -t vllm-node-20260204-pr31740 --apply-vllm-pr 31740 -c
 ```
 ### 2026-02-02
 #### Nemotron Nano mod
@@ -670,6 +723,7 @@ You can override the auto-detected values if needed:
 | `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. |
 | `--check-config` | Check configuration and auto-detection without launching. |
 | `--solo` | Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster |
 | `--launch-script` | Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted. |
 | `-d` | Run in daemon mode (detached). |
 ## 3\. Running the Container (Manual)
@@ -846,7 +900,55 @@ Mods can be used for:
 - Customizing vLLM behavior for specific workloads
 - Rapid iteration on development without rebuilding the entire image
-## 7\. Using cluster mode for inference
+## 7\. Launch Scripts
 Launch scripts provide a simple way to define reusable model configurations. Instead of passing long command lines, you can create a bash script that is copied into the container and executed directly.
 ### Basic Usage
 ```bash
 # Use a launch script by name (looks in profiles/ directory)
 ./launch-cluster.sh --launch-script example-vllm-minimax
 # Use with explicit nodes
 ./launch-cluster.sh -n 192.168.1.1,192.168.1.2 --launch-script vllm-openai-gpt-oss-120b.sh
 # Combine with mods for models requiring patches
 ./launch-cluster.sh --launch-script vllm-glm-4.7-nvfp4.sh --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4
 ```
 ### Script Format
 Launch scripts are simple bash files that run directly inside the container:
 ```bash
 #!/bin/bash
 # PROFILE: OpenAI GPT-OSS 120B
 # DESCRIPTION: vLLM serving openai/gpt-oss-120b with FlashInfer MOE optimization
 # Set environment variables if needed
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 # Run your command
 vllm serve openai/gpt-oss-120b \
    --host 0.0.0.0 \
    --port 8000 \
    --tensor-parallel-size 2 \
    --distributed-executor-backend ray \
    --enable-auto-tool-choice
 ```
 ### Available Launch Scripts
 The `examples/` directory contains ready-to-use launch scripts:
 - **example-vllm-minimax.sh** - MiniMax-M2-AWQ with Ray distributed backend
 - **vllm-openai-gpt-oss-120b.sh** - OpenAI GPT-OSS 120B with FlashInfer MOE
 - **vllm-glm-4.7-nvfp4.sh** - GLM-4.7-NVFP4 (requires the glm4_moe patch mod)
 See [examples/README.md](examples/README.md) for detailed documentation and more examples.
 ## 8\. Using cluster mode for inference
 First, start follow the instructions above to start the head container on your first Spark, and node container on the second Spark.
 Then, on the first Spark, run vllm like this:
@@ -863,7 +965,7 @@ docker exec -it vllm_node
 And execute vllm command inside.
-## 8\. Fastsafetensors
+## 9\. Fastsafetensors
 This build includes support for fastsafetensors loading which significantly improves loading speeds, especially on DGX Spark where MMAP performance is very poor currently.
 [Fasttensors](https://github.com/foundation-model-stack/fastsafetensors/) solve this issue by using more efficient multi-threaded loading while avoiding mmap.
@@ -877,11 +979,11 @@ To use this method, simply include `--load-format fastsafetensors` when running
 HF_HUB_OFFLINE=1 vllm serve openai/gpt-oss-120b --port 8888 --host 0.0.0.0 --trust_remote_code --swap-space 16 --gpu-memory-utilization 0.7 -tp 2 --distributed-executor-backend ray --load-format fastsafetensors
 ```
-## 9\. Benchmarking
+## 10\. Benchmarking
 I recommend using [llama-benchy](https://github.com/eugr/llama-benchy) - a new benchmarking tool that delivers results in the same format as llama-bench from llama.cpp suite.
-## 10\. Downloading Models
+## 11\. Downloading Models
 The `hf-download.sh` script provides a convenient way to download models from HuggingFace and distribute them across your cluster nodes. It uses Huggingface CLI via `uvx` for fast downloads and `rsync` for distribution across the cluster.
--- a/examples/README.md
+++ b/examples/README.md
@@ -0,0 +1,186 @@
 # Example Launch Scripts
 This directory contains example bash scripts that demonstrate how to use the `--launch-script` option directly with `launch-cluster.sh`. 
 **Note:** For most use cases, the recipe system (`./run-recipe.sh`) is the recommended approach. These examples are provided for reference and for advanced users who need direct control over the launch process.
 ## Why Launch Scripts?
 - **Simple** - Just write a bash script that runs your command
 - **Flexible** - Use any bash features: environment variables, conditionals, loops
 - **Standalone** - Each script can be tested directly on a head node
 - **No magic** - What you see is what gets executed
 ## Usage
 ```bash
 # Use a launch script by name (looks in examples/ directory)
 ./launch-cluster.sh --launch-script example-vllm-minimax
 # Use a launch script by filename
 ./launch-cluster.sh --launch-script example-vllm-minimax.sh
 # Use a launch script with absolute path
 ./launch-cluster.sh --launch-script /path/to/my-script.sh
 # Combine with mods if needed
 ./launch-cluster.sh --launch-script my-script.sh --apply-mod mods/my-patch
 # Combine with other options
 ./launch-cluster.sh -n 192.168.1.1,192.168.1.2 --launch-script my-model.sh -d
 ```
 When using `--launch-script`, the `exec` action is automatically implied if no action is specified.
 ## Script Structure
 Launch scripts are simple bash scripts. The script is copied into the container at `/workspace/exec-script.sh` and executed.
 ```bash
 #!/bin/bash
 # PROFILE: Human-readable name
 # DESCRIPTION: What this script does
 # Optional: Set environment variables
 export MY_VAR="value"
 # Run your command
 vllm serve org/model-name \
    --port 8000 \
    --host 0.0.0.0 \
    --gpu-memory-utilization 0.7
 ```
 ### Metadata Comments
 The `# PROFILE:` and `# DESCRIPTION:` comments are optional but recommended for documentation:
 ```bash
 #!/bin/bash
 # PROFILE: MiniMax-M2-AWQ Example
 # DESCRIPTION: vLLM serving MiniMax-M2-AWQ with Ray distributed backend
 ```
 ## Examples
 ### Basic vLLM Serving
 ```bash
 #!/bin/bash
 # PROFILE: MiniMax-M2-AWQ
 # DESCRIPTION: vLLM serving MiniMax-M2-AWQ with Ray distributed backend
 vllm serve QuantTrio/MiniMax-M2-AWQ \
    --port 8000 \
    --host 0.0.0.0 \
    --gpu-memory-utilization 0.7 \
    -tp 2 \
    --distributed-executor-backend ray \
    --max-model-len 128000 \
    --load-format fastsafetensors \
    --enable-auto-tool-choice \
    --tool-call-parser minimax_m2
 ```
 ### With Environment Variables
 ```bash
 #!/bin/bash
 # PROFILE: OpenAI GPT-OSS 120B
 # DESCRIPTION: vLLM serving openai/gpt-oss-120b with FlashInfer MOE optimization
 # Enable FlashInfer MOE with MXFP4/MXFP8 quantization
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 vllm serve openai/gpt-oss-120b \
    --tool-call-parser openai \
    --enable-auto-tool-choice \
    --tensor-parallel-size 2 \
    --distributed-executor-backend ray \
    --host 0.0.0.0 \
    --port 8000
 ```
 ### With Conditional Logic
 ```bash
 #!/bin/bash
 # PROFILE: Adaptive Model Server
 # DESCRIPTION: Adjusts settings based on available GPUs
 GPU_COUNT=$(nvidia-smi -L | wc -l)
 echo "Detected $GPU_COUNT GPUs"
 if [[ $GPU_COUNT -ge 4 ]]; then
    TP_SIZE=4
    MEM_UTIL=0.9
 else
    TP_SIZE=2
    MEM_UTIL=0.7
 fi
 vllm serve meta-llama/Llama-3.1-70B-Instruct \
    --port 8000 \
    --host 0.0.0.0 \
    -tp $TP_SIZE \
    --gpu-memory-utilization $MEM_UTIL \
    --distributed-executor-backend ray
 ```
 ### SGLang
 ```bash
 #!/bin/bash
 # PROFILE: SGLang Llama 3.1
 # DESCRIPTION: SGLang runtime with Llama 3.1
 sglang launch meta-llama/Llama-3.1-8B-Instruct \
    --port 8000 \
    --host 0.0.0.0 \
    --tp 2
 ```
 ### With Model Requiring Patches
 If your model requires patches, use `--apply-mod` alongside `--launch-script`:
 ```bash
 # Script: vllm-glm-4.7-nvfp4.sh
 #!/bin/bash
 # PROFILE: Salyut1/GLM-4.7-NVFP4
 # DESCRIPTION: vLLM serving GLM-4.7-NVFP4
 # NOTE: Requires --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4
 vllm serve Salyut1/GLM-4.7-NVFP4 \
    --attention-config.backend flashinfer \
    --tool-call-parser glm47 \
    -tp 2 \
    --host 0.0.0.0 \
    --port 8000
 ```
 Usage:
 ```bash
 ./launch-cluster.sh --launch-script vllm-glm-4.7-nvfp4.sh --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4 exec
 ```
 ## Creating a New Launch Script
 1. Create a new `.sh` file in this directory
 2. Add the shebang `#!/bin/bash`
 3. Add `# PROFILE:` and `# DESCRIPTION:` comments
 4. Write your command (e.g., `vllm serve ...`)
 5. Run with `./launch-cluster.sh --launch-script my-script.sh exec`
 ## Testing Scripts
 Since launch scripts are standard bash files, you can test them directly:
 ```bash
 # Inside a running container or on a head node with the runtime installed
 cd profiles
 ./my-script.sh
 ```
 This makes development and debugging much easier than complex configuration systems.
--- a/examples/example-vllm-minimax.sh
+++ b/examples/example-vllm-minimax.sh
@@ -0,0 +1,15 @@
 #!/bin/bash
 # PROFILE: MiniMax-M2-AWQ Example
 # DESCRIPTION: vLLM serving MiniMax-M2-AWQ with Ray distributed backend
 vllm serve QuantTrio/MiniMax-M2-AWQ \
    --port 8000 \
    --host 0.0.0.0 \
    --gpu-memory-utilization 0.7 \
    -tp 2 \
    --distributed-executor-backend ray \
    --max-model-len 128000 \
    --load-format fastsafetensors \
    --enable-auto-tool-choice \
    --tool-call-parser minimax_m2 \
    --reasoning-parser minimax_m2_append_think
--- a/examples/vllm-glm-4.7-nvfp4.sh
+++ b/examples/vllm-glm-4.7-nvfp4.sh
@@ -0,0 +1,17 @@
 #!/bin/bash
 # PROFILE: Salyut1/GLM-4.7-NVFP4
 # DESCRIPTION: vLLM serving GLM-4.7-NVFP4
 # NOTE: This profile requires --apply-mod mods/fix-Salyut1-GLM-4.7-NVFP4 to fix k/v scales incompatibility
 # See: https://huggingface.co/Salyut1/GLM-4.7-NVFP4/discussions/3#694ab9b6e2efa04b7ecb0c4b
 vllm serve Salyut1/GLM-4.7-NVFP4 \
    --attention-config.backend flashinfer \
    --tool-call-parser glm47 \
    --reasoning-parser glm45 \
    --enable-auto-tool-choice \
    -tp 2 \
    --gpu-memory-utilization 0.88 \
    --max-model-len 32000 \
    --distributed-executor-backend ray \
    --host 0.0.0.0 \
    --port 8000
--- a/examples/vllm-openai-gpt-oss-120b.sh
+++ b/examples/vllm-openai-gpt-oss-120b.sh
@@ -0,0 +1,20 @@
 #!/bin/bash
 # PROFILE: OpenAI GPT-OSS 120B
 # DESCRIPTION: vLLM serving openai/gpt-oss-120b with FlashInfer MOE optimization
 # Enable FlashInfer MOE with MXFP4/MXFP8 quantization
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 vllm serve openai/gpt-oss-120b \
    --tool-call-parser openai \
    --enable-auto-tool-choice \
    --tensor-parallel-size 2 \
    --distributed-executor-backend ray \
    --kv-cache-dtype fp8 \
    --gpu-memory-utilization 0.70 \
    --max-model-len 128000 \
    --max-num-batched-tokens 4096 \
    --max-num-seqs 8 \
    --enable-prefix-caching \
    --host 0.0.0.0 \
    --port 8000
--- a/launch-cluster.sh
+++ b/launch-cluster.sh
@@ -26,6 +26,8 @@ ACTION="start"
 CLUSTER_WAS_RUNNING="false"
 MOD_PATHS=()
 MOD_TYPES=()
 LAUNCH_SCRIPT_PATH=""
 SCRIPT_DIR="$(dirname "$(realpath "$0")")"
 ACTIONS_ARG=""
 SOLO_MODE="false"
@@ -41,11 +43,16 @@ usage() {
    echo "  -e, --env       Environment variable to pass to container (e.g. -e VAR=val)"
    echo "  --nccl-debug    NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
    echo "  --apply-mod     Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)"
    echo "  --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted."
    echo "  --check-config  Check configuration and auto-detection without launching"
    echo "  --solo          Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
    echo "  -d              Daemon mode (only for 'start' action)"
-    echo "  action          start | stop | status | exec (Default: start)"
+    echo "  action          start | stop | status | exec (Default: start). Not compatible with --launch-script."
-    echo "  command         Command to run (only for 'exec' action)"
+    echo "  command         Command to run (only for 'exec' action). Not compatible with --launch-script."
    echo ""
    echo "Launch Script Usage:"
    echo "  $0 --launch-script examples/my-script.sh   # Script copied to container and executed"
    echo "  $0 --launch-script /path/to/script.sh      # Uses absolute path to script"
    exit 1
 }
@@ -59,6 +66,7 @@ while [[ "$#" -gt 0 ]]; do
        --ib-if) IB_IF="$2"; shift ;;
        -e|--env) DOCKER_ARGS="$DOCKER_ARGS -e $2"; shift ;;
        --apply-mod) MOD_PATHS+=("$2"); shift ;;
        --launch-script) LAUNCH_SCRIPT_PATH="$2"; shift ;;
        --nccl-debug)
            if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then
                NCCL_DEBUG_VAL="$2"
@@ -72,9 +80,17 @@ while [[ "$#" -gt 0 ]]; do
        -d) DAEMON_MODE="true" ;;
        -h|--help) usage ;;
        start|stop|status) 
            if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
                echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
                exit 1
            fi
            ACTION="$1" 
            ;;
        exec)
            if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
                echo "Error: Action 'exec' is not compatible with --launch-script. Please omit the action or not use --launch-script."
                exit 1
            fi
            ACTION="exec"
            shift
            COMMAND_TO_RUN="$@"
@@ -85,6 +101,10 @@ while [[ "$#" -gt 0 ]]; do
            # unless it's the default 'start' implied.
            # However, to support "omitted" = start, we need to be careful.
            # If the arg looks like a command, it's exec.
            if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
                echo "Error: Command is not compatible with --launch-script. Please omit the command or not use --launch-script."
                exit 1
            fi
            ACTION="exec"
            COMMAND_TO_RUN="$@"
            break 
@@ -107,6 +127,37 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
    esac
 fi
 # Resolve launch script path if specified
 if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
    # Check if it's an absolute path or relative path that exists
    if [[ -f "$LAUNCH_SCRIPT_PATH" ]]; then
        LAUNCH_SCRIPT_PATH=$(realpath "$LAUNCH_SCRIPT_PATH")
    # Check if it's just a filename, look in examples/ directory
    elif [[ -f "$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" ]]; then
        LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH"
    # Check if it's a name without .sh extension
    elif [[ -f "$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" ]]; then
        LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh"
    else
        echo "Error: Launch script '$LAUNCH_SCRIPT_PATH' not found."
        echo "Searched in:"
        echo "  - $LAUNCH_SCRIPT_PATH"
        echo "  - $SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH"
        echo "  - $SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh"
        exit 1
    fi
    echo "Using launch script: $LAUNCH_SCRIPT_PATH"
    # Set command to run the copied script (use absolute path since docker exec may not be in /workspace)
    COMMAND_TO_RUN="/workspace/exec-script.sh"
    # If launch script is specified, default action to exec unless explicitly set to stop/status
    if [[ "$ACTION" == "start" ]]; then
        ACTION="exec"
    fi
 fi
 # Validate MOD_PATHS if set
 for i in "${!MOD_PATHS[@]}"; do
    mod_path="${MOD_PATHS[$i]}"
@@ -427,6 +478,25 @@ apply_mod_to_container() {
    fi
 }
 # Copy Launch Script to Container Function
 copy_launch_script_to_container() {
    local container="$1"
    local script_path="$2"
    echo "Copying launch script to head node..."
    local target_script_path="$script_path"
    # Copy script into container as /workspace/exec-script.sh
    echo "  Copying script into container..."
    docker cp "$target_script_path" "$container:/workspace/exec-script.sh"
    # Make executable
    docker exec "$container" chmod +x /workspace/exec-script.sh
    echo "  Launch script copied to head node"
 }
 # Start Cluster Function
 start_cluster() {
    check_cluster_running
@@ -495,6 +565,11 @@ start_cluster() {
        done
    fi
    # Copy launch script to head node only (workers don't need it - they just run Ray)
    if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
        copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
    fi
    if [[ "$SOLO_MODE" == "false" ]]; then
        wait_for_cluster
    else
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -0,0 +1,266 @@
 # Recipes
 Recipes provide a **one-click solution** for deploying models with pre-configured settings. Each recipe is a YAML file that specifies:
 - HuggingFace model to download
 - Container image and build arguments
 - Required mods/patches
 - Default parameters (port, host, tensor parallelism, etc.)
 - Environment variables
 - The vLLM serve command
 ## Quick Start
 ```bash
 # List available recipes
 ./run-recipe.sh --list
 # Run a recipe in solo mode (single node)
 ./run-recipe.sh glm-4.7-flash-awq --solo
 # Full setup: build container + download model + run
 ./run-recipe.sh glm-4.7-flash-awq --solo --setup
 # Run with overrides
 ./run-recipe.sh glm-4.7-flash-awq --solo --port 9000 --gpu-mem 0.8
 # Cluster deployment
 ./run-recipe.sh glm-4.7-nvfp4 -n 192.168.1.10,192.168.1.11 --setup
 ```
 ## Cluster Node Discovery
 The recipe runner can automatically discover cluster nodes:
 ```bash
 # Auto-discover nodes and save to .env
 ./run-recipe.sh --discover
 # Show current .env configuration
 ./run-recipe.sh --show-env
 # Run recipe (uses nodes from .env automatically)
 ./run-recipe.sh glm-4.7-nvfp4 --setup
 ```
 When you run `--discover`, it:
 1. Scans the network for nodes with SSH access
 2. Prompts you to select which nodes to include
 3. Saves the configuration to `.env`
 Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`.
 ## Workflow Modes
 ### Solo Mode (Single Node)
 ```bash
 # Explicitly run in solo mode
 ./run-recipe.sh glm-4.7-flash-awq --solo
 # If no nodes configured, defaults to solo
 ./run-recipe.sh minimax-m2-awq
 ```
 ### Cluster Mode (Multiple Nodes)
 ```bash
 # Specify nodes directly (first IP is head node)
 ./run-recipe.sh glm-4.7-nvfp4 -n 192.168.1.10,192.168.1.11 --setup
 # Or use auto-discovered nodes from .env
 ./run-recipe.sh --discover  # First time only
 ./run-recipe.sh glm-4.7-nvfp4 --setup
 ```
 When using cluster mode with `--setup`:
 - Container is built locally and copied to all worker nodes
 - Model is downloaded locally and copied to all worker nodes
 ### Cluster-Only Recipes
 Some models are too large to run on a single node. These recipes have `cluster_only: true` and will fail with a helpful error if you try to run them in solo mode:
 ```bash
 $ ./run-recipe.sh glm-4.7-nvfp4 --solo
 Error: Recipe 'GLM-4.7-NVFP4' requires cluster mode.
 This model is too large to run on a single node.
 Options:
  1. Specify nodes directly:  ./run-recipe.sh glm-4.7-nvfp4 -n node1,node2
  2. Auto-discover and save:  ./run-recipe.sh --discover
     Then run:                ./run-recipe.sh glm-4.7-nvfp4
 ```
 ## Setup Options
 | Flag | Description |
 |------|-------------|
 | `--setup` | Full setup: build (if missing) + download (if missing) + run |
 | `--build-only` | Only build/copy the container, don't run |
 | `--download-only` | Only download/copy the model, don't run |
 | `--force-build` | Rebuild even if container exists |
 | `--force-download` | Re-download even if model exists |
 | `--dry-run` | Show what would happen without executing |
 ## Recipe Format
 ```yaml
 # Required fields
 name: Human-readable name
 container: docker-image-name
 command: |
  vllm serve model/name \
      --port {port} \
      --host {host}
 # Optional fields
 description: What this recipe does
 model: org/model-name              # HuggingFace model ID for --setup downloads
 cluster_only: false                # Set to true if model requires cluster mode
 build_args:                        # Extra args for build-and-copy.sh
  - --pre-tf                       # e.g., for transformers 5.0
  - --exp-mxfp4                    # e.g., for MXFP4 Dockerfile
 mods:
  - mods/some-patch
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 2
  gpu_memory_utilization: 0.85
  max_model_len: 32000
 env:
  SOME_VAR: "value"
 ```
 ### Build Arguments
 The `build_args` field passes flags to `build-and-copy.sh`:
 | Flag | Description |
 |------|-------------|
 | `--pre-tf` | Use transformers 5.0 (required for GLM-4.7 models) |
 | `--exp-mxfp4` | Use MXFP4 Dockerfile (for MXFP4 quantized models) |
 | `--use-wheels` | Use pre-built wheels instead of building from source |
 ### Parameter Substitution
 Use `{param_name}` in the command to substitute values from defaults or CLI overrides:
 ```yaml
 defaults:
  port: 8000
  tensor_parallel: 2
 command: |
  vllm serve my/model \
      --port {port} \
      -tp {tensor_parallel}
 ```
 Override at runtime:
 ```bash
 ./run-recipe.sh my-recipe --port 9000 --tp 4
 ```
 ## CLI Reference
 ```
 Usage: ./run-recipe.sh [OPTIONS] [RECIPE]
 Cluster discovery:
  --discover                  Auto-detect cluster nodes and save to .env
  --show-env                  Show current .env configuration
 Recipe overrides:
  --port PORT                 Override port
  --host HOST                 Override host
  --tensor-parallel, --tp N   Override tensor parallelism
  --gpu-memory-utilization N  Override GPU memory utilization (--gpu-mem)
  --max-model-len N           Override max model length
 Setup options:
  --setup                     Full setup: build + download + run
  --build-only                Only build/copy container, don't run
  --download-only             Only download/copy model, don't run
  --force-build               Rebuild even if container exists
  --force-download            Re-download even if model exists
 Launch options:
  --solo                      Run in solo mode (single node, no Ray)
  -n, --nodes IPS             Comma-separated node IPs (first = head)
  -d, --daemon                Run in daemon mode
  -t, --container IMAGE       Override container from recipe
  --nccl-debug LEVEL          NCCL debug level (VERSION, WARN, INFO, TRACE)
 Other:
  --dry-run                   Show what would be executed
  --list, -l                  List available recipes
 ```
 ## Creating a Recipe
 1. Create a new `.yaml` file in `recipes/`
 2. Specify required fields: `name`, `container`, `command`
 3. Add `build_args` if your model needs special build options
 4. Add `mods` if your model needs patches
 5. Set `cluster_only: true` if model is too large for single node
 6. Set sensible `defaults`
 7. Add `env` variables if needed
 Example:
 ```yaml
 name: My Model
 description: My custom model setup
 container: vllm-node-tf5
 build_args:
  - --pre-tf
 mods:
  - mods/my-fix
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 1
  gpu_memory_utilization: 0.85
 command: |
  vllm serve org/my-model \
      --port {port} \
      --host {host} \
      -tp {tensor_parallel} \
      --gpu-memory-utilization {gpu_memory_utilization}
 ```
 ## Architecture
 ```
 ┌─────────────────────────────────────────────────────────┐
 │  run-recipe.sh / run-recipe.py                          │
 │  - Parses YAML recipe                                   │
 │  - Auto-discovers cluster nodes (--discover)            │
 │  - Loads nodes from .env                                │
 │  - Handles --setup (build + download + run)             │
 │  - Generates launch script from template                │
 │  - Applies CLI overrides                                │
 └──────────┬────────────────────────┬─────────────────────┘
           │ calls (for build)      │ calls (for download)
           ▼                        ▼
 ┌──────────────────────┐  ┌───────────────────────────────┐
 │  build-and-copy.sh   │  │  hf-download.sh               │
 │  - Docker build      │  │  - HuggingFace model download │
 │  - Copy to workers   │  │  - Rsync to workers           │
 └──────────────────────┘  └───────────────────────────────┘
           │ 
           │ then calls (for run)
           ▼
 ┌─────────────────────────────────────────────────────────┐
 │  launch-cluster.sh                                      │
 │  - Cluster orchestration                                │
 │  - Container lifecycle                                  │
 │  - Mod application                                      │
 │  - Launch script execution                              │
 └─────────────────────────────────────────────────────────┘
 ```
 This separation follows the Unix philosophy: `run-recipe.sh` provides convenience, while the underlying scripts remain focused on their specific tasks.
--- a/recipes/glm-4.7-flash-awq.yaml
+++ b/recipes/glm-4.7-flash-awq.yaml
@@ -0,0 +1,64 @@
 # Recipe: GLM-4.7-Flash-AWQ-4bit
 # cyankiwi's AWQ quantized GLM-4.7-Flash model
 # Requires a patch for inference speed optimization
 #
 # NOTE: vLLM implementation is suboptimal even with the patch. 
 # The model performance is still significantly slower than it should be 
 # for a model with this number of active parameters. Running in cluster 
 # increases prompt processing performance, but not token generation.
 # Expect ~40 t/s generation speed in both single node and cluster.
 recipe_version: "1"
 name: GLM-4.7-Flash-AWQ
 description: vLLM serving cyankiwi/GLM-4.7-Flash-AWQ-4bit with speed optimization patch
 # HuggingFace model to download
 model: cyankiwi/GLM-4.7-Flash-AWQ-4bit
 # This model can run on single node (solo) or cluster
 cluster_only: false
 # Container image to use
 container: vllm-node-tf5
 # Build arguments for build-and-copy.sh
 # tf5 = transformers 5.0 (required for GLM-4.7)
 build_args:
  - --pre-tf
 # Mods to apply before running (paths relative to repo root)
 # This mod prevents severe inference speed degradation
 mods:
  - mods/fix-glm-4.7-flash-AWQ
 # Default settings (can be overridden via CLI)
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 1
  gpu_memory_utilization: 0.7
  max_model_len: 202752
  max_num_batched_tokens: 4096
  max_num_seqs: 64
  served_model_name: glm-4.7-flash
 # Environment variables to set in the container
 env:
  # Add any required env vars here
 # The vLLM serve command template
 # Use {var_name} for substitution from defaults/overrides
 # In cluster mode, --distributed-executor-backend ray and -tp 2 are added
 command: |
  vllm serve cyankiwi/GLM-4.7-Flash-AWQ-4bit \
      --tool-call-parser glm47 \
      --reasoning-parser glm45 \
      --enable-auto-tool-choice \
      --served-model-name {served_model_name} \
      --max-model-len {max_model_len} \
      --max-num-batched-tokens {max_num_batched_tokens} \
      --max-num-seqs {max_num_seqs} \
      --gpu-memory-utilization {gpu_memory_utilization} \
      -tp {tensor_parallel} \
      --host {host} \
      --port {port}
--- a/recipes/minimax-m2-awq.yaml
+++ b/recipes/minimax-m2-awq.yaml
@@ -0,0 +1,40 @@
 # Recipe: MiniMax-M2-AWQ
 # MiniMax M2 model with AWQ quantization
 recipe_version: "1"
 name: MiniMax-M2-AWQ
 description: vLLM serving MiniMax-M2-AWQ with Ray distributed backend
 # HuggingFace model to download (optional, for --download-model)
 model: QuantTrio/MiniMax-M2-AWQ
 # Container image to use
 container: vllm-node
 # No mods required
 mods: []
 # Default settings (can be overridden via CLI)
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 2
  gpu_memory_utilization: 0.7
  max_model_len: 128000
 # Environment variables
 env: {}
 # The vLLM serve command template
 command: |
  vllm serve QuantTrio/MiniMax-M2-AWQ \
      --port {port} \
      --host {host} \
      --gpu-memory-utilization {gpu_memory_utilization} \
      -tp {tensor_parallel} \
      --distributed-executor-backend ray \
      --max-model-len {max_model_len} \
      --load-format fastsafetensors \
      --enable-auto-tool-choice \
      --tool-call-parser minimax_m2 \
      --reasoning-parser minimax_m2_append_think
--- a/recipes/openai-gpt-oss-120b.yaml
+++ b/recipes/openai-gpt-oss-120b.yaml
@@ -0,0 +1,52 @@
 # Recipe: OpenAI GPT-OSS 120B
 # OpenAI's open source 120B MoE model with MXFP4 quantization support
 recipe_version: "1"
 name: OpenAI GPT-OSS 120B
 description: vLLM serving openai/gpt-oss-120b with MXFP4 quantization and FlashInfer
 # HuggingFace model to download (optional, for --download-model)
 model: openai/gpt-oss-120b
 # Container image to use
 container: vllm-node-mxfp4
 # Build arguments for build-and-copy.sh
 build_args:
  - --exp-mxfp4
 # No mods required for this model
 mods: []
 # Default settings (can be overridden via CLI)
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 2
  gpu_memory_utilization: 0.70
  max_num_batched_tokens: 8192
 # Environment variables to set in the container
 env:
  VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: "1"
 # The vLLM serve command template
 # Uses MXFP4 quantization for memory efficiency
 command: |
  vllm serve openai/gpt-oss-120b \
      --tool-call-parser openai \
      --reasoning-parser openai_gptoss \
      --enable-auto-tool-choice \
      --tensor-parallel-size {tensor_parallel} \
      --distributed-executor-backend ray \
      --gpu-memory-utilization {gpu_memory_utilization} \
      --enable-prefix-caching \
      --load-format fastsafetensors \
      --quantization mxfp4 \
      --mxfp4-backend CUTLASS \
      --mxfp4-layers moe,qkv,o,lm_head \
      --attention-backend FLASHINFER \
      --kv-cache-dtype fp8 \
      --max-num-batched-tokens {max_num_batched_tokens} \
      --host {host} \
      --port {port}
--- a/run-recipe.py
+++ b/run-recipe.py
--- a/run-recipe.sh
+++ b/run-recipe.sh
@@ -0,0 +1,42 @@
 #!/bin/bash
 #
 # run-recipe.sh - Wrapper for run-recipe.py
 #
 # Ensures Python dependencies are available and runs the recipe runner.
 #
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 RECIPE_SCRIPT="$SCRIPT_DIR/run-recipe.py"
 # Check for Python 3.10+
 if command -v python3 &>/dev/null; then
    PYTHON=python3
 elif command -v python &>/dev/null; then
    PYTHON=python
 else
    echo "Error: Python 3 not found. Please install Python 3.10 or later."
    exit 1
 fi
 # Verify version
 PY_VERSION=$($PYTHON -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
 PY_MAJOR=$($PYTHON -c 'import sys; print(sys.version_info.major)')
 PY_MINOR=$($PYTHON -c 'import sys; print(sys.version_info.minor)')
 if [[ "$PY_MAJOR" -lt 3 ]] || [[ "$PY_MAJOR" -eq 3 && "$PY_MINOR" -lt 10 ]]; then
    echo "Error: Python 3.10+ required, found $PY_VERSION"
    exit 1
 fi
 # Check for PyYAML and install if missing
 if ! $PYTHON -c "import yaml" 2>/dev/null; then
    echo "Installing PyYAML..."
    $PYTHON -m pip install --quiet pyyaml
    if [[ $? -ne 0 ]]; then
        echo "Error: Failed to install PyYAML. Try: pip install pyyaml"
        exit 1
    fi
 fi
 # Run the recipe script
 exec $PYTHON "$RECIPE_SCRIPT" "$@"
--- a/tests/expected_commands.sh
+++ b/tests/expected_commands.sh
@@ -0,0 +1,89 @@
 # Expected vLLM serve arguments for each recipe
 # This file is used by test_recipes.sh to verify recipes match README documentation
 #
 # Format: Each recipe has a section with expected arguments
 # Tests will verify these arguments appear in the dry-run output
 #
 # IMPORTANT: Keep this in sync with README.md documentation
 # When updating recipes, update both README.md and this file
 # ==============================================================================
 # glm-4.7-flash-awq
 # README Reference: Lines 186-198 (solo) and 203-218 (cluster)
 # ==============================================================================
 GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit"
 GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5"
 GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ"
 GLM_FLASH_AWQ_ARGS=(
    "--tool-call-parser glm47"
    "--reasoning-parser glm45"
    "--enable-auto-tool-choice"
    "--served-model-name glm-4.7-flash"
    "--max-model-len 202752"
    "--max-num-batched-tokens 4096"
    "--max-num-seqs 64"
    "--gpu-memory-utilization 0.7"
    "--port 8000"
    "--host 0.0.0.0"
 )
 # ==============================================================================
 # openai-gpt-oss-120b
 # README Reference: Lines 244-257 (solo) and 264-280 (cluster)
 # ==============================================================================
 GPT_OSS_MODEL="openai/gpt-oss-120b"
 GPT_OSS_CONTAINER="vllm-node-mxfp4"
 GPT_OSS_ARGS=(
    "--port 8000"
    "--host 0.0.0.0"
    "--enable-auto-tool-choice"
    "--tool-call-parser openai"
    "--reasoning-parser openai_gptoss"
    "--gpu-memory-utilization 0.7"
    "--enable-prefix-caching"
    "--load-format fastsafetensors"
    "--quantization mxfp4"
    "--mxfp4-backend CUTLASS"
    "--mxfp4-layers moe,qkv,o,lm_head"
    "--attention-backend FLASHINFER"
    "--kv-cache-dtype fp8"
    "--max-num-batched-tokens 8192"
 )
 # ==============================================================================
 # minimax-m2-awq
 # README Reference: Not explicitly documented, but based on model requirements
 # ==============================================================================
 MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ"
 MINIMAX_CONTAINER="vllm-node"
 MINIMAX_ARGS=(
    "--port 8000"
    "--host 0.0.0.0"
    "--gpu-memory-utilization 0.7"
    "--max-model-len 128000"
    "--load-format fastsafetensors"
    "--enable-auto-tool-choice"
    "--tool-call-parser minimax_m2"
    "--reasoning-parser minimax_m2_append_think"
 )
 # ==============================================================================
 # Cluster Mode Expected Arguments
 # These are arguments that should appear ONLY in cluster mode
 # Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node)
 # ==============================================================================
 # glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model)
 GLM_FLASH_AWQ_CLUSTER_TP="1"
 # openai-gpt-oss-120b cluster mode (2 nodes = tp 2)
 GPT_OSS_CLUSTER_TP="2"
 GPT_OSS_CLUSTER_ARGS=(
    "--distributed-executor-backend ray"
 )
 # minimax-m2-awq cluster mode (2 nodes = tp 2)
 MINIMAX_CLUSTER_TP="2"
 MINIMAX_CLUSTER_ARGS=(
    "--distributed-executor-backend ray"
 )
--- a/tests/test_recipes.sh
+++ b/tests/test_recipes.sh
@@ -0,0 +1,859 @@
 #!/bin/bash
 #
 # test_recipes.sh - Integration tests for run-recipe.py and launch-cluster.sh
 #
 # These tests use --dry-run mode to verify compatibility without actually
 # running containers. Suitable for CI/CD pipelines.
 #
 # Usage:
 #   ./tests/test_recipes.sh          # Run all tests
 #   ./tests/test_recipes.sh -v       # Verbose output
 #
 set -e
 SCRIPT_DIR="$(dirname "$(realpath "$0")")"
 PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
 VERBOSE="${1:-}"
 # Load expected commands for README verification
 source "$SCRIPT_DIR/expected_commands.sh"
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 # Test counters
 TESTS_PASSED=0
 TESTS_FAILED=0
 TESTS_SKIPPED=0
 # Helper functions
 log_test() {
    echo -e "${YELLOW}[TEST]${NC} $1"
 }
 log_pass() {
    echo -e "${GREEN}[PASS]${NC} $1"
    TESTS_PASSED=$((TESTS_PASSED + 1))
 }
 log_fail() {
    echo -e "${RED}[FAIL]${NC} $1"
    TESTS_FAILED=$((TESTS_FAILED + 1))
 }
 log_skip() {
    echo -e "${YELLOW}[SKIP]${NC} $1"
    TESTS_SKIPPED=$((TESTS_SKIPPED + 1))
 }
 log_verbose() {
    if [[ "$VERBOSE" == "-v" ]]; then
        echo "       $1"
    fi
 }
 # Check prerequisites
 check_prerequisites() {
    log_test "Checking prerequisites..."
    if ! command -v python3 &> /dev/null; then
        log_fail "python3 not found"
        exit 1
    fi
    # Check Python version
    python_version=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
    if [[ $(echo "$python_version < 3.10" | bc -l) -eq 1 ]]; then
        log_fail "Python 3.10+ required, found $python_version"
        exit 1
    fi
    # Check PyYAML
    if ! python3 -c "import yaml" 2>/dev/null; then
        log_fail "PyYAML not installed"
        exit 1
    fi
    log_pass "Prerequisites OK (Python $python_version with PyYAML)"
 }
 # Test: run-recipe.py exists and is executable
 test_run_recipe_exists() {
    log_test "run-recipe.py exists and is executable"
    if [[ -x "$PROJECT_DIR/run-recipe.py" ]]; then
        log_pass "run-recipe.py is executable"
    else
        log_fail "run-recipe.py not found or not executable"
    fi
 }
 # Test: launch-cluster.sh exists and is executable
 test_launch_cluster_exists() {
    log_test "launch-cluster.sh exists and is executable"
    if [[ -x "$PROJECT_DIR/launch-cluster.sh" ]]; then
        log_pass "launch-cluster.sh is executable"
    else
        log_fail "launch-cluster.sh not found or not executable"
    fi
 }
 # Test: run-recipe.py --list works
 test_list_recipes() {
    log_test "run-recipe.py --list"
    output=$("$PROJECT_DIR/run-recipe.py" --list 2>&1)
    if [[ $? -eq 0 ]] && echo "$output" | grep -q "Available recipes"; then
        log_pass "--list shows available recipes"
        log_verbose "Found recipes in output"
    else
        log_fail "--list failed or no recipes found"
        log_verbose "$output"
    fi
 }
 # Test: All recipes have required recipe_version field
 test_recipe_version_required() {
    log_test "All recipes have required recipe_version field"
    local all_valid=true
    for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
        if [[ -f "$recipe" ]]; then
            recipe_name=$(basename "$recipe")
            if ! grep -q "^recipe_version:" "$recipe"; then
                log_verbose "$recipe_name missing recipe_version"
                all_valid=false
            fi
        fi
    done
    if [[ "$all_valid" == "true" ]]; then
        log_pass "All recipes have recipe_version field"
    else
        log_fail "Some recipes missing recipe_version field"
    fi
 }
 # Test: All recipes load without errors
 test_all_recipes_load() {
    log_test "All recipes load without errors"
    local all_valid=true
    for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
        if [[ -f "$recipe" ]]; then
            recipe_name=$(basename "$recipe" .yaml)
            # Try to load recipe with --dry-run (will fail early if recipe is invalid)
            if ! "$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1 | grep -q "Error:"; then
                log_verbose "$recipe_name loads OK"
            else
                log_verbose "$recipe_name failed to load"
                all_valid=false
            fi
        fi
    done
    if [[ "$all_valid" == "true" ]]; then
        log_pass "All recipes load successfully"
    else
        log_fail "Some recipes failed to load"
    fi
 }
 # Test: Dry-run generates valid launch script
 test_dry_run_generates_script() {
    log_test "Dry-run generates valid launch script"
    # Find first available recipe
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
    if echo "$output" | grep -q "#!/bin/bash" && echo "$output" | grep -q "vllm serve"; then
        log_pass "Dry-run generates bash script with vllm serve command"
    else
        log_fail "Dry-run output doesn't contain expected content"
        log_verbose "$output"
    fi
 }
 # Test: Solo mode sets tensor_parallel=1
 test_solo_mode_tp1() {
    log_test "Solo mode sets tensor_parallel=1"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
    # Check that -tp 1 is in the output (solo mode should set tp=1)
    if echo "$output" | grep -q "\-tp 1"; then
        log_pass "Solo mode correctly sets -tp 1"
    else
        log_fail "Solo mode did not set -tp 1"
        log_verbose "$output"
    fi
 }
 # Test: Solo mode removes --distributed-executor-backend ray
 test_solo_mode_removes_ray() {
    log_test "Solo mode removes --distributed-executor-backend ray"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
    # Check that --distributed-executor-backend is NOT in the output
    if ! echo "$output" | grep -q "\-\-distributed-executor-backend"; then
        log_pass "Solo mode correctly removes --distributed-executor-backend"
    else
        log_fail "Solo mode did not remove --distributed-executor-backend"
        log_verbose "$output"
    fi
 }
 # Test: Cluster mode preserves --distributed-executor-backend ray
 test_cluster_mode_keeps_ray() {
    log_test "Cluster mode preserves --distributed-executor-backend ray"
    # Use minimax-m2-awq which explicitly has --distributed-executor-backend ray
    if [[ ! -f "$PROJECT_DIR/recipes/minimax-m2-awq.yaml" ]]; then
        log_skip "minimax-m2-awq.yaml not found"
        return
    fi
    output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "192.168.1.1,192.168.1.2" 2>&1)
    # Check that --distributed-executor-backend IS in the output for cluster mode
    if echo "$output" | grep -q "\-\-distributed-executor-backend ray"; then
        log_pass "Cluster mode correctly preserves --distributed-executor-backend ray"
    else
        log_fail "Cluster mode did not preserve --distributed-executor-backend"
        log_verbose "$output"
    fi
 }
 # Test: CLI overrides work (--port)
 test_cli_override_port() {
    log_test "CLI override --port works"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 9999 2>&1)
    if echo "$output" | grep -q "\-\-port 9999"; then
        log_pass "--port override correctly applied"
    else
        log_fail "--port override not found in output"
        log_verbose "$output"
    fi
 }
 # Test: launch-cluster.sh --help works
 test_launch_cluster_help() {
    log_test "launch-cluster.sh --help"
    output=$("$PROJECT_DIR/launch-cluster.sh" --help 2>&1 || true)
    if echo "$output" | grep -q "Usage:"; then
        log_pass "--help shows usage information"
    else
        log_fail "--help did not show usage"
        log_verbose "$output"
    fi
 }
 # Test: launch-cluster.sh references examples/ not profiles/
 test_launch_cluster_examples_path() {
    log_test "launch-cluster.sh references examples/ directory"
    if grep -q "examples/" "$PROJECT_DIR/launch-cluster.sh"; then
        log_pass "launch-cluster.sh references examples/"
    else
        log_fail "launch-cluster.sh does not reference examples/"
    fi
    if grep -q "profiles/" "$PROJECT_DIR/launch-cluster.sh"; then
        log_fail "launch-cluster.sh still references profiles/"
    fi
 }
 # Test: Unsupported recipe version shows warning
 test_unsupported_recipe_version() {
    log_test "Unsupported recipe_version shows warning"
    # Create a temporary recipe with unsupported version
    temp_recipe=$(mktemp)
    cat > "$temp_recipe" << 'EOF'
 recipe_version: "999"
 name: Test Recipe
 container: test-container
 command: echo "test"
 EOF
    output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1)
    rm -f "$temp_recipe"
    if echo "$output" | grep -q "Warning.*schema version"; then
        log_pass "Unsupported recipe_version shows warning"
    else
        log_fail "No warning for unsupported recipe_version"
        log_verbose "$output"
    fi
 }
 # Test: Missing recipe_version fails
 test_missing_recipe_version_fails() {
    log_test "Missing recipe_version field fails"
    # Create a temporary recipe without recipe_version
    temp_recipe=$(mktemp)
    cat > "$temp_recipe" << 'EOF'
 name: Test Recipe
 container: test-container
 command: echo "test"
 EOF
    output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true)
    rm -f "$temp_recipe"
    if echo "$output" | grep -q "Error.*recipe_version"; then
        log_pass "Missing recipe_version correctly fails"
    else
        log_fail "Missing recipe_version did not fail as expected"
        log_verbose "$output"
    fi
 }
 # Test: cluster_only recipe fails in solo mode
 test_cluster_only_fails_solo() {
    log_test "cluster_only recipe fails in solo mode"
    # Create a temporary cluster_only recipe
    temp_recipe=$(mktemp)
    cat > "$temp_recipe" << 'EOF'
 recipe_version: "1"
 name: Cluster Only Test
 container: test-container
 cluster_only: true
 command: echo "test"
 EOF
    output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true)
    exit_code=$?
    rm -f "$temp_recipe"
    if echo "$output" | grep -q "requires cluster mode"; then
        log_pass "cluster_only recipe correctly fails in solo mode"
    else
        log_fail "cluster_only recipe did not fail in solo mode"
        log_verbose "$output"
    fi
 }
 # ==============================================================================
 # Launch-cluster.sh Command Line Verification Tests
 # ==============================================================================
 # These tests verify that the dry-run output contains the expected
 # launch-cluster.sh command line arguments matching the recipe configuration.
 # Helper: Extract launch-cluster command from dry-run output
 extract_launch_cmd() {
    echo "$1" | grep -A5 "launch-cluster.sh is called with:" | grep -v "launch-cluster.sh is called with:" | tr '\n' ' '
 }
 # Test: Solo mode generates --solo flag in launch-cluster command
 test_launch_cmd_solo_flag() {
    log_test "Launch command includes --solo flag in solo mode"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "\-\-solo"; then
        log_pass "Launch command includes --solo flag"
    else
        log_fail "Launch command missing --solo flag"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: Cluster mode generates -n flag with nodes
 test_launch_cmd_nodes_flag() {
    log_test "Launch command includes -n flag with nodes in cluster mode"
    output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "\-n 10.0.0.1,10.0.0.2"; then
        log_pass "Launch command includes -n with correct nodes"
    else
        log_fail "Launch command missing or incorrect -n flag"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: Container image from recipe is passed to launch-cluster
 test_launch_cmd_container_image() {
    log_test "Launch command includes correct container image (-t)"
    # Use openai-gpt-oss-120b which has a specific container name
    if [[ ! -f "$PROJECT_DIR/recipes/openai-gpt-oss-120b.yaml" ]]; then
        log_skip "openai-gpt-oss-120b.yaml not found"
        return
    fi
    output=$("$PROJECT_DIR/run-recipe.py" openai-gpt-oss-120b --dry-run --solo 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    # Check the container is vllm-node-mxfp4 (from the recipe)
    if echo "$launch_cmd" | grep -q "\-t vllm-node-mxfp4"; then
        log_pass "Launch command includes correct container image"
    else
        log_fail "Launch command has wrong container image"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: Mods from recipe are passed as --apply-mod
 test_launch_cmd_mods() {
    log_test "Launch command includes --apply-mod for recipe mods"
    # Use glm-4.7-flash-awq which has a mod
    if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then
        log_skip "glm-4.7-flash-awq.yaml not found"
        return
    fi
    output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "\-\-apply-mod"; then
        log_pass "Launch command includes --apply-mod for mods"
    else
        log_fail "Launch command missing --apply-mod"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: Daemon mode flag is passed through
 test_launch_cmd_daemon_flag() {
    log_test "Launch command includes -d flag in daemon mode"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -d 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "\-d"; then
        log_pass "Launch command includes -d flag"
    else
        log_fail "Launch command missing -d flag"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: NCCL debug level is passed through
 test_launch_cmd_nccl_debug() {
    log_test "Launch command includes --nccl-debug when specified"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --nccl-debug INFO 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "\-\-nccl-debug INFO"; then
        log_pass "Launch command includes --nccl-debug INFO"
    else
        log_fail "Launch command missing --nccl-debug"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: --launch-script is always included
 test_launch_cmd_launch_script() {
    log_test "Launch command includes --launch-script"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "\-\-launch-script"; then
        log_pass "Launch command includes --launch-script"
    else
        log_fail "Launch command missing --launch-script"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: Container override (-t CLI) takes precedence
 test_launch_cmd_container_override() {
    log_test "CLI container override (-t) takes precedence"
    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
    if [[ -z "$first_recipe" ]]; then
        log_skip "No recipes found"
        return
    fi
    recipe_name=$(basename "$first_recipe" .yaml)
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -t my-custom-image 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "\-t my-custom-image"; then
        log_pass "Container override correctly applied"
    else
        log_fail "Container override not applied"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Test: Cluster mode does NOT include --solo flag
 test_launch_cmd_no_solo_in_cluster() {
    log_test "Launch command does NOT include --solo in cluster mode"
    output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -qv "\-\-solo" || ! echo "$launch_cmd" | grep -q "\-\-solo"; then
        log_pass "Cluster mode correctly omits --solo flag"
    else
        log_fail "Cluster mode incorrectly includes --solo flag"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # ==============================================================================
 # README Documentation Verification Tests
 # ==============================================================================
 # These tests verify that recipe dry-run output matches the expected commands
 # documented in README.md. Expected values are defined in expected_commands.sh
 # Helper: Extract the generated launch script from dry-run output
 extract_vllm_command() {
    # Extract lines between "Generated Launch Script" and "What would be executed"
    echo "$1" | sed -n '/=== Generated Launch Script ===/,/=== What would be executed ===/p' | grep -v "===" | grep -v "^#" | grep -v "^$"
 }
 # Helper: Verify a recipe contains all expected arguments
 verify_recipe_args() {
    local recipe_name="$1"
    local expected_model="$2"
    local expected_container="$3"
    shift 3
    local expected_args=("$@")
    log_test "README match: $recipe_name"
    if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then
        log_skip "${recipe_name}.yaml not found"
        return
    fi
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
    vllm_cmd=$(extract_vllm_command "$output")
    launch_cmd=$(extract_launch_cmd "$output")
    local all_passed=true
    local missing_args=()
    # Check model name
    if ! echo "$vllm_cmd" | grep -q "$expected_model"; then
        missing_args+=("model: $expected_model")
        all_passed=false
    fi
    # Check container
    if ! echo "$launch_cmd" | grep -q "\-t $expected_container"; then
        missing_args+=("container: $expected_container")
        all_passed=false
    fi
    # Check each expected argument
    for arg in "${expected_args[@]}"; do
        # Handle arguments that may have slight formatting differences
        # Extract the flag and value separately for flexible matching
        local flag=$(echo "$arg" | awk '{print $1}')
        local value=$(echo "$arg" | cut -d' ' -f2-)
        # Use grep -F for fixed string matching (avoids -- being treated as grep options)
        if ! echo "$vllm_cmd" | grep -qF -- "$flag"; then
            missing_args+=("$arg")
            all_passed=false
        elif [[ -n "$value" ]] && [[ "$value" != "$flag" ]]; then
            # Check if value is present (might be on next line due to formatting)
            if ! echo "$vllm_cmd" | grep -qF -- "$value"; then
                missing_args+=("$arg (flag present, value mismatch)")
                all_passed=false
            fi
        fi
    done
    if [[ "$all_passed" == "true" ]]; then
        log_pass "README match: $recipe_name - all expected arguments present"
    else
        log_fail "README match: $recipe_name - missing arguments"
        for missing in "${missing_args[@]}"; do
            log_verbose "  Missing: $missing"
        done
        log_verbose "  vLLM command: $vllm_cmd"
    fi
 }
 # Test: glm-4.7-flash-awq matches README documentation
 test_readme_glm_flash_awq() {
    verify_recipe_args "glm-4.7-flash-awq" \
        "$GLM_FLASH_AWQ_MODEL" \
        "$GLM_FLASH_AWQ_CONTAINER" \
        "${GLM_FLASH_AWQ_ARGS[@]}"
 }
 # Test: openai-gpt-oss-120b matches README documentation
 test_readme_gpt_oss() {
    verify_recipe_args "openai-gpt-oss-120b" \
        "$GPT_OSS_MODEL" \
        "$GPT_OSS_CONTAINER" \
        "${GPT_OSS_ARGS[@]}"
 }
 # Test: minimax-m2-awq matches expected configuration
 test_readme_minimax() {
    verify_recipe_args "minimax-m2-awq" \
        "$MINIMAX_MODEL" \
        "$MINIMAX_CONTAINER" \
        "${MINIMAX_ARGS[@]}"
 }
 # Test: glm-4.7-flash-awq includes correct mod
 test_readme_glm_flash_mod() {
    log_test "README match: glm-4.7-flash-awq mod path"
    if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then
        log_skip "glm-4.7-flash-awq.yaml not found"
        return
    fi
    output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1)
    launch_cmd=$(extract_launch_cmd "$output")
    if echo "$launch_cmd" | grep -q "$GLM_FLASH_AWQ_MOD"; then
        log_pass "README match: glm-4.7-flash-awq has correct mod path"
    else
        log_fail "README match: glm-4.7-flash-awq missing expected mod: $GLM_FLASH_AWQ_MOD"
        log_verbose "Launch cmd: $launch_cmd"
    fi
 }
 # Helper: Verify cluster mode specific arguments
 verify_cluster_args() {
    local recipe_name="$1"
    local expected_tp="$2"
    shift 2
    local expected_args=("$@")
    log_test "README match (cluster): $recipe_name"
    if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then
        log_skip "${recipe_name}.yaml not found"
        return
    fi
    # Use fake nodes for cluster mode
    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
    vllm_cmd=$(extract_vllm_command "$output")
    local all_passed=true
    local missing_args=()
    # Check tensor parallel
    if ! echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) $expected_tp"; then
        missing_args+=("tensor_parallel: $expected_tp")
        all_passed=false
    fi
    # Check cluster-specific arguments
    for arg in "${expected_args[@]}"; do
        if ! echo "$vllm_cmd" | grep -qF -- "$arg"; then
            missing_args+=("$arg")
            all_passed=false
        fi
    done
    if [[ "$all_passed" == "true" ]]; then
        log_pass "README match (cluster): $recipe_name - cluster args correct"
    else
        log_fail "README match (cluster): $recipe_name - missing cluster arguments"
        for missing in "${missing_args[@]}"; do
            log_verbose "  Missing: $missing"
        done
        log_verbose "  vLLM command: $vllm_cmd"
    fi
 }
 # Test: openai-gpt-oss-120b cluster mode has correct tensor_parallel and ray backend
 test_readme_gpt_oss_cluster() {
    verify_cluster_args "openai-gpt-oss-120b" \
        "$GPT_OSS_CLUSTER_TP" \
        "${GPT_OSS_CLUSTER_ARGS[@]}"
 }
 # Test: minimax-m2-awq cluster mode has correct tensor_parallel and ray backend
 test_readme_minimax_cluster() {
    verify_cluster_args "minimax-m2-awq" \
        "$MINIMAX_CLUSTER_TP" \
        "${MINIMAX_CLUSTER_ARGS[@]}"
 }
 # Test: glm-4.7-flash-awq cluster mode stays at tp=1 (single GPU model)
 test_readme_glm_flash_cluster() {
    log_test "README match (cluster): glm-4.7-flash-awq stays tp=1"
    if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then
        log_skip "glm-4.7-flash-awq.yaml not found"
        return
    fi
    # Even in cluster mode, this model uses tp=1
    output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
    vllm_cmd=$(extract_vllm_command "$output")
    if echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) 1"; then
        log_pass "README match (cluster): glm-4.7-flash-awq correctly keeps tp=1"
    else
        log_fail "README match (cluster): glm-4.7-flash-awq should have tp=1"
        log_verbose "  vLLM command: $vllm_cmd"
    fi
 }
 # Run all tests
 main() {
    echo "=============================================="
    echo "  run-recipe.py Integration Tests"
    echo "=============================================="
    echo ""
    cd "$PROJECT_DIR"
    check_prerequisites
    echo ""
    # File existence tests
    test_run_recipe_exists
    test_launch_cluster_exists
    echo ""
    # Basic functionality tests
    test_list_recipes
    test_recipe_version_required
    test_all_recipes_load
    echo ""
    # Dry-run tests
    test_dry_run_generates_script
    test_solo_mode_tp1
    test_solo_mode_removes_ray
    test_cluster_mode_keeps_ray
    test_cli_override_port
    echo ""
    # launch-cluster.sh command line verification tests
    echo "--- Launch Command Verification ---"
    test_launch_cmd_solo_flag
    test_launch_cmd_nodes_flag
    test_launch_cmd_container_image
    test_launch_cmd_mods
    test_launch_cmd_daemon_flag
    test_launch_cmd_nccl_debug
    test_launch_cmd_launch_script
    test_launch_cmd_container_override
    test_launch_cmd_no_solo_in_cluster
    echo ""
    # README documentation verification tests
    echo "--- README Documentation Verification (Solo Mode) ---"
    test_readme_glm_flash_awq
    test_readme_gpt_oss
    test_readme_minimax
    test_readme_glm_flash_mod
    echo ""
    # Cluster mode documentation verification tests
    echo "--- README Documentation Verification (Cluster Mode) ---"
    test_readme_gpt_oss_cluster
    test_readme_minimax_cluster
    test_readme_glm_flash_cluster
    echo ""
    # launch-cluster.sh tests
    test_launch_cluster_help
    test_launch_cluster_examples_path
    echo ""
    # Validation tests
    test_unsupported_recipe_version
    test_missing_recipe_version_fails
    test_cluster_only_fails_solo
    echo ""
    # Summary
    echo "=============================================="
    echo "  Test Summary"
    echo "=============================================="
    echo -e "  ${GREEN}Passed:${NC}  $TESTS_PASSED"
    echo -e "  ${RED}Failed:${NC}  $TESTS_FAILED"
    echo -e "  ${YELLOW}Skipped:${NC} $TESTS_SKIPPED"
    echo "=============================================="
    if [[ $TESTS_FAILED -gt 0 ]]; then
        exit 1
    fi
    exit 0
 }
 main "$@"