From 28ba6090fcc86ce53583d87e55e6d88a4bf6afc6 Mon Sep 17 00:00:00 2001 From: Raphael Amorim Date: Tue, 3 Feb 2026 17:32:59 -0500 Subject: [PATCH] Adding suggestions from Eugr and unit tests --- .github/workflows/test-recipes.yml | 59 ++ {profiles => examples}/README.md | 8 +- .../example-vllm-minimax.sh | 0 {profiles => examples}/vllm-glm-4.7-nvfp4.sh | 0 .../vllm-openai-gpt-oss-120b.sh | 0 launch-cluster.sh | 29 +- run-recipe.py | 1 + tests/expected_commands.sh | 89 ++ tests/test_recipes.sh | 859 ++++++++++++++++++ 9 files changed, 1024 insertions(+), 21 deletions(-) create mode 100644 .github/workflows/test-recipes.yml rename {profiles => examples}/README.md (90%) rename {profiles => examples}/example-vllm-minimax.sh (100%) rename {profiles => examples}/vllm-glm-4.7-nvfp4.sh (100%) rename {profiles => examples}/vllm-openai-gpt-oss-120b.sh (100%) create mode 100644 tests/expected_commands.sh create mode 100755 tests/test_recipes.sh diff --git a/.github/workflows/test-recipes.yml b/.github/workflows/test-recipes.yml new file mode 100644 index 0000000..0a0a0e1 --- /dev/null +++ b/.github/workflows/test-recipes.yml @@ -0,0 +1,59 @@ +name: Recipe Tests + +on: + push: + branches: [ main, profiles ] + paths: + - 'run-recipe.py' + - 'run-recipe.sh' + - 'launch-cluster.sh' + - 'recipes/**' + - 'tests/**' + - '.github/workflows/test-recipes.yml' + pull_request: + branches: [ main, profiles ] + paths: + - 'run-recipe.py' + - 'run-recipe.sh' + - 'launch-cluster.sh' + - 'recipes/**' + - 'tests/**' + - '.github/workflows/test-recipes.yml' + +jobs: + test: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pyyaml + + - name: Make scripts executable + run: | + chmod +x run-recipe.py run-recipe.sh launch-cluster.sh + chmod +x tests/test_recipes.sh + + - name: Run recipe integration tests + run: | + ./tests/test_recipes.sh -v + + - name: Verify all recipes with dry-run + run: | + for recipe in recipes/*.yaml; do + name=$(basename "$recipe" .yaml) + echo "Testing recipe: $name" + ./run-recipe.py "$name" --dry-run --solo || exit 1 + done diff --git a/profiles/README.md b/examples/README.md similarity index 90% rename from profiles/README.md rename to examples/README.md index 470e9f8..4d84af9 100644 --- a/profiles/README.md +++ b/examples/README.md @@ -1,6 +1,8 @@ -# Launch Scripts +# Example Launch Scripts -This directory contains bash scripts that can be executed in the container using the `--launch-script` option. Launch scripts are simple, executable bash files that run directly inside the container. +This directory contains example bash scripts that demonstrate how to use the `--launch-script` option directly with `launch-cluster.sh`. + +**Note:** For most use cases, the recipe system (`./run-recipe.sh`) is the recommended approach. These examples are provided for reference and for advanced users who need direct control over the launch process. ## Why Launch Scripts? @@ -12,7 +14,7 @@ This directory contains bash scripts that can be executed in the container using ## Usage ```bash -# Use a launch script by name (looks in profiles/ directory) +# Use a launch script by name (looks in examples/ directory) ./launch-cluster.sh --launch-script example-vllm-minimax # Use a launch script by filename diff --git a/profiles/example-vllm-minimax.sh b/examples/example-vllm-minimax.sh similarity index 100% rename from profiles/example-vllm-minimax.sh rename to examples/example-vllm-minimax.sh diff --git a/profiles/vllm-glm-4.7-nvfp4.sh b/examples/vllm-glm-4.7-nvfp4.sh similarity index 100% rename from profiles/vllm-glm-4.7-nvfp4.sh rename to examples/vllm-glm-4.7-nvfp4.sh diff --git a/profiles/vllm-openai-gpt-oss-120b.sh b/examples/vllm-openai-gpt-oss-120b.sh similarity index 100% rename from profiles/vllm-openai-gpt-oss-120b.sh rename to examples/vllm-openai-gpt-oss-120b.sh diff --git a/launch-cluster.sh b/launch-cluster.sh index 2851701..a49cbe1 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -43,7 +43,7 @@ usage() { echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)" echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO." echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)" - echo " --launch-script Path to bash script to execute in the container (from profiles/ directory or absolute path)" + echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path)" echo " --check-config Check configuration and auto-detection without launching" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" echo " -d Daemon mode (only for 'start' action)" @@ -51,7 +51,7 @@ usage() { echo " command Command to run (only for 'exec' action)" echo "" echo "Launch Script Usage:" - echo " $0 --launch-script profiles/my-script.sh # Script copied to container and executed" + echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed" echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script" exit 1 } @@ -120,18 +120,18 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then # Check if it's an absolute path or relative path that exists if [[ -f "$LAUNCH_SCRIPT_PATH" ]]; then LAUNCH_SCRIPT_PATH=$(realpath "$LAUNCH_SCRIPT_PATH") - # Check if it's just a filename, look in profiles/ directory - elif [[ -f "$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" ]]; then - LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" + # Check if it's just a filename, look in examples/ directory + elif [[ -f "$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" ]]; then + LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" # Check if it's a name without .sh extension - elif [[ -f "$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" ]]; then - LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" + elif [[ -f "$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" ]]; then + LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" else echo "Error: Launch script '$LAUNCH_SCRIPT_PATH' not found." echo "Searched in:" echo " - $LAUNCH_SCRIPT_PATH" - echo " - $SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" - echo " - $SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" + echo " - $SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" + echo " - $SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" exit 1 fi @@ -578,17 +578,10 @@ start_cluster() { done fi - # Copy launch script if specified + # Copy launch script to head node only (workers don't need it - they just run Ray) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then - echo "Copying launch script to cluster nodes..." - - # Copy to Head + echo "Copying launch script to head node..." copy_launch_script_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "$LAUNCH_SCRIPT_PATH" - - # Copy to Workers - for worker in "${PEER_NODES[@]}"; do - copy_launch_script_to_container "$worker" "$CONTAINER_NAME" "false" "$LAUNCH_SCRIPT_PATH" - done fi if [[ "$SOLO_MODE" == "false" ]]; then diff --git a/run-recipe.py b/run-recipe.py index 16038ee..43b4f2a 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -75,6 +75,7 @@ RECIPE VERSION HISTORY: RELATED FILES: - run-recipe.sh: Bash wrapper that ensures Python deps are installed - recipes/*.yaml: Recipe definitions + - examples/: Example launch scripts for direct use with launch-cluster.sh - launch-cluster.sh: Low-level container orchestration - build-and-copy.sh: Docker build and distribution - hf-download.sh: HuggingFace model download and sync diff --git a/tests/expected_commands.sh b/tests/expected_commands.sh new file mode 100644 index 0000000..7ee60f1 --- /dev/null +++ b/tests/expected_commands.sh @@ -0,0 +1,89 @@ +# Expected vLLM serve arguments for each recipe +# This file is used by test_recipes.sh to verify recipes match README documentation +# +# Format: Each recipe has a section with expected arguments +# Tests will verify these arguments appear in the dry-run output +# +# IMPORTANT: Keep this in sync with README.md documentation +# When updating recipes, update both README.md and this file + +# ============================================================================== +# glm-4.7-flash-awq +# README Reference: Lines 186-198 (solo) and 203-218 (cluster) +# ============================================================================== +GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit" +GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5" +GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ" +GLM_FLASH_AWQ_ARGS=( + "--tool-call-parser glm47" + "--reasoning-parser glm45" + "--enable-auto-tool-choice" + "--served-model-name glm-4.7-flash" + "--max-model-len 202752" + "--max-num-batched-tokens 4096" + "--max-num-seqs 64" + "--gpu-memory-utilization 0.7" + "--port 8888" + "--host 0.0.0.0" +) + +# ============================================================================== +# openai-gpt-oss-120b +# README Reference: Lines 244-257 (solo) and 264-280 (cluster) +# ============================================================================== +GPT_OSS_MODEL="openai/gpt-oss-120b" +GPT_OSS_CONTAINER="vllm-node-mxfp4" +GPT_OSS_ARGS=( + "--port 8888" + "--host 0.0.0.0" + "--enable-auto-tool-choice" + "--tool-call-parser openai" + "--reasoning-parser openai_gptoss" + "--gpu-memory-utilization 0.7" + "--enable-prefix-caching" + "--load-format fastsafetensors" + "--quantization mxfp4" + "--mxfp4-backend CUTLASS" + "--mxfp4-layers moe,qkv,o,lm_head" + "--attention-backend FLASHINFER" + "--kv-cache-dtype fp8" + "--max-num-batched-tokens 8192" +) + +# ============================================================================== +# minimax-m2-awq +# README Reference: Not explicitly documented, but based on model requirements +# ============================================================================== +MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ" +MINIMAX_CONTAINER="vllm-node" +MINIMAX_ARGS=( + "--port 8000" + "--host 0.0.0.0" + "--gpu-memory-utilization 0.7" + "--max-model-len 128000" + "--load-format fastsafetensors" + "--enable-auto-tool-choice" + "--tool-call-parser minimax_m2" + "--reasoning-parser minimax_m2_append_think" +) + +# ============================================================================== +# Cluster Mode Expected Arguments +# These are arguments that should appear ONLY in cluster mode +# Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node) +# ============================================================================== + +# glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model) +GLM_FLASH_AWQ_CLUSTER_TP="1" + +# openai-gpt-oss-120b cluster mode (2 nodes = tp 2) +GPT_OSS_CLUSTER_TP="2" +GPT_OSS_CLUSTER_ARGS=( + "--distributed-executor-backend ray" +) + +# minimax-m2-awq cluster mode (2 nodes = tp 2) +MINIMAX_CLUSTER_TP="2" +MINIMAX_CLUSTER_ARGS=( + "--distributed-executor-backend ray" +) diff --git a/tests/test_recipes.sh b/tests/test_recipes.sh new file mode 100755 index 0000000..6e44e26 --- /dev/null +++ b/tests/test_recipes.sh @@ -0,0 +1,859 @@ +#!/bin/bash +# +# test_recipes.sh - Integration tests for run-recipe.py and launch-cluster.sh +# +# These tests use --dry-run mode to verify compatibility without actually +# running containers. Suitable for CI/CD pipelines. +# +# Usage: +# ./tests/test_recipes.sh # Run all tests +# ./tests/test_recipes.sh -v # Verbose output +# + +set -e + +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +VERBOSE="${1:-}" + +# Load expected commands for README verification +source "$SCRIPT_DIR/expected_commands.sh" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Test counters +TESTS_PASSED=0 +TESTS_FAILED=0 +TESTS_SKIPPED=0 + +# Helper functions +log_test() { + echo -e "${YELLOW}[TEST]${NC} $1" +} + +log_pass() { + echo -e "${GREEN}[PASS]${NC} $1" + TESTS_PASSED=$((TESTS_PASSED + 1)) +} + +log_fail() { + echo -e "${RED}[FAIL]${NC} $1" + TESTS_FAILED=$((TESTS_FAILED + 1)) +} + +log_skip() { + echo -e "${YELLOW}[SKIP]${NC} $1" + TESTS_SKIPPED=$((TESTS_SKIPPED + 1)) +} + +log_verbose() { + if [[ "$VERBOSE" == "-v" ]]; then + echo " $1" + fi +} + +# Check prerequisites +check_prerequisites() { + log_test "Checking prerequisites..." + + if ! command -v python3 &> /dev/null; then + log_fail "python3 not found" + exit 1 + fi + + # Check Python version + python_version=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') + if [[ $(echo "$python_version < 3.10" | bc -l) -eq 1 ]]; then + log_fail "Python 3.10+ required, found $python_version" + exit 1 + fi + + # Check PyYAML + if ! python3 -c "import yaml" 2>/dev/null; then + log_fail "PyYAML not installed" + exit 1 + fi + + log_pass "Prerequisites OK (Python $python_version with PyYAML)" +} + +# Test: run-recipe.py exists and is executable +test_run_recipe_exists() { + log_test "run-recipe.py exists and is executable" + + if [[ -x "$PROJECT_DIR/run-recipe.py" ]]; then + log_pass "run-recipe.py is executable" + else + log_fail "run-recipe.py not found or not executable" + fi +} + +# Test: launch-cluster.sh exists and is executable +test_launch_cluster_exists() { + log_test "launch-cluster.sh exists and is executable" + + if [[ -x "$PROJECT_DIR/launch-cluster.sh" ]]; then + log_pass "launch-cluster.sh is executable" + else + log_fail "launch-cluster.sh not found or not executable" + fi +} + +# Test: run-recipe.py --list works +test_list_recipes() { + log_test "run-recipe.py --list" + + output=$("$PROJECT_DIR/run-recipe.py" --list 2>&1) + + if [[ $? -eq 0 ]] && echo "$output" | grep -q "Available recipes"; then + log_pass "--list shows available recipes" + log_verbose "Found recipes in output" + else + log_fail "--list failed or no recipes found" + log_verbose "$output" + fi +} + +# Test: All recipes have required recipe_version field +test_recipe_version_required() { + log_test "All recipes have required recipe_version field" + + local all_valid=true + for recipe in "$PROJECT_DIR/recipes/"*.yaml; do + if [[ -f "$recipe" ]]; then + recipe_name=$(basename "$recipe") + if ! grep -q "^recipe_version:" "$recipe"; then + log_verbose "$recipe_name missing recipe_version" + all_valid=false + fi + fi + done + + if [[ "$all_valid" == "true" ]]; then + log_pass "All recipes have recipe_version field" + else + log_fail "Some recipes missing recipe_version field" + fi +} + +# Test: All recipes load without errors +test_all_recipes_load() { + log_test "All recipes load without errors" + + local all_valid=true + for recipe in "$PROJECT_DIR/recipes/"*.yaml; do + if [[ -f "$recipe" ]]; then + recipe_name=$(basename "$recipe" .yaml) + # Try to load recipe with --dry-run (will fail early if recipe is invalid) + if ! "$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1 | grep -q "Error:"; then + log_verbose "$recipe_name loads OK" + else + log_verbose "$recipe_name failed to load" + all_valid=false + fi + fi + done + + if [[ "$all_valid" == "true" ]]; then + log_pass "All recipes load successfully" + else + log_fail "Some recipes failed to load" + fi +} + +# Test: Dry-run generates valid launch script +test_dry_run_generates_script() { + log_test "Dry-run generates valid launch script" + + # Find first available recipe + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + + if echo "$output" | grep -q "#!/bin/bash" && echo "$output" | grep -q "vllm serve"; then + log_pass "Dry-run generates bash script with vllm serve command" + else + log_fail "Dry-run output doesn't contain expected content" + log_verbose "$output" + fi +} + +# Test: Solo mode sets tensor_parallel=1 +test_solo_mode_tp1() { + log_test "Solo mode sets tensor_parallel=1" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + + # Check that -tp 1 is in the output (solo mode should set tp=1) + if echo "$output" | grep -q "\-tp 1"; then + log_pass "Solo mode correctly sets -tp 1" + else + log_fail "Solo mode did not set -tp 1" + log_verbose "$output" + fi +} + +# Test: Solo mode removes --distributed-executor-backend ray +test_solo_mode_removes_ray() { + log_test "Solo mode removes --distributed-executor-backend ray" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + + # Check that --distributed-executor-backend is NOT in the output + if ! echo "$output" | grep -q "\-\-distributed-executor-backend"; then + log_pass "Solo mode correctly removes --distributed-executor-backend" + else + log_fail "Solo mode did not remove --distributed-executor-backend" + log_verbose "$output" + fi +} + +# Test: Cluster mode preserves --distributed-executor-backend ray +test_cluster_mode_keeps_ray() { + log_test "Cluster mode preserves --distributed-executor-backend ray" + + # Use minimax-m2-awq which explicitly has --distributed-executor-backend ray + if [[ ! -f "$PROJECT_DIR/recipes/minimax-m2-awq.yaml" ]]; then + log_skip "minimax-m2-awq.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "192.168.1.1,192.168.1.2" 2>&1) + + # Check that --distributed-executor-backend IS in the output for cluster mode + if echo "$output" | grep -q "\-\-distributed-executor-backend ray"; then + log_pass "Cluster mode correctly preserves --distributed-executor-backend ray" + else + log_fail "Cluster mode did not preserve --distributed-executor-backend" + log_verbose "$output" + fi +} + +# Test: CLI overrides work (--port) +test_cli_override_port() { + log_test "CLI override --port works" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 9999 2>&1) + + if echo "$output" | grep -q "\-\-port 9999"; then + log_pass "--port override correctly applied" + else + log_fail "--port override not found in output" + log_verbose "$output" + fi +} + +# Test: launch-cluster.sh --help works +test_launch_cluster_help() { + log_test "launch-cluster.sh --help" + + output=$("$PROJECT_DIR/launch-cluster.sh" --help 2>&1 || true) + + if echo "$output" | grep -q "Usage:"; then + log_pass "--help shows usage information" + else + log_fail "--help did not show usage" + log_verbose "$output" + fi +} + +# Test: launch-cluster.sh references examples/ not profiles/ +test_launch_cluster_examples_path() { + log_test "launch-cluster.sh references examples/ directory" + + if grep -q "examples/" "$PROJECT_DIR/launch-cluster.sh"; then + log_pass "launch-cluster.sh references examples/" + else + log_fail "launch-cluster.sh does not reference examples/" + fi + + if grep -q "profiles/" "$PROJECT_DIR/launch-cluster.sh"; then + log_fail "launch-cluster.sh still references profiles/" + fi +} + +# Test: Unsupported recipe version shows warning +test_unsupported_recipe_version() { + log_test "Unsupported recipe_version shows warning" + + # Create a temporary recipe with unsupported version + temp_recipe=$(mktemp) + cat > "$temp_recipe" << 'EOF' +recipe_version: "999" +name: Test Recipe +container: test-container +command: echo "test" +EOF + + output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1) + rm -f "$temp_recipe" + + if echo "$output" | grep -q "Warning.*schema version"; then + log_pass "Unsupported recipe_version shows warning" + else + log_fail "No warning for unsupported recipe_version" + log_verbose "$output" + fi +} + +# Test: Missing recipe_version fails +test_missing_recipe_version_fails() { + log_test "Missing recipe_version field fails" + + # Create a temporary recipe without recipe_version + temp_recipe=$(mktemp) + cat > "$temp_recipe" << 'EOF' +name: Test Recipe +container: test-container +command: echo "test" +EOF + + output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true) + rm -f "$temp_recipe" + + if echo "$output" | grep -q "Error.*recipe_version"; then + log_pass "Missing recipe_version correctly fails" + else + log_fail "Missing recipe_version did not fail as expected" + log_verbose "$output" + fi +} + +# Test: cluster_only recipe fails in solo mode +test_cluster_only_fails_solo() { + log_test "cluster_only recipe fails in solo mode" + + # Create a temporary cluster_only recipe + temp_recipe=$(mktemp) + cat > "$temp_recipe" << 'EOF' +recipe_version: "1" +name: Cluster Only Test +container: test-container +cluster_only: true +command: echo "test" +EOF + + output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true) + exit_code=$? + rm -f "$temp_recipe" + + if echo "$output" | grep -q "requires cluster mode"; then + log_pass "cluster_only recipe correctly fails in solo mode" + else + log_fail "cluster_only recipe did not fail in solo mode" + log_verbose "$output" + fi +} + +# ============================================================================== +# Launch-cluster.sh Command Line Verification Tests +# ============================================================================== +# These tests verify that the dry-run output contains the expected +# launch-cluster.sh command line arguments matching the recipe configuration. + +# Helper: Extract launch-cluster command from dry-run output +extract_launch_cmd() { + echo "$1" | grep -A5 "launch-cluster.sh is called with:" | grep -v "launch-cluster.sh is called with:" | tr '\n' ' ' +} + +# Test: Solo mode generates --solo flag in launch-cluster command +test_launch_cmd_solo_flag() { + log_test "Launch command includes --solo flag in solo mode" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-solo"; then + log_pass "Launch command includes --solo flag" + else + log_fail "Launch command missing --solo flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Cluster mode generates -n flag with nodes +test_launch_cmd_nodes_flag() { + log_test "Launch command includes -n flag with nodes in cluster mode" + + output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-n 10.0.0.1,10.0.0.2"; then + log_pass "Launch command includes -n with correct nodes" + else + log_fail "Launch command missing or incorrect -n flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Container image from recipe is passed to launch-cluster +test_launch_cmd_container_image() { + log_test "Launch command includes correct container image (-t)" + + # Use openai-gpt-oss-120b which has a specific container name + if [[ ! -f "$PROJECT_DIR/recipes/openai-gpt-oss-120b.yaml" ]]; then + log_skip "openai-gpt-oss-120b.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" openai-gpt-oss-120b --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + # Check the container is vllm-node-mxfp4 (from the recipe) + if echo "$launch_cmd" | grep -q "\-t vllm-node-mxfp4"; then + log_pass "Launch command includes correct container image" + else + log_fail "Launch command has wrong container image" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Mods from recipe are passed as --apply-mod +test_launch_cmd_mods() { + log_test "Launch command includes --apply-mod for recipe mods" + + # Use glm-4.7-flash-awq which has a mod + if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then + log_skip "glm-4.7-flash-awq.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-apply-mod"; then + log_pass "Launch command includes --apply-mod for mods" + else + log_fail "Launch command missing --apply-mod" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Daemon mode flag is passed through +test_launch_cmd_daemon_flag() { + log_test "Launch command includes -d flag in daemon mode" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -d 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-d"; then + log_pass "Launch command includes -d flag" + else + log_fail "Launch command missing -d flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: NCCL debug level is passed through +test_launch_cmd_nccl_debug() { + log_test "Launch command includes --nccl-debug when specified" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --nccl-debug INFO 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-nccl-debug INFO"; then + log_pass "Launch command includes --nccl-debug INFO" + else + log_fail "Launch command missing --nccl-debug" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: --launch-script is always included +test_launch_cmd_launch_script() { + log_test "Launch command includes --launch-script" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-\-launch-script"; then + log_pass "Launch command includes --launch-script" + else + log_fail "Launch command missing --launch-script" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Container override (-t CLI) takes precedence +test_launch_cmd_container_override() { + log_test "CLI container override (-t) takes precedence" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -t my-custom-image 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "\-t my-custom-image"; then + log_pass "Container override correctly applied" + else + log_fail "Container override not applied" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Test: Cluster mode does NOT include --solo flag +test_launch_cmd_no_solo_in_cluster() { + log_test "Launch command does NOT include --solo in cluster mode" + + output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -qv "\-\-solo" || ! echo "$launch_cmd" | grep -q "\-\-solo"; then + log_pass "Cluster mode correctly omits --solo flag" + else + log_fail "Cluster mode incorrectly includes --solo flag" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# ============================================================================== +# README Documentation Verification Tests +# ============================================================================== +# These tests verify that recipe dry-run output matches the expected commands +# documented in README.md. Expected values are defined in expected_commands.sh + +# Helper: Extract the generated launch script from dry-run output +extract_vllm_command() { + # Extract lines between "Generated Launch Script" and "What would be executed" + echo "$1" | sed -n '/=== Generated Launch Script ===/,/=== What would be executed ===/p' | grep -v "===" | grep -v "^#" | grep -v "^$" +} + +# Helper: Verify a recipe contains all expected arguments +verify_recipe_args() { + local recipe_name="$1" + local expected_model="$2" + local expected_container="$3" + shift 3 + local expected_args=("$@") + + log_test "README match: $recipe_name" + + if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then + log_skip "${recipe_name}.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1) + vllm_cmd=$(extract_vllm_command "$output") + launch_cmd=$(extract_launch_cmd "$output") + + local all_passed=true + local missing_args=() + + # Check model name + if ! echo "$vllm_cmd" | grep -q "$expected_model"; then + missing_args+=("model: $expected_model") + all_passed=false + fi + + # Check container + if ! echo "$launch_cmd" | grep -q "\-t $expected_container"; then + missing_args+=("container: $expected_container") + all_passed=false + fi + + # Check each expected argument + for arg in "${expected_args[@]}"; do + # Handle arguments that may have slight formatting differences + # Extract the flag and value separately for flexible matching + local flag=$(echo "$arg" | awk '{print $1}') + local value=$(echo "$arg" | cut -d' ' -f2-) + + # Use grep -F for fixed string matching (avoids -- being treated as grep options) + if ! echo "$vllm_cmd" | grep -qF -- "$flag"; then + missing_args+=("$arg") + all_passed=false + elif [[ -n "$value" ]] && [[ "$value" != "$flag" ]]; then + # Check if value is present (might be on next line due to formatting) + if ! echo "$vllm_cmd" | grep -qF -- "$value"; then + missing_args+=("$arg (flag present, value mismatch)") + all_passed=false + fi + fi + done + + if [[ "$all_passed" == "true" ]]; then + log_pass "README match: $recipe_name - all expected arguments present" + else + log_fail "README match: $recipe_name - missing arguments" + for missing in "${missing_args[@]}"; do + log_verbose " Missing: $missing" + done + log_verbose " vLLM command: $vllm_cmd" + fi +} + +# Test: glm-4.7-flash-awq matches README documentation +test_readme_glm_flash_awq() { + verify_recipe_args "glm-4.7-flash-awq" \ + "$GLM_FLASH_AWQ_MODEL" \ + "$GLM_FLASH_AWQ_CONTAINER" \ + "${GLM_FLASH_AWQ_ARGS[@]}" +} + +# Test: openai-gpt-oss-120b matches README documentation +test_readme_gpt_oss() { + verify_recipe_args "openai-gpt-oss-120b" \ + "$GPT_OSS_MODEL" \ + "$GPT_OSS_CONTAINER" \ + "${GPT_OSS_ARGS[@]}" +} + +# Test: minimax-m2-awq matches expected configuration +test_readme_minimax() { + verify_recipe_args "minimax-m2-awq" \ + "$MINIMAX_MODEL" \ + "$MINIMAX_CONTAINER" \ + "${MINIMAX_ARGS[@]}" +} + +# Test: glm-4.7-flash-awq includes correct mod +test_readme_glm_flash_mod() { + log_test "README match: glm-4.7-flash-awq mod path" + + if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then + log_skip "glm-4.7-flash-awq.yaml not found" + return + fi + + output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1) + launch_cmd=$(extract_launch_cmd "$output") + + if echo "$launch_cmd" | grep -q "$GLM_FLASH_AWQ_MOD"; then + log_pass "README match: glm-4.7-flash-awq has correct mod path" + else + log_fail "README match: glm-4.7-flash-awq missing expected mod: $GLM_FLASH_AWQ_MOD" + log_verbose "Launch cmd: $launch_cmd" + fi +} + +# Helper: Verify cluster mode specific arguments +verify_cluster_args() { + local recipe_name="$1" + local expected_tp="$2" + shift 2 + local expected_args=("$@") + + log_test "README match (cluster): $recipe_name" + + if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then + log_skip "${recipe_name}.yaml not found" + return + fi + + # Use fake nodes for cluster mode + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + vllm_cmd=$(extract_vllm_command "$output") + + local all_passed=true + local missing_args=() + + # Check tensor parallel + if ! echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) $expected_tp"; then + missing_args+=("tensor_parallel: $expected_tp") + all_passed=false + fi + + # Check cluster-specific arguments + for arg in "${expected_args[@]}"; do + if ! echo "$vllm_cmd" | grep -qF -- "$arg"; then + missing_args+=("$arg") + all_passed=false + fi + done + + if [[ "$all_passed" == "true" ]]; then + log_pass "README match (cluster): $recipe_name - cluster args correct" + else + log_fail "README match (cluster): $recipe_name - missing cluster arguments" + for missing in "${missing_args[@]}"; do + log_verbose " Missing: $missing" + done + log_verbose " vLLM command: $vllm_cmd" + fi +} + +# Test: openai-gpt-oss-120b cluster mode has correct tensor_parallel and ray backend +test_readme_gpt_oss_cluster() { + verify_cluster_args "openai-gpt-oss-120b" \ + "$GPT_OSS_CLUSTER_TP" \ + "${GPT_OSS_CLUSTER_ARGS[@]}" +} + +# Test: minimax-m2-awq cluster mode has correct tensor_parallel and ray backend +test_readme_minimax_cluster() { + verify_cluster_args "minimax-m2-awq" \ + "$MINIMAX_CLUSTER_TP" \ + "${MINIMAX_CLUSTER_ARGS[@]}" +} + +# Test: glm-4.7-flash-awq cluster mode stays at tp=1 (single GPU model) +test_readme_glm_flash_cluster() { + log_test "README match (cluster): glm-4.7-flash-awq stays tp=1" + + if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then + log_skip "glm-4.7-flash-awq.yaml not found" + return + fi + + # Even in cluster mode, this model uses tp=1 + output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1) + vllm_cmd=$(extract_vllm_command "$output") + + if echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) 1"; then + log_pass "README match (cluster): glm-4.7-flash-awq correctly keeps tp=1" + else + log_fail "README match (cluster): glm-4.7-flash-awq should have tp=1" + log_verbose " vLLM command: $vllm_cmd" + fi +} + +# Run all tests +main() { + echo "==============================================" + echo " run-recipe.py Integration Tests" + echo "==============================================" + echo "" + + cd "$PROJECT_DIR" + + check_prerequisites + echo "" + + # File existence tests + test_run_recipe_exists + test_launch_cluster_exists + echo "" + + # Basic functionality tests + test_list_recipes + test_recipe_version_required + test_all_recipes_load + echo "" + + # Dry-run tests + test_dry_run_generates_script + test_solo_mode_tp1 + test_solo_mode_removes_ray + test_cluster_mode_keeps_ray + test_cli_override_port + echo "" + + # launch-cluster.sh command line verification tests + echo "--- Launch Command Verification ---" + test_launch_cmd_solo_flag + test_launch_cmd_nodes_flag + test_launch_cmd_container_image + test_launch_cmd_mods + test_launch_cmd_daemon_flag + test_launch_cmd_nccl_debug + test_launch_cmd_launch_script + test_launch_cmd_container_override + test_launch_cmd_no_solo_in_cluster + echo "" + + # README documentation verification tests + echo "--- README Documentation Verification (Solo Mode) ---" + test_readme_glm_flash_awq + test_readme_gpt_oss + test_readme_minimax + test_readme_glm_flash_mod + echo "" + + # Cluster mode documentation verification tests + echo "--- README Documentation Verification (Cluster Mode) ---" + test_readme_gpt_oss_cluster + test_readme_minimax_cluster + test_readme_glm_flash_cluster + echo "" + + # launch-cluster.sh tests + test_launch_cluster_help + test_launch_cluster_examples_path + echo "" + + # Validation tests + test_unsupported_recipe_version + test_missing_recipe_version_fails + test_cluster_only_fails_solo + echo "" + + # Summary + echo "==============================================" + echo " Test Summary" + echo "==============================================" + echo -e " ${GREEN}Passed:${NC} $TESTS_PASSED" + echo -e " ${RED}Failed:${NC} $TESTS_FAILED" + echo -e " ${YELLOW}Skipped:${NC} $TESTS_SKIPPED" + echo "==============================================" + + if [[ $TESTS_FAILED -gt 0 ]]; then + exit 1 + fi + exit 0 +} + +main "$@"