Adding suggestions from Eugr and unit tests
This commit is contained in:
59
.github/workflows/test-recipes.yml
vendored
Normal file
59
.github/workflows/test-recipes.yml
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
name: Recipe Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, profiles ]
|
||||
paths:
|
||||
- 'run-recipe.py'
|
||||
- 'run-recipe.sh'
|
||||
- 'launch-cluster.sh'
|
||||
- 'recipes/**'
|
||||
- 'tests/**'
|
||||
- '.github/workflows/test-recipes.yml'
|
||||
pull_request:
|
||||
branches: [ main, profiles ]
|
||||
paths:
|
||||
- 'run-recipe.py'
|
||||
- 'run-recipe.sh'
|
||||
- 'launch-cluster.sh'
|
||||
- 'recipes/**'
|
||||
- 'tests/**'
|
||||
- '.github/workflows/test-recipes.yml'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.10', '3.11', '3.12']
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install pyyaml
|
||||
|
||||
- name: Make scripts executable
|
||||
run: |
|
||||
chmod +x run-recipe.py run-recipe.sh launch-cluster.sh
|
||||
chmod +x tests/test_recipes.sh
|
||||
|
||||
- name: Run recipe integration tests
|
||||
run: |
|
||||
./tests/test_recipes.sh -v
|
||||
|
||||
- name: Verify all recipes with dry-run
|
||||
run: |
|
||||
for recipe in recipes/*.yaml; do
|
||||
name=$(basename "$recipe" .yaml)
|
||||
echo "Testing recipe: $name"
|
||||
./run-recipe.py "$name" --dry-run --solo || exit 1
|
||||
done
|
||||
@@ -1,6 +1,8 @@
|
||||
# Launch Scripts
|
||||
# Example Launch Scripts
|
||||
|
||||
This directory contains bash scripts that can be executed in the container using the `--launch-script` option. Launch scripts are simple, executable bash files that run directly inside the container.
|
||||
This directory contains example bash scripts that demonstrate how to use the `--launch-script` option directly with `launch-cluster.sh`.
|
||||
|
||||
**Note:** For most use cases, the recipe system (`./run-recipe.sh`) is the recommended approach. These examples are provided for reference and for advanced users who need direct control over the launch process.
|
||||
|
||||
## Why Launch Scripts?
|
||||
|
||||
@@ -12,7 +14,7 @@ This directory contains bash scripts that can be executed in the container using
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Use a launch script by name (looks in profiles/ directory)
|
||||
# Use a launch script by name (looks in examples/ directory)
|
||||
./launch-cluster.sh --launch-script example-vllm-minimax
|
||||
|
||||
# Use a launch script by filename
|
||||
@@ -43,7 +43,7 @@ usage() {
|
||||
echo " -e, --env Environment variable to pass to container (e.g. -e VAR=val)"
|
||||
echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
|
||||
echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch (Can be specified multiple times)"
|
||||
echo " --launch-script Path to bash script to execute in the container (from profiles/ directory or absolute path)"
|
||||
echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path)"
|
||||
echo " --check-config Check configuration and auto-detection without launching"
|
||||
echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
|
||||
echo " -d Daemon mode (only for 'start' action)"
|
||||
@@ -51,7 +51,7 @@ usage() {
|
||||
echo " command Command to run (only for 'exec' action)"
|
||||
echo ""
|
||||
echo "Launch Script Usage:"
|
||||
echo " $0 --launch-script profiles/my-script.sh # Script copied to container and executed"
|
||||
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
|
||||
echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script"
|
||||
exit 1
|
||||
}
|
||||
@@ -120,18 +120,18 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
# Check if it's an absolute path or relative path that exists
|
||||
if [[ -f "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
LAUNCH_SCRIPT_PATH=$(realpath "$LAUNCH_SCRIPT_PATH")
|
||||
# Check if it's just a filename, look in profiles/ directory
|
||||
elif [[ -f "$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH"
|
||||
# Check if it's just a filename, look in examples/ directory
|
||||
elif [[ -f "$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH"
|
||||
# Check if it's a name without .sh extension
|
||||
elif [[ -f "$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh" ]]; then
|
||||
LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh"
|
||||
elif [[ -f "$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh" ]]; then
|
||||
LAUNCH_SCRIPT_PATH="$SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh"
|
||||
else
|
||||
echo "Error: Launch script '$LAUNCH_SCRIPT_PATH' not found."
|
||||
echo "Searched in:"
|
||||
echo " - $LAUNCH_SCRIPT_PATH"
|
||||
echo " - $SCRIPT_DIR/profiles/$LAUNCH_SCRIPT_PATH"
|
||||
echo " - $SCRIPT_DIR/profiles/${LAUNCH_SCRIPT_PATH}.sh"
|
||||
echo " - $SCRIPT_DIR/examples/$LAUNCH_SCRIPT_PATH"
|
||||
echo " - $SCRIPT_DIR/examples/${LAUNCH_SCRIPT_PATH}.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -578,17 +578,10 @@ start_cluster() {
|
||||
done
|
||||
fi
|
||||
|
||||
# Copy launch script if specified
|
||||
# Copy launch script to head node only (workers don't need it - they just run Ray)
|
||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
echo "Copying launch script to cluster nodes..."
|
||||
|
||||
# Copy to Head
|
||||
echo "Copying launch script to head node..."
|
||||
copy_launch_script_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "$LAUNCH_SCRIPT_PATH"
|
||||
|
||||
# Copy to Workers
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
copy_launch_script_to_container "$worker" "$CONTAINER_NAME" "false" "$LAUNCH_SCRIPT_PATH"
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ "$SOLO_MODE" == "false" ]]; then
|
||||
|
||||
@@ -75,6 +75,7 @@ RECIPE VERSION HISTORY:
|
||||
RELATED FILES:
|
||||
- run-recipe.sh: Bash wrapper that ensures Python deps are installed
|
||||
- recipes/*.yaml: Recipe definitions
|
||||
- examples/: Example launch scripts for direct use with launch-cluster.sh
|
||||
- launch-cluster.sh: Low-level container orchestration
|
||||
- build-and-copy.sh: Docker build and distribution
|
||||
- hf-download.sh: HuggingFace model download and sync
|
||||
|
||||
89
tests/expected_commands.sh
Normal file
89
tests/expected_commands.sh
Normal file
@@ -0,0 +1,89 @@
|
||||
# Expected vLLM serve arguments for each recipe
|
||||
# This file is used by test_recipes.sh to verify recipes match README documentation
|
||||
#
|
||||
# Format: Each recipe has a section with expected arguments
|
||||
# Tests will verify these arguments appear in the dry-run output
|
||||
#
|
||||
# IMPORTANT: Keep this in sync with README.md documentation
|
||||
# When updating recipes, update both README.md and this file
|
||||
|
||||
# ==============================================================================
|
||||
# glm-4.7-flash-awq
|
||||
# README Reference: Lines 186-198 (solo) and 203-218 (cluster)
|
||||
# ==============================================================================
|
||||
GLM_FLASH_AWQ_MODEL="cyankiwi/GLM-4.7-Flash-AWQ-4bit"
|
||||
GLM_FLASH_AWQ_CONTAINER="vllm-node-tf5"
|
||||
GLM_FLASH_AWQ_MOD="mods/fix-glm-4.7-flash-AWQ"
|
||||
GLM_FLASH_AWQ_ARGS=(
|
||||
"--tool-call-parser glm47"
|
||||
"--reasoning-parser glm45"
|
||||
"--enable-auto-tool-choice"
|
||||
"--served-model-name glm-4.7-flash"
|
||||
"--max-model-len 202752"
|
||||
"--max-num-batched-tokens 4096"
|
||||
"--max-num-seqs 64"
|
||||
"--gpu-memory-utilization 0.7"
|
||||
"--port 8888"
|
||||
"--host 0.0.0.0"
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# openai-gpt-oss-120b
|
||||
# README Reference: Lines 244-257 (solo) and 264-280 (cluster)
|
||||
# ==============================================================================
|
||||
GPT_OSS_MODEL="openai/gpt-oss-120b"
|
||||
GPT_OSS_CONTAINER="vllm-node-mxfp4"
|
||||
GPT_OSS_ARGS=(
|
||||
"--port 8888"
|
||||
"--host 0.0.0.0"
|
||||
"--enable-auto-tool-choice"
|
||||
"--tool-call-parser openai"
|
||||
"--reasoning-parser openai_gptoss"
|
||||
"--gpu-memory-utilization 0.7"
|
||||
"--enable-prefix-caching"
|
||||
"--load-format fastsafetensors"
|
||||
"--quantization mxfp4"
|
||||
"--mxfp4-backend CUTLASS"
|
||||
"--mxfp4-layers moe,qkv,o,lm_head"
|
||||
"--attention-backend FLASHINFER"
|
||||
"--kv-cache-dtype fp8"
|
||||
"--max-num-batched-tokens 8192"
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# minimax-m2-awq
|
||||
# README Reference: Not explicitly documented, but based on model requirements
|
||||
# ==============================================================================
|
||||
MINIMAX_MODEL="QuantTrio/MiniMax-M2-AWQ"
|
||||
MINIMAX_CONTAINER="vllm-node"
|
||||
MINIMAX_ARGS=(
|
||||
"--port 8000"
|
||||
"--host 0.0.0.0"
|
||||
"--gpu-memory-utilization 0.7"
|
||||
"--max-model-len 128000"
|
||||
"--load-format fastsafetensors"
|
||||
"--enable-auto-tool-choice"
|
||||
"--tool-call-parser minimax_m2"
|
||||
"--reasoning-parser minimax_m2_append_think"
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# Cluster Mode Expected Arguments
|
||||
# These are arguments that should appear ONLY in cluster mode
|
||||
# Note: Tests use 2 nodes, so tensor_parallel = 2 (1 GPU per node)
|
||||
# ==============================================================================
|
||||
|
||||
# glm-4.7-flash-awq cluster mode (no distributed backend - single GPU model)
|
||||
GLM_FLASH_AWQ_CLUSTER_TP="1"
|
||||
|
||||
# openai-gpt-oss-120b cluster mode (2 nodes = tp 2)
|
||||
GPT_OSS_CLUSTER_TP="2"
|
||||
GPT_OSS_CLUSTER_ARGS=(
|
||||
"--distributed-executor-backend ray"
|
||||
)
|
||||
|
||||
# minimax-m2-awq cluster mode (2 nodes = tp 2)
|
||||
MINIMAX_CLUSTER_TP="2"
|
||||
MINIMAX_CLUSTER_ARGS=(
|
||||
"--distributed-executor-backend ray"
|
||||
)
|
||||
859
tests/test_recipes.sh
Executable file
859
tests/test_recipes.sh
Executable file
@@ -0,0 +1,859 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# test_recipes.sh - Integration tests for run-recipe.py and launch-cluster.sh
|
||||
#
|
||||
# These tests use --dry-run mode to verify compatibility without actually
|
||||
# running containers. Suitable for CI/CD pipelines.
|
||||
#
|
||||
# Usage:
|
||||
# ./tests/test_recipes.sh # Run all tests
|
||||
# ./tests/test_recipes.sh -v # Verbose output
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
VERBOSE="${1:-}"
|
||||
|
||||
# Load expected commands for README verification
|
||||
source "$SCRIPT_DIR/expected_commands.sh"
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Test counters
|
||||
TESTS_PASSED=0
|
||||
TESTS_FAILED=0
|
||||
TESTS_SKIPPED=0
|
||||
|
||||
# Helper functions
|
||||
log_test() {
|
||||
echo -e "${YELLOW}[TEST]${NC} $1"
|
||||
}
|
||||
|
||||
log_pass() {
|
||||
echo -e "${GREEN}[PASS]${NC} $1"
|
||||
TESTS_PASSED=$((TESTS_PASSED + 1))
|
||||
}
|
||||
|
||||
log_fail() {
|
||||
echo -e "${RED}[FAIL]${NC} $1"
|
||||
TESTS_FAILED=$((TESTS_FAILED + 1))
|
||||
}
|
||||
|
||||
log_skip() {
|
||||
echo -e "${YELLOW}[SKIP]${NC} $1"
|
||||
TESTS_SKIPPED=$((TESTS_SKIPPED + 1))
|
||||
}
|
||||
|
||||
log_verbose() {
|
||||
if [[ "$VERBOSE" == "-v" ]]; then
|
||||
echo " $1"
|
||||
fi
|
||||
}
|
||||
|
||||
# Check prerequisites
|
||||
check_prerequisites() {
|
||||
log_test "Checking prerequisites..."
|
||||
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
log_fail "python3 not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check Python version
|
||||
python_version=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
|
||||
if [[ $(echo "$python_version < 3.10" | bc -l) -eq 1 ]]; then
|
||||
log_fail "Python 3.10+ required, found $python_version"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check PyYAML
|
||||
if ! python3 -c "import yaml" 2>/dev/null; then
|
||||
log_fail "PyYAML not installed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log_pass "Prerequisites OK (Python $python_version with PyYAML)"
|
||||
}
|
||||
|
||||
# Test: run-recipe.py exists and is executable
|
||||
test_run_recipe_exists() {
|
||||
log_test "run-recipe.py exists and is executable"
|
||||
|
||||
if [[ -x "$PROJECT_DIR/run-recipe.py" ]]; then
|
||||
log_pass "run-recipe.py is executable"
|
||||
else
|
||||
log_fail "run-recipe.py not found or not executable"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: launch-cluster.sh exists and is executable
|
||||
test_launch_cluster_exists() {
|
||||
log_test "launch-cluster.sh exists and is executable"
|
||||
|
||||
if [[ -x "$PROJECT_DIR/launch-cluster.sh" ]]; then
|
||||
log_pass "launch-cluster.sh is executable"
|
||||
else
|
||||
log_fail "launch-cluster.sh not found or not executable"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: run-recipe.py --list works
|
||||
test_list_recipes() {
|
||||
log_test "run-recipe.py --list"
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" --list 2>&1)
|
||||
|
||||
if [[ $? -eq 0 ]] && echo "$output" | grep -q "Available recipes"; then
|
||||
log_pass "--list shows available recipes"
|
||||
log_verbose "Found recipes in output"
|
||||
else
|
||||
log_fail "--list failed or no recipes found"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: All recipes have required recipe_version field
|
||||
test_recipe_version_required() {
|
||||
log_test "All recipes have required recipe_version field"
|
||||
|
||||
local all_valid=true
|
||||
for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
|
||||
if [[ -f "$recipe" ]]; then
|
||||
recipe_name=$(basename "$recipe")
|
||||
if ! grep -q "^recipe_version:" "$recipe"; then
|
||||
log_verbose "$recipe_name missing recipe_version"
|
||||
all_valid=false
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$all_valid" == "true" ]]; then
|
||||
log_pass "All recipes have recipe_version field"
|
||||
else
|
||||
log_fail "Some recipes missing recipe_version field"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: All recipes load without errors
|
||||
test_all_recipes_load() {
|
||||
log_test "All recipes load without errors"
|
||||
|
||||
local all_valid=true
|
||||
for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
|
||||
if [[ -f "$recipe" ]]; then
|
||||
recipe_name=$(basename "$recipe" .yaml)
|
||||
# Try to load recipe with --dry-run (will fail early if recipe is invalid)
|
||||
if ! "$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1 | grep -q "Error:"; then
|
||||
log_verbose "$recipe_name loads OK"
|
||||
else
|
||||
log_verbose "$recipe_name failed to load"
|
||||
all_valid=false
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$all_valid" == "true" ]]; then
|
||||
log_pass "All recipes load successfully"
|
||||
else
|
||||
log_fail "Some recipes failed to load"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Dry-run generates valid launch script
|
||||
test_dry_run_generates_script() {
|
||||
log_test "Dry-run generates valid launch script"
|
||||
|
||||
# Find first available recipe
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
|
||||
if echo "$output" | grep -q "#!/bin/bash" && echo "$output" | grep -q "vllm serve"; then
|
||||
log_pass "Dry-run generates bash script with vllm serve command"
|
||||
else
|
||||
log_fail "Dry-run output doesn't contain expected content"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Solo mode sets tensor_parallel=1
|
||||
test_solo_mode_tp1() {
|
||||
log_test "Solo mode sets tensor_parallel=1"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
|
||||
# Check that -tp 1 is in the output (solo mode should set tp=1)
|
||||
if echo "$output" | grep -q "\-tp 1"; then
|
||||
log_pass "Solo mode correctly sets -tp 1"
|
||||
else
|
||||
log_fail "Solo mode did not set -tp 1"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Solo mode removes --distributed-executor-backend ray
|
||||
test_solo_mode_removes_ray() {
|
||||
log_test "Solo mode removes --distributed-executor-backend ray"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
|
||||
# Check that --distributed-executor-backend is NOT in the output
|
||||
if ! echo "$output" | grep -q "\-\-distributed-executor-backend"; then
|
||||
log_pass "Solo mode correctly removes --distributed-executor-backend"
|
||||
else
|
||||
log_fail "Solo mode did not remove --distributed-executor-backend"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Cluster mode preserves --distributed-executor-backend ray
|
||||
test_cluster_mode_keeps_ray() {
|
||||
log_test "Cluster mode preserves --distributed-executor-backend ray"
|
||||
|
||||
# Use minimax-m2-awq which explicitly has --distributed-executor-backend ray
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/minimax-m2-awq.yaml" ]]; then
|
||||
log_skip "minimax-m2-awq.yaml not found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "192.168.1.1,192.168.1.2" 2>&1)
|
||||
|
||||
# Check that --distributed-executor-backend IS in the output for cluster mode
|
||||
if echo "$output" | grep -q "\-\-distributed-executor-backend ray"; then
|
||||
log_pass "Cluster mode correctly preserves --distributed-executor-backend ray"
|
||||
else
|
||||
log_fail "Cluster mode did not preserve --distributed-executor-backend"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: CLI overrides work (--port)
|
||||
test_cli_override_port() {
|
||||
log_test "CLI override --port works"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 9999 2>&1)
|
||||
|
||||
if echo "$output" | grep -q "\-\-port 9999"; then
|
||||
log_pass "--port override correctly applied"
|
||||
else
|
||||
log_fail "--port override not found in output"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: launch-cluster.sh --help works
|
||||
test_launch_cluster_help() {
|
||||
log_test "launch-cluster.sh --help"
|
||||
|
||||
output=$("$PROJECT_DIR/launch-cluster.sh" --help 2>&1 || true)
|
||||
|
||||
if echo "$output" | grep -q "Usage:"; then
|
||||
log_pass "--help shows usage information"
|
||||
else
|
||||
log_fail "--help did not show usage"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: launch-cluster.sh references examples/ not profiles/
|
||||
test_launch_cluster_examples_path() {
|
||||
log_test "launch-cluster.sh references examples/ directory"
|
||||
|
||||
if grep -q "examples/" "$PROJECT_DIR/launch-cluster.sh"; then
|
||||
log_pass "launch-cluster.sh references examples/"
|
||||
else
|
||||
log_fail "launch-cluster.sh does not reference examples/"
|
||||
fi
|
||||
|
||||
if grep -q "profiles/" "$PROJECT_DIR/launch-cluster.sh"; then
|
||||
log_fail "launch-cluster.sh still references profiles/"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Unsupported recipe version shows warning
|
||||
test_unsupported_recipe_version() {
|
||||
log_test "Unsupported recipe_version shows warning"
|
||||
|
||||
# Create a temporary recipe with unsupported version
|
||||
temp_recipe=$(mktemp)
|
||||
cat > "$temp_recipe" << 'EOF'
|
||||
recipe_version: "999"
|
||||
name: Test Recipe
|
||||
container: test-container
|
||||
command: echo "test"
|
||||
EOF
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1)
|
||||
rm -f "$temp_recipe"
|
||||
|
||||
if echo "$output" | grep -q "Warning.*schema version"; then
|
||||
log_pass "Unsupported recipe_version shows warning"
|
||||
else
|
||||
log_fail "No warning for unsupported recipe_version"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Missing recipe_version fails
|
||||
test_missing_recipe_version_fails() {
|
||||
log_test "Missing recipe_version field fails"
|
||||
|
||||
# Create a temporary recipe without recipe_version
|
||||
temp_recipe=$(mktemp)
|
||||
cat > "$temp_recipe" << 'EOF'
|
||||
name: Test Recipe
|
||||
container: test-container
|
||||
command: echo "test"
|
||||
EOF
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true)
|
||||
rm -f "$temp_recipe"
|
||||
|
||||
if echo "$output" | grep -q "Error.*recipe_version"; then
|
||||
log_pass "Missing recipe_version correctly fails"
|
||||
else
|
||||
log_fail "Missing recipe_version did not fail as expected"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: cluster_only recipe fails in solo mode
|
||||
test_cluster_only_fails_solo() {
|
||||
log_test "cluster_only recipe fails in solo mode"
|
||||
|
||||
# Create a temporary cluster_only recipe
|
||||
temp_recipe=$(mktemp)
|
||||
cat > "$temp_recipe" << 'EOF'
|
||||
recipe_version: "1"
|
||||
name: Cluster Only Test
|
||||
container: test-container
|
||||
cluster_only: true
|
||||
command: echo "test"
|
||||
EOF
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true)
|
||||
exit_code=$?
|
||||
rm -f "$temp_recipe"
|
||||
|
||||
if echo "$output" | grep -q "requires cluster mode"; then
|
||||
log_pass "cluster_only recipe correctly fails in solo mode"
|
||||
else
|
||||
log_fail "cluster_only recipe did not fail in solo mode"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# Launch-cluster.sh Command Line Verification Tests
|
||||
# ==============================================================================
|
||||
# These tests verify that the dry-run output contains the expected
|
||||
# launch-cluster.sh command line arguments matching the recipe configuration.
|
||||
|
||||
# Helper: Extract launch-cluster command from dry-run output
|
||||
extract_launch_cmd() {
|
||||
echo "$1" | grep -A5 "launch-cluster.sh is called with:" | grep -v "launch-cluster.sh is called with:" | tr '\n' ' '
|
||||
}
|
||||
|
||||
# Test: Solo mode generates --solo flag in launch-cluster command
|
||||
test_launch_cmd_solo_flag() {
|
||||
log_test "Launch command includes --solo flag in solo mode"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-\-solo"; then
|
||||
log_pass "Launch command includes --solo flag"
|
||||
else
|
||||
log_fail "Launch command missing --solo flag"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Cluster mode generates -n flag with nodes
|
||||
test_launch_cmd_nodes_flag() {
|
||||
log_test "Launch command includes -n flag with nodes in cluster mode"
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-n 10.0.0.1,10.0.0.2"; then
|
||||
log_pass "Launch command includes -n with correct nodes"
|
||||
else
|
||||
log_fail "Launch command missing or incorrect -n flag"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Container image from recipe is passed to launch-cluster
|
||||
test_launch_cmd_container_image() {
|
||||
log_test "Launch command includes correct container image (-t)"
|
||||
|
||||
# Use openai-gpt-oss-120b which has a specific container name
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/openai-gpt-oss-120b.yaml" ]]; then
|
||||
log_skip "openai-gpt-oss-120b.yaml not found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" openai-gpt-oss-120b --dry-run --solo 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
# Check the container is vllm-node-mxfp4 (from the recipe)
|
||||
if echo "$launch_cmd" | grep -q "\-t vllm-node-mxfp4"; then
|
||||
log_pass "Launch command includes correct container image"
|
||||
else
|
||||
log_fail "Launch command has wrong container image"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Mods from recipe are passed as --apply-mod
|
||||
test_launch_cmd_mods() {
|
||||
log_test "Launch command includes --apply-mod for recipe mods"
|
||||
|
||||
# Use glm-4.7-flash-awq which has a mod
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then
|
||||
log_skip "glm-4.7-flash-awq.yaml not found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-\-apply-mod"; then
|
||||
log_pass "Launch command includes --apply-mod for mods"
|
||||
else
|
||||
log_fail "Launch command missing --apply-mod"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Daemon mode flag is passed through
|
||||
test_launch_cmd_daemon_flag() {
|
||||
log_test "Launch command includes -d flag in daemon mode"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -d 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-d"; then
|
||||
log_pass "Launch command includes -d flag"
|
||||
else
|
||||
log_fail "Launch command missing -d flag"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: NCCL debug level is passed through
|
||||
test_launch_cmd_nccl_debug() {
|
||||
log_test "Launch command includes --nccl-debug when specified"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --nccl-debug INFO 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-\-nccl-debug INFO"; then
|
||||
log_pass "Launch command includes --nccl-debug INFO"
|
||||
else
|
||||
log_fail "Launch command missing --nccl-debug"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: --launch-script is always included
|
||||
test_launch_cmd_launch_script() {
|
||||
log_test "Launch command includes --launch-script"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-\-launch-script"; then
|
||||
log_pass "Launch command includes --launch-script"
|
||||
else
|
||||
log_fail "Launch command missing --launch-script"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Container override (-t CLI) takes precedence
|
||||
test_launch_cmd_container_override() {
|
||||
log_test "CLI container override (-t) takes precedence"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -t my-custom-image 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-t my-custom-image"; then
|
||||
log_pass "Container override correctly applied"
|
||||
else
|
||||
log_fail "Container override not applied"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: Cluster mode does NOT include --solo flag
|
||||
test_launch_cmd_no_solo_in_cluster() {
|
||||
log_test "Launch command does NOT include --solo in cluster mode"
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" minimax-m2-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -qv "\-\-solo" || ! echo "$launch_cmd" | grep -q "\-\-solo"; then
|
||||
log_pass "Cluster mode correctly omits --solo flag"
|
||||
else
|
||||
log_fail "Cluster mode incorrectly includes --solo flag"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# README Documentation Verification Tests
|
||||
# ==============================================================================
|
||||
# These tests verify that recipe dry-run output matches the expected commands
|
||||
# documented in README.md. Expected values are defined in expected_commands.sh
|
||||
|
||||
# Helper: Extract the generated launch script from dry-run output
|
||||
extract_vllm_command() {
|
||||
# Extract lines between "Generated Launch Script" and "What would be executed"
|
||||
echo "$1" | sed -n '/=== Generated Launch Script ===/,/=== What would be executed ===/p' | grep -v "===" | grep -v "^#" | grep -v "^$"
|
||||
}
|
||||
|
||||
# Helper: Verify a recipe contains all expected arguments
|
||||
verify_recipe_args() {
|
||||
local recipe_name="$1"
|
||||
local expected_model="$2"
|
||||
local expected_container="$3"
|
||||
shift 3
|
||||
local expected_args=("$@")
|
||||
|
||||
log_test "README match: $recipe_name"
|
||||
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then
|
||||
log_skip "${recipe_name}.yaml not found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
vllm_cmd=$(extract_vllm_command "$output")
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
local all_passed=true
|
||||
local missing_args=()
|
||||
|
||||
# Check model name
|
||||
if ! echo "$vllm_cmd" | grep -q "$expected_model"; then
|
||||
missing_args+=("model: $expected_model")
|
||||
all_passed=false
|
||||
fi
|
||||
|
||||
# Check container
|
||||
if ! echo "$launch_cmd" | grep -q "\-t $expected_container"; then
|
||||
missing_args+=("container: $expected_container")
|
||||
all_passed=false
|
||||
fi
|
||||
|
||||
# Check each expected argument
|
||||
for arg in "${expected_args[@]}"; do
|
||||
# Handle arguments that may have slight formatting differences
|
||||
# Extract the flag and value separately for flexible matching
|
||||
local flag=$(echo "$arg" | awk '{print $1}')
|
||||
local value=$(echo "$arg" | cut -d' ' -f2-)
|
||||
|
||||
# Use grep -F for fixed string matching (avoids -- being treated as grep options)
|
||||
if ! echo "$vllm_cmd" | grep -qF -- "$flag"; then
|
||||
missing_args+=("$arg")
|
||||
all_passed=false
|
||||
elif [[ -n "$value" ]] && [[ "$value" != "$flag" ]]; then
|
||||
# Check if value is present (might be on next line due to formatting)
|
||||
if ! echo "$vllm_cmd" | grep -qF -- "$value"; then
|
||||
missing_args+=("$arg (flag present, value mismatch)")
|
||||
all_passed=false
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$all_passed" == "true" ]]; then
|
||||
log_pass "README match: $recipe_name - all expected arguments present"
|
||||
else
|
||||
log_fail "README match: $recipe_name - missing arguments"
|
||||
for missing in "${missing_args[@]}"; do
|
||||
log_verbose " Missing: $missing"
|
||||
done
|
||||
log_verbose " vLLM command: $vllm_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: glm-4.7-flash-awq matches README documentation
|
||||
test_readme_glm_flash_awq() {
|
||||
verify_recipe_args "glm-4.7-flash-awq" \
|
||||
"$GLM_FLASH_AWQ_MODEL" \
|
||||
"$GLM_FLASH_AWQ_CONTAINER" \
|
||||
"${GLM_FLASH_AWQ_ARGS[@]}"
|
||||
}
|
||||
|
||||
# Test: openai-gpt-oss-120b matches README documentation
|
||||
test_readme_gpt_oss() {
|
||||
verify_recipe_args "openai-gpt-oss-120b" \
|
||||
"$GPT_OSS_MODEL" \
|
||||
"$GPT_OSS_CONTAINER" \
|
||||
"${GPT_OSS_ARGS[@]}"
|
||||
}
|
||||
|
||||
# Test: minimax-m2-awq matches expected configuration
|
||||
test_readme_minimax() {
|
||||
verify_recipe_args "minimax-m2-awq" \
|
||||
"$MINIMAX_MODEL" \
|
||||
"$MINIMAX_CONTAINER" \
|
||||
"${MINIMAX_ARGS[@]}"
|
||||
}
|
||||
|
||||
# Test: glm-4.7-flash-awq includes correct mod
|
||||
test_readme_glm_flash_mod() {
|
||||
log_test "README match: glm-4.7-flash-awq mod path"
|
||||
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then
|
||||
log_skip "glm-4.7-flash-awq.yaml not found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "$GLM_FLASH_AWQ_MOD"; then
|
||||
log_pass "README match: glm-4.7-flash-awq has correct mod path"
|
||||
else
|
||||
log_fail "README match: glm-4.7-flash-awq missing expected mod: $GLM_FLASH_AWQ_MOD"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper: Verify cluster mode specific arguments
|
||||
verify_cluster_args() {
|
||||
local recipe_name="$1"
|
||||
local expected_tp="$2"
|
||||
shift 2
|
||||
local expected_args=("$@")
|
||||
|
||||
log_test "README match (cluster): $recipe_name"
|
||||
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/${recipe_name}.yaml" ]]; then
|
||||
log_skip "${recipe_name}.yaml not found"
|
||||
return
|
||||
fi
|
||||
|
||||
# Use fake nodes for cluster mode
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
|
||||
vllm_cmd=$(extract_vllm_command "$output")
|
||||
|
||||
local all_passed=true
|
||||
local missing_args=()
|
||||
|
||||
# Check tensor parallel
|
||||
if ! echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) $expected_tp"; then
|
||||
missing_args+=("tensor_parallel: $expected_tp")
|
||||
all_passed=false
|
||||
fi
|
||||
|
||||
# Check cluster-specific arguments
|
||||
for arg in "${expected_args[@]}"; do
|
||||
if ! echo "$vllm_cmd" | grep -qF -- "$arg"; then
|
||||
missing_args+=("$arg")
|
||||
all_passed=false
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$all_passed" == "true" ]]; then
|
||||
log_pass "README match (cluster): $recipe_name - cluster args correct"
|
||||
else
|
||||
log_fail "README match (cluster): $recipe_name - missing cluster arguments"
|
||||
for missing in "${missing_args[@]}"; do
|
||||
log_verbose " Missing: $missing"
|
||||
done
|
||||
log_verbose " vLLM command: $vllm_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: openai-gpt-oss-120b cluster mode has correct tensor_parallel and ray backend
|
||||
test_readme_gpt_oss_cluster() {
|
||||
verify_cluster_args "openai-gpt-oss-120b" \
|
||||
"$GPT_OSS_CLUSTER_TP" \
|
||||
"${GPT_OSS_CLUSTER_ARGS[@]}"
|
||||
}
|
||||
|
||||
# Test: minimax-m2-awq cluster mode has correct tensor_parallel and ray backend
|
||||
test_readme_minimax_cluster() {
|
||||
verify_cluster_args "minimax-m2-awq" \
|
||||
"$MINIMAX_CLUSTER_TP" \
|
||||
"${MINIMAX_CLUSTER_ARGS[@]}"
|
||||
}
|
||||
|
||||
# Test: glm-4.7-flash-awq cluster mode stays at tp=1 (single GPU model)
|
||||
test_readme_glm_flash_cluster() {
|
||||
log_test "README match (cluster): glm-4.7-flash-awq stays tp=1"
|
||||
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then
|
||||
log_skip "glm-4.7-flash-awq.yaml not found"
|
||||
return
|
||||
fi
|
||||
|
||||
# Even in cluster mode, this model uses tp=1
|
||||
output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run -n "10.0.0.1,10.0.0.2" 2>&1)
|
||||
vllm_cmd=$(extract_vllm_command "$output")
|
||||
|
||||
if echo "$vllm_cmd" | grep -qE "(--tensor-parallel-size|-tp) 1"; then
|
||||
log_pass "README match (cluster): glm-4.7-flash-awq correctly keeps tp=1"
|
||||
else
|
||||
log_fail "README match (cluster): glm-4.7-flash-awq should have tp=1"
|
||||
log_verbose " vLLM command: $vllm_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Run all tests
|
||||
main() {
|
||||
echo "=============================================="
|
||||
echo " run-recipe.py Integration Tests"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
|
||||
cd "$PROJECT_DIR"
|
||||
|
||||
check_prerequisites
|
||||
echo ""
|
||||
|
||||
# File existence tests
|
||||
test_run_recipe_exists
|
||||
test_launch_cluster_exists
|
||||
echo ""
|
||||
|
||||
# Basic functionality tests
|
||||
test_list_recipes
|
||||
test_recipe_version_required
|
||||
test_all_recipes_load
|
||||
echo ""
|
||||
|
||||
# Dry-run tests
|
||||
test_dry_run_generates_script
|
||||
test_solo_mode_tp1
|
||||
test_solo_mode_removes_ray
|
||||
test_cluster_mode_keeps_ray
|
||||
test_cli_override_port
|
||||
echo ""
|
||||
|
||||
# launch-cluster.sh command line verification tests
|
||||
echo "--- Launch Command Verification ---"
|
||||
test_launch_cmd_solo_flag
|
||||
test_launch_cmd_nodes_flag
|
||||
test_launch_cmd_container_image
|
||||
test_launch_cmd_mods
|
||||
test_launch_cmd_daemon_flag
|
||||
test_launch_cmd_nccl_debug
|
||||
test_launch_cmd_launch_script
|
||||
test_launch_cmd_container_override
|
||||
test_launch_cmd_no_solo_in_cluster
|
||||
echo ""
|
||||
|
||||
# README documentation verification tests
|
||||
echo "--- README Documentation Verification (Solo Mode) ---"
|
||||
test_readme_glm_flash_awq
|
||||
test_readme_gpt_oss
|
||||
test_readme_minimax
|
||||
test_readme_glm_flash_mod
|
||||
echo ""
|
||||
|
||||
# Cluster mode documentation verification tests
|
||||
echo "--- README Documentation Verification (Cluster Mode) ---"
|
||||
test_readme_gpt_oss_cluster
|
||||
test_readme_minimax_cluster
|
||||
test_readme_glm_flash_cluster
|
||||
echo ""
|
||||
|
||||
# launch-cluster.sh tests
|
||||
test_launch_cluster_help
|
||||
test_launch_cluster_examples_path
|
||||
echo ""
|
||||
|
||||
# Validation tests
|
||||
test_unsupported_recipe_version
|
||||
test_missing_recipe_version_fails
|
||||
test_cluster_only_fails_solo
|
||||
echo ""
|
||||
|
||||
# Summary
|
||||
echo "=============================================="
|
||||
echo " Test Summary"
|
||||
echo "=============================================="
|
||||
echo -e " ${GREEN}Passed:${NC} $TESTS_PASSED"
|
||||
echo -e " ${RED}Failed:${NC} $TESTS_FAILED"
|
||||
echo -e " ${YELLOW}Skipped:${NC} $TESTS_SKIPPED"
|
||||
echo "=============================================="
|
||||
|
||||
if [[ $TESTS_FAILED -gt 0 ]]; then
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user