Merge pull request #34
This commit is contained in:
@@ -18,6 +18,8 @@ model: cyankiwi/GLM-4.7-Flash-AWQ-4bit
|
||||
# This model can run on single node (solo) or cluster
|
||||
cluster_only: false
|
||||
|
||||
solo_only: false
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
|
||||
@@ -68,6 +68,7 @@ RECIPE YAML SCHEMA:
|
||||
env: dict # Optional: Environment variables
|
||||
build_args: list[str] # Optional: Args for build-and-copy.sh
|
||||
cluster_only: bool # Optional: Require cluster mode (default: false)
|
||||
solo_only: bool # Optional: Require solo mode (default: false)
|
||||
|
||||
RECIPE VERSION HISTORY:
|
||||
Version 1 (default): Initial schema with all fields above supported.
|
||||
@@ -132,6 +133,7 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]:
|
||||
env (dict, optional): Environment variables to export before running
|
||||
build_args (list[str], optional): Extra args for build-and-copy.sh (e.g., ['-f', 'Dockerfile.mxfp4'])
|
||||
cluster_only (bool, optional): If True, recipe cannot run in solo mode
|
||||
solo_only (bool, optional): If True, recipe cannot run in cluster mode
|
||||
|
||||
Args:
|
||||
recipe_path: Path object pointing to YAML file or just recipe name
|
||||
@@ -175,6 +177,8 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]:
|
||||
recipe.setdefault("mods", [])
|
||||
recipe.setdefault("defaults", {})
|
||||
recipe.setdefault("env", {})
|
||||
recipe.setdefault("cluster_only", False)
|
||||
recipe.setdefault("solo_only", False)
|
||||
|
||||
# Validate recipe version compatibility
|
||||
# EXTENSIBILITY: When adding new schema versions, update SUPPORTED_VERSIONS
|
||||
@@ -221,6 +225,7 @@ def list_recipes() -> None:
|
||||
model = recipe.get("model", "")
|
||||
mods = recipe.get("mods", [])
|
||||
cluster_only = recipe.get("cluster_only", False)
|
||||
solo_only = recipe.get("solo_only", False)
|
||||
|
||||
print(f" {recipe_path.name}")
|
||||
print(f" Name: {name}")
|
||||
@@ -229,7 +234,9 @@ def list_recipes() -> None:
|
||||
if model:
|
||||
print(f" Model: {model}")
|
||||
if cluster_only:
|
||||
print(f" Cluster only: Yes")
|
||||
print(" Cluster only: Yes")
|
||||
if solo_only:
|
||||
print(" Solo only: Yes")
|
||||
print(f" Container: {container}")
|
||||
if build_args:
|
||||
print(f" Build args: {' '.join(build_args)}")
|
||||
@@ -921,6 +928,7 @@ Examples:
|
||||
|
||||
# Check if recipe requires cluster mode
|
||||
cluster_only = recipe.get("cluster_only", False)
|
||||
solo_only = recipe.get("solo_only", False)
|
||||
is_solo = args.solo or not is_cluster
|
||||
|
||||
if cluster_only and is_solo:
|
||||
@@ -932,6 +940,14 @@ Examples:
|
||||
print(f" 2. Auto-discover and save: {sys.argv[0]} --discover")
|
||||
print(f" Then run: {sys.argv[0]} {args.recipe}")
|
||||
return 1
|
||||
if solo_only and not is_solo:
|
||||
print(f"Error: Recipe '{recipe['name']}' requires solo mode.")
|
||||
print("This recipe is intended to run on a single node only.")
|
||||
print()
|
||||
print("Options:")
|
||||
print(f" 1. Run solo: {sys.argv[0]} {args.recipe} --solo")
|
||||
print(f" 2. Remove nodes from .env: {sys.argv[0]} --show-env")
|
||||
return 1
|
||||
|
||||
# Determine copy targets for cluster deployments
|
||||
copy_targets = worker_nodes if is_cluster else None
|
||||
@@ -944,7 +960,9 @@ Examples:
|
||||
if model:
|
||||
print(f"Model: {model}")
|
||||
if cluster_only:
|
||||
print(f"Cluster only: Yes (model too large for single node)")
|
||||
print("Cluster only: Yes (model too large for single node)")
|
||||
if solo_only:
|
||||
print("Solo only: Yes (single node only)")
|
||||
if nodes:
|
||||
source = "(from .env)" if nodes_from_env else ""
|
||||
print(f"Nodes: {', '.join(nodes)} {source}".strip())
|
||||
|
||||
@@ -10,7 +10,8 @@
|
||||
# ./tests/test_recipes.sh -v # Verbose output
|
||||
#
|
||||
|
||||
set -e
|
||||
# Don't exit on first failure; we want a full summary.
|
||||
set +e
|
||||
|
||||
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
@@ -56,6 +57,83 @@ log_verbose() {
|
||||
fi
|
||||
}
|
||||
|
||||
get_recipe_flag() {
|
||||
local flag_name="$1"
|
||||
local recipe_file="$2"
|
||||
grep -E "^${flag_name}:" "$recipe_file" | awk '{print $2}'
|
||||
}
|
||||
|
||||
find_solo_recipe() {
|
||||
for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
|
||||
if [[ -f "$recipe" ]]; then
|
||||
cluster_only=$(get_recipe_flag "cluster_only" "$recipe")
|
||||
if [[ "$cluster_only" == "true" ]]; then
|
||||
continue
|
||||
fi
|
||||
echo "$(basename "$recipe" .yaml)"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
find_cluster_recipe() {
|
||||
for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
|
||||
if [[ -f "$recipe" ]]; then
|
||||
solo_only=$(get_recipe_flag "solo_only" "$recipe")
|
||||
if [[ "$solo_only" == "true" ]]; then
|
||||
continue
|
||||
fi
|
||||
echo "$(basename "$recipe" .yaml)"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
find_recipe_with_mods() {
|
||||
for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
|
||||
if [[ -f "$recipe" ]]; then
|
||||
has_mods=$(awk '
|
||||
/^mods:/ {inmods=1; next}
|
||||
inmods && /^[[:space:]]*-[[:space:]]/ {print "yes"; exit}
|
||||
inmods && /^[^[:space:]]/ {exit}
|
||||
' "$recipe")
|
||||
if [[ "$has_mods" == "yes" ]]; then
|
||||
echo "$(basename "$recipe" .yaml)"
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
get_recipe_mode() {
|
||||
local recipe_name="$1"
|
||||
local recipe_file="$PROJECT_DIR/recipes/${recipe_name}.yaml"
|
||||
local cluster_only
|
||||
local solo_only
|
||||
cluster_only=$(get_recipe_flag "cluster_only" "$recipe_file")
|
||||
solo_only=$(get_recipe_flag "solo_only" "$recipe_file")
|
||||
if [[ "$cluster_only" == "true" ]]; then
|
||||
echo "cluster"
|
||||
elif [[ "$solo_only" == "true" ]]; then
|
||||
echo "solo"
|
||||
else
|
||||
echo "solo"
|
||||
fi
|
||||
}
|
||||
|
||||
run_recipe_dry_run() {
|
||||
local recipe_name="$1"
|
||||
local mode="$2"
|
||||
if [[ "$mode" == "cluster" ]]; then
|
||||
"$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1
|
||||
else
|
||||
"$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1
|
||||
fi
|
||||
}
|
||||
|
||||
# Check prerequisites
|
||||
check_prerequisites() {
|
||||
log_test "Checking prerequisites..."
|
||||
@@ -148,8 +226,22 @@ test_all_recipes_load() {
|
||||
for recipe in "$PROJECT_DIR/recipes/"*.yaml; do
|
||||
if [[ -f "$recipe" ]]; then
|
||||
recipe_name=$(basename "$recipe" .yaml)
|
||||
# Try to load recipe with --dry-run (will fail early if recipe is invalid)
|
||||
if ! "$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1 | grep -q "Error:"; then
|
||||
cluster_only=$(grep -E "^cluster_only:" "$recipe" | awk '{print $2}')
|
||||
solo_only=$(grep -E "^solo_only:" "$recipe" | awk '{print $2}')
|
||||
|
||||
if [[ "$cluster_only" == "true" && "$solo_only" == "true" ]]; then
|
||||
log_verbose "$recipe_name has conflicting cluster_only + solo_only"
|
||||
all_valid=false
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ "$cluster_only" == "true" ]]; then
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1 || true)
|
||||
else
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1 || true)
|
||||
fi
|
||||
|
||||
if ! echo "$output" | grep -q "Error:"; then
|
||||
log_verbose "$recipe_name loads OK"
|
||||
else
|
||||
log_verbose "$recipe_name failed to load"
|
||||
@@ -169,15 +261,13 @@ test_all_recipes_load() {
|
||||
test_dry_run_generates_script() {
|
||||
log_test "Dry-run generates valid launch script"
|
||||
|
||||
# Find first available recipe
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
output=$(run_recipe_dry_run "$recipe_name" "solo")
|
||||
|
||||
if echo "$output" | grep -q "#!/bin/bash" && echo "$output" | grep -q "vllm serve"; then
|
||||
log_pass "Dry-run generates bash script with vllm serve command"
|
||||
@@ -191,14 +281,13 @@ test_dry_run_generates_script() {
|
||||
test_solo_mode_tp1() {
|
||||
log_test "Solo mode sets tensor_parallel=1"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
output=$(run_recipe_dry_run "$recipe_name" "solo")
|
||||
|
||||
# Check that -tp 1 is in the output (solo mode should set tp=1)
|
||||
if echo "$output" | grep -q "\-tp 1"; then
|
||||
@@ -213,14 +302,13 @@ test_solo_mode_tp1() {
|
||||
test_solo_mode_removes_ray() {
|
||||
log_test "Solo mode removes --distributed-executor-backend ray"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
output=$(run_recipe_dry_run "$recipe_name" "solo")
|
||||
|
||||
# Check that --distributed-executor-backend is NOT in the output
|
||||
if ! echo "$output" | grep -q "\-\-distributed-executor-backend"; then
|
||||
@@ -256,13 +344,12 @@ test_cluster_mode_keeps_ray() {
|
||||
test_cli_override_port() {
|
||||
log_test "CLI override --port works"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 9999 2>&1)
|
||||
|
||||
if echo "$output" | grep -q "\-\-port 9999"; then
|
||||
@@ -375,6 +462,82 @@ EOF
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: solo_only recipe fails in cluster mode
|
||||
test_solo_only_fails_cluster() {
|
||||
log_test "solo_only recipe fails in cluster mode"
|
||||
|
||||
# Create a temporary solo_only recipe
|
||||
temp_recipe=$(mktemp)
|
||||
cat > "$temp_recipe" << 'EOF'
|
||||
recipe_version: "1"
|
||||
name: Solo Only Test
|
||||
container: test-container
|
||||
solo_only: true
|
||||
command: echo "test"
|
||||
EOF
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1 || true)
|
||||
rm -f "$temp_recipe"
|
||||
|
||||
if echo "$output" | grep -q "requires solo mode"; then
|
||||
log_pass "solo_only recipe correctly fails in cluster mode"
|
||||
else
|
||||
log_fail "solo_only recipe did not fail in cluster mode"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: solo_only recipe succeeds in solo mode
|
||||
test_solo_only_allows_solo() {
|
||||
log_test "solo_only recipe succeeds in solo mode"
|
||||
|
||||
temp_recipe=$(mktemp)
|
||||
cat > "$temp_recipe" << 'EOF'
|
||||
recipe_version: "1"
|
||||
name: Solo Only Test
|
||||
container: test-container
|
||||
solo_only: true
|
||||
command: echo "test"
|
||||
EOF
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true)
|
||||
rm -f "$temp_recipe"
|
||||
|
||||
if ! echo "$output" | grep -q "Error: Recipe"; then
|
||||
log_pass "solo_only recipe runs in solo mode"
|
||||
else
|
||||
log_fail "solo_only recipe failed in solo mode"
|
||||
log_verbose "$output"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: cluster_only and solo_only both true fails in any mode
|
||||
test_conflicting_mode_flags_fail() {
|
||||
log_test "cluster_only and solo_only both true fails"
|
||||
|
||||
temp_recipe=$(mktemp)
|
||||
cat > "$temp_recipe" << 'EOF'
|
||||
recipe_version: "1"
|
||||
name: Conflicting Mode Test
|
||||
container: test-container
|
||||
cluster_only: true
|
||||
solo_only: true
|
||||
command: echo "test"
|
||||
EOF
|
||||
|
||||
output_solo=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run --solo 2>&1 || true)
|
||||
output_cluster=$("$PROJECT_DIR/run-recipe.py" "$temp_recipe" --dry-run -n "10.0.0.1,10.0.0.2" 2>&1 || true)
|
||||
rm -f "$temp_recipe"
|
||||
|
||||
if echo "$output_solo" | grep -q "requires cluster mode" && echo "$output_cluster" | grep -q "requires solo mode"; then
|
||||
log_pass "Conflicting flags correctly fail in both modes"
|
||||
else
|
||||
log_fail "Conflicting flags did not fail as expected"
|
||||
log_verbose "solo: $output_solo"
|
||||
log_verbose "cluster: $output_cluster"
|
||||
fi
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# Launch-cluster.sh Command Line Verification Tests
|
||||
# ==============================================================================
|
||||
@@ -390,14 +553,13 @@ extract_launch_cmd() {
|
||||
test_launch_cmd_solo_flag() {
|
||||
log_test "Launch command includes --solo flag in solo mode"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
output=$(run_recipe_dry_run "$recipe_name" "solo")
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-\-solo"; then
|
||||
@@ -449,13 +611,14 @@ test_launch_cmd_container_image() {
|
||||
test_launch_cmd_mods() {
|
||||
log_test "Launch command includes --apply-mod for recipe mods"
|
||||
|
||||
# Use glm-4.7-flash-awq which has a mod
|
||||
if [[ ! -f "$PROJECT_DIR/recipes/glm-4.7-flash-awq.yaml" ]]; then
|
||||
log_skip "glm-4.7-flash-awq.yaml not found"
|
||||
recipe_name=$(find_recipe_with_mods)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No recipes with mods found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1)
|
||||
mode=$(get_recipe_mode "$recipe_name")
|
||||
output=$(run_recipe_dry_run "$recipe_name" "$mode")
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-\-apply-mod"; then
|
||||
@@ -470,13 +633,12 @@ test_launch_cmd_mods() {
|
||||
test_launch_cmd_daemon_flag() {
|
||||
log_test "Launch command includes -d flag in daemon mode"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -d 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
@@ -492,13 +654,12 @@ test_launch_cmd_daemon_flag() {
|
||||
test_launch_cmd_nccl_debug() {
|
||||
log_test "Launch command includes --nccl-debug when specified"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --nccl-debug INFO 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
@@ -514,14 +675,13 @@ test_launch_cmd_nccl_debug() {
|
||||
test_launch_cmd_launch_script() {
|
||||
log_test "Launch command includes --launch-script"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
output=$(run_recipe_dry_run "$recipe_name" "solo")
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-\-launch-script"; then
|
||||
@@ -536,13 +696,12 @@ test_launch_cmd_launch_script() {
|
||||
test_launch_cmd_container_override() {
|
||||
log_test "CLI container override (-t) takes precedence"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -t my-custom-image 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
@@ -596,7 +755,8 @@ verify_recipe_args() {
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
mode=$(get_recipe_mode "$recipe_name")
|
||||
output=$(run_recipe_dry_run "$recipe_name" "$mode")
|
||||
vllm_cmd=$(extract_vllm_command "$output")
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
@@ -679,7 +839,8 @@ test_readme_glm_flash_mod() {
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" glm-4.7-flash-awq --dry-run --solo 2>&1)
|
||||
mode=$(get_recipe_mode "glm-4.7-flash-awq")
|
||||
output=$(run_recipe_dry_run "glm-4.7-flash-awq" "$mode")
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "$GLM_FLASH_AWQ_MOD"; then
|
||||
@@ -780,13 +941,12 @@ test_readme_glm_flash_cluster() {
|
||||
test_extra_args_load_format() {
|
||||
log_test "Extra args: --load-format safetensors"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format safetensors 2>&1)
|
||||
|
||||
if echo "$output" | grep -q "\-\-load-format safetensors"; then
|
||||
@@ -801,13 +961,12 @@ test_extra_args_load_format() {
|
||||
test_extra_args_served_model_name() {
|
||||
log_test "Extra args: --served-model-name custom-api-name"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --served-model-name custom-api-name 2>&1)
|
||||
|
||||
if echo "$output" | grep -q "\-\-served-model-name custom-api-name"; then
|
||||
@@ -822,13 +981,12 @@ test_extra_args_served_model_name() {
|
||||
test_extra_args_equals_syntax() {
|
||||
log_test "Extra args: -cc.cudagraph_mode=PIECEWISE (equals syntax)"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- -cc.cudagraph_mode=PIECEWISE 2>&1)
|
||||
|
||||
if echo "$output" | grep -q "\-cc.cudagraph_mode=PIECEWISE"; then
|
||||
@@ -843,13 +1001,12 @@ test_extra_args_equals_syntax() {
|
||||
test_extra_args_multiple() {
|
||||
log_test "Extra args: multiple arguments"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format auto --enforce-eager --seed 42 2>&1)
|
||||
|
||||
local all_found=true
|
||||
@@ -875,13 +1032,12 @@ test_extra_args_multiple() {
|
||||
test_extra_args_empty() {
|
||||
log_test "Extra args: empty (just --)"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
# Should not error with just --
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- 2>&1)
|
||||
exit_code=$?
|
||||
@@ -898,13 +1054,12 @@ test_extra_args_empty() {
|
||||
test_extra_args_duplicate_port_warning() {
|
||||
log_test "Extra args: duplicate --port shows warning"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
# Pass --port via shorthand AND via extra args - should warn
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 8080 -- --port 9999 2>&1)
|
||||
|
||||
@@ -920,13 +1075,12 @@ test_extra_args_duplicate_port_warning() {
|
||||
test_extra_args_duplicate_gpu_mem_warning() {
|
||||
log_test "Extra args: duplicate --gpu-memory-utilization shows warning"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
# Pass --gpu-mem via shorthand AND via extra args - should warn
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --gpu-mem 0.8 -- --gpu-memory-utilization 0.95 2>&1)
|
||||
|
||||
@@ -942,13 +1096,12 @@ test_extra_args_duplicate_gpu_mem_warning() {
|
||||
test_extra_args_duplicate_tp_warning() {
|
||||
log_test "Extra args: duplicate --tensor-parallel-size shows warning"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
# Pass --tp via shorthand AND via extra args - should warn
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --tp 2 -- --tensor-parallel-size 4 2>&1)
|
||||
|
||||
@@ -964,13 +1117,12 @@ test_extra_args_duplicate_tp_warning() {
|
||||
test_extra_args_ordering() {
|
||||
log_test "Extra args: appear at end of vllm command"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --my-custom-arg value 2>&1)
|
||||
vllm_cmd=$(extract_vllm_command "$output")
|
||||
|
||||
@@ -993,13 +1145,12 @@ test_extra_args_ordering() {
|
||||
test_extra_args_cluster_mode() {
|
||||
log_test "Extra args: work in cluster mode"
|
||||
|
||||
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||
if [[ -z "$first_recipe" ]]; then
|
||||
log_skip "No recipes found"
|
||||
recipe_name=$(find_cluster_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No cluster-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
recipe_name=$(basename "$first_recipe" .yaml)
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" -- --load-format auto 2>&1)
|
||||
|
||||
if echo "$output" | grep -q "\-\-load-format auto"; then
|
||||
@@ -1092,6 +1243,9 @@ main() {
|
||||
test_unsupported_recipe_version
|
||||
test_missing_recipe_version_fails
|
||||
test_cluster_only_fails_solo
|
||||
test_solo_only_fails_cluster
|
||||
test_solo_only_allows_solo
|
||||
test_conflicting_mode_flags_fail
|
||||
echo ""
|
||||
|
||||
# Summary
|
||||
|
||||
Reference in New Issue
Block a user