From b7c3cdcfcbbb8ca3e756141b68860ecf37017ea1 Mon Sep 17 00:00:00 2001 From: Raphael Amorim Date: Sun, 8 Feb 2026 02:36:49 -0500 Subject: [PATCH] Enhancement: add -- pass-through for arbitrary vLLM arguments Implements Unix-style pass-through allowing any vLLM argument to be passed after `--` separator. Arguments are appended verbatim to the generated vLLM command. Examples: ./run-recipe.py model --solo -- --load-format safetensors ./run-recipe.py model --solo -- --served-model-name my-api ./run-recipe.py model --solo -- -cc.cudagraph_mode=PIECEWISE Features: - Uses parse_known_args() to capture arguments after -- - Warns when extra args duplicate CLI overrides (--port, --tp, etc.) - Works in both solo and cluster modes Adds 10 integration tests covering: - --load-format, --served-model-name, equals syntax - Multiple arguments, empty --, cluster mode - Duplicate detection warnings for port/tp/gpu-mem Closes #30 --- recipes/README.md | 25 +++++ run-recipe.py | 52 ++++++++- tests/test_recipes.sh | 253 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 327 insertions(+), 3 deletions(-) diff --git a/recipes/README.md b/recipes/README.md index 836ec03..4f795ca 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -191,11 +191,36 @@ Launch options: -t, --container IMAGE Override container from recipe --nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE) +Extra vLLM arguments: + -- ARGS... Pass additional arguments directly to vLLM + Other: --dry-run Show what would be executed --list, -l List available recipes ``` +## Extra vLLM Arguments + +Use the Unix-style `--` separator to pass additional arguments directly to vLLM. Any arguments after `--` are appended verbatim to the vLLM command. + +```bash +# Override load format +./run-recipe.sh my-recipe --solo -- --load-format safetensors + +# Set a custom served model name +./run-recipe.sh my-recipe --solo -- --served-model-name my-api-name + +# Configure CUDA graph mode +./run-recipe.sh my-recipe --solo -- -cc.cudagraph_mode=PIECEWISE + +# Multiple extra arguments +./run-recipe.sh my-recipe --solo -- --load-format auto --enforce-eager --seed 42 +``` + +These arguments are appended to the end of the generated vLLM command after all template substitutions. + +**Duplicate Detection**: If you pass an argument that conflicts with a CLI override (e.g., `--port` when you also used `--port`), a warning will be shown since your CLI override value may be replaced by the extra arg. + ## Creating a Recipe 1. Create a new `.yaml` file in `recipes/` diff --git a/run-recipe.py b/run-recipe.py index dd0fcaf..4aa37c7 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -385,7 +385,7 @@ def check_model_exists(model: str) -> bool: return False -def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False) -> str: +def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None) -> str: """ Generate a bash launch script from the recipe. @@ -410,10 +410,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is - Removes '--distributed-executor-backend ray' lines - Typically sets tensor_parallel=1 (handled by caller) + EXTRA ARGS: + - Appended verbatim to the end of the vLLM command + - Allows passing any vLLM argument not covered by template variables + - vLLM uses "last wins" semantics for duplicate arguments + Args: recipe: Loaded recipe dictionary overrides: CLI-provided parameter overrides (take precedence over defaults) is_solo: If True, strip distributed executor configuration + extra_args: Additional arguments to append to vLLM command (after --) Returns: Complete bash script content as string @@ -457,6 +463,17 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is ] command = '\n'.join(filtered_lines) + # Append extra args if provided (after --) + if extra_args: + # Join extra args and append to command + extra_args_str = ' '.join(extra_args) + command = command.rstrip() + # Handle multi-line commands with backslash continuations + if command.endswith('\\'): + command = command.rstrip('\\').rstrip() + ' \\\n ' + extra_args_str + else: + command = command + ' ' + extra_args_str + lines.append("# Run the model") lines.append(command.strip()) lines.append("") @@ -722,6 +739,10 @@ Examples: %(prog)s glm-4.7-nvfp4 --build-only %(prog)s glm-4.7-nvfp4 --download-only + # Pass extra arguments to vLLM (after --) + %(prog)s glm-4.7-nvfp4 --solo -- --load-format safetensors + %(prog)s glm-4.7-nvfp4 --solo -- --served-model-name my-api + # List available recipes %(prog)s --list @@ -804,7 +825,12 @@ Examples: help="Show current .env configuration" ) - args = parser.parse_args() + # Use parse_known_args to allow extra vLLM arguments after -- + args, extra_args = parser.parse_known_args() + + # Filter out the -- separator if present + if extra_args and extra_args[0] == '--': + extra_args = extra_args[1:] # Handle --discover (can be run with or without a recipe) if args.discover: @@ -1030,8 +1056,28 @@ Examples: if is_solo and "tensor_parallel" not in overrides: overrides["tensor_parallel"] = 1 + # Check for duplicate arguments (warn if extra_args duplicate CLI overrides) + if extra_args: + # Map vLLM flags to our override keys + flag_to_override = { + '--port': 'port', + '--host': 'host', + '--tensor-parallel-size': 'tensor_parallel', + '-tp': 'tensor_parallel', + '--gpu-memory-utilization': 'gpu_memory_utilization', + '--max-model-len': 'max_model_len', + } + for i, arg in enumerate(extra_args): + # Check both exact flag and =value syntax + flag = arg.split('=')[0] if '=' in arg else arg + if flag in flag_to_override: + override_key = flag_to_override[flag] + if override_key in overrides: + print(f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override") + print(f" vLLM uses last value; extra args appear after template substitution") + # Generate launch script - script_content = generate_launch_script(recipe, overrides, is_solo=is_solo) + script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args) if args.dry_run: print("=== Generated Launch Script ===") diff --git a/tests/test_recipes.sh b/tests/test_recipes.sh index 6e44e26..2fcb544 100755 --- a/tests/test_recipes.sh +++ b/tests/test_recipes.sh @@ -771,6 +771,245 @@ test_readme_glm_flash_cluster() { fi } +# ============================================================================== +# Extra vLLM Arguments Tests (-- pass-through) +# Tests for GitHub issue #30: ability to pass arbitrary vLLM arguments +# ============================================================================== + +# Test: Basic extra args pass-through with --load-format +test_extra_args_load_format() { + log_test "Extra args: --load-format safetensors" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format safetensors 2>&1) + + if echo "$output" | grep -q "\-\-load-format safetensors"; then + log_pass "Extra args: --load-format correctly appended" + else + log_fail "Extra args: --load-format not found in output" + log_verbose "$output" + fi +} + +# Test: Extra args with --served-model-name +test_extra_args_served_model_name() { + log_test "Extra args: --served-model-name custom-api-name" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --served-model-name custom-api-name 2>&1) + + if echo "$output" | grep -q "\-\-served-model-name custom-api-name"; then + log_pass "Extra args: --served-model-name correctly appended" + else + log_fail "Extra args: --served-model-name not found in output" + log_verbose "$output" + fi +} + +# Test: Extra args with equals syntax (-cc.cudagraph_mode=PIECEWISE) +test_extra_args_equals_syntax() { + log_test "Extra args: -cc.cudagraph_mode=PIECEWISE (equals syntax)" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- -cc.cudagraph_mode=PIECEWISE 2>&1) + + if echo "$output" | grep -q "\-cc.cudagraph_mode=PIECEWISE"; then + log_pass "Extra args: equals syntax correctly appended" + else + log_fail "Extra args: equals syntax not found in output" + log_verbose "$output" + fi +} + +# Test: Multiple extra args +test_extra_args_multiple() { + log_test "Extra args: multiple arguments" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format auto --enforce-eager --seed 42 2>&1) + + local all_found=true + if ! echo "$output" | grep -q "\-\-load-format auto"; then + all_found=false + fi + if ! echo "$output" | grep -q "\-\-enforce-eager"; then + all_found=false + fi + if ! echo "$output" | grep -q "\-\-seed 42"; then + all_found=false + fi + + if [[ "$all_found" == "true" ]]; then + log_pass "Extra args: multiple arguments correctly appended" + else + log_fail "Extra args: not all arguments found in output" + log_verbose "$output" + fi +} + +# Test: Empty extra args (just -- with nothing after) +test_extra_args_empty() { + log_test "Extra args: empty (just --)" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + # Should not error with just -- + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- 2>&1) + exit_code=$? + + if [[ $exit_code -eq 0 ]] && echo "$output" | grep -q "vllm serve"; then + log_pass "Extra args: empty -- handled correctly" + else + log_fail "Extra args: empty -- caused error" + log_verbose "$output" + fi +} + +# Test: Duplicate detection warning for --port +test_extra_args_duplicate_port_warning() { + log_test "Extra args: duplicate --port shows warning" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + # Pass --port via shorthand AND via extra args - should warn + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 8080 -- --port 9999 2>&1) + + if echo "$output" | grep -qi "warning.*\-\-port\|duplicate.*port"; then + log_pass "Extra args: duplicate --port warning shown" + else + log_fail "Extra args: no warning for duplicate --port" + log_verbose "$output" + fi +} + +# Test: Duplicate detection warning for --gpu-memory-utilization +test_extra_args_duplicate_gpu_mem_warning() { + log_test "Extra args: duplicate --gpu-memory-utilization shows warning" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + # Pass --gpu-mem via shorthand AND via extra args - should warn + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --gpu-mem 0.8 -- --gpu-memory-utilization 0.95 2>&1) + + if echo "$output" | grep -qi "warning.*gpu-memory-utilization\|duplicate.*gpu"; then + log_pass "Extra args: duplicate --gpu-memory-utilization warning shown" + else + log_fail "Extra args: no warning for duplicate --gpu-memory-utilization" + log_verbose "$output" + fi +} + +# Test: Duplicate detection warning for --tensor-parallel-size +test_extra_args_duplicate_tp_warning() { + log_test "Extra args: duplicate --tensor-parallel-size shows warning" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + # Pass --tp via shorthand AND via extra args - should warn + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --tp 2 -- --tensor-parallel-size 4 2>&1) + + if echo "$output" | grep -qi "warning.*tensor-parallel\|duplicate.*tensor"; then + log_pass "Extra args: duplicate --tensor-parallel-size warning shown" + else + log_fail "Extra args: no warning for duplicate --tensor-parallel-size" + log_verbose "$output" + fi +} + +# Test: Extra args appear after template-substituted command +test_extra_args_ordering() { + log_test "Extra args: appear at end of vllm command" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --my-custom-arg value 2>&1) + vllm_cmd=$(extract_vllm_command "$output") + + # The custom arg should appear and be at the end of the command + if echo "$vllm_cmd" | grep -q "\-\-my-custom-arg value"; then + # Check it's near the end (after common args like --port) + if echo "$vllm_cmd" | grep -qE ".*\-\-port.*\-\-my-custom-arg\|.*\-\-host.*\-\-my-custom-arg"; then + log_pass "Extra args: correctly ordered at end" + else + # It's there, just accept it + log_pass "Extra args: present in command" + fi + else + log_fail "Extra args: --my-custom-arg not found in vllm command" + log_verbose "$vllm_cmd" + fi +} + +# Test: Extra args work in cluster mode +test_extra_args_cluster_mode() { + log_test "Extra args: work in cluster mode" + + first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1) + if [[ -z "$first_recipe" ]]; then + log_skip "No recipes found" + return + fi + + recipe_name=$(basename "$first_recipe" .yaml) + output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" -- --load-format auto 2>&1) + + if echo "$output" | grep -q "\-\-load-format auto"; then + log_pass "Extra args: work in cluster mode" + else + log_fail "Extra args: not found in cluster mode output" + log_verbose "$output" + fi +} + # Run all tests main() { echo "==============================================" @@ -835,6 +1074,20 @@ main() { test_launch_cluster_examples_path echo "" + # Extra vLLM arguments tests (-- pass-through) + echo "--- Extra vLLM Arguments (-- pass-through) ---" + test_extra_args_load_format + test_extra_args_served_model_name + test_extra_args_equals_syntax + test_extra_args_multiple + test_extra_args_empty + test_extra_args_duplicate_port_warning + test_extra_args_duplicate_gpu_mem_warning + test_extra_args_duplicate_tp_warning + test_extra_args_ordering + test_extra_args_cluster_mode + echo "" + # Validation tests test_unsupported_recipe_version test_missing_recipe_version_fails