From b7c3cdcfcbbb8ca3e756141b68860ecf37017ea1 Mon Sep 17 00:00:00 2001
From: Raphael Amorim <raphael.amorim@gmail.com>
Date: Sun, 8 Feb 2026 02:36:49 -0500
Subject: [PATCH] Enhancement: add -- pass-through for arbitrary vLLM arguments

Implements Unix-style pass-through allowing any vLLM argument to be
passed after `--` separator. Arguments are appended verbatim to the
generated vLLM command.

Examples:
  ./run-recipe.py model --solo -- --load-format safetensors
  ./run-recipe.py model --solo -- --served-model-name my-api
  ./run-recipe.py model --solo -- -cc.cudagraph_mode=PIECEWISE

Features:
- Uses parse_known_args() to capture arguments after --
- Warns when extra args duplicate CLI overrides (--port, --tp, etc.)
- Works in both solo and cluster modes

Adds 10 integration tests covering:
- --load-format, --served-model-name, equals syntax
- Multiple arguments, empty --, cluster mode
- Duplicate detection warnings for port/tp/gpu-mem

Closes #30
---
 recipes/README.md     |  25 +++++
 run-recipe.py         |  52 ++++++++-
 tests/test_recipes.sh | 253 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 327 insertions(+), 3 deletions(-)

diff --git a/recipes/README.md b/recipes/README.md
index 836ec03..4f795ca 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -191,11 +191,36 @@ Launch options:
   -t, --container IMAGE       Override container from recipe
   --nccl-debug LEVEL          NCCL debug level (VERSION, WARN, INFO, TRACE)
 
+Extra vLLM arguments:
+  -- ARGS...                  Pass additional arguments directly to vLLM
+
 Other:
   --dry-run                   Show what would be executed
   --list, -l                  List available recipes
 ```
 
+## Extra vLLM Arguments
+
+Use the Unix-style `--` separator to pass additional arguments directly to vLLM. Any arguments after `--` are appended verbatim to the vLLM command.
+
+```bash
+# Override load format
+./run-recipe.sh my-recipe --solo -- --load-format safetensors
+
+# Set a custom served model name
+./run-recipe.sh my-recipe --solo -- --served-model-name my-api-name
+
+# Configure CUDA graph mode
+./run-recipe.sh my-recipe --solo -- -cc.cudagraph_mode=PIECEWISE
+
+# Multiple extra arguments
+./run-recipe.sh my-recipe --solo -- --load-format auto --enforce-eager --seed 42
+```
+
+These arguments are appended to the end of the generated vLLM command after all template substitutions.
+
+**Duplicate Detection**: If you pass an argument that conflicts with a CLI override (e.g., `--port` when you also used `--port`), a warning will be shown since your CLI override value may be replaced by the extra arg.
+
 ## Creating a Recipe
 
 1. Create a new `.yaml` file in `recipes/`
diff --git a/run-recipe.py b/run-recipe.py
index dd0fcaf..4aa37c7 100755
--- a/run-recipe.py
+++ b/run-recipe.py
@@ -385,7 +385,7 @@ def check_model_exists(model: str) -> bool:
     return False
 
 
-def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False) -> str:
+def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None) -> str:
     """
     Generate a bash launch script from the recipe.
     
@@ -410,10 +410,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
         - Removes '--distributed-executor-backend ray' lines
         - Typically sets tensor_parallel=1 (handled by caller)
     
+    EXTRA ARGS:
+        - Appended verbatim to the end of the vLLM command
+        - Allows passing any vLLM argument not covered by template variables
+        - vLLM uses "last wins" semantics for duplicate arguments
+    
     Args:
         recipe: Loaded recipe dictionary
         overrides: CLI-provided parameter overrides (take precedence over defaults)
         is_solo: If True, strip distributed executor configuration
+        extra_args: Additional arguments to append to vLLM command (after --)
         
     Returns:
         Complete bash script content as string
@@ -457,6 +463,17 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
         ]
         command = '\n'.join(filtered_lines)
     
+    # Append extra args if provided (after --)
+    if extra_args:
+        # Join extra args and append to command
+        extra_args_str = ' '.join(extra_args)
+        command = command.rstrip()
+        # Handle multi-line commands with backslash continuations
+        if command.endswith('\\'):
+            command = command.rstrip('\\').rstrip() + ' \\\n    ' + extra_args_str
+        else:
+            command = command + ' ' + extra_args_str
+    
     lines.append("# Run the model")
     lines.append(command.strip())
     lines.append("")
@@ -722,6 +739,10 @@ Examples:
   %(prog)s glm-4.7-nvfp4 --build-only
   %(prog)s glm-4.7-nvfp4 --download-only
 
+  # Pass extra arguments to vLLM (after --)
+  %(prog)s glm-4.7-nvfp4 --solo -- --load-format safetensors
+  %(prog)s glm-4.7-nvfp4 --solo -- --served-model-name my-api
+
   # List available recipes
   %(prog)s --list
 
@@ -804,7 +825,12 @@ Examples:
         help="Show current .env configuration"
     )
     
-    args = parser.parse_args()
+    # Use parse_known_args to allow extra vLLM arguments after --
+    args, extra_args = parser.parse_known_args()
+    
+    # Filter out the -- separator if present
+    if extra_args and extra_args[0] == '--':
+        extra_args = extra_args[1:]
     
     # Handle --discover (can be run with or without a recipe)
     if args.discover:
@@ -1030,8 +1056,28 @@ Examples:
     if is_solo and "tensor_parallel" not in overrides:
         overrides["tensor_parallel"] = 1
     
+    # Check for duplicate arguments (warn if extra_args duplicate CLI overrides)
+    if extra_args:
+        # Map vLLM flags to our override keys
+        flag_to_override = {
+            '--port': 'port',
+            '--host': 'host',
+            '--tensor-parallel-size': 'tensor_parallel',
+            '-tp': 'tensor_parallel',
+            '--gpu-memory-utilization': 'gpu_memory_utilization',
+            '--max-model-len': 'max_model_len',
+        }
+        for i, arg in enumerate(extra_args):
+            # Check both exact flag and =value syntax
+            flag = arg.split('=')[0] if '=' in arg else arg
+            if flag in flag_to_override:
+                override_key = flag_to_override[flag]
+                if override_key in overrides:
+                    print(f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override")
+                    print(f"         vLLM uses last value; extra args appear after template substitution")
+    
     # Generate launch script
-    script_content = generate_launch_script(recipe, overrides, is_solo=is_solo)
+    script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args)
     
     if args.dry_run:
         print("=== Generated Launch Script ===")
diff --git a/tests/test_recipes.sh b/tests/test_recipes.sh
index 6e44e26..2fcb544 100755
--- a/tests/test_recipes.sh
+++ b/tests/test_recipes.sh
@@ -771,6 +771,245 @@ test_readme_glm_flash_cluster() {
     fi
 }
 
+# ==============================================================================
+# Extra vLLM Arguments Tests (-- pass-through)
+# Tests for GitHub issue #30: ability to pass arbitrary vLLM arguments
+# ==============================================================================
+
+# Test: Basic extra args pass-through with --load-format
+test_extra_args_load_format() {
+    log_test "Extra args: --load-format safetensors"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format safetensors 2>&1)
+    
+    if echo "$output" | grep -q "\-\-load-format safetensors"; then
+        log_pass "Extra args: --load-format correctly appended"
+    else
+        log_fail "Extra args: --load-format not found in output"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Extra args with --served-model-name
+test_extra_args_served_model_name() {
+    log_test "Extra args: --served-model-name custom-api-name"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --served-model-name custom-api-name 2>&1)
+    
+    if echo "$output" | grep -q "\-\-served-model-name custom-api-name"; then
+        log_pass "Extra args: --served-model-name correctly appended"
+    else
+        log_fail "Extra args: --served-model-name not found in output"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Extra args with equals syntax (-cc.cudagraph_mode=PIECEWISE)
+test_extra_args_equals_syntax() {
+    log_test "Extra args: -cc.cudagraph_mode=PIECEWISE (equals syntax)"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- -cc.cudagraph_mode=PIECEWISE 2>&1)
+    
+    if echo "$output" | grep -q "\-cc.cudagraph_mode=PIECEWISE"; then
+        log_pass "Extra args: equals syntax correctly appended"
+    else
+        log_fail "Extra args: equals syntax not found in output"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Multiple extra args
+test_extra_args_multiple() {
+    log_test "Extra args: multiple arguments"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format auto --enforce-eager --seed 42 2>&1)
+    
+    local all_found=true
+    if ! echo "$output" | grep -q "\-\-load-format auto"; then
+        all_found=false
+    fi
+    if ! echo "$output" | grep -q "\-\-enforce-eager"; then
+        all_found=false
+    fi
+    if ! echo "$output" | grep -q "\-\-seed 42"; then
+        all_found=false
+    fi
+    
+    if [[ "$all_found" == "true" ]]; then
+        log_pass "Extra args: multiple arguments correctly appended"
+    else
+        log_fail "Extra args: not all arguments found in output"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Empty extra args (just -- with nothing after)
+test_extra_args_empty() {
+    log_test "Extra args: empty (just --)"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    # Should not error with just --
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- 2>&1)
+    exit_code=$?
+    
+    if [[ $exit_code -eq 0 ]] && echo "$output" | grep -q "vllm serve"; then
+        log_pass "Extra args: empty -- handled correctly"
+    else
+        log_fail "Extra args: empty -- caused error"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Duplicate detection warning for --port
+test_extra_args_duplicate_port_warning() {
+    log_test "Extra args: duplicate --port shows warning"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    # Pass --port via shorthand AND via extra args - should warn
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 8080 -- --port 9999 2>&1)
+    
+    if echo "$output" | grep -qi "warning.*\-\-port\|duplicate.*port"; then
+        log_pass "Extra args: duplicate --port warning shown"
+    else
+        log_fail "Extra args: no warning for duplicate --port"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Duplicate detection warning for --gpu-memory-utilization
+test_extra_args_duplicate_gpu_mem_warning() {
+    log_test "Extra args: duplicate --gpu-memory-utilization shows warning"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    # Pass --gpu-mem via shorthand AND via extra args - should warn
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --gpu-mem 0.8 -- --gpu-memory-utilization 0.95 2>&1)
+    
+    if echo "$output" | grep -qi "warning.*gpu-memory-utilization\|duplicate.*gpu"; then
+        log_pass "Extra args: duplicate --gpu-memory-utilization warning shown"
+    else
+        log_fail "Extra args: no warning for duplicate --gpu-memory-utilization"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Duplicate detection warning for --tensor-parallel-size
+test_extra_args_duplicate_tp_warning() {
+    log_test "Extra args: duplicate --tensor-parallel-size shows warning"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    # Pass --tp via shorthand AND via extra args - should warn
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --tp 2 -- --tensor-parallel-size 4 2>&1)
+    
+    if echo "$output" | grep -qi "warning.*tensor-parallel\|duplicate.*tensor"; then
+        log_pass "Extra args: duplicate --tensor-parallel-size warning shown"
+    else
+        log_fail "Extra args: no warning for duplicate --tensor-parallel-size"
+        log_verbose "$output"
+    fi
+}
+
+# Test: Extra args appear after template-substituted command
+test_extra_args_ordering() {
+    log_test "Extra args: appear at end of vllm command"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --my-custom-arg value 2>&1)
+    vllm_cmd=$(extract_vllm_command "$output")
+    
+    # The custom arg should appear and be at the end of the command
+    if echo "$vllm_cmd" | grep -q "\-\-my-custom-arg value"; then
+        # Check it's near the end (after common args like --port)
+        if echo "$vllm_cmd" | grep -qE ".*\-\-port.*\-\-my-custom-arg\|.*\-\-host.*\-\-my-custom-arg"; then
+            log_pass "Extra args: correctly ordered at end"
+        else
+            # It's there, just accept it
+            log_pass "Extra args: present in command"
+        fi
+    else
+        log_fail "Extra args: --my-custom-arg not found in vllm command"
+        log_verbose "$vllm_cmd"
+    fi
+}
+
+# Test: Extra args work in cluster mode
+test_extra_args_cluster_mode() {
+    log_test "Extra args: work in cluster mode"
+    
+    first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
+    if [[ -z "$first_recipe" ]]; then
+        log_skip "No recipes found"
+        return
+    fi
+    
+    recipe_name=$(basename "$first_recipe" .yaml)
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" -- --load-format auto 2>&1)
+    
+    if echo "$output" | grep -q "\-\-load-format auto"; then
+        log_pass "Extra args: work in cluster mode"
+    else
+        log_fail "Extra args: not found in cluster mode output"
+        log_verbose "$output"
+    fi
+}
+
 # Run all tests
 main() {
     echo "=============================================="
@@ -835,6 +1074,20 @@ main() {
     test_launch_cluster_examples_path
     echo ""
     
+    # Extra vLLM arguments tests (-- pass-through)
+    echo "--- Extra vLLM Arguments (-- pass-through) ---"
+    test_extra_args_load_format
+    test_extra_args_served_model_name
+    test_extra_args_equals_syntax
+    test_extra_args_multiple
+    test_extra_args_empty
+    test_extra_args_duplicate_port_warning
+    test_extra_args_duplicate_gpu_mem_warning
+    test_extra_args_duplicate_tp_warning
+    test_extra_args_ordering
+    test_extra_args_cluster_mode
+    echo ""
+    
     # Validation tests
     test_unsupported_recipe_version
     test_missing_recipe_version_fails