Enhancement: add -- pass-through for arbitrary vLLM arguments

Implements Unix-style pass-through allowing any vLLM argument to be passed after `--` separator. Arguments are appended verbatim to the generated vLLM command. Examples: ./run-recipe.py model --solo -- --load-format safetensors ./run-recipe.py model --solo -- --served-model-name my-api ./run-recipe.py model --solo -- -cc.cudagraph_mode=PIECEWISE Features: - Uses parse_known_args() to capture arguments after -- - Warns when extra args duplicate CLI overrides (--port, --tp, etc.) - Works in both solo and cluster modes Adds 10 integration tests covering: - --load-format, --served-model-name, equals syntax - Multiple arguments, empty --, cluster mode - Duplicate detection warnings for port/tp/gpu-mem Closes #30
2026-02-08 02:36:49 -05:00
parent 8cb956b972
commit b7c3cdcfcb
3 changed files with 327 additions and 3 deletions
--- a/run-recipe.py
+++ b/run-recipe.py
@@ -385,7 +385,7 @@ def check_model_exists(model: str) -> bool:
    return False


-def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False) -> str:
+def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None) -> str:
    """
    Generate a bash launch script from the recipe.
    
@@ -410,10 +410,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
        - Removes '--distributed-executor-backend ray' lines
        - Typically sets tensor_parallel=1 (handled by caller)
    
+    EXTRA ARGS:
+        - Appended verbatim to the end of the vLLM command
+        - Allows passing any vLLM argument not covered by template variables
+        - vLLM uses "last wins" semantics for duplicate arguments
+    
    Args:
        recipe: Loaded recipe dictionary
        overrides: CLI-provided parameter overrides (take precedence over defaults)
        is_solo: If True, strip distributed executor configuration
+        extra_args: Additional arguments to append to vLLM command (after --)
        
    Returns:
        Complete bash script content as string
@@ -457,6 +463,17 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
        ]
        command = '\n'.join(filtered_lines)
    
+    # Append extra args if provided (after --)
+    if extra_args:
+        # Join extra args and append to command
+        extra_args_str = ' '.join(extra_args)
+        command = command.rstrip()
+        # Handle multi-line commands with backslash continuations
+        if command.endswith('\\'):
+            command = command.rstrip('\\').rstrip() + ' \\\n    ' + extra_args_str
+        else:
+            command = command + ' ' + extra_args_str
+    
    lines.append("# Run the model")
    lines.append(command.strip())
    lines.append("")
@@ -722,6 +739,10 @@ Examples:
  %(prog)s glm-4.7-nvfp4 --build-only
  %(prog)s glm-4.7-nvfp4 --download-only

+  # Pass extra arguments to vLLM (after --)
+  %(prog)s glm-4.7-nvfp4 --solo -- --load-format safetensors
+  %(prog)s glm-4.7-nvfp4 --solo -- --served-model-name my-api
+
  # List available recipes
  %(prog)s --list

@@ -804,7 +825,12 @@ Examples:
        help="Show current .env configuration"
    )
    
-    args = parser.parse_args()
+    # Use parse_known_args to allow extra vLLM arguments after --
+    args, extra_args = parser.parse_known_args()
+    
+    # Filter out the -- separator if present
+    if extra_args and extra_args[0] == '--':
+        extra_args = extra_args[1:]
    
    # Handle --discover (can be run with or without a recipe)
    if args.discover:
@@ -1030,8 +1056,28 @@ Examples:
    if is_solo and "tensor_parallel" not in overrides:
        overrides["tensor_parallel"] = 1
    
+    # Check for duplicate arguments (warn if extra_args duplicate CLI overrides)
+    if extra_args:
+        # Map vLLM flags to our override keys
+        flag_to_override = {
+            '--port': 'port',
+            '--host': 'host',
+            '--tensor-parallel-size': 'tensor_parallel',
+            '-tp': 'tensor_parallel',
+            '--gpu-memory-utilization': 'gpu_memory_utilization',
+            '--max-model-len': 'max_model_len',
+        }
+        for i, arg in enumerate(extra_args):
+            # Check both exact flag and =value syntax
+            flag = arg.split('=')[0] if '=' in arg else arg
+            if flag in flag_to_override:
+                override_key = flag_to_override[flag]
+                if override_key in overrides:
+                    print(f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override")
+                    print(f"         vLLM uses last value; extra args appear after template substitution")
+    
    # Generate launch script
-    script_content = generate_launch_script(recipe, overrides, is_solo=is_solo)
+    script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args)
    
    if args.dry_run:
        print("=== Generated Launch Script ===")