Merge pull request #32
This commit is contained in:
@@ -191,11 +191,36 @@ Launch options:
|
|||||||
-t, --container IMAGE Override container from recipe
|
-t, --container IMAGE Override container from recipe
|
||||||
--nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE)
|
--nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE)
|
||||||
|
|
||||||
|
Extra vLLM arguments:
|
||||||
|
-- ARGS... Pass additional arguments directly to vLLM
|
||||||
|
|
||||||
Other:
|
Other:
|
||||||
--dry-run Show what would be executed
|
--dry-run Show what would be executed
|
||||||
--list, -l List available recipes
|
--list, -l List available recipes
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Extra vLLM Arguments
|
||||||
|
|
||||||
|
Use the Unix-style `--` separator to pass additional arguments directly to vLLM. Any arguments after `--` are appended verbatim to the vLLM command.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Override load format
|
||||||
|
./run-recipe.sh my-recipe --solo -- --load-format safetensors
|
||||||
|
|
||||||
|
# Set a custom served model name
|
||||||
|
./run-recipe.sh my-recipe --solo -- --served-model-name my-api-name
|
||||||
|
|
||||||
|
# Configure CUDA graph mode
|
||||||
|
./run-recipe.sh my-recipe --solo -- -cc.cudagraph_mode=PIECEWISE
|
||||||
|
|
||||||
|
# Multiple extra arguments
|
||||||
|
./run-recipe.sh my-recipe --solo -- --load-format auto --enforce-eager --seed 42
|
||||||
|
```
|
||||||
|
|
||||||
|
These arguments are appended to the end of the generated vLLM command after all template substitutions.
|
||||||
|
|
||||||
|
**Duplicate Detection**: If you pass an argument that conflicts with a CLI override (e.g., `--port` when you also used `--port`), a warning will be shown since your CLI override value may be replaced by the extra arg.
|
||||||
|
|
||||||
## Creating a Recipe
|
## Creating a Recipe
|
||||||
|
|
||||||
1. Create a new `.yaml` file in `recipes/`
|
1. Create a new `.yaml` file in `recipes/`
|
||||||
|
|||||||
@@ -385,7 +385,7 @@ def check_model_exists(model: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False) -> str:
|
def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None) -> str:
|
||||||
"""
|
"""
|
||||||
Generate a bash launch script from the recipe.
|
Generate a bash launch script from the recipe.
|
||||||
|
|
||||||
@@ -410,10 +410,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
|||||||
- Removes '--distributed-executor-backend ray' lines
|
- Removes '--distributed-executor-backend ray' lines
|
||||||
- Typically sets tensor_parallel=1 (handled by caller)
|
- Typically sets tensor_parallel=1 (handled by caller)
|
||||||
|
|
||||||
|
EXTRA ARGS:
|
||||||
|
- Appended verbatim to the end of the vLLM command
|
||||||
|
- Allows passing any vLLM argument not covered by template variables
|
||||||
|
- vLLM uses "last wins" semantics for duplicate arguments
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
recipe: Loaded recipe dictionary
|
recipe: Loaded recipe dictionary
|
||||||
overrides: CLI-provided parameter overrides (take precedence over defaults)
|
overrides: CLI-provided parameter overrides (take precedence over defaults)
|
||||||
is_solo: If True, strip distributed executor configuration
|
is_solo: If True, strip distributed executor configuration
|
||||||
|
extra_args: Additional arguments to append to vLLM command (after --)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Complete bash script content as string
|
Complete bash script content as string
|
||||||
@@ -457,6 +463,17 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
|||||||
]
|
]
|
||||||
command = '\n'.join(filtered_lines)
|
command = '\n'.join(filtered_lines)
|
||||||
|
|
||||||
|
# Append extra args if provided (after --)
|
||||||
|
if extra_args:
|
||||||
|
# Join extra args and append to command
|
||||||
|
extra_args_str = ' '.join(extra_args)
|
||||||
|
command = command.rstrip()
|
||||||
|
# Handle multi-line commands with backslash continuations
|
||||||
|
if command.endswith('\\'):
|
||||||
|
command = command.rstrip('\\').rstrip() + ' \\\n ' + extra_args_str
|
||||||
|
else:
|
||||||
|
command = command + ' ' + extra_args_str
|
||||||
|
|
||||||
lines.append("# Run the model")
|
lines.append("# Run the model")
|
||||||
lines.append(command.strip())
|
lines.append(command.strip())
|
||||||
lines.append("")
|
lines.append("")
|
||||||
@@ -722,6 +739,10 @@ Examples:
|
|||||||
%(prog)s glm-4.7-nvfp4 --build-only
|
%(prog)s glm-4.7-nvfp4 --build-only
|
||||||
%(prog)s glm-4.7-nvfp4 --download-only
|
%(prog)s glm-4.7-nvfp4 --download-only
|
||||||
|
|
||||||
|
# Pass extra arguments to vLLM (after --)
|
||||||
|
%(prog)s glm-4.7-nvfp4 --solo -- --load-format safetensors
|
||||||
|
%(prog)s glm-4.7-nvfp4 --solo -- --served-model-name my-api
|
||||||
|
|
||||||
# List available recipes
|
# List available recipes
|
||||||
%(prog)s --list
|
%(prog)s --list
|
||||||
|
|
||||||
@@ -804,7 +825,12 @@ Examples:
|
|||||||
help="Show current .env configuration"
|
help="Show current .env configuration"
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
# Use parse_known_args to allow extra vLLM arguments after --
|
||||||
|
args, extra_args = parser.parse_known_args()
|
||||||
|
|
||||||
|
# Filter out the -- separator if present
|
||||||
|
if extra_args and extra_args[0] == '--':
|
||||||
|
extra_args = extra_args[1:]
|
||||||
|
|
||||||
# Handle --discover (can be run with or without a recipe)
|
# Handle --discover (can be run with or without a recipe)
|
||||||
if args.discover:
|
if args.discover:
|
||||||
@@ -1030,8 +1056,28 @@ Examples:
|
|||||||
if is_solo and "tensor_parallel" not in overrides:
|
if is_solo and "tensor_parallel" not in overrides:
|
||||||
overrides["tensor_parallel"] = 1
|
overrides["tensor_parallel"] = 1
|
||||||
|
|
||||||
|
# Check for duplicate arguments (warn if extra_args duplicate CLI overrides)
|
||||||
|
if extra_args:
|
||||||
|
# Map vLLM flags to our override keys
|
||||||
|
flag_to_override = {
|
||||||
|
'--port': 'port',
|
||||||
|
'--host': 'host',
|
||||||
|
'--tensor-parallel-size': 'tensor_parallel',
|
||||||
|
'-tp': 'tensor_parallel',
|
||||||
|
'--gpu-memory-utilization': 'gpu_memory_utilization',
|
||||||
|
'--max-model-len': 'max_model_len',
|
||||||
|
}
|
||||||
|
for i, arg in enumerate(extra_args):
|
||||||
|
# Check both exact flag and =value syntax
|
||||||
|
flag = arg.split('=')[0] if '=' in arg else arg
|
||||||
|
if flag in flag_to_override:
|
||||||
|
override_key = flag_to_override[flag]
|
||||||
|
if override_key in overrides:
|
||||||
|
print(f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override")
|
||||||
|
print(f" vLLM uses last value; extra args appear after template substitution")
|
||||||
|
|
||||||
# Generate launch script
|
# Generate launch script
|
||||||
script_content = generate_launch_script(recipe, overrides, is_solo=is_solo)
|
script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args)
|
||||||
|
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
print("=== Generated Launch Script ===")
|
print("=== Generated Launch Script ===")
|
||||||
|
|||||||
@@ -771,6 +771,245 @@ test_readme_glm_flash_cluster() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# Extra vLLM Arguments Tests (-- pass-through)
|
||||||
|
# Tests for GitHub issue #30: ability to pass arbitrary vLLM arguments
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
# Test: Basic extra args pass-through with --load-format
|
||||||
|
test_extra_args_load_format() {
|
||||||
|
log_test "Extra args: --load-format safetensors"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format safetensors 2>&1)
|
||||||
|
|
||||||
|
if echo "$output" | grep -q "\-\-load-format safetensors"; then
|
||||||
|
log_pass "Extra args: --load-format correctly appended"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: --load-format not found in output"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Extra args with --served-model-name
|
||||||
|
test_extra_args_served_model_name() {
|
||||||
|
log_test "Extra args: --served-model-name custom-api-name"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --served-model-name custom-api-name 2>&1)
|
||||||
|
|
||||||
|
if echo "$output" | grep -q "\-\-served-model-name custom-api-name"; then
|
||||||
|
log_pass "Extra args: --served-model-name correctly appended"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: --served-model-name not found in output"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Extra args with equals syntax (-cc.cudagraph_mode=PIECEWISE)
|
||||||
|
test_extra_args_equals_syntax() {
|
||||||
|
log_test "Extra args: -cc.cudagraph_mode=PIECEWISE (equals syntax)"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- -cc.cudagraph_mode=PIECEWISE 2>&1)
|
||||||
|
|
||||||
|
if echo "$output" | grep -q "\-cc.cudagraph_mode=PIECEWISE"; then
|
||||||
|
log_pass "Extra args: equals syntax correctly appended"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: equals syntax not found in output"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Multiple extra args
|
||||||
|
test_extra_args_multiple() {
|
||||||
|
log_test "Extra args: multiple arguments"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --load-format auto --enforce-eager --seed 42 2>&1)
|
||||||
|
|
||||||
|
local all_found=true
|
||||||
|
if ! echo "$output" | grep -q "\-\-load-format auto"; then
|
||||||
|
all_found=false
|
||||||
|
fi
|
||||||
|
if ! echo "$output" | grep -q "\-\-enforce-eager"; then
|
||||||
|
all_found=false
|
||||||
|
fi
|
||||||
|
if ! echo "$output" | grep -q "\-\-seed 42"; then
|
||||||
|
all_found=false
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$all_found" == "true" ]]; then
|
||||||
|
log_pass "Extra args: multiple arguments correctly appended"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: not all arguments found in output"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Empty extra args (just -- with nothing after)
|
||||||
|
test_extra_args_empty() {
|
||||||
|
log_test "Extra args: empty (just --)"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
# Should not error with just --
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- 2>&1)
|
||||||
|
exit_code=$?
|
||||||
|
|
||||||
|
if [[ $exit_code -eq 0 ]] && echo "$output" | grep -q "vllm serve"; then
|
||||||
|
log_pass "Extra args: empty -- handled correctly"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: empty -- caused error"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Duplicate detection warning for --port
|
||||||
|
test_extra_args_duplicate_port_warning() {
|
||||||
|
log_test "Extra args: duplicate --port shows warning"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
# Pass --port via shorthand AND via extra args - should warn
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --port 8080 -- --port 9999 2>&1)
|
||||||
|
|
||||||
|
if echo "$output" | grep -qi "warning.*\-\-port\|duplicate.*port"; then
|
||||||
|
log_pass "Extra args: duplicate --port warning shown"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: no warning for duplicate --port"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Duplicate detection warning for --gpu-memory-utilization
|
||||||
|
test_extra_args_duplicate_gpu_mem_warning() {
|
||||||
|
log_test "Extra args: duplicate --gpu-memory-utilization shows warning"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
# Pass --gpu-mem via shorthand AND via extra args - should warn
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --gpu-mem 0.8 -- --gpu-memory-utilization 0.95 2>&1)
|
||||||
|
|
||||||
|
if echo "$output" | grep -qi "warning.*gpu-memory-utilization\|duplicate.*gpu"; then
|
||||||
|
log_pass "Extra args: duplicate --gpu-memory-utilization warning shown"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: no warning for duplicate --gpu-memory-utilization"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Duplicate detection warning for --tensor-parallel-size
|
||||||
|
test_extra_args_duplicate_tp_warning() {
|
||||||
|
log_test "Extra args: duplicate --tensor-parallel-size shows warning"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
# Pass --tp via shorthand AND via extra args - should warn
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo --tp 2 -- --tensor-parallel-size 4 2>&1)
|
||||||
|
|
||||||
|
if echo "$output" | grep -qi "warning.*tensor-parallel\|duplicate.*tensor"; then
|
||||||
|
log_pass "Extra args: duplicate --tensor-parallel-size warning shown"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: no warning for duplicate --tensor-parallel-size"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Extra args appear after template-substituted command
|
||||||
|
test_extra_args_ordering() {
|
||||||
|
log_test "Extra args: appear at end of vllm command"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -- --my-custom-arg value 2>&1)
|
||||||
|
vllm_cmd=$(extract_vllm_command "$output")
|
||||||
|
|
||||||
|
# The custom arg should appear and be at the end of the command
|
||||||
|
if echo "$vllm_cmd" | grep -q "\-\-my-custom-arg value"; then
|
||||||
|
# Check it's near the end (after common args like --port)
|
||||||
|
if echo "$vllm_cmd" | grep -qE ".*\-\-port.*\-\-my-custom-arg\|.*\-\-host.*\-\-my-custom-arg"; then
|
||||||
|
log_pass "Extra args: correctly ordered at end"
|
||||||
|
else
|
||||||
|
# It's there, just accept it
|
||||||
|
log_pass "Extra args: present in command"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log_fail "Extra args: --my-custom-arg not found in vllm command"
|
||||||
|
log_verbose "$vllm_cmd"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test: Extra args work in cluster mode
|
||||||
|
test_extra_args_cluster_mode() {
|
||||||
|
log_test "Extra args: work in cluster mode"
|
||||||
|
|
||||||
|
first_recipe=$(ls "$PROJECT_DIR/recipes/"*.yaml 2>/dev/null | head -1)
|
||||||
|
if [[ -z "$first_recipe" ]]; then
|
||||||
|
log_skip "No recipes found"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
recipe_name=$(basename "$first_recipe" .yaml)
|
||||||
|
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run -n "10.0.0.1,10.0.0.2" -- --load-format auto 2>&1)
|
||||||
|
|
||||||
|
if echo "$output" | grep -q "\-\-load-format auto"; then
|
||||||
|
log_pass "Extra args: work in cluster mode"
|
||||||
|
else
|
||||||
|
log_fail "Extra args: not found in cluster mode output"
|
||||||
|
log_verbose "$output"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# Run all tests
|
# Run all tests
|
||||||
main() {
|
main() {
|
||||||
echo "=============================================="
|
echo "=============================================="
|
||||||
@@ -835,6 +1074,20 @@ main() {
|
|||||||
test_launch_cluster_examples_path
|
test_launch_cluster_examples_path
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
# Extra vLLM arguments tests (-- pass-through)
|
||||||
|
echo "--- Extra vLLM Arguments (-- pass-through) ---"
|
||||||
|
test_extra_args_load_format
|
||||||
|
test_extra_args_served_model_name
|
||||||
|
test_extra_args_equals_syntax
|
||||||
|
test_extra_args_multiple
|
||||||
|
test_extra_args_empty
|
||||||
|
test_extra_args_duplicate_port_warning
|
||||||
|
test_extra_args_duplicate_gpu_mem_warning
|
||||||
|
test_extra_args_duplicate_tp_warning
|
||||||
|
test_extra_args_ordering
|
||||||
|
test_extra_args_cluster_mode
|
||||||
|
echo ""
|
||||||
|
|
||||||
# Validation tests
|
# Validation tests
|
||||||
test_unsupported_recipe_version
|
test_unsupported_recipe_version
|
||||||
test_missing_recipe_version_fails
|
test_missing_recipe_version_fails
|
||||||
|
|||||||
Reference in New Issue
Block a user