Changed Nemotron-3-Nano-NVFP4 to Marlin backend
This commit is contained in:
@@ -25,12 +25,13 @@ defaults:
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 1
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 131072
|
||||
max_model_len: 262144
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_USE_FLASHINFER_MOE_FP4: 1
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||
VLLM_NVFP4_GEMM_BACKEND: "marlin"
|
||||
VLLM_TEST_FORCE_FP8_MARLIN: "1"
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: "1"
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
@@ -44,6 +45,5 @@ command: |
|
||||
--reasoning-parser nano_v3 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--enable-prefix-caching \
|
||||
--attention-backend flashinfer \
|
||||
--load-format fastsafetensors \
|
||||
--gpu-memory-utilization {gpu_memory_utilization}
|
||||
|
||||
@@ -470,17 +470,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
||||
if '--distributed-executor-backend' not in line
|
||||
]
|
||||
command = '\n'.join(filtered_lines)
|
||||
|
||||
# Remove trailing backslash if present
|
||||
if command.endswith('\\\n'):
|
||||
command = command.rstrip('\\\n').rstrip()
|
||||
|
||||
# Append extra args if provided (after --)
|
||||
if extra_args:
|
||||
# Join extra args and append to command
|
||||
extra_args_str = ' '.join(shlex.quote(a) for a in extra_args)
|
||||
command = command.rstrip()
|
||||
# Handle multi-line commands with backslash continuations
|
||||
if command.endswith('\\'):
|
||||
command = command.rstrip('\\').rstrip() + ' \\\n ' + extra_args_str
|
||||
else:
|
||||
command = command + ' ' + extra_args_str
|
||||
command = command + ' ' + extra_args_str
|
||||
|
||||
lines.append("# Run the model")
|
||||
lines.append(command.strip())
|
||||
|
||||
Reference in New Issue
Block a user