Changed Nemotron-3-Nano-NVFP4 to Marlin backend
This commit is contained in:
@@ -25,12 +25,13 @@ defaults:
|
|||||||
host: 0.0.0.0
|
host: 0.0.0.0
|
||||||
tensor_parallel: 1
|
tensor_parallel: 1
|
||||||
gpu_memory_utilization: 0.7
|
gpu_memory_utilization: 0.7
|
||||||
max_model_len: 131072
|
max_model_len: 262144
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
env:
|
env:
|
||||||
VLLM_USE_FLASHINFER_MOE_FP4: 1
|
VLLM_NVFP4_GEMM_BACKEND: "marlin"
|
||||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
VLLM_TEST_FORCE_FP8_MARLIN: "1"
|
||||||
|
VLLM_MARLIN_USE_ATOMIC_ADD: "1"
|
||||||
|
|
||||||
# The vLLM serve command template
|
# The vLLM serve command template
|
||||||
command: |
|
command: |
|
||||||
@@ -44,6 +45,5 @@ command: |
|
|||||||
--reasoning-parser nano_v3 \
|
--reasoning-parser nano_v3 \
|
||||||
--kv-cache-dtype fp8 \
|
--kv-cache-dtype fp8 \
|
||||||
--enable-prefix-caching \
|
--enable-prefix-caching \
|
||||||
--attention-backend flashinfer \
|
|
||||||
--load-format fastsafetensors \
|
--load-format fastsafetensors \
|
||||||
--gpu-memory-utilization {gpu_memory_utilization}
|
--gpu-memory-utilization {gpu_memory_utilization}
|
||||||
|
|||||||
@@ -470,17 +470,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
|
|||||||
if '--distributed-executor-backend' not in line
|
if '--distributed-executor-backend' not in line
|
||||||
]
|
]
|
||||||
command = '\n'.join(filtered_lines)
|
command = '\n'.join(filtered_lines)
|
||||||
|
|
||||||
|
# Remove trailing backslash if present
|
||||||
|
if command.endswith('\\\n'):
|
||||||
|
command = command.rstrip('\\\n').rstrip()
|
||||||
|
|
||||||
# Append extra args if provided (after --)
|
# Append extra args if provided (after --)
|
||||||
if extra_args:
|
if extra_args:
|
||||||
# Join extra args and append to command
|
# Join extra args and append to command
|
||||||
extra_args_str = ' '.join(shlex.quote(a) for a in extra_args)
|
extra_args_str = ' '.join(shlex.quote(a) for a in extra_args)
|
||||||
command = command.rstrip()
|
command = command + ' ' + extra_args_str
|
||||||
# Handle multi-line commands with backslash continuations
|
|
||||||
if command.endswith('\\'):
|
|
||||||
command = command.rstrip('\\').rstrip() + ' \\\n ' + extra_args_str
|
|
||||||
else:
|
|
||||||
command = command + ' ' + extra_args_str
|
|
||||||
|
|
||||||
lines.append("# Run the model")
|
lines.append("# Run the model")
|
||||||
lines.append(command.strip())
|
lines.append(command.strip())
|
||||||
|
|||||||
Reference in New Issue
Block a user