Changed Nemotron-3-Nano-NVFP4 to Marlin backend

This commit is contained in:
Eugene Rakhmatulin
2026-03-17 13:10:48 -07:00
parent fa645f3e4b
commit b1eeefc0eb
2 changed files with 9 additions and 10 deletions

View File

@@ -25,12 +25,13 @@ defaults:
host: 0.0.0.0
tensor_parallel: 1
gpu_memory_utilization: 0.7
max_model_len: 131072
max_model_len: 262144
# Environment variables
env:
VLLM_USE_FLASHINFER_MOE_FP4: 1
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
VLLM_NVFP4_GEMM_BACKEND: "marlin"
VLLM_TEST_FORCE_FP8_MARLIN: "1"
VLLM_MARLIN_USE_ATOMIC_ADD: "1"
# The vLLM serve command template
command: |
@@ -44,6 +45,5 @@ command: |
--reasoning-parser nano_v3 \
--kv-cache-dtype fp8 \
--enable-prefix-caching \
--attention-backend flashinfer \
--load-format fastsafetensors \
--gpu-memory-utilization {gpu_memory_utilization}