Changed Nemotron-3-Nano-NVFP4 to Marlin backend

2026-03-17 13:10:48 -07:00
parent fa645f3e4b
commit b1eeefc0eb
2 changed files with 9 additions and 10 deletions
--- a/recipes/nemotron-3-nano-nvfp4.yaml
+++ b/recipes/nemotron-3-nano-nvfp4.yaml
@@ -25,12 +25,13 @@ defaults:
  host: 0.0.0.0
  tensor_parallel: 1
  gpu_memory_utilization: 0.7
-  max_model_len: 131072
+  max_model_len: 262144

 # Environment variables
 env:
-  VLLM_USE_FLASHINFER_MOE_FP4: 1
-  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
+  VLLM_NVFP4_GEMM_BACKEND: "marlin"
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
+  VLLM_MARLIN_USE_ATOMIC_ADD: "1"

 # The vLLM serve command template
 command: |
@@ -44,6 +45,5 @@ command: |
     --reasoning-parser nano_v3  \
     --kv-cache-dtype fp8  \
     --enable-prefix-caching  \
-     --attention-backend flashinfer  \
     --load-format fastsafetensors  \
     --gpu-memory-utilization {gpu_memory_utilization}