Changed Nemotron-3-Nano-NVFP4 to Marlin backend

2026-03-17 13:10:48 -07:00
parent fa645f3e4b
commit b1eeefc0eb
2 changed files with 9 additions and 10 deletions
--- a/recipes/nemotron-3-nano-nvfp4.yaml
+++ b/recipes/nemotron-3-nano-nvfp4.yaml
@@ -25,12 +25,13 @@ defaults:
  host: 0.0.0.0
  tensor_parallel: 1
  gpu_memory_utilization: 0.7
-  max_model_len: 131072
+  max_model_len: 262144

 # Environment variables
 env:
-  VLLM_USE_FLASHINFER_MOE_FP4: 1
-  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
+  VLLM_NVFP4_GEMM_BACKEND: "marlin"
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
+  VLLM_MARLIN_USE_ATOMIC_ADD: "1"

 # The vLLM serve command template
 command: |
@@ -44,6 +45,5 @@ command: |
     --reasoning-parser nano_v3  \
     --kv-cache-dtype fp8  \
     --enable-prefix-caching  \
-     --attention-backend flashinfer  \
     --load-format fastsafetensors  \
     --gpu-memory-utilization {gpu_memory_utilization}
--- a/run-recipe.py
+++ b/run-recipe.py
@@ -470,17 +470,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
            if '--distributed-executor-backend' not in line
        ]
        command = '\n'.join(filtered_lines)
+
+    # Remove trailing backslash if present
+    if command.endswith('\\\n'):
+        command = command.rstrip('\\\n').rstrip()
    
    # Append extra args if provided (after --)
    if extra_args:
        # Join extra args and append to command
        extra_args_str = ' '.join(shlex.quote(a) for a in extra_args)
-        command = command.rstrip()
-        # Handle multi-line commands with backslash continuations
-        if command.endswith('\\'):
-            command = command.rstrip('\\').rstrip() + ' \\\n    ' + extra_args_str
-        else:
-            command = command + ' ' + extra_args_str
+        command = command + ' ' + extra_args_str
    
    lines.append("# Run the model")
    lines.append(command.strip())