diff --git a/recipes/nemotron-3-nano-nvfp4.yaml b/recipes/nemotron-3-nano-nvfp4.yaml index eaeb161..c835e45 100644 --- a/recipes/nemotron-3-nano-nvfp4.yaml +++ b/recipes/nemotron-3-nano-nvfp4.yaml @@ -25,12 +25,13 @@ defaults: host: 0.0.0.0 tensor_parallel: 1 gpu_memory_utilization: 0.7 - max_model_len: 131072 + max_model_len: 262144 # Environment variables env: - VLLM_USE_FLASHINFER_MOE_FP4: 1 - VLLM_FLASHINFER_MOE_BACKEND: "throughput" + VLLM_NVFP4_GEMM_BACKEND: "marlin" + VLLM_TEST_FORCE_FP8_MARLIN: "1" + VLLM_MARLIN_USE_ATOMIC_ADD: "1" # The vLLM serve command template command: | @@ -44,6 +45,5 @@ command: | --reasoning-parser nano_v3 \ --kv-cache-dtype fp8 \ --enable-prefix-caching \ - --attention-backend flashinfer \ --load-format fastsafetensors \ --gpu-memory-utilization {gpu_memory_utilization} diff --git a/run-recipe.py b/run-recipe.py index 219c2d4..2cb41b7 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -470,17 +470,16 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is if '--distributed-executor-backend' not in line ] command = '\n'.join(filtered_lines) + + # Remove trailing backslash if present + if command.endswith('\\\n'): + command = command.rstrip('\\\n').rstrip() # Append extra args if provided (after --) if extra_args: # Join extra args and append to command extra_args_str = ' '.join(shlex.quote(a) for a in extra_args) - command = command.rstrip() - # Handle multi-line commands with backslash continuations - if command.endswith('\\'): - command = command.rstrip('\\').rstrip() + ' \\\n ' + extra_args_str - else: - command = command + ' ' + extra_args_str + command = command + ' ' + extra_args_str lines.append("# Run the model") lines.append(command.strip())