From 122edc8229ebc94054c5a28452900092a3fd7451 Mon Sep 17 00:00:00 2001 From: remi Date: Wed, 11 Mar 2026 20:53:44 +0100 Subject: [PATCH 1/2] super nemotron mod & recipe for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 --- mods/nemotron-super/run.sh | 4 +++ recipes/nemotron-3-super-nvfp4.yaml | 44 +++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 mods/nemotron-super/run.sh create mode 100644 recipes/nemotron-3-super-nvfp4.yaml diff --git a/mods/nemotron-super/run.sh b/mods/nemotron-super/run.sh new file mode 100644 index 0000000..ad0c5b2 --- /dev/null +++ b/mods/nemotron-super/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +cd $WORKSPACE_DIR +wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/resolve/main/super_v3_reasoning_parser.py diff --git a/recipes/nemotron-3-super-nvfp4.yaml b/recipes/nemotron-3-super-nvfp4.yaml new file mode 100644 index 0000000..79c6780 --- /dev/null +++ b/recipes/nemotron-3-super-nvfp4.yaml @@ -0,0 +1,44 @@ +# Recipe: Nemotron-3-Super-NVFP4 +# Optimized for Marlin backend throughput +recipe_version: "2" +name: Nemotron-3-Super-NVFP4-Marlin-Optimized +description: vLLM serving Nemotron-3-Super-120B using Marlin kernels + +model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 +container: vllm-node +solo_only: true + +mods: + - mods/nemotron-super + +defaults: + port: 8888 + host: 0.0.0.0 + tensor_parallel: 1 + gpu_memory_utilization: 0.7 + max_model_len: 262144 + max_num_seqs: 8 + +env: + # Marlin performance overrides + VLLM_NVFP4_GEMM_BACKEND: "marlin" + VLLM_TEST_FORCE_FP8_MARLIN: "1" + VLLM_MARLIN_USE_ATOMIC_ADD: "1" + # Disable conflicting backends + VLLM_FP8_BACKEND: "marlin" + VLLM_SCALED_MM_BACKEND: "marlin" + +command: | + vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \ + --max-model-len {max_model_len} \ + --max-num-seqs {max_num_seqs} \ + --port {port} --host {host} \ + --trust-remote-code \ + --tensor-parallel-size {tensor_parallel} \ + --kv-cache-dtype fp8 \ + --load-format fastsafetensors \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --reasoning-parser-plugin super_v3_reasoning_parser.py \ + --reasoning-parser super_v3 From 6f9a2f981cfe222fc63bd0a799c5587f29e70059 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 12 Mar 2026 12:59:05 -0700 Subject: [PATCH 2/2] Adjusted model parameters --- recipes/nemotron-3-super-nvfp4.yaml | 42 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/recipes/nemotron-3-super-nvfp4.yaml b/recipes/nemotron-3-super-nvfp4.yaml index 79c6780..48120fe 100644 --- a/recipes/nemotron-3-super-nvfp4.yaml +++ b/recipes/nemotron-3-super-nvfp4.yaml @@ -1,44 +1,44 @@ # Recipe: Nemotron-3-Super-NVFP4 # Optimized for Marlin backend throughput -recipe_version: "2" +recipe_version: "1" name: Nemotron-3-Super-NVFP4-Marlin-Optimized description: vLLM serving Nemotron-3-Super-120B using Marlin kernels model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 container: vllm-node +cluster_only: false +# This model can only run on single node (solo) solo_only: true mods: - mods/nemotron-super +container: vllm-node defaults: - port: 8888 + port: 8000 host: 0.0.0.0 tensor_parallel: 1 gpu_memory_utilization: 0.7 max_model_len: 262144 - max_num_seqs: 8 - + max_num_seqs: 10 env: - # Marlin performance overrides VLLM_NVFP4_GEMM_BACKEND: "marlin" VLLM_TEST_FORCE_FP8_MARLIN: "1" VLLM_MARLIN_USE_ATOMIC_ADD: "1" - # Disable conflicting backends - VLLM_FP8_BACKEND: "marlin" - VLLM_SCALED_MM_BACKEND: "marlin" command: | - vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \ - --max-model-len {max_model_len} \ - --max-num-seqs {max_num_seqs} \ - --port {port} --host {host} \ - --trust-remote-code \ - --tensor-parallel-size {tensor_parallel} \ - --kv-cache-dtype fp8 \ - --load-format fastsafetensors \ - --gpu-memory-utilization {gpu_memory_utilization} \ - --enable-auto-tool-choice \ - --tool-call-parser qwen3_coder \ - --reasoning-parser-plugin super_v3_reasoning_parser.py \ - --reasoning-parser super_v3 + vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \ + --kv-cache-dtype fp8 \ + -tp {tensor_parallel} \ + --trust-remote-code \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --max-model-len {max_model_len} \ + --max-num-seqs {max_num_seqs} \ + --enable-prefix-caching \ + --host {host} \ + --port {port} \ + --enable-auto-tool-choice \ + --load-format fastsafetensors \ + --tool-call-parser qwen3_coder \ + --reasoning-parser-plugin super_v3_reasoning_parser.py \ + --reasoning-parser super_v3 \ No newline at end of file