From 122edc8229ebc94054c5a28452900092a3fd7451 Mon Sep 17 00:00:00 2001
From: remi <giraudremi92@gmail.com>
Date: Wed, 11 Mar 2026 20:53:44 +0100
Subject: [PATCH 1/2] super nemotron mod & recipe for
 nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4

---
 mods/nemotron-super/run.sh          |  4 +++
 recipes/nemotron-3-super-nvfp4.yaml | 44 +++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 mods/nemotron-super/run.sh
 create mode 100644 recipes/nemotron-3-super-nvfp4.yaml

diff --git a/mods/nemotron-super/run.sh b/mods/nemotron-super/run.sh
new file mode 100644
index 0000000..ad0c5b2
--- /dev/null
+++ b/mods/nemotron-super/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+cd $WORKSPACE_DIR
+wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/resolve/main/super_v3_reasoning_parser.py
diff --git a/recipes/nemotron-3-super-nvfp4.yaml b/recipes/nemotron-3-super-nvfp4.yaml
new file mode 100644
index 0000000..79c6780
--- /dev/null
+++ b/recipes/nemotron-3-super-nvfp4.yaml
@@ -0,0 +1,44 @@
+# Recipe: Nemotron-3-Super-NVFP4
+# Optimized for Marlin backend throughput
+recipe_version: "2"
+name: Nemotron-3-Super-NVFP4-Marlin-Optimized
+description: vLLM serving Nemotron-3-Super-120B using Marlin kernels
+
+model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
+container: vllm-node
+solo_only: true
+
+mods: 
+  - mods/nemotron-super
+
+defaults:
+  port: 8888
+  host: 0.0.0.0
+  tensor_parallel: 1
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_seqs: 8
+
+env:
+  # Marlin performance overrides
+  VLLM_NVFP4_GEMM_BACKEND: "marlin"
+  VLLM_TEST_FORCE_FP8_MARLIN: "1"
+  VLLM_MARLIN_USE_ATOMIC_ADD: "1"
+  # Disable conflicting backends
+  VLLM_FP8_BACKEND: "marlin"
+  VLLM_SCALED_MM_BACKEND: "marlin"
+
+command: |
+  vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4  \
+      --max-model-len {max_model_len} \
+      --max-num-seqs {max_num_seqs} \
+      --port {port} --host {host} \
+      --trust-remote-code \
+      --tensor-parallel-size {tensor_parallel} \
+      --kv-cache-dtype fp8  \
+      --load-format fastsafetensors  \
+      --gpu-memory-utilization {gpu_memory_utilization} \
+      --enable-auto-tool-choice \
+      --tool-call-parser qwen3_coder \
+      --reasoning-parser-plugin super_v3_reasoning_parser.py \
+      --reasoning-parser super_v3

From 6f9a2f981cfe222fc63bd0a799c5587f29e70059 Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Thu, 12 Mar 2026 12:59:05 -0700
Subject: [PATCH 2/2] Adjusted model parameters

---
 recipes/nemotron-3-super-nvfp4.yaml | 42 ++++++++++++++---------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/recipes/nemotron-3-super-nvfp4.yaml b/recipes/nemotron-3-super-nvfp4.yaml
index 79c6780..48120fe 100644
--- a/recipes/nemotron-3-super-nvfp4.yaml
+++ b/recipes/nemotron-3-super-nvfp4.yaml
@@ -1,44 +1,44 @@
 # Recipe: Nemotron-3-Super-NVFP4
 # Optimized for Marlin backend throughput
-recipe_version: "2"
+recipe_version: "1"
 name: Nemotron-3-Super-NVFP4-Marlin-Optimized
 description: vLLM serving Nemotron-3-Super-120B using Marlin kernels
 
 model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
 container: vllm-node
+cluster_only: false
+# This model can only run on single node (solo)
 solo_only: true
 
 mods: 
   - mods/nemotron-super
 
+container: vllm-node
 defaults:
-  port: 8888
+  port: 8000
   host: 0.0.0.0
   tensor_parallel: 1
   gpu_memory_utilization: 0.7
   max_model_len: 262144
-  max_num_seqs: 8
-
+  max_num_seqs: 10
 env:
-  # Marlin performance overrides
   VLLM_NVFP4_GEMM_BACKEND: "marlin"
   VLLM_TEST_FORCE_FP8_MARLIN: "1"
   VLLM_MARLIN_USE_ATOMIC_ADD: "1"
-  # Disable conflicting backends
-  VLLM_FP8_BACKEND: "marlin"
-  VLLM_SCALED_MM_BACKEND: "marlin"
 
 command: |
-  vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4  \
-      --max-model-len {max_model_len} \
-      --max-num-seqs {max_num_seqs} \
-      --port {port} --host {host} \
-      --trust-remote-code \
-      --tensor-parallel-size {tensor_parallel} \
-      --kv-cache-dtype fp8  \
-      --load-format fastsafetensors  \
-      --gpu-memory-utilization {gpu_memory_utilization} \
-      --enable-auto-tool-choice \
-      --tool-call-parser qwen3_coder \
-      --reasoning-parser-plugin super_v3_reasoning_parser.py \
-      --reasoning-parser super_v3
+  vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
+  --kv-cache-dtype fp8 \
+  -tp {tensor_parallel} \
+  --trust-remote-code \
+  --gpu-memory-utilization {gpu_memory_utilization} \
+  --max-model-len {max_model_len} \
+  --max-num-seqs {max_num_seqs} \
+  --enable-prefix-caching \
+  --host {host} \
+  --port {port} \
+  --enable-auto-tool-choice \
+  --load-format fastsafetensors \
+  --tool-call-parser qwen3_coder \
+  --reasoning-parser-plugin super_v3_reasoning_parser.py \
+  --reasoning-parser super_v3
\ No newline at end of file