# Recipe: Nemotron-3-Super-NVFP4
# Optimized for Marlin backend throughput
recipe_version: "2"
name: Nemotron-3-Super-NVFP4-Marlin-Optimized
description: vLLM serving Nemotron-3-Super-120B using Marlin kernels

model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
container: vllm-node
solo_only: true

mods: 
  - mods/nemotron-super

defaults:
  port: 8888
  host: 0.0.0.0
  tensor_parallel: 1
  gpu_memory_utilization: 0.7
  max_model_len: 262144
  max_num_seqs: 8

env:
  # Marlin performance overrides
  VLLM_NVFP4_GEMM_BACKEND: "marlin"
  VLLM_TEST_FORCE_FP8_MARLIN: "1"
  VLLM_MARLIN_USE_ATOMIC_ADD: "1"
  # Disable conflicting backends
  VLLM_FP8_BACKEND: "marlin"
  VLLM_SCALED_MM_BACKEND: "marlin"

command: |
  vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4  \
      --max-model-len {max_model_len} \
      --max-num-seqs {max_num_seqs} \
      --port {port} --host {host} \
      --trust-remote-code \
      --tensor-parallel-size {tensor_parallel} \
      --kv-cache-dtype fp8  \
      --load-format fastsafetensors  \
      --gpu-memory-utilization {gpu_memory_utilization} \
      --enable-auto-tool-choice \
      --tool-call-parser qwen3_coder \
      --reasoning-parser-plugin super_v3_reasoning_parser.py \
      --reasoning-parser super_v3