# Recipe: Nemotron-3-Super-NVFP4 # Uses VLLM_CUTLASS for NVFP4 recipe_version: "1" name: Nemotron-3-Super-NVFP4-CUTLASS-Optimized description: vLLM serving Nemotron-3-Super-120B using CUTLASS kernels model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 container: vllm-node cluster_only: false solo_only: false # mods: # - mods/nemotron-super env: VLLM_FLASHINFER_ALLREDUCE_BACKEND: trtllm VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1 container: vllm-node defaults: port: 8000 host: 0.0.0.0 tensor_parallel: 2 gpu_memory_utilization: 0.7 max_model_len: 262144 max_num_seqs: 10 command: | vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \ --kv-cache-dtype fp8 \ --moe-backend cutlass \ --trust-remote-code \ --gpu-memory-utilization {gpu_memory_utilization} \ --max-model-len {max_model_len} \ --max-num-seqs {max_num_seqs} \ --enable-prefix-caching \ --host {host} \ --port {port} \ --enable-auto-tool-choice \ --load-format fastsafetensors \ --tool-call-parser qwen3_coder \ --reasoning-parser nemotron_v3 \ --mamba_ssm_cache_dtype float32 \ --tensor-parallel-size {tensor_parallel} \ --attention-backend TRITON_ATTN \ --distributed-executor-backend ray