# Recipe: Nemotron-3-Nano-NVFP4 # Nemotron-3-Nano model with NVFP4 quantization support # Currently can only be run in solo mode, cluster mode fails with error recipe_version: "1" name: Nemotron-3-Nano-NVFP4 description: vLLM serving Nemotron-3-Nano-NVFP4 on a SINGLE NODE ONLY! # HuggingFace model to download (optional, for --download-model) model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 # Container image to use container: vllm-node # This model can only run on single node (solo) solo_only: true # No mods required mods: - mods/nemotron-nano # Default settings (can be overridden via CLI) defaults: port: 8000 host: 0.0.0.0 tensor_parallel: 1 gpu_memory_utilization: 0.7 max_model_len: 131072 # Environment variables env: VLLM_USE_FLASHINFER_MOE_FP4: 1 VLLM_FLASHINFER_MOE_BACKEND: "throughput" # The vLLM serve command template command: | vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \ --max-model-len {max_model_len} \ --port {port} --host {host} \ --trust-remote-code \ --enable-auto-tool-choice \ --tool-call-parser qwen3_coder \ --reasoning-parser-plugin nano_v3_reasoning_parser.py \ --reasoning-parser nano_v3 \ --kv-cache-dtype fp8 \ --enable-prefix-caching \ --attention-backend flashinfer \ --load-format fastsafetensors \ --gpu-memory-utilization {gpu_memory_utilization}