From 74876dd4424adf7a0978efd3fb9da222b84fde23 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Mon, 9 Feb 2026 14:33:35 -0800 Subject: [PATCH] Added recipes for nemotron-nano-3 and qwen3-coder-next --- recipes/minimax-m2-awq.yaml | 3 ++ recipes/nemotron-3-nano-nvfp4.yaml | 47 ++++++++++++++++++++++++++++++ recipes/qwen3-coder-next-fp8.yaml | 43 +++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 recipes/nemotron-3-nano-nvfp4.yaml create mode 100644 recipes/qwen3-coder-next-fp8.yaml diff --git a/recipes/minimax-m2-awq.yaml b/recipes/minimax-m2-awq.yaml index 8d584c7..52079d4 100644 --- a/recipes/minimax-m2-awq.yaml +++ b/recipes/minimax-m2-awq.yaml @@ -11,6 +11,9 @@ model: QuantTrio/MiniMax-M2-AWQ # Container image to use container: vllm-node +# Can only be run in a cluster +cluster_only: true + # No mods required mods: [] diff --git a/recipes/nemotron-3-nano-nvfp4.yaml b/recipes/nemotron-3-nano-nvfp4.yaml new file mode 100644 index 0000000..15cf910 --- /dev/null +++ b/recipes/nemotron-3-nano-nvfp4.yaml @@ -0,0 +1,47 @@ +# Recipe: Nemotron-3-Nano-NVFP4 +# Nemotron-3-Nano model with NVFP4 quantization support +# Currently can only be run in solo mode, cluster mode fails with error + +recipe_version: "1" +name: Nemotron-3-Nano-NVFP4 +description: vLLM serving Nemotron-3-Nano-NVFP4 on a SINGLE NODE ONLY! + +# HuggingFace model to download (optional, for --download-model) +model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 + +# Container image to use +container: vllm-node + +# This model can only run on single node (solo) +solo_only: true + +# No mods required +mods: + - mods/nemotron-nano + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 1 + gpu_memory_utilization: 0.7 + max_model_len: 131072 + +# Environment variables +env: + VLLM_USE_FLASHINFER_MOE_FP4: 1 + VLLM_FLASHINFER_MOE_BACKEND: "throughput" + +# The vLLM serve command template +command: | + vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \ + --max-model-len {max_model_len} \ + --port {port} --host {host} \ + --trust-remote-code \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --reasoning-parser-plugin nano_v3_reasoning_parser.py \ + --reasoning-parser nano_v3 \ + --kv-cache-dtype fp8 \ + --load-format fastsafetensors \ + --gpu-memory-utilization {gpu_memory_utilization} diff --git a/recipes/qwen3-coder-next-fp8.yaml b/recipes/qwen3-coder-next-fp8.yaml new file mode 100644 index 0000000..0200c0c --- /dev/null +++ b/recipes/qwen3-coder-next-fp8.yaml @@ -0,0 +1,43 @@ +# Recipe: Qwen3-Coder-Next-FP8 +# Qwen3-Coder-Next model in native FP8 format +# Currently can only be run in solo mode, cluster mode fails with error - tracking https://github.com/vllm-project/vllm/issues/33857 + +recipe_version: "1" +name: Qwen3-Coder-Next-FP8 +description: vLLM serving Qwen3-Coder-Next-FP8 on a SINGLE NODE ONLY! + +# HuggingFace model to download (optional, for --download-model) +model: Qwen/Qwen3-Coder-Next-FP8 + +# This model can only run on single node (solo) +solo_only: true + +# Container image to use +container: vllm-node + +# No mods required +mods: [] + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 1 + gpu_memory_utilization: 0.7 + max_model_len: 131072 + +# Environment variables +env: {} + +# The vLLM serve command template +command: | + vllm serve Qwen/Qwen3-Coder-Next-FP8 \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --host {host} \ + --port {port} \ + --load-format fastsafetensors \ + --attention-backend flashinfer \ + --enable-prefix-caching \ + --max-model-len {max_model_len}