From 133ed9cfb9e1dbf97fc843ec2884a2a62574cee6 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 31 Jan 2026 16:12:33 -0800 Subject: [PATCH 1/2] bumped up MXFP4 base image version --- Dockerfile.mxfp4 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4 index b0b569d..c218952 100644 --- a/Dockerfile.mxfp4 +++ b/Dockerfile.mxfp4 @@ -6,7 +6,7 @@ ARG BUILD_JOBS=16 # ========================================================= # STAGE 1: Base Image (Installs Dependencies) # ========================================================= -FROM nvcr.io/nvidia/pytorch:25.12-py3 AS base +FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base # Build parallemism ARG BUILD_JOBS @@ -225,7 +225,7 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \ # ========================================================= # STAGE 4: Runner (Transfers only necessary artifacts) # ========================================================= -FROM nvcr.io/nvidia/pytorch:25.12-py3 AS runner +FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner ENV DEBIAN_FRONTEND=noninteractive ENV PIP_BREAK_SYSTEM_PACKAGES=1 @@ -274,3 +274,7 @@ RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install ray[default] fastsafetensors + +# If not compiling Triton +# remove triton-kernels as they are not compatible with this vLLM version yet +RUN uv pip uninstall triton-kernels From 4634ee92a26c797f9cd970b08c9d4e15f51969c9 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Mon, 2 Feb 2026 11:58:07 -0800 Subject: [PATCH 2/2] Added a mod for Nemotron Nano --- README.md | 28 ++++++++++++++++++++++++++++ mods/nemotron-nano/run.sh | 4 ++++ 2 files changed, 32 insertions(+) create mode 100644 mods/nemotron-nano/run.sh diff --git a/README.md b/README.md index 97a400f..9cbc8ef 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,34 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +### 2026-02-02 + +Added a mod for nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B support. It supports all Nemotron Nano models/quants using the same reasoning parser. +To use, add `--apply-mod mods/nemotron-nano` to `./launch-cluster.sh` arguments. + +For example, to run nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 on a single node: + +```bash +./launch-cluster.sh --solo --apply-mod mods/nemotron-nano \ + -e VLLM_USE_FLASHINFER_MOE_FP4=1 \ + -e VLLM_FLASHINFER_MOE_BACKEND=throughput \ + exec vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \ + --max-num-seqs 8 \ + --tensor-parallel-size 1 \ + --max-model-len 262144 \ + --port 8888 --host 0.0.0.0 \ + --trust-remote-code \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --reasoning-parser-plugin nano_v3_reasoning_parser.py \ + --reasoning-parser nano_v3 \ + --kv-cache-dtype fp8 \ + --gpu-memory-utilization 0.7 \ + --load-format fastsafetensors +``` + +Please note, that NVFP4 models on Spark are not fully supported on vLLM (any build) yet, so the performance will not be optimal. You will likely see Flashinfer errors during load. This model is also known to crash sometimes. + ### 2026-01-29 #### New Parameters for launch-cluster.sh diff --git a/mods/nemotron-nano/run.sh b/mods/nemotron-nano/run.sh new file mode 100644 index 0000000..8f2d581 --- /dev/null +++ b/mods/nemotron-nano/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +cd $VLLM_BASE_DIR +wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4/resolve/main/nano_v3_reasoning_parser.py