From 8caebe31557d47a80a0f623f01dfe63c12fbd09a Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Fri, 20 Mar 2026 17:03:18 -0700 Subject: [PATCH 1/6] Reverting back to CUDA image + pytorch from wheels --- Dockerfile | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index cce0e4a..f664c95 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,9 +4,9 @@ ARG BUILD_JOBS=16 # ========================================================= -# STAGE 1: Base Image (Installs Dependencies) +# STAGE 1: Base Build Image # ========================================================= -FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base +FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS base # Build parallemism ARG BUILD_JOBS @@ -35,10 +35,13 @@ ENV VLLM_BASE_DIR=/workspace/vllm # Added ccache to enable incremental compilation caching RUN apt update && \ apt install -y --no-install-recommends \ - curl vim ninja-build git \ + curl vim cmake build-essential ninja-build \ + libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \ + python3-dev python3-pip git wget \ + libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ ccache \ && rm -rf /var/lib/apt/lists/* \ - && pip install uv && pip uninstall -y flash-attn + && pip install uv # Configure Ccache for CUDA/C++ ENV PATH=/usr/lib/ccache:$PATH @@ -133,7 +136,8 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} WORKDIR $VLLM_BASE_DIR RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \ + uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton # --- VLLM SOURCE CACHE BUSTER --- ARG CACHEBUST_VLLM=1 @@ -211,7 +215,7 @@ COPY --from=vllm-builder /workspace/wheels / # ========================================================= # STAGE 6: Runner (Installs wheels from host ./wheels/) # ========================================================= -FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner +FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS runner # Transferring build settings from build image because of ptxas/jit compilation during vLLM startup # Build parallemism @@ -235,10 +239,12 @@ ENV UV_LINK_MODE=copy # Install runtime dependencies RUN apt update && \ apt install -y --no-install-recommends \ - curl vim git \ + python3 python3-pip python3-dev vim curl git wget \ + libcudnn9-cuda-13 \ + libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ libxcb1 \ && rm -rf /var/lib/apt/lists/* \ - && pip install uv && pip uninstall -y flash-attn # triton-kernels pytorch-triton + && pip install uv # Set final working directory WORKDIR $VLLM_BASE_DIR @@ -250,6 +256,11 @@ RUN mkdir -p tiktoken_encodings && \ ARG PRE_TRANSFORMERS=0 +# Install dependencies +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \ + uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton + # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat) # With --tf5: override vLLM's transformers<5 constraint to get transformers>=5 RUN --mount=type=bind,source=wheels,target=/workspace/wheels \ @@ -273,7 +284,7 @@ ENV PATH=$VLLM_BASE_DIR:$PATH # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13 + uv pip install ray[default] fastsafetensors # Cleanup From 8385506c5e1fe9003108d770bd20a84de58bfcef Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Fri, 20 Mar 2026 23:51:21 -0700 Subject: [PATCH 2/6] Fixes --- Dockerfile | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index f664c95..2a56775 100644 --- a/Dockerfile +++ b/Dockerfile @@ -43,6 +43,11 @@ RUN apt update && \ && rm -rf /var/lib/apt/lists/* \ && pip install uv +# Additional deps +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \ + uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm + # Configure Ccache for CUDA/C++ ENV PATH=/usr/lib/ccache:$PATH ENV CCACHE_DIR=/root/.ccache @@ -76,9 +81,6 @@ ARG FLASHINFER_REF=main # Change this argument to force a re-download of FlashInfer ARG CACHEBUST_FLASHINFER=1 -RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" - # Smart Git Clone (Fetch changes instead of full re-clone) RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ cd /repo-cache && \ @@ -135,10 +137,6 @@ ARG TORCH_CUDA_ARCH_LIST="12.1a" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} WORKDIR $VLLM_BASE_DIR -RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \ - uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton - # --- VLLM SOURCE CACHE BUSTER --- ARG CACHEBUST_VLLM=1 @@ -256,10 +254,10 @@ RUN mkdir -p tiktoken_encodings && \ ARG PRE_TRANSFORMERS=0 -# Install dependencies +# Install deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \ - uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton + uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \ + uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat) # With --tf5: override vLLM's transformers<5 constraint to get transformers>=5 From 6e8d85c9142bba60830f1983731ba92ff1ffcc7d Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 21 Mar 2026 15:12:12 -0700 Subject: [PATCH 3/6] cleanup --- Dockerfile | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2a56775..ef0d766 100644 --- a/Dockerfile +++ b/Dockerfile @@ -282,21 +282,4 @@ ENV PATH=$VLLM_BASE_DIR:$PATH # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install ray[default] fastsafetensors - -# Cleanup - -# Keeping it here for reference - this won't work as is without squashing layers -# RUN uv pip uninstall absl-py apex argon2-cffi \ -# argon2-cffi-bindings arrow asttokens astunparse async-lru audioread babel beautifulsoup4 \ -# black bleach comm contourpy cycler datasets debugpy decorator defusedxml dllist dm-tree \ -# execnet executing expecttest fastjsonschema fonttools fqdn gast hypothesis \ -# ipykernel ipython ipython_pygments_lexers isoduration isort jedi joblib jupyter-events \ -# jupyter-lsp jupyter_client jupyter_core jupyter_server jupyter_server_terminals jupyterlab \ -# jupyterlab_code_formatter jupyterlab_code_formatter jupyterlab_pygments jupyterlab_server \ -# jupyterlab_tensorboard_pro jupytext kiwisolver matplotlib matplotlib-inline matplotlib-inline \ -# mistune ml_dtypes mock nbclient nbconvert nbformat nest-asyncio notebook notebook_shim \ -# opt_einsum optree outlines_core overrides pandas pandocfilters parso pexpect polygraphy pooch \ -# pyarrow pycocotools pytest-flakefinder pytest-rerunfailures pytest-shard pytest-xdist \ -# scikit-learn scipy Send2Trash soundfile soupsieve soxr spin stack-data \ -# wcwidth webcolors xdoctest Werkzeug \ No newline at end of file + uv pip install ray[default] fastsafetensors \ No newline at end of file From 926dd57a871ebc415d43dfca5637b52c4ed98183 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 21 Mar 2026 15:15:01 -0700 Subject: [PATCH 4/6] cuda 13.2 torch --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ef0d766..c517952 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,7 @@ RUN apt update && \ # Additional deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \ + uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \ uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm # Configure Ccache for CUDA/C++ @@ -256,7 +256,7 @@ ARG PRE_TRANSFORMERS=0 # Install deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \ + uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \ uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat) From 7a54657abff030c912d03bd613aaa8243eb981cf Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 21 Mar 2026 15:36:17 -0700 Subject: [PATCH 5/6] Revert "cuda 13.2 torch" This reverts commit 926dd57a871ebc415d43dfca5637b52c4ed98183. --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index c517952..ef0d766 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,7 @@ RUN apt update && \ # Additional deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \ + uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \ uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm # Configure Ccache for CUDA/C++ @@ -256,7 +256,7 @@ ARG PRE_TRANSFORMERS=0 # Install deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \ + uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \ uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat) From 9e089acf2bf1c8e78dc1e59fd9be03c3079ea2f8 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sun, 22 Mar 2026 23:03:24 -0700 Subject: [PATCH 6/6] Updated Nemotron recipes to use VLLM CUTLASS --- recipes/nemotron-3-nano-nvfp4.yaml | 7 +------ recipes/nemotron-3-super-nvfp4.yaml | 12 ++++-------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/recipes/nemotron-3-nano-nvfp4.yaml b/recipes/nemotron-3-nano-nvfp4.yaml index c835e45..43f1383 100644 --- a/recipes/nemotron-3-nano-nvfp4.yaml +++ b/recipes/nemotron-3-nano-nvfp4.yaml @@ -27,15 +27,10 @@ defaults: gpu_memory_utilization: 0.7 max_model_len: 262144 -# Environment variables -env: - VLLM_NVFP4_GEMM_BACKEND: "marlin" - VLLM_TEST_FORCE_FP8_MARLIN: "1" - VLLM_MARLIN_USE_ATOMIC_ADD: "1" - # The vLLM serve command template command: | vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \ + --moe-backend cutlass \ --max-model-len {max_model_len} \ --port {port} --host {host} \ --trust-remote-code \ diff --git a/recipes/nemotron-3-super-nvfp4.yaml b/recipes/nemotron-3-super-nvfp4.yaml index a4de32d..ec790c2 100644 --- a/recipes/nemotron-3-super-nvfp4.yaml +++ b/recipes/nemotron-3-super-nvfp4.yaml @@ -1,8 +1,8 @@ # Recipe: Nemotron-3-Super-NVFP4 -# Optimized for Marlin backend throughput +# Uses VLLM_CUTLASS for NVFP4 recipe_version: "1" -name: Nemotron-3-Super-NVFP4-Marlin-Optimized -description: vLLM serving Nemotron-3-Super-120B using Marlin kernels +name: Nemotron-3-Super-NVFP4-CUTLASS-Optimized +description: vLLM serving Nemotron-3-Super-120B using CUTLASS kernels model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 container: vllm-node @@ -20,15 +20,11 @@ defaults: gpu_memory_utilization: 0.7 max_model_len: 262144 max_num_seqs: 10 -env: - VLLM_NVFP4_GEMM_BACKEND: "marlin" - VLLM_TEST_FORCE_FP8_MARLIN: "1" - VLLM_MARLIN_USE_ATOMIC_ADD: "1" command: | vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \ --kv-cache-dtype fp8 \ - -tp {tensor_parallel} \ + --moe-backend cutlass \ --trust-remote-code \ --gpu-memory-utilization {gpu_memory_utilization} \ --max-model-len {max_model_len} \