From 8caebe31557d47a80a0f623f01dfe63c12fbd09a Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Fri, 20 Mar 2026 17:03:18 -0700
Subject: [PATCH 1/6] Reverting back to CUDA image + pytorch from wheels

---
 Dockerfile | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index cce0e4a..f664c95 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,9 +4,9 @@
 ARG BUILD_JOBS=16
 
 # =========================================================
-# STAGE 1: Base Image (Installs Dependencies)
+# STAGE 1: Base Build Image
 # =========================================================
-FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base
+FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS base
 
 # Build parallemism
 ARG BUILD_JOBS
@@ -35,10 +35,13 @@ ENV VLLM_BASE_DIR=/workspace/vllm
 # Added ccache to enable incremental compilation caching
 RUN apt update && \
     apt install -y --no-install-recommends \
-    curl vim ninja-build git \
+    curl vim cmake build-essential ninja-build \
+    libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
+    python3-dev python3-pip git wget \
+    libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
     ccache \
     && rm -rf /var/lib/apt/lists/* \
-    && pip install uv && pip uninstall -y flash-attn
+    && pip install uv
 
 # Configure Ccache for CUDA/C++
 ENV PATH=/usr/lib/ccache:$PATH
@@ -133,7 +136,8 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 WORKDIR $VLLM_BASE_DIR
 
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
+     uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton
 
 # --- VLLM SOURCE CACHE BUSTER ---
 ARG CACHEBUST_VLLM=1
@@ -211,7 +215,7 @@ COPY --from=vllm-builder /workspace/wheels /
 # =========================================================
 # STAGE 6: Runner (Installs wheels from host ./wheels/)
 # =========================================================
-FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
+FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS runner
 
 # Transferring build settings from build image because of ptxas/jit compilation during vLLM startup
 # Build parallemism
@@ -235,10 +239,12 @@ ENV UV_LINK_MODE=copy
 # Install runtime dependencies
 RUN apt update && \
     apt install -y --no-install-recommends \
-    curl vim git \
+    python3 python3-pip python3-dev vim curl git wget \
+    libcudnn9-cuda-13 \
+    libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
     libxcb1 \
     && rm -rf /var/lib/apt/lists/* \
-    && pip install uv && pip uninstall -y flash-attn # triton-kernels pytorch-triton
+    && pip install uv 
 
 # Set final working directory
 WORKDIR $VLLM_BASE_DIR
@@ -250,6 +256,11 @@ RUN mkdir -p tiktoken_encodings && \
 
 ARG PRE_TRANSFORMERS=0
 
+# Install dependencies
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+     uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton
+
 # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
 # With --tf5: override vLLM's transformers<5 constraint to get transformers>=5
 RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
@@ -273,7 +284,7 @@ ENV PATH=$VLLM_BASE_DIR:$PATH
 
 # Final extra deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
+    uv pip install ray[default] fastsafetensors
 
 # Cleanup
 

From 8385506c5e1fe9003108d770bd20a84de58bfcef Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Fri, 20 Mar 2026 23:51:21 -0700
Subject: [PATCH 2/6] Fixes

---
 Dockerfile | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f664c95..2a56775 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,6 +43,11 @@ RUN apt update && \
     && rm -rf /var/lib/apt/lists/* \
     && pip install uv
 
+# Additional deps
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm
+
 # Configure Ccache for CUDA/C++
 ENV PATH=/usr/lib/ccache:$PATH
 ENV CCACHE_DIR=/root/.ccache
@@ -76,9 +81,6 @@ ARG FLASHINFER_REF=main
 # Change this argument to force a re-download of FlashInfer
 ARG CACHEBUST_FLASHINFER=1
 
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
-
 # Smart Git Clone (Fetch changes instead of full re-clone)
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
     cd /repo-cache && \
@@ -135,10 +137,6 @@ ARG TORCH_CUDA_ARCH_LIST="12.1a"
 ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 WORKDIR $VLLM_BASE_DIR
 
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \
-     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton
-
 # --- VLLM SOURCE CACHE BUSTER ---
 ARG CACHEBUST_VLLM=1
 
@@ -256,10 +254,10 @@ RUN mkdir -p tiktoken_encodings && \
 
 ARG PRE_TRANSFORMERS=0
 
-# Install dependencies
+# Install deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 && \
-     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" triton
+     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" 
 
 # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
 # With --tf5: override vLLM's transformers<5 constraint to get transformers>=5

From 6e8d85c9142bba60830f1983731ba92ff1ffcc7d Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Sat, 21 Mar 2026 15:12:12 -0700
Subject: [PATCH 3/6] cleanup

---
 Dockerfile | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 2a56775..ef0d766 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -282,21 +282,4 @@ ENV PATH=$VLLM_BASE_DIR:$PATH
 
 # Final extra deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install ray[default] fastsafetensors
-
-# Cleanup
-
-# Keeping it here for reference - this won't work as is without squashing layers
-# RUN uv pip uninstall absl-py apex argon2-cffi \
-#     argon2-cffi-bindings arrow asttokens astunparse async-lru audioread babel beautifulsoup4 \
-#     black bleach comm contourpy cycler datasets debugpy decorator defusedxml dllist dm-tree \
-#     execnet executing expecttest fastjsonschema fonttools fqdn gast hypothesis \
-#     ipykernel ipython ipython_pygments_lexers isoduration isort jedi joblib jupyter-events \
-#     jupyter-lsp jupyter_client jupyter_core jupyter_server jupyter_server_terminals jupyterlab \
-#     jupyterlab_code_formatter jupyterlab_code_formatter jupyterlab_pygments jupyterlab_server \
-#     jupyterlab_tensorboard_pro jupytext kiwisolver matplotlib matplotlib-inline matplotlib-inline \
-#     mistune ml_dtypes mock nbclient nbconvert nbformat nest-asyncio notebook notebook_shim \
-#     opt_einsum optree outlines_core overrides pandas pandocfilters parso pexpect polygraphy pooch \
-#     pyarrow pycocotools pytest-flakefinder pytest-rerunfailures pytest-shard pytest-xdist \
-#     scikit-learn scipy Send2Trash soundfile soupsieve soxr spin stack-data \
-#     wcwidth webcolors xdoctest Werkzeug
\ No newline at end of file
+    uv pip install ray[default] fastsafetensors
\ No newline at end of file

From 926dd57a871ebc415d43dfca5637b52c4ed98183 Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Sat, 21 Mar 2026 15:15:01 -0700
Subject: [PATCH 4/6] cuda 13.2 torch

---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ef0d766..c517952 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ RUN apt update && \
 
 # Additional deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
+     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \
      uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm
 
 # Configure Ccache for CUDA/C++
@@ -256,7 +256,7 @@ ARG PRE_TRANSFORMERS=0
 
 # Install deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
+     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \
      uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" 
 
 # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)

From 7a54657abff030c912d03bd613aaa8243eb981cf Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Sat, 21 Mar 2026 15:36:17 -0700
Subject: [PATCH 5/6] Revert "cuda 13.2 torch"

This reverts commit 926dd57a871ebc415d43dfca5637b52c4ed98183.
---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index c517952..ef0d766 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,7 @@ RUN apt update && \
 
 # Additional deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \
+     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
      uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm
 
 # Configure Ccache for CUDA/C++
@@ -256,7 +256,7 @@ ARG PRE_TRANSFORMERS=0
 
 # Install deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu132 && \
+     uv pip install torch torchvision torchaudio triton --index-url https://download.pytorch.org/whl/nightly/cu130 && \
      uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" 
 
 # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)

From 9e089acf2bf1c8e78dc1e59fd9be03c3079ea2f8 Mon Sep 17 00:00:00 2001
From: Eugene Rakhmatulin <eugr@eugr.com>
Date: Sun, 22 Mar 2026 23:03:24 -0700
Subject: [PATCH 6/6] Updated Nemotron recipes to use VLLM CUTLASS

---
 recipes/nemotron-3-nano-nvfp4.yaml  |  7 +------
 recipes/nemotron-3-super-nvfp4.yaml | 12 ++++--------
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/recipes/nemotron-3-nano-nvfp4.yaml b/recipes/nemotron-3-nano-nvfp4.yaml
index c835e45..43f1383 100644
--- a/recipes/nemotron-3-nano-nvfp4.yaml
+++ b/recipes/nemotron-3-nano-nvfp4.yaml
@@ -27,15 +27,10 @@ defaults:
   gpu_memory_utilization: 0.7
   max_model_len: 262144
 
-# Environment variables
-env:
-  VLLM_NVFP4_GEMM_BACKEND: "marlin"
-  VLLM_TEST_FORCE_FP8_MARLIN: "1"
-  VLLM_MARLIN_USE_ATOMIC_ADD: "1"
-
 # The vLLM serve command template
 command: |
   vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4  \
+     --moe-backend cutlass \
      --max-model-len {max_model_len} \
      --port {port} --host {host} \
      --trust-remote-code \
diff --git a/recipes/nemotron-3-super-nvfp4.yaml b/recipes/nemotron-3-super-nvfp4.yaml
index a4de32d..ec790c2 100644
--- a/recipes/nemotron-3-super-nvfp4.yaml
+++ b/recipes/nemotron-3-super-nvfp4.yaml
@@ -1,8 +1,8 @@
 # Recipe: Nemotron-3-Super-NVFP4
-# Optimized for Marlin backend throughput
+# Uses VLLM_CUTLASS for NVFP4
 recipe_version: "1"
-name: Nemotron-3-Super-NVFP4-Marlin-Optimized
-description: vLLM serving Nemotron-3-Super-120B using Marlin kernels
+name: Nemotron-3-Super-NVFP4-CUTLASS-Optimized
+description: vLLM serving Nemotron-3-Super-120B using CUTLASS kernels
 
 model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
 container: vllm-node
@@ -20,15 +20,11 @@ defaults:
   gpu_memory_utilization: 0.7
   max_model_len: 262144
   max_num_seqs: 10
-env:
-  VLLM_NVFP4_GEMM_BACKEND: "marlin"
-  VLLM_TEST_FORCE_FP8_MARLIN: "1"
-  VLLM_MARLIN_USE_ATOMIC_ADD: "1"
 
 command: |
   vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
   --kv-cache-dtype fp8 \
-  -tp {tensor_parallel} \
+  --moe-backend cutlass \
   --trust-remote-code \
   --gpu-memory-utilization {gpu_memory_utilization} \
   --max-model-len {max_model_len} \