From dc614dc6ae543f921f688a5c438b2741038eeb97 Mon Sep 17 00:00:00 2001
From: eugr <eugr@spark2.home.eugr.net>
Date: Sun, 14 Dec 2025 10:32:28 -0800
Subject: [PATCH] Separated Triton build into a dedicated phase for better
 caching

---
 Dockerfile | 61 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ce376be..9b811ee 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1.6
 
 # =========================================================
-# STAGE 1: Builder (Builds vLLM from Source)
+# STAGE 1: Base Image (Installs Dependencies)
 # =========================================================
 FROM nvidia/cuda:13.0.2-devel-ubuntu24.04 AS base
 
@@ -38,9 +38,6 @@ WORKDIR $VLLM_BASE_DIR
 ENV TORCH_CUDA_ARCH_LIST=12.1a
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
 
-# Initial Triton repo clone (cached forever) - before all cache busters
-RUN git clone https://github.com/triton-lang/triton.git
-
 # --- CACHE BUSTER ---
 # Change this argument to force a re-download of PyTorch/FlashInfer
 ARG CACHEBUST_DEPS=1
@@ -66,6 +63,39 @@ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
     pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 && \
     pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate
 
+# =========================================================
+# STAGE 2: Triton Builder (Compiles Triton independently)
+# =========================================================
+FROM base AS triton-builder
+
+WORKDIR $VLLM_BASE_DIR
+
+# Initial Triton repo clone (cached forever)
+RUN git clone https://github.com/triton-lang/triton.git
+
+# We expect TRITON_SHA to be passed from the command line to break the cache
+# Set to v3.5.1 commit by default
+ARG TRITON_SHA=0add68262ab0a2e33b84524346cb27cbb2787356
+
+WORKDIR $VLLM_BASE_DIR/triton
+
+# This only runs if TRITON_SHA differs from the last build
+RUN --mount=type=cache,id=ccache,target=/root/.ccache \
+    --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
+    git fetch origin && \
+    git checkout ${TRITON_SHA} && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    pip install -r python/requirements.txt && \
+    mkdir -p /workspace/wheels && \
+    pip wheel --no-build-isolation . --wheel-dir=/workspace/wheels -v && \
+    pip wheel python/triton_kernels --no-deps --wheel-dir=/workspace/wheels
+
+# =========================================================
+# STAGE 3: vLLM Builder (Builds vLLM from Source)
+# =========================================================
+FROM base AS builder
+
 # --- VLLM SOURCE CACHE BUSTER ---
 # Change THIS argument to force a fresh git clone and rebuild of vLLM
 # without re-installing the dependencies above.
@@ -113,26 +143,13 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \
     --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
     pip install --no-build-isolation . -v
 
-# Install latest Triton from main - override version pulled from dependencies
-
-# We expect TRITON_SHA to be passed from the command line to break the cache
-# Set to v3.5.1 commit by default
-ARG TRITON_SHA=0add68262ab0a2e33b84524346cb27cbb2787356
-
-# This only runs if TRITON_SHA differs from the last build
-RUN --mount=type=cache,id=ccache,target=/root/.ccache \
-    --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
-    cd triton && \
-    git fetch origin && \
-    git checkout ${TRITON_SHA} && \
-    git submodule sync && \
-    git submodule update --init --recursive && \
-    pip install -r python/requirements.txt && \
-    pip install --no-build-isolation . -v && \
-    pip install python/triton_kernels --no-deps
+# Install custom Triton from triton-builder
+COPY --from=triton-builder /workspace/wheels /workspace/wheels
+RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
+    pip install /workspace/wheels/*.whl
 
 # =========================================================
-# STAGE 2: Runner (Transfers only necessary artifacts)
+# STAGE 4: Runner (Transfers only necessary artifacts)
 # =========================================================
 FROM nvidia/cuda:13.0.2-devel-ubuntu24.04 AS runner