Separated Triton build into a dedicated phase for better caching

2025-12-14 10:32:28 -08:00
parent 25f759fec8
commit dc614dc6ae
1 changed files with 39 additions and 22 deletions
--- a/61
+++ b/61
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1.6
 # =========================================================
-# STAGE 1: Builder (Builds vLLM from Source)
+# STAGE 1: Base Image (Installs Dependencies)
 # =========================================================
 FROM nvidia/cuda:13.0.2-devel-ubuntu24.04 AS base
@@ -38,9 +38,6 @@ WORKDIR $VLLM_BASE_DIR
 ENV TORCH_CUDA_ARCH_LIST=12.1a
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
 # Initial Triton repo clone (cached forever) - before all cache busters
 RUN git clone https://github.com/triton-lang/triton.git
 # --- CACHE BUSTER ---
 # Change this argument to force a re-download of PyTorch/FlashInfer
 ARG CACHEBUST_DEPS=1
@@ -66,6 +63,39 @@ RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
    pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 && \
    pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate
 # =========================================================
 # STAGE 2: Triton Builder (Compiles Triton independently)
 # =========================================================
 FROM base AS triton-builder
 WORKDIR $VLLM_BASE_DIR
 # Initial Triton repo clone (cached forever)
 RUN git clone https://github.com/triton-lang/triton.git
 # We expect TRITON_SHA to be passed from the command line to break the cache
 # Set to v3.5.1 commit by default
 ARG TRITON_SHA=0add68262ab0a2e33b84524346cb27cbb2787356
 WORKDIR $VLLM_BASE_DIR/triton
 # This only runs if TRITON_SHA differs from the last build
 RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
    git fetch origin && \
    git checkout ${TRITON_SHA} && \
    git submodule sync && \
    git submodule update --init --recursive && \
    pip install -r python/requirements.txt && \
    mkdir -p /workspace/wheels && \
    pip wheel --no-build-isolation . --wheel-dir=/workspace/wheels -v && \
    pip wheel python/triton_kernels --no-deps --wheel-dir=/workspace/wheels
 # =========================================================
 # STAGE 3: vLLM Builder (Builds vLLM from Source)
 # =========================================================
 FROM base AS builder
 # --- VLLM SOURCE CACHE BUSTER ---
 # Change THIS argument to force a fresh git clone and rebuild of vLLM
 # without re-installing the dependencies above.
@@ -113,26 +143,13 @@ RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
    pip install --no-build-isolation . -v
-# Install latest Triton from main - override version pulled from dependencies
+# Install custom Triton from triton-builder
-
+COPY --from=triton-builder /workspace/wheels /workspace/wheels
-# We expect TRITON_SHA to be passed from the command line to break the cache
+RUN --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
-# Set to v3.5.1 commit by default
+    pip install /workspace/wheels/*.whl
 ARG TRITON_SHA=0add68262ab0a2e33b84524346cb27cbb2787356
 # This only runs if TRITON_SHA differs from the last build
 RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=pip-cache,target=/root/.cache/pip \
    cd triton && \
    git fetch origin && \
    git checkout ${TRITON_SHA} && \
    git submodule sync && \
    git submodule update --init --recursive && \
    pip install -r python/requirements.txt && \
    pip install --no-build-isolation . -v && \
    pip install python/triton_kernels --no-deps
 # =========================================================
-# STAGE 2: Runner (Transfers only necessary artifacts)
+# STAGE 4: Runner (Transfers only necessary artifacts)
 # =========================================================
 FROM nvidia/cuda:13.0.2-devel-ubuntu24.04 AS runner