Initial refactoring to enable separate wheel builds

2026-02-17 19:15:32 -08:00
parent 5b2313dddb
commit ec0f189256
3 changed files with 204 additions and 200 deletions
--- a/107
+++ b/107
@@ -60,36 +60,9 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas

 # =========================================================
-# STAGE 2: Builder (Builds Triton, Flashinfer and vLLM from Source)
+# STAGE 2: FlashInfer Builder
 # =========================================================
-FROM base AS builder
-
-
-# # ======= Triton Build ========== 
-
-# # Initial Triton repo clone (cached forever)
-# RUN git clone https://github.com/triton-lang/triton.git
-
-# # We expect TRITON_REF to be passed from the command line to break the cache
-# # Set to v3.6.0 by default
-# ARG TRITON_REF=v3.6.0
-
-# WORKDIR $VLLM_BASE_DIR/triton
-
-# # This only runs if TRITON_REF differs from the last build
-# RUN --mount=type=cache,id=ccache,target=/root/.ccache \
-#     --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-#     git fetch origin && \
-#     git checkout ${TRITON_REF} && \
-#     git submodule sync && \
-#     git submodule update --init --recursive && \
-#     uv pip install -r python/requirements.txt && \
-#     mkdir -p /workspace/wheels && \
-#     rm -rf .git && \
-#     uv build --no-build-isolation --wheel --out-dir=/workspace/wheels -v .  && \
-#     uv build --no-build-isolation --wheel --no-index --out-dir=/workspace/wheels python/triton_kernels 
-
-# ======= FlashInfer Build ==========
+FROM base AS flashinfer-builder

 ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
 ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
@@ -98,17 +71,14 @@ ARG FLASHINFER_REF=main

 # --- CACHE BUSTER ---
 # Change this argument to force a re-download of FlashInfer
-ARG CACHEBUST_DEPS=1
+ARG CACHEBUST_FLASHINFER=1

 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"

-# 4. Smart Git Clone (Fetch changes instead of full re-clone)
-# We mount a cache at /repo-cache. This directory persists on your host machine.
+# Smart Git Clone (Fetch changes instead of full re-clone)
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
-    # 1. Go into the persistent cache directory
    cd /repo-cache && \
-    # 2. Logic: Clone if missing, otherwise Fetch & Reset
    if [ ! -d "flashinfer" ]; then \
        echo "Cache miss: Cloning FlashInfer from scratch..." && \
        git clone --recursive https://github.com/flashinfer-ai/flashinfer.git; \
@@ -124,55 +94,54 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
        (git checkout --detach origin/${FLASHINFER_REF} 2>/dev/null || git checkout ${FLASHINFER_REF}) && \
        git submodule update --init --recursive && \
        git clean -fdx && \
-        # Optimize git repo size
        git gc --auto; \
    fi && \
-    # 3. Copy the updated code from the cache to the actual container workspace
-    # We use 'cp -a' to preserve permissions
    cp -a /repo-cache/flashinfer /workspace/flashinfer

-# Build FlashInfer wheels
-
 WORKDIR /workspace/flashinfer

 # Apply patch to avoid re-downloading existing cubins
 COPY flashinfer_cache.patch .
-RUN patch -p1 < flashinfer_cache.patch
-
-# flashinfer-python
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
+    patch -p1 < flashinfer_cache.patch && \
+    # flashinfer-python
    sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
+    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # flashinfer-cubin
+    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # flashinfer-jit-cache
+    cd ../flashinfer-jit-cache && \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v

-# flashinfer-cubin
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    --mount=type=cache,id=ccache,target=/root/.ccache \
-    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
-    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+# =========================================================
+# STAGE 3: FlashInfer Wheel Export
+# =========================================================
+FROM scratch AS flashinfer-export
+COPY --from=flashinfer-builder /workspace/wheels /
+
+# =========================================================
+# STAGE 4: vLLM Builder
+# =========================================================
+FROM base AS vllm-builder
+
+ARG TORCH_CUDA_ARCH_LIST="12.1a"
+ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+WORKDIR $VLLM_BASE_DIR

-# flashinfer-jit-cache
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    --mount=type=cache,id=ccache,target=/root/.ccache \
-    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
-    cd flashinfer-jit-cache && \
-    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"

 # --- VLLM SOURCE CACHE BUSTER ---
-# Change THIS argument to force a fresh git clone and rebuild of vLLM
-# without re-installing the dependencies above.
 ARG CACHEBUST_VLLM=1

 # Git reference (branch, tag, or SHA) to checkout
 ARG VLLM_REF=main

-# 4. Smart Git Clone (Fetch changes instead of full re-clone)
-# We mount a cache at /repo-cache. This directory persists on your host machine.
+# Smart Git Clone (Fetch changes instead of full re-clone)
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
-    # 1. Go into the persistent cache directory
    cd /repo-cache && \
-    # 2. Logic: Clone if missing, otherwise Fetch & Reset
    if [ ! -d "vllm" ]; then \
        echo "Cache miss: Cloning vLLM from scratch..." && \
        git clone --recursive https://github.com/vllm-project/vllm.git; \
@@ -188,11 +157,8 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
        (git checkout --detach origin/${VLLM_REF} 2>/dev/null || git checkout ${VLLM_REF}) && \
        git submodule update --init --recursive && \
        git clean -fdx && \
-        # Optimize git repo size
        git gc --auto; \
    fi && \
-    # 3. Copy the updated code from the cache to the actual container workspace
-    # We use 'cp -a' to preserve permissions
    cp -a /repo-cache/vllm $VLLM_BASE_DIR/

 WORKDIR $VLLM_BASE_DIR/vllm
@@ -231,19 +197,18 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
 #     fi

 # Final Compilation
-# We mount the ccache directory here. Ideally, map this to a host volume for persistence 
-# across totally separate `docker build` invocations.
 RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v

-# # Install custom Triton from triton-builder
-# COPY --from=triton-builder /workspace/wheels /workspace/wheels
-# RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-#     uv pip install /workspace/wheels/*.whl
+# =========================================================
+# STAGE 5: vLLM Wheel Export
+# =========================================================
+FROM scratch AS vllm-export
+COPY --from=vllm-builder /workspace/wheels /

 # =========================================================
-# STAGE 4: Runner (Transfers only necessary artifacts)
+# STAGE 6: Runner (Installs wheels from host ./wheels/)
 # =========================================================
 FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner

@@ -282,10 +247,10 @@ RUN mkdir -p tiktoken_encodings && \
    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"

-# Copy artifacts from Builder Stage
-RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels \
+# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
+RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install /mount/wheels/*.whl
+    uv pip install /workspace/wheels/*.whl

 ARG PRE_TRANSFORMERS=0
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \