Initial refactoring to enable separate wheel builds

2026-02-17 19:15:32 -08:00
parent 5b2313dddb
commit ec0f189256
3 changed files with 204 additions and 200 deletions
--- a/107
+++ b/107
@@ -60,36 +60,9 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas

 # =========================================================
-# STAGE 2: Builder (Builds Triton, Flashinfer and vLLM from Source)
+# STAGE 2: FlashInfer Builder
 # =========================================================
-FROM base AS builder
-
-
-# # ======= Triton Build ========== 
-
-# # Initial Triton repo clone (cached forever)
-# RUN git clone https://github.com/triton-lang/triton.git
-
-# # We expect TRITON_REF to be passed from the command line to break the cache
-# # Set to v3.6.0 by default
-# ARG TRITON_REF=v3.6.0
-
-# WORKDIR $VLLM_BASE_DIR/triton
-
-# # This only runs if TRITON_REF differs from the last build
-# RUN --mount=type=cache,id=ccache,target=/root/.ccache \
-#     --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-#     git fetch origin && \
-#     git checkout ${TRITON_REF} && \
-#     git submodule sync && \
-#     git submodule update --init --recursive && \
-#     uv pip install -r python/requirements.txt && \
-#     mkdir -p /workspace/wheels && \
-#     rm -rf .git && \
-#     uv build --no-build-isolation --wheel --out-dir=/workspace/wheels -v .  && \
-#     uv build --no-build-isolation --wheel --no-index --out-dir=/workspace/wheels python/triton_kernels 
-
-# ======= FlashInfer Build ==========
+FROM base AS flashinfer-builder

 ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
 ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
@@ -98,17 +71,14 @@ ARG FLASHINFER_REF=main

 # --- CACHE BUSTER ---
 # Change this argument to force a re-download of FlashInfer
-ARG CACHEBUST_DEPS=1
+ARG CACHEBUST_FLASHINFER=1

 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"

-# 4. Smart Git Clone (Fetch changes instead of full re-clone)
-# We mount a cache at /repo-cache. This directory persists on your host machine.
+# Smart Git Clone (Fetch changes instead of full re-clone)
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
-    # 1. Go into the persistent cache directory
    cd /repo-cache && \
-    # 2. Logic: Clone if missing, otherwise Fetch & Reset
    if [ ! -d "flashinfer" ]; then \
        echo "Cache miss: Cloning FlashInfer from scratch..." && \
        git clone --recursive https://github.com/flashinfer-ai/flashinfer.git; \
@@ -124,55 +94,54 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
        (git checkout --detach origin/${FLASHINFER_REF} 2>/dev/null || git checkout ${FLASHINFER_REF}) && \
        git submodule update --init --recursive && \
        git clean -fdx && \
-        # Optimize git repo size
        git gc --auto; \
    fi && \
-    # 3. Copy the updated code from the cache to the actual container workspace
-    # We use 'cp -a' to preserve permissions
    cp -a /repo-cache/flashinfer /workspace/flashinfer

-# Build FlashInfer wheels
-
 WORKDIR /workspace/flashinfer

 # Apply patch to avoid re-downloading existing cubins
 COPY flashinfer_cache.patch .
-RUN patch -p1 < flashinfer_cache.patch
-
-# flashinfer-python
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
+    patch -p1 < flashinfer_cache.patch && \
+    # flashinfer-python
    sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
+    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # flashinfer-cubin
+    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # flashinfer-jit-cache
+    cd ../flashinfer-jit-cache && \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v

-# flashinfer-cubin
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    --mount=type=cache,id=ccache,target=/root/.ccache \
-    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
-    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+# =========================================================
+# STAGE 3: FlashInfer Wheel Export
+# =========================================================
+FROM scratch AS flashinfer-export
+COPY --from=flashinfer-builder /workspace/wheels /
+
+# =========================================================
+# STAGE 4: vLLM Builder
+# =========================================================
+FROM base AS vllm-builder
+
+ARG TORCH_CUDA_ARCH_LIST="12.1a"
+ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+WORKDIR $VLLM_BASE_DIR

-# flashinfer-jit-cache
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    --mount=type=cache,id=ccache,target=/root/.ccache \
-    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
-    cd flashinfer-jit-cache && \
-    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"

 # --- VLLM SOURCE CACHE BUSTER ---
-# Change THIS argument to force a fresh git clone and rebuild of vLLM
-# without re-installing the dependencies above.
 ARG CACHEBUST_VLLM=1

 # Git reference (branch, tag, or SHA) to checkout
 ARG VLLM_REF=main

-# 4. Smart Git Clone (Fetch changes instead of full re-clone)
-# We mount a cache at /repo-cache. This directory persists on your host machine.
+# Smart Git Clone (Fetch changes instead of full re-clone)
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
-    # 1. Go into the persistent cache directory
    cd /repo-cache && \
-    # 2. Logic: Clone if missing, otherwise Fetch & Reset
    if [ ! -d "vllm" ]; then \
        echo "Cache miss: Cloning vLLM from scratch..." && \
        git clone --recursive https://github.com/vllm-project/vllm.git; \
@@ -188,11 +157,8 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
        (git checkout --detach origin/${VLLM_REF} 2>/dev/null || git checkout ${VLLM_REF}) && \
        git submodule update --init --recursive && \
        git clean -fdx && \
-        # Optimize git repo size
        git gc --auto; \
    fi && \
-    # 3. Copy the updated code from the cache to the actual container workspace
-    # We use 'cp -a' to preserve permissions
    cp -a /repo-cache/vllm $VLLM_BASE_DIR/

 WORKDIR $VLLM_BASE_DIR/vllm
@@ -231,19 +197,18 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
 #     fi

 # Final Compilation
-# We mount the ccache directory here. Ideally, map this to a host volume for persistence 
-# across totally separate `docker build` invocations.
 RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v

-# # Install custom Triton from triton-builder
-# COPY --from=triton-builder /workspace/wheels /workspace/wheels
-# RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-#     uv pip install /workspace/wheels/*.whl
+# =========================================================
+# STAGE 5: vLLM Wheel Export
+# =========================================================
+FROM scratch AS vllm-export
+COPY --from=vllm-builder /workspace/wheels /

 # =========================================================
-# STAGE 4: Runner (Transfers only necessary artifacts)
+# STAGE 6: Runner (Installs wheels from host ./wheels/)
 # =========================================================
 FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner

@@ -282,10 +247,10 @@ RUN mkdir -p tiktoken_encodings && \
    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"

-# Copy artifacts from Builder Stage
-RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels \
+# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
+RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install /mount/wheels/*.whl
+    uv pip install /workspace/wheels/*.whl

 ARG PRE_TRANSFORMERS=0
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--- a/build-and-copy.sh
+++ b/build-and-copy.sh
@@ -6,23 +6,21 @@ START_TIME=$(date +%s)

 # Default values
 IMAGE_TAG="vllm-node"
-REBUILD_DEPS=false
+REBUILD_FLASHINFER=false
 REBUILD_VLLM=false
 COPY_HOSTS=()
 SSH_USER="$USER"
 NO_BUILD=false
-TRITON_REF="v3.6.0"
 VLLM_REF="main"
 TMP_IMAGE=""
 PARALLEL_COPY=false
-USE_WHEELS_MODE=""
-PRE_FLASHINFER=false
-PRE_TRANSFORMERS=false
 EXP_MXFP4=false
-TRITON_REF_SET=false
 VLLM_REF_SET=false
 VLLM_PRS=""
+PRE_TRANSFORMERS=false
 FULL_LOG=false
+BUILD_JOBS="16"
+GPU_ARCH_LIST="12.1a"

 cleanup() {
    if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -60,28 +58,23 @@ copy_to_host() {
        return 1
    fi
 }
-BUILD_JOBS="16"
-GPU_ARCH_LIST="12.1a"

 # Help function
 usage() {
    echo "Usage: $0 [OPTIONS]"
    echo "  -t, --tag <tag>               : Image tag (default: 'vllm-node')"
    echo "  --gpu-arch <arch>             : GPU architecture (default: '12.1a')"
-    echo "  --rebuild-deps            : Set cache bust for dependencies"
-    echo "  --rebuild-vllm            : Set cache bust for vllm"
-    echo "  --triton-ref <ref>        : Triton commit SHA, branch or tag (default: 'v3.5.1')"
+    echo "  --rebuild-flashinfer          : Force rebuild of FlashInfer wheels (ignore cached wheels)"
+    echo "  --rebuild-vllm                : Force rebuild of vLLM wheels (ignore cached wheels)"
    echo "  --vllm-ref <ref>              : vLLM commit SHA, branch or tag (default: 'main')"
-    echo "  -c, --copy-to <hosts>     : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag."
+    echo "  -c, --copy-to <hosts>         : Host(s) to copy the image to. Accepts comma or space-delimited lists."
    echo "      --copy-to-host            : Alias for --copy-to (backwards compatibility)."
    echo "      --copy-parallel           : Copy to all hosts in parallel instead of serially."
-    echo "  -j, --build-jobs <jobs>   : Number of concurrent build jobs (default: \${BUILD_JOBS})"
+    echo "  -j, --build-jobs <jobs>       : Number of concurrent build jobs (default: ${BUILD_JOBS})"
    echo "  -u, --user <user>             : Username for ssh command (default: \$USER)"
-    echo "  --use-wheels [mode]       : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
-    echo "  --pre-flashinfer          : Use pre-release versions of FlashInfer"
    echo "  --pre-tf, --pre-transformers  : Install transformers 5.0.0rc0 or higher"
    echo "  --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
-    echo "  --apply-vllm-pr <pr-num>  : Apply a specific PR patch to vLLM source code. Can be specified multiple times."
+    echo "  --apply-vllm-pr <pr-num>      : Apply a specific PR patch to vLLM source. Can be specified multiple times."
    echo "  --full-log                    : Enable full build logging (--progress=plain)"
    echo "  --no-build                    : Skip building, only copy image (requires --copy-to)"
    echo "  -h, --help                    : Show this help message"
@@ -93,19 +86,16 @@ while [[ "$#" -gt 0 ]]; do
    case $1 in
        -t|--tag) IMAGE_TAG="$2"; shift ;;
        --gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
-        --rebuild-deps) REBUILD_DEPS=true ;;
+        --rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
        --rebuild-vllm) REBUILD_VLLM=true ;;
-        --triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
        --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
        -c|--copy-to|--copy-to-host|--copy-to-hosts)
            shift
-            # Consume arguments until the next flag or end of args
            while [[ "$#" -gt 0 && "$1" != -* ]]; do
                add_copy_hosts "$1"
                shift
            done

-            # If no hosts specified, use autodiscovery
            if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
                echo "No hosts specified. Using autodiscovery..."
                source "$(dirname "$0")/autodiscover.sh"
@@ -116,7 +106,6 @@ while [[ "$#" -gt 0 ]]; do
                    exit 1
                fi

-                # Use PEER_NODES directly
                if [ ${#PEER_NODES[@]} -gt 0 ]; then
                    COPY_HOSTS=("${PEER_NODES[@]}")
                fi
@@ -132,19 +121,6 @@ while [[ "$#" -gt 0 ]]; do
        -j|--build-jobs) BUILD_JOBS="$2"; shift ;;
        -u|--user) SSH_USER="$2"; shift ;;
        --copy-parallel) PARALLEL_COPY=true ;;
-        --use-wheels)
-            if [[ "$2" != -* && -n "$2" ]]; then
-                if [[ "$2" != "nightly" && "$2" != "release" ]]; then
-                    echo "Error: --use-wheels argument must be 'nightly' or 'release'."
-                    exit 1
-                fi
-                USE_WHEELS_MODE="$2"
-                shift
-            else
-                USE_WHEELS_MODE="nightly"
-            fi
-            ;;
-        --pre-flashinfer) PRE_FLASHINFER=true ;;
        --pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
        --exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;;
        --apply-vllm-pr)
@@ -168,25 +144,16 @@ while [[ "$#" -gt 0 ]]; do
    shift
 done

+# Validate flag combinations
 if [ -n "$VLLM_PRS" ]; then
    if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
-    if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --apply-vllm-pr is incompatible with --use-wheels"; exit 1; fi
 fi

 if [ "$EXP_MXFP4" = true ]; then
-    if [ "$TRITON_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --triton-ref"; exit 1; fi
    if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
-    if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --exp-mxfp4 is incompatible with --use-wheels"; exit 1; fi
-    if [ "$PRE_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-flashinfer"; exit 1; fi
    if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
-fi
-
-if [ -n "$USE_WHEELS_MODE" ]; then
-    read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice
-    case "$choice" in 
-        y|Y ) echo "Continuing...";;
-        * ) echo "Aborting."; exit 1;;
-    esac
+    if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
+    if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
 fi

 # Validate --no-build usage
@@ -195,81 +162,147 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
    exit 1
 fi

-# Build image (unless --no-build is set)
-BUILD_TIME=0
+# Ensure wheels directory exists
+mkdir -p ./wheels
+
+# Common build flags used across all non-mxfp4 sub-builds
+COMMON_BUILD_FLAGS=()
+if [ "$FULL_LOG" = true ]; then
+    COMMON_BUILD_FLAGS+=("--progress=plain")
+fi
+COMMON_BUILD_FLAGS+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
+COMMON_BUILD_FLAGS+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
+COMMON_BUILD_FLAGS+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
+
+# =====================================================
+# Build image (unless --no-build or --exp-mxfp4)
+# =====================================================
+FLASHINFER_BUILD_TIME=0
+VLLM_BUILD_TIME=0
+RUNNER_BUILD_TIME=0
+
 if [ "$NO_BUILD" = false ]; then
-    # Construct build command
-    CMD=("docker" "build" "-t" "$IMAGE_TAG")
-
-    if [ "$FULL_LOG" = true ]; then
-        CMD+=("--progress=plain")
-    fi
-
    if [ "$EXP_MXFP4" = true ]; then
        echo "Building with experimental MXFP4 support..."
-        CMD+=("-f" "Dockerfile.mxfp4")
-    elif [ -n "$USE_WHEELS_MODE" ]; then
-        echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
-        CMD+=("-f" "Dockerfile.wheels")
-        if [ "$USE_WHEELS_MODE" = "release" ]; then
-             CMD+=("--build-arg" "WHEELS_FROM_GITHUB_RELEASE=1")
-        fi
-    else
-        echo "Building vLLM from source"
-    fi
-
-    if [ "$REBUILD_DEPS" = true ]; then
-        echo "Setting CACHEBUST_DEPS..."
-        CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")
-    fi
-
-    if [ "$REBUILD_VLLM" = true ]; then
-        echo "Setting CACHEBUST_VLLM..."
-        CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
-    fi
-
-    # Add TRITON_REF to build arguments
-    CMD+=("--build-arg" "TRITON_REF=$TRITON_REF")
-
-    # Add VLLM_REF to build arguments
-    CMD+=("--build-arg" "VLLM_REF=$VLLM_REF")
-
-    # Add BUILD_JOBS to build arguments
-    CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
-
-    # Add GPU architecture to build arguments
-    CMD+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
-    CMD+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
-
-    if [ "$PRE_FLASHINFER" = true ]; then
-        echo "Using pre-release FlashInfer..."
-        CMD+=("--build-arg" "FLASHINFER_PRE=--pre")
-    fi
-
-    if [ -n "$VLLM_PRS" ]; then
-        echo "Applying vLLM PRs: $VLLM_PRS"
-        CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
-    fi
-
-    if [ "$PRE_TRANSFORMERS" = true ]; then
-        echo "Using transformers>=5.0.0..."
-        CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
-    fi
-
-    # Add build context
-    CMD+=(".")
-
-    # Execute build
+        CMD=("docker" "build" "-t" "$IMAGE_TAG" "${COMMON_BUILD_FLAGS[@]}" "-f" "Dockerfile.mxfp4" ".")
        echo "Building image with command: ${CMD[*]}"
        BUILD_START=$(date +%s)
        "${CMD[@]}"
        BUILD_END=$(date +%s)
-    BUILD_TIME=$((BUILD_END - BUILD_START))
+        RUNNER_BUILD_TIME=$((BUILD_END - BUILD_START))
+    else
+        # ----------------------------------------------------------
+        # Phase 1: FlashInfer wheels
+        # ----------------------------------------------------------
+        FLASHINFER_WHEELS_EXIST=false
+        if compgen -G "./wheels/flashinfer*.whl" > /dev/null 2>&1; then
+            FLASHINFER_WHEELS_EXIST=true
+        fi
+
+        if [ "$REBUILD_FLASHINFER" = true ] || [ "$FLASHINFER_WHEELS_EXIST" = false ]; then
+            if [ "$REBUILD_FLASHINFER" = true ]; then
+                echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
+            else
+                echo "No FlashInfer wheels found in ./wheels/ — building..."
+            fi
+
+            FI_CMD=("docker" "build"
+                "--target" "flashinfer-export"
+                "--output" "type=local,dest=./wheels"
+                "${COMMON_BUILD_FLAGS[@]}")
+
+            if [ "$REBUILD_FLASHINFER" = true ]; then
+                FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
+            fi
+
+            FI_CMD+=(".")
+
+            echo "FlashInfer build command: ${FI_CMD[*]}"
+            FI_START=$(date +%s)
+            "${FI_CMD[@]}"
+            FI_END=$(date +%s)
+            FLASHINFER_BUILD_TIME=$((FI_END - FI_START))
+        else
+            echo "FlashInfer wheels already present in ./wheels/ — skipping build."
+        fi
+
+        # ----------------------------------------------------------
+        # Phase 2: vLLM wheels
+        # ----------------------------------------------------------
+        VLLM_WHEELS_EXIST=false
+        if compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
+            VLLM_WHEELS_EXIST=true
+        fi
+
+        if [ "$REBUILD_VLLM" = true ] || [ "$VLLM_WHEELS_EXIST" = false ]; then
+            if [ "$REBUILD_VLLM" = true ]; then
+                echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
+            else
+                echo "No vLLM wheels found in ./wheels/ — building..."
+            fi
+
+            VLLM_CMD=("docker" "build"
+                "--target" "vllm-export"
+                "--output" "type=local,dest=./wheels"
+                "${COMMON_BUILD_FLAGS[@]}"
+                "--build-arg" "VLLM_REF=$VLLM_REF")
+
+            if [ "$REBUILD_VLLM" = true ]; then
+                VLLM_CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
+            fi
+
+            if [ -n "$VLLM_PRS" ]; then
+                echo "Applying vLLM PRs: $VLLM_PRS"
+                VLLM_CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
+            fi
+
+            if [ "$PRE_TRANSFORMERS" = true ]; then
+                echo "Using transformers>=5.0.0..."
+                VLLM_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
+            fi
+
+            VLLM_CMD+=(".")
+
+            echo "vLLM build command: ${VLLM_CMD[*]}"
+            VLLM_START=$(date +%s)
+            "${VLLM_CMD[@]}"
+            VLLM_END=$(date +%s)
+            VLLM_BUILD_TIME=$((VLLM_END - VLLM_START))
+        else
+            echo "vLLM wheels already present in ./wheels/ — skipping build."
+        fi
+
+        # ----------------------------------------------------------
+        # Phase 3: Runner image
+        # ----------------------------------------------------------
+        if ! compgen -G "./wheels/*.whl" > /dev/null 2>&1; then
+            echo "Error: No wheel files found in ./wheels/ — cannot build runner image."
+            exit 1
+        fi
+
+        RUNNER_CMD=("docker" "build"
+            "-t" "$IMAGE_TAG"
+            "${COMMON_BUILD_FLAGS[@]}")
+
+        if [ "$PRE_TRANSFORMERS" = true ]; then
+            RUNNER_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
+        fi
+
+        RUNNER_CMD+=(".")
+
+        echo "Building runner image with command: ${RUNNER_CMD[*]}"
+        RUNNER_START=$(date +%s)
+        "${RUNNER_CMD[@]}"
+        RUNNER_END=$(date +%s)
+        RUNNER_BUILD_TIME=$((RUNNER_END - RUNNER_START))
+    fi
 else
    echo "Skipping build (--no-build specified)"
 fi

-# Copy to host if requested
+# =====================================================
+# Copy to host(s) if requested
+# =====================================================
 COPY_TIME=0
 if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then
    echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}"
@@ -320,8 +353,14 @@ echo ""
 echo "========================================="
 echo "         TIMING STATISTICS"
 echo "========================================="
-if [ "$BUILD_TIME" -gt 0 ]; then
-    echo "Docker Build:  $(printf '%02d:%02d:%02d' $((BUILD_TIME/3600)) $((BUILD_TIME%3600/60)) $((BUILD_TIME%60)))"
+if [ "$FLASHINFER_BUILD_TIME" -gt 0 ]; then
+    echo "FlashInfer Build: $(printf '%02d:%02d:%02d' $((FLASHINFER_BUILD_TIME/3600)) $((FLASHINFER_BUILD_TIME%3600/60)) $((FLASHINFER_BUILD_TIME%60)))"
+fi
+if [ "$VLLM_BUILD_TIME" -gt 0 ]; then
+    echo "vLLM Build:       $(printf '%02d:%02d:%02d' $((VLLM_BUILD_TIME/3600)) $((VLLM_BUILD_TIME%3600/60)) $((VLLM_BUILD_TIME%60)))"
+fi
+if [ "$RUNNER_BUILD_TIME" -gt 0 ]; then
+    echo "Runner Build:     $(printf '%02d:%02d:%02d' $((RUNNER_BUILD_TIME/3600)) $((RUNNER_BUILD_TIME%3600/60)) $((RUNNER_BUILD_TIME%60)))"
 fi
 if [ "$COPY_TIME" -gt 0 ]; then
    echo "Image Copy:       $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))"
--- a/wheels/.gitkeep
+++ b/wheels/.gitkeep