Initial refactoring to enable separate wheel builds

2026-02-17 19:15:32 -08:00
parent 5b2313dddb
commit ec0f189256
3 changed files with 204 additions and 200 deletions
--- a/107
+++ b/107
@@ -60,36 +60,9 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
 # =========================================================
-# STAGE 2: Builder (Builds Triton, Flashinfer and vLLM from Source)
+# STAGE 2: FlashInfer Builder
 # =========================================================
-FROM base AS builder
+FROM base AS flashinfer-builder
 # # ======= Triton Build ========== 
 # # Initial Triton repo clone (cached forever)
 # RUN git clone https://github.com/triton-lang/triton.git
 # # We expect TRITON_REF to be passed from the command line to break the cache
 # # Set to v3.6.0 by default
 # ARG TRITON_REF=v3.6.0
 # WORKDIR $VLLM_BASE_DIR/triton
 # # This only runs if TRITON_REF differs from the last build
 # RUN --mount=type=cache,id=ccache,target=/root/.ccache \
 #     --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
 #     git fetch origin && \
 #     git checkout ${TRITON_REF} && \
 #     git submodule sync && \
 #     git submodule update --init --recursive && \
 #     uv pip install -r python/requirements.txt && \
 #     mkdir -p /workspace/wheels && \
 #     rm -rf .git && \
 #     uv build --no-build-isolation --wheel --out-dir=/workspace/wheels -v .  && \
 #     uv build --no-build-isolation --wheel --no-index --out-dir=/workspace/wheels python/triton_kernels 
 # ======= FlashInfer Build ==========
 ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
 ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
@@ -98,17 +71,14 @@ ARG FLASHINFER_REF=main
 # --- CACHE BUSTER ---
 # Change this argument to force a re-download of FlashInfer
-ARG CACHEBUST_DEPS=1
+ARG CACHEBUST_FLASHINFER=1
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
-# 4. Smart Git Clone (Fetch changes instead of full re-clone)
+# Smart Git Clone (Fetch changes instead of full re-clone)
 # We mount a cache at /repo-cache. This directory persists on your host machine.
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
    # 1. Go into the persistent cache directory
    cd /repo-cache && \
    # 2. Logic: Clone if missing, otherwise Fetch & Reset
    if [ ! -d "flashinfer" ]; then \
        echo "Cache miss: Cloning FlashInfer from scratch..." && \
        git clone --recursive https://github.com/flashinfer-ai/flashinfer.git; \
@@ -124,55 +94,54 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
        (git checkout --detach origin/${FLASHINFER_REF} 2>/dev/null || git checkout ${FLASHINFER_REF}) && \
        git submodule update --init --recursive && \
        git clean -fdx && \
        # Optimize git repo size
        git gc --auto; \
    fi && \
    # 3. Copy the updated code from the cache to the actual container workspace
    # We use 'cp -a' to preserve permissions
    cp -a /repo-cache/flashinfer /workspace/flashinfer
 # Build FlashInfer wheels
 WORKDIR /workspace/flashinfer
 # Apply patch to avoid re-downloading existing cubins
 COPY flashinfer_cache.patch .
 RUN patch -p1 < flashinfer_cache.patch
 # flashinfer-python
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
    patch -p1 < flashinfer_cache.patch && \
    # flashinfer-python
    sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
    # flashinfer-cubin
    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
    # flashinfer-jit-cache
    cd ../flashinfer-jit-cache && \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
-# flashinfer-cubin
+# =========================================================
-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+# STAGE 3: FlashInfer Wheel Export
-    --mount=type=cache,id=ccache,target=/root/.ccache \
+# =========================================================
-    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
+FROM scratch AS flashinfer-export
-    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+COPY --from=flashinfer-builder /workspace/wheels /
 # =========================================================
 # STAGE 4: vLLM Builder
 # =========================================================
 FROM base AS vllm-builder
 ARG TORCH_CUDA_ARCH_LIST="12.1a"
 ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 WORKDIR $VLLM_BASE_DIR
 # flashinfer-jit-cache
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    --mount=type=cache,id=ccache,target=/root/.ccache \
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
    --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
    cd flashinfer-jit-cache && \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
 # --- VLLM SOURCE CACHE BUSTER ---
 # Change THIS argument to force a fresh git clone and rebuild of vLLM
 # without re-installing the dependencies above.
 ARG CACHEBUST_VLLM=1
 # Git reference (branch, tag, or SHA) to checkout
 ARG VLLM_REF=main
-# 4. Smart Git Clone (Fetch changes instead of full re-clone)
+# Smart Git Clone (Fetch changes instead of full re-clone)
 # We mount a cache at /repo-cache. This directory persists on your host machine.
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
    # 1. Go into the persistent cache directory
    cd /repo-cache && \
    # 2. Logic: Clone if missing, otherwise Fetch & Reset
    if [ ! -d "vllm" ]; then \
        echo "Cache miss: Cloning vLLM from scratch..." && \
        git clone --recursive https://github.com/vllm-project/vllm.git; \
@@ -188,11 +157,8 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
        (git checkout --detach origin/${VLLM_REF} 2>/dev/null || git checkout ${VLLM_REF}) && \
        git submodule update --init --recursive && \
        git clean -fdx && \
        # Optimize git repo size
        git gc --auto; \
    fi && \
    # 3. Copy the updated code from the cache to the actual container workspace
    # We use 'cp -a' to preserve permissions
    cp -a /repo-cache/vllm $VLLM_BASE_DIR/
 WORKDIR $VLLM_BASE_DIR/vllm
@@ -231,19 +197,18 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
 #     fi
 # Final Compilation
 # We mount the ccache directory here. Ideally, map this to a host volume for persistence 
 # across totally separate `docker build` invocations.
 RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
-# # Install custom Triton from triton-builder
+# =========================================================
-# COPY --from=triton-builder /workspace/wheels /workspace/wheels
+# STAGE 5: vLLM Wheel Export
-# RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+# =========================================================
-#     uv pip install /workspace/wheels/*.whl
+FROM scratch AS vllm-export
 COPY --from=vllm-builder /workspace/wheels /
 # =========================================================
-# STAGE 4: Runner (Transfers only necessary artifacts)
+# STAGE 6: Runner (Installs wheels from host ./wheels/)
 # =========================================================
 FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
@@ -282,10 +247,10 @@ RUN mkdir -p tiktoken_encodings && \
    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
-# Copy artifacts from Builder Stage
+# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
-RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels \
+RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install /mount/wheels/*.whl
+    uv pip install /workspace/wheels/*.whl
 ARG PRE_TRANSFORMERS=0
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--- a/build-and-copy.sh
+++ b/build-and-copy.sh
@@ -6,23 +6,21 @@ START_TIME=$(date +%s)
 # Default values
 IMAGE_TAG="vllm-node"
-REBUILD_DEPS=false
+REBUILD_FLASHINFER=false
 REBUILD_VLLM=false
 COPY_HOSTS=()
 SSH_USER="$USER"
 NO_BUILD=false
 TRITON_REF="v3.6.0"
 VLLM_REF="main"
 TMP_IMAGE=""
 PARALLEL_COPY=false
 USE_WHEELS_MODE=""
 PRE_FLASHINFER=false
 PRE_TRANSFORMERS=false
 EXP_MXFP4=false
 TRITON_REF_SET=false
 VLLM_REF_SET=false
 VLLM_PRS=""
 PRE_TRANSFORMERS=false
 FULL_LOG=false
 BUILD_JOBS="16"
 GPU_ARCH_LIST="12.1a"
 cleanup() {
    if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -60,31 +58,26 @@ copy_to_host() {
        return 1
    fi
 }
 BUILD_JOBS="16"
 GPU_ARCH_LIST="12.1a"
 # Help function
 usage() {
    echo "Usage: $0 [OPTIONS]"
-    echo "  -t, --tag <tag>           : Image tag (default: 'vllm-node')"
+    echo "  -t, --tag <tag>               : Image tag (default: 'vllm-node')"
-    echo "  --gpu-arch <arch>         : GPU architecture (default: '12.1a')"
+    echo "  --gpu-arch <arch>             : GPU architecture (default: '12.1a')"
-    echo "  --rebuild-deps            : Set cache bust for dependencies"
+    echo "  --rebuild-flashinfer          : Force rebuild of FlashInfer wheels (ignore cached wheels)"
-    echo "  --rebuild-vllm            : Set cache bust for vllm"
+    echo "  --rebuild-vllm                : Force rebuild of vLLM wheels (ignore cached wheels)"
-    echo "  --triton-ref <ref>        : Triton commit SHA, branch or tag (default: 'v3.5.1')"
+    echo "  --vllm-ref <ref>              : vLLM commit SHA, branch or tag (default: 'main')"
-    echo "  --vllm-ref <ref>          : vLLM commit SHA, branch or tag (default: 'main')"
+    echo "  -c, --copy-to <hosts>         : Host(s) to copy the image to. Accepts comma or space-delimited lists."
-    echo "  -c, --copy-to <hosts>     : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag."
+    echo "      --copy-to-host            : Alias for --copy-to (backwards compatibility)."
-    echo "      --copy-to-host        : Alias for --copy-to (backwards compatibility)."
+    echo "      --copy-parallel           : Copy to all hosts in parallel instead of serially."
-    echo "      --copy-parallel       : Copy to all hosts in parallel instead of serially."
+    echo "  -j, --build-jobs <jobs>       : Number of concurrent build jobs (default: ${BUILD_JOBS})"
-    echo "  -j, --build-jobs <jobs>   : Number of concurrent build jobs (default: \${BUILD_JOBS})"
+    echo "  -u, --user <user>             : Username for ssh command (default: \$USER)"
-    echo "  -u, --user <user>         : Username for ssh command (default: \$USER)"
+    echo "  --pre-tf, --pre-transformers  : Install transformers 5.0.0rc0 or higher"
    echo "  --use-wheels [mode]       : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
    echo "  --pre-flashinfer          : Use pre-release versions of FlashInfer"
    echo "  --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher"
    echo "  --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
-    echo "  --apply-vllm-pr <pr-num>  : Apply a specific PR patch to vLLM source code. Can be specified multiple times."
+    echo "  --apply-vllm-pr <pr-num>      : Apply a specific PR patch to vLLM source. Can be specified multiple times."
-    echo "  --full-log                : Enable full build logging (--progress=plain)"
+    echo "  --full-log                    : Enable full build logging (--progress=plain)"
-    echo "  --no-build                : Skip building, only copy image (requires --copy-to)"
+    echo "  --no-build                    : Skip building, only copy image (requires --copy-to)"
-    echo "  -h, --help                : Show this help message"
+    echo "  -h, --help                    : Show this help message"
    exit 1
 }
@@ -93,19 +86,16 @@ while [[ "$#" -gt 0 ]]; do
    case $1 in
        -t|--tag) IMAGE_TAG="$2"; shift ;;
        --gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
-        --rebuild-deps) REBUILD_DEPS=true ;;
+        --rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
        --rebuild-vllm) REBUILD_VLLM=true ;;
        --triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
        --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
        -c|--copy-to|--copy-to-host|--copy-to-hosts)
            shift
            # Consume arguments until the next flag or end of args
            while [[ "$#" -gt 0 && "$1" != -* ]]; do
                add_copy_hosts "$1"
                shift
            done
            # If no hosts specified, use autodiscovery
            if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
                echo "No hosts specified. Using autodiscovery..."
                source "$(dirname "$0")/autodiscover.sh"
@@ -116,7 +106,6 @@ while [[ "$#" -gt 0 ]]; do
                    exit 1
                fi
                # Use PEER_NODES directly
                if [ ${#PEER_NODES[@]} -gt 0 ]; then
                    COPY_HOSTS=("${PEER_NODES[@]}")
                fi
@@ -132,19 +121,6 @@ while [[ "$#" -gt 0 ]]; do
        -j|--build-jobs) BUILD_JOBS="$2"; shift ;;
        -u|--user) SSH_USER="$2"; shift ;;
        --copy-parallel) PARALLEL_COPY=true ;;
        --use-wheels)
            if [[ "$2" != -* && -n "$2" ]]; then
                if [[ "$2" != "nightly" && "$2" != "release" ]]; then
                    echo "Error: --use-wheels argument must be 'nightly' or 'release'."
                    exit 1
                fi
                USE_WHEELS_MODE="$2"
                shift
            else
                USE_WHEELS_MODE="nightly"
            fi
            ;;
        --pre-flashinfer) PRE_FLASHINFER=true ;;
        --pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
        --exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;;
        --apply-vllm-pr)
@@ -168,25 +144,16 @@ while [[ "$#" -gt 0 ]]; do
    shift
 done
 # Validate flag combinations
 if [ -n "$VLLM_PRS" ]; then
    if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
    if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --apply-vllm-pr is incompatible with --use-wheels"; exit 1; fi
 fi
 if [ "$EXP_MXFP4" = true ]; then
    if [ "$TRITON_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --triton-ref"; exit 1; fi
    if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
    if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --exp-mxfp4 is incompatible with --use-wheels"; exit 1; fi
    if [ "$PRE_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-flashinfer"; exit 1; fi
    if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
-fi
+    if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
-
+    if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
 if [ -n "$USE_WHEELS_MODE" ]; then
    read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice
    case "$choice" in 
        y|Y ) echo "Continuing...";;
        * ) echo "Aborting."; exit 1;;
    esac
 fi
 # Validate --no-build usage
@@ -195,81 +162,147 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
    exit 1
 fi
-# Build image (unless --no-build is set)
+# Ensure wheels directory exists
-BUILD_TIME=0
+mkdir -p ./wheels
 # Common build flags used across all non-mxfp4 sub-builds
 COMMON_BUILD_FLAGS=()
 if [ "$FULL_LOG" = true ]; then
    COMMON_BUILD_FLAGS+=("--progress=plain")
 fi
 COMMON_BUILD_FLAGS+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
 COMMON_BUILD_FLAGS+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
 COMMON_BUILD_FLAGS+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
 # =====================================================
 # Build image (unless --no-build or --exp-mxfp4)
 # =====================================================
 FLASHINFER_BUILD_TIME=0
 VLLM_BUILD_TIME=0
 RUNNER_BUILD_TIME=0
 if [ "$NO_BUILD" = false ]; then
    # Construct build command
    CMD=("docker" "build" "-t" "$IMAGE_TAG")
    if [ "$FULL_LOG" = true ]; then
        CMD+=("--progress=plain")
    fi
    if [ "$EXP_MXFP4" = true ]; then
        echo "Building with experimental MXFP4 support..."
-        CMD+=("-f" "Dockerfile.mxfp4")
+        CMD=("docker" "build" "-t" "$IMAGE_TAG" "${COMMON_BUILD_FLAGS[@]}" "-f" "Dockerfile.mxfp4" ".")
-    elif [ -n "$USE_WHEELS_MODE" ]; then
+        echo "Building image with command: ${CMD[*]}"
-        echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
+        BUILD_START=$(date +%s)
-        CMD+=("-f" "Dockerfile.wheels")
+        "${CMD[@]}"
-        if [ "$USE_WHEELS_MODE" = "release" ]; then
+        BUILD_END=$(date +%s)
-             CMD+=("--build-arg" "WHEELS_FROM_GITHUB_RELEASE=1")
+        RUNNER_BUILD_TIME=$((BUILD_END - BUILD_START))
        fi
    else
-        echo "Building vLLM from source"
+        # ----------------------------------------------------------
        # Phase 1: FlashInfer wheels
        # ----------------------------------------------------------
        FLASHINFER_WHEELS_EXIST=false
        if compgen -G "./wheels/flashinfer*.whl" > /dev/null 2>&1; then
            FLASHINFER_WHEELS_EXIST=true
        fi
        if [ "$REBUILD_FLASHINFER" = true ] || [ "$FLASHINFER_WHEELS_EXIST" = false ]; then
            if [ "$REBUILD_FLASHINFER" = true ]; then
                echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
            else
                echo "No FlashInfer wheels found in ./wheels/ — building..."
            fi
            FI_CMD=("docker" "build"
                "--target" "flashinfer-export"
                "--output" "type=local,dest=./wheels"
                "${COMMON_BUILD_FLAGS[@]}")
            if [ "$REBUILD_FLASHINFER" = true ]; then
                FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
            fi
            FI_CMD+=(".")
            echo "FlashInfer build command: ${FI_CMD[*]}"
            FI_START=$(date +%s)
            "${FI_CMD[@]}"
            FI_END=$(date +%s)
            FLASHINFER_BUILD_TIME=$((FI_END - FI_START))
        else
            echo "FlashInfer wheels already present in ./wheels/ — skipping build."
        fi
        # ----------------------------------------------------------
        # Phase 2: vLLM wheels
        # ----------------------------------------------------------
        VLLM_WHEELS_EXIST=false
        if compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
            VLLM_WHEELS_EXIST=true
        fi
        if [ "$REBUILD_VLLM" = true ] || [ "$VLLM_WHEELS_EXIST" = false ]; then
            if [ "$REBUILD_VLLM" = true ]; then
                echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
            else
                echo "No vLLM wheels found in ./wheels/ — building..."
            fi
            VLLM_CMD=("docker" "build"
                "--target" "vllm-export"
                "--output" "type=local,dest=./wheels"
                "${COMMON_BUILD_FLAGS[@]}"
                "--build-arg" "VLLM_REF=$VLLM_REF")
            if [ "$REBUILD_VLLM" = true ]; then
                VLLM_CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
            fi
            if [ -n "$VLLM_PRS" ]; then
                echo "Applying vLLM PRs: $VLLM_PRS"
                VLLM_CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
            fi
            if [ "$PRE_TRANSFORMERS" = true ]; then
                echo "Using transformers>=5.0.0..."
                VLLM_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
            fi
            VLLM_CMD+=(".")
            echo "vLLM build command: ${VLLM_CMD[*]}"
            VLLM_START=$(date +%s)
            "${VLLM_CMD[@]}"
            VLLM_END=$(date +%s)
            VLLM_BUILD_TIME=$((VLLM_END - VLLM_START))
        else
            echo "vLLM wheels already present in ./wheels/ — skipping build."
        fi
        # ----------------------------------------------------------
        # Phase 3: Runner image
        # ----------------------------------------------------------
        if ! compgen -G "./wheels/*.whl" > /dev/null 2>&1; then
            echo "Error: No wheel files found in ./wheels/ — cannot build runner image."
            exit 1
        fi
        RUNNER_CMD=("docker" "build"
            "-t" "$IMAGE_TAG"
            "${COMMON_BUILD_FLAGS[@]}")
        if [ "$PRE_TRANSFORMERS" = true ]; then
            RUNNER_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
        fi
        RUNNER_CMD+=(".")
        echo "Building runner image with command: ${RUNNER_CMD[*]}"
        RUNNER_START=$(date +%s)
        "${RUNNER_CMD[@]}"
        RUNNER_END=$(date +%s)
        RUNNER_BUILD_TIME=$((RUNNER_END - RUNNER_START))
    fi
    if [ "$REBUILD_DEPS" = true ]; then
        echo "Setting CACHEBUST_DEPS..."
        CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")
    fi
    if [ "$REBUILD_VLLM" = true ]; then
        echo "Setting CACHEBUST_VLLM..."
        CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
    fi
    # Add TRITON_REF to build arguments
    CMD+=("--build-arg" "TRITON_REF=$TRITON_REF")
    # Add VLLM_REF to build arguments
    CMD+=("--build-arg" "VLLM_REF=$VLLM_REF")
    # Add BUILD_JOBS to build arguments
    CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
    # Add GPU architecture to build arguments
    CMD+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
    CMD+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
    if [ "$PRE_FLASHINFER" = true ]; then
        echo "Using pre-release FlashInfer..."
        CMD+=("--build-arg" "FLASHINFER_PRE=--pre")
    fi
    if [ -n "$VLLM_PRS" ]; then
        echo "Applying vLLM PRs: $VLLM_PRS"
        CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
    fi
    if [ "$PRE_TRANSFORMERS" = true ]; then
        echo "Using transformers>=5.0.0..."
        CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
    fi
    # Add build context
    CMD+=(".")
    # Execute build
    echo "Building image with command: ${CMD[*]}"
    BUILD_START=$(date +%s)
    "${CMD[@]}"
    BUILD_END=$(date +%s)
    BUILD_TIME=$((BUILD_END - BUILD_START))
 else
    echo "Skipping build (--no-build specified)"
 fi
-# Copy to host if requested
+# =====================================================
 # Copy to host(s) if requested
 # =====================================================
 COPY_TIME=0
 if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then
    echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}"
@@ -320,12 +353,18 @@ echo ""
 echo "========================================="
 echo "         TIMING STATISTICS"
 echo "========================================="
-if [ "$BUILD_TIME" -gt 0 ]; then
+if [ "$FLASHINFER_BUILD_TIME" -gt 0 ]; then
-    echo "Docker Build:  $(printf '%02d:%02d:%02d' $((BUILD_TIME/3600)) $((BUILD_TIME%3600/60)) $((BUILD_TIME%60)))"
+    echo "FlashInfer Build: $(printf '%02d:%02d:%02d' $((FLASHINFER_BUILD_TIME/3600)) $((FLASHINFER_BUILD_TIME%3600/60)) $((FLASHINFER_BUILD_TIME%60)))"
 fi
 if [ "$VLLM_BUILD_TIME" -gt 0 ]; then
    echo "vLLM Build:       $(printf '%02d:%02d:%02d' $((VLLM_BUILD_TIME/3600)) $((VLLM_BUILD_TIME%3600/60)) $((VLLM_BUILD_TIME%60)))"
 fi
 if [ "$RUNNER_BUILD_TIME" -gt 0 ]; then
    echo "Runner Build:     $(printf '%02d:%02d:%02d' $((RUNNER_BUILD_TIME/3600)) $((RUNNER_BUILD_TIME%3600/60)) $((RUNNER_BUILD_TIME%60)))"
 fi
 if [ "$COPY_TIME" -gt 0 ]; then
-    echo "Image Copy:    $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))"
+    echo "Image Copy:       $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))"
 fi
-echo "Total Time:    $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
+echo "Total Time:       $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
 echo "========================================="
 echo "Done building $IMAGE_TAG."
--- a/wheels/.gitkeep
+++ b/wheels/.gitkeep