Initial refactoring to enable separate wheel builds

This commit is contained in:
Eugene Rakhmatulin
2026-02-17 19:15:32 -08:00
parent 5b2313dddb
commit ec0f189256
3 changed files with 204 additions and 200 deletions

View File

@@ -60,36 +60,9 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
# ========================================================= # =========================================================
# STAGE 2: Builder (Builds Triton, Flashinfer and vLLM from Source) # STAGE 2: FlashInfer Builder
# ========================================================= # =========================================================
FROM base AS builder FROM base AS flashinfer-builder
# # ======= Triton Build ==========
# # Initial Triton repo clone (cached forever)
# RUN git clone https://github.com/triton-lang/triton.git
# # We expect TRITON_REF to be passed from the command line to break the cache
# # Set to v3.6.0 by default
# ARG TRITON_REF=v3.6.0
# WORKDIR $VLLM_BASE_DIR/triton
# # This only runs if TRITON_REF differs from the last build
# RUN --mount=type=cache,id=ccache,target=/root/.ccache \
# --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
# git fetch origin && \
# git checkout ${TRITON_REF} && \
# git submodule sync && \
# git submodule update --init --recursive && \
# uv pip install -r python/requirements.txt && \
# mkdir -p /workspace/wheels && \
# rm -rf .git && \
# uv build --no-build-isolation --wheel --out-dir=/workspace/wheels -v . && \
# uv build --no-build-isolation --wheel --no-index --out-dir=/workspace/wheels python/triton_kernels
# ======= FlashInfer Build ==========
ARG FLASHINFER_CUDA_ARCH_LIST="12.1a" ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST} ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
@@ -98,17 +71,14 @@ ARG FLASHINFER_REF=main
# --- CACHE BUSTER --- # --- CACHE BUSTER ---
# Change this argument to force a re-download of FlashInfer # Change this argument to force a re-download of FlashInfer
ARG CACHEBUST_DEPS=1 ARG CACHEBUST_FLASHINFER=1
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
# 4. Smart Git Clone (Fetch changes instead of full re-clone) # Smart Git Clone (Fetch changes instead of full re-clone)
# We mount a cache at /repo-cache. This directory persists on your host machine.
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
# 1. Go into the persistent cache directory
cd /repo-cache && \ cd /repo-cache && \
# 2. Logic: Clone if missing, otherwise Fetch & Reset
if [ ! -d "flashinfer" ]; then \ if [ ! -d "flashinfer" ]; then \
echo "Cache miss: Cloning FlashInfer from scratch..." && \ echo "Cache miss: Cloning FlashInfer from scratch..." && \
git clone --recursive https://github.com/flashinfer-ai/flashinfer.git; \ git clone --recursive https://github.com/flashinfer-ai/flashinfer.git; \
@@ -124,55 +94,54 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
(git checkout --detach origin/${FLASHINFER_REF} 2>/dev/null || git checkout ${FLASHINFER_REF}) && \ (git checkout --detach origin/${FLASHINFER_REF} 2>/dev/null || git checkout ${FLASHINFER_REF}) && \
git submodule update --init --recursive && \ git submodule update --init --recursive && \
git clean -fdx && \ git clean -fdx && \
# Optimize git repo size
git gc --auto; \ git gc --auto; \
fi && \ fi && \
# 3. Copy the updated code from the cache to the actual container workspace
# We use 'cp -a' to preserve permissions
cp -a /repo-cache/flashinfer /workspace/flashinfer cp -a /repo-cache/flashinfer /workspace/flashinfer
# Build FlashInfer wheels
WORKDIR /workspace/flashinfer WORKDIR /workspace/flashinfer
# Apply patch to avoid re-downloading existing cubins # Apply patch to avoid re-downloading existing cubins
COPY flashinfer_cache.patch . COPY flashinfer_cache.patch .
RUN patch -p1 < flashinfer_cache.patch
# flashinfer-python
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--mount=type=cache,id=ccache,target=/root/.ccache \ --mount=type=cache,id=ccache,target=/root/.ccache \
--mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \ --mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
patch -p1 < flashinfer_cache.patch && \
# flashinfer-python
sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \ sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
# flashinfer-cubin
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
# flashinfer-jit-cache
cd ../flashinfer-jit-cache && \
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
# flashinfer-cubin # =========================================================
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # STAGE 3: FlashInfer Wheel Export
--mount=type=cache,id=ccache,target=/root/.ccache \ # =========================================================
--mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \ FROM scratch AS flashinfer-export
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v COPY --from=flashinfer-builder /workspace/wheels /
# =========================================================
# STAGE 4: vLLM Builder
# =========================================================
FROM base AS vllm-builder
ARG TORCH_CUDA_ARCH_LIST="12.1a"
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
WORKDIR $VLLM_BASE_DIR
# flashinfer-jit-cache
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
--mount=type=cache,id=ccache,target=/root/.ccache \ uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
--mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
cd flashinfer-jit-cache && \
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
# --- VLLM SOURCE CACHE BUSTER --- # --- VLLM SOURCE CACHE BUSTER ---
# Change THIS argument to force a fresh git clone and rebuild of vLLM
# without re-installing the dependencies above.
ARG CACHEBUST_VLLM=1 ARG CACHEBUST_VLLM=1
# Git reference (branch, tag, or SHA) to checkout # Git reference (branch, tag, or SHA) to checkout
ARG VLLM_REF=main ARG VLLM_REF=main
# 4. Smart Git Clone (Fetch changes instead of full re-clone) # Smart Git Clone (Fetch changes instead of full re-clone)
# We mount a cache at /repo-cache. This directory persists on your host machine.
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
# 1. Go into the persistent cache directory
cd /repo-cache && \ cd /repo-cache && \
# 2. Logic: Clone if missing, otherwise Fetch & Reset
if [ ! -d "vllm" ]; then \ if [ ! -d "vllm" ]; then \
echo "Cache miss: Cloning vLLM from scratch..." && \ echo "Cache miss: Cloning vLLM from scratch..." && \
git clone --recursive https://github.com/vllm-project/vllm.git; \ git clone --recursive https://github.com/vllm-project/vllm.git; \
@@ -188,11 +157,8 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
(git checkout --detach origin/${VLLM_REF} 2>/dev/null || git checkout ${VLLM_REF}) && \ (git checkout --detach origin/${VLLM_REF} 2>/dev/null || git checkout ${VLLM_REF}) && \
git submodule update --init --recursive && \ git submodule update --init --recursive && \
git clean -fdx && \ git clean -fdx && \
# Optimize git repo size
git gc --auto; \ git gc --auto; \
fi && \ fi && \
# 3. Copy the updated code from the cache to the actual container workspace
# We use 'cp -a' to preserve permissions
cp -a /repo-cache/vllm $VLLM_BASE_DIR/ cp -a /repo-cache/vllm $VLLM_BASE_DIR/
WORKDIR $VLLM_BASE_DIR/vllm WORKDIR $VLLM_BASE_DIR/vllm
@@ -231,19 +197,18 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
# fi # fi
# Final Compilation # Final Compilation
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
# across totally separate `docker build` invocations.
RUN --mount=type=cache,id=ccache,target=/root/.ccache \ RUN --mount=type=cache,id=ccache,target=/root/.ccache \
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
# # Install custom Triton from triton-builder # =========================================================
# COPY --from=triton-builder /workspace/wheels /workspace/wheels # STAGE 5: vLLM Wheel Export
# RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # =========================================================
# uv pip install /workspace/wheels/*.whl FROM scratch AS vllm-export
COPY --from=vllm-builder /workspace/wheels /
# ========================================================= # =========================================================
# STAGE 4: Runner (Transfers only necessary artifacts) # STAGE 6: Runner (Installs wheels from host ./wheels/)
# ========================================================= # =========================================================
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
@@ -282,10 +247,10 @@ RUN mkdir -p tiktoken_encodings && \
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
# Copy artifacts from Builder Stage # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels \ RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \ --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install /mount/wheels/*.whl uv pip install /workspace/wheels/*.whl
ARG PRE_TRANSFORMERS=0 ARG PRE_TRANSFORMERS=0
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \

View File

@@ -6,23 +6,21 @@ START_TIME=$(date +%s)
# Default values # Default values
IMAGE_TAG="vllm-node" IMAGE_TAG="vllm-node"
REBUILD_DEPS=false REBUILD_FLASHINFER=false
REBUILD_VLLM=false REBUILD_VLLM=false
COPY_HOSTS=() COPY_HOSTS=()
SSH_USER="$USER" SSH_USER="$USER"
NO_BUILD=false NO_BUILD=false
TRITON_REF="v3.6.0"
VLLM_REF="main" VLLM_REF="main"
TMP_IMAGE="" TMP_IMAGE=""
PARALLEL_COPY=false PARALLEL_COPY=false
USE_WHEELS_MODE=""
PRE_FLASHINFER=false
PRE_TRANSFORMERS=false
EXP_MXFP4=false EXP_MXFP4=false
TRITON_REF_SET=false
VLLM_REF_SET=false VLLM_REF_SET=false
VLLM_PRS="" VLLM_PRS=""
PRE_TRANSFORMERS=false
FULL_LOG=false FULL_LOG=false
BUILD_JOBS="16"
GPU_ARCH_LIST="12.1a"
cleanup() { cleanup() {
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -60,31 +58,26 @@ copy_to_host() {
return 1 return 1
fi fi
} }
BUILD_JOBS="16"
GPU_ARCH_LIST="12.1a"
# Help function # Help function
usage() { usage() {
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')" echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')" echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')"
echo " --rebuild-deps : Set cache bust for dependencies" echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)"
echo " --rebuild-vllm : Set cache bust for vllm" echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)"
echo " --triton-ref <ref> : Triton commit SHA, branch or tag (default: 'v3.5.1')" echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')" echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists."
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag." echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: ${BUILD_JOBS})"
echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: \${BUILD_JOBS})" echo " -u, --user <user> : Username for ssh command (default: \$USER)"
echo " -u, --user <user> : Username for ssh command (default: \$USER)" echo " --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher"
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
echo " --pre-flashinfer : Use pre-release versions of FlashInfer"
echo " --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher"
echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support" echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source code. Can be specified multiple times." echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source. Can be specified multiple times."
echo " --full-log : Enable full build logging (--progress=plain)" echo " --full-log : Enable full build logging (--progress=plain)"
echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " --no-build : Skip building, only copy image (requires --copy-to)"
echo " -h, --help : Show this help message" echo " -h, --help : Show this help message"
exit 1 exit 1
} }
@@ -93,19 +86,16 @@ while [[ "$#" -gt 0 ]]; do
case $1 in case $1 in
-t|--tag) IMAGE_TAG="$2"; shift ;; -t|--tag) IMAGE_TAG="$2"; shift ;;
--gpu-arch) GPU_ARCH_LIST="$2"; shift ;; --gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
--rebuild-deps) REBUILD_DEPS=true ;; --rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
--rebuild-vllm) REBUILD_VLLM=true ;; --rebuild-vllm) REBUILD_VLLM=true ;;
--triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
--vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;; --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
-c|--copy-to|--copy-to-host|--copy-to-hosts) -c|--copy-to|--copy-to-host|--copy-to-hosts)
shift shift
# Consume arguments until the next flag or end of args
while [[ "$#" -gt 0 && "$1" != -* ]]; do while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1" add_copy_hosts "$1"
shift shift
done done
# If no hosts specified, use autodiscovery
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "No hosts specified. Using autodiscovery..." echo "No hosts specified. Using autodiscovery..."
source "$(dirname "$0")/autodiscover.sh" source "$(dirname "$0")/autodiscover.sh"
@@ -116,7 +106,6 @@ while [[ "$#" -gt 0 ]]; do
exit 1 exit 1
fi fi
# Use PEER_NODES directly
if [ ${#PEER_NODES[@]} -gt 0 ]; then if [ ${#PEER_NODES[@]} -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}") COPY_HOSTS=("${PEER_NODES[@]}")
fi fi
@@ -132,19 +121,6 @@ while [[ "$#" -gt 0 ]]; do
-j|--build-jobs) BUILD_JOBS="$2"; shift ;; -j|--build-jobs) BUILD_JOBS="$2"; shift ;;
-u|--user) SSH_USER="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;;
--copy-parallel) PARALLEL_COPY=true ;; --copy-parallel) PARALLEL_COPY=true ;;
--use-wheels)
if [[ "$2" != -* && -n "$2" ]]; then
if [[ "$2" != "nightly" && "$2" != "release" ]]; then
echo "Error: --use-wheels argument must be 'nightly' or 'release'."
exit 1
fi
USE_WHEELS_MODE="$2"
shift
else
USE_WHEELS_MODE="nightly"
fi
;;
--pre-flashinfer) PRE_FLASHINFER=true ;;
--pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;; --pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
--exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;; --exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;;
--apply-vllm-pr) --apply-vllm-pr)
@@ -168,25 +144,16 @@ while [[ "$#" -gt 0 ]]; do
shift shift
done done
# Validate flag combinations
if [ -n "$VLLM_PRS" ]; then if [ -n "$VLLM_PRS" ]; then
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --apply-vllm-pr is incompatible with --use-wheels"; exit 1; fi
fi fi
if [ "$EXP_MXFP4" = true ]; then if [ "$EXP_MXFP4" = true ]; then
if [ "$TRITON_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --triton-ref"; exit 1; fi
if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --exp-mxfp4 is incompatible with --use-wheels"; exit 1; fi
if [ "$PRE_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-flashinfer"; exit 1; fi
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
fi if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
if [ -n "$USE_WHEELS_MODE" ]; then
read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice
case "$choice" in
y|Y ) echo "Continuing...";;
* ) echo "Aborting."; exit 1;;
esac
fi fi
# Validate --no-build usage # Validate --no-build usage
@@ -195,81 +162,147 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
exit 1 exit 1
fi fi
# Build image (unless --no-build is set) # Ensure wheels directory exists
BUILD_TIME=0 mkdir -p ./wheels
# Common build flags used across all non-mxfp4 sub-builds
COMMON_BUILD_FLAGS=()
if [ "$FULL_LOG" = true ]; then
COMMON_BUILD_FLAGS+=("--progress=plain")
fi
COMMON_BUILD_FLAGS+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
COMMON_BUILD_FLAGS+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
COMMON_BUILD_FLAGS+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
# =====================================================
# Build image (unless --no-build or --exp-mxfp4)
# =====================================================
FLASHINFER_BUILD_TIME=0
VLLM_BUILD_TIME=0
RUNNER_BUILD_TIME=0
if [ "$NO_BUILD" = false ]; then if [ "$NO_BUILD" = false ]; then
# Construct build command
CMD=("docker" "build" "-t" "$IMAGE_TAG")
if [ "$FULL_LOG" = true ]; then
CMD+=("--progress=plain")
fi
if [ "$EXP_MXFP4" = true ]; then if [ "$EXP_MXFP4" = true ]; then
echo "Building with experimental MXFP4 support..." echo "Building with experimental MXFP4 support..."
CMD+=("-f" "Dockerfile.mxfp4") CMD=("docker" "build" "-t" "$IMAGE_TAG" "${COMMON_BUILD_FLAGS[@]}" "-f" "Dockerfile.mxfp4" ".")
elif [ -n "$USE_WHEELS_MODE" ]; then echo "Building image with command: ${CMD[*]}"
echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)" BUILD_START=$(date +%s)
CMD+=("-f" "Dockerfile.wheels") "${CMD[@]}"
if [ "$USE_WHEELS_MODE" = "release" ]; then BUILD_END=$(date +%s)
CMD+=("--build-arg" "WHEELS_FROM_GITHUB_RELEASE=1") RUNNER_BUILD_TIME=$((BUILD_END - BUILD_START))
fi
else else
echo "Building vLLM from source" # ----------------------------------------------------------
# Phase 1: FlashInfer wheels
# ----------------------------------------------------------
FLASHINFER_WHEELS_EXIST=false
if compgen -G "./wheels/flashinfer*.whl" > /dev/null 2>&1; then
FLASHINFER_WHEELS_EXIST=true
fi
if [ "$REBUILD_FLASHINFER" = true ] || [ "$FLASHINFER_WHEELS_EXIST" = false ]; then
if [ "$REBUILD_FLASHINFER" = true ]; then
echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
else
echo "No FlashInfer wheels found in ./wheels/ — building..."
fi
FI_CMD=("docker" "build"
"--target" "flashinfer-export"
"--output" "type=local,dest=./wheels"
"${COMMON_BUILD_FLAGS[@]}")
if [ "$REBUILD_FLASHINFER" = true ]; then
FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
fi
FI_CMD+=(".")
echo "FlashInfer build command: ${FI_CMD[*]}"
FI_START=$(date +%s)
"${FI_CMD[@]}"
FI_END=$(date +%s)
FLASHINFER_BUILD_TIME=$((FI_END - FI_START))
else
echo "FlashInfer wheels already present in ./wheels/ — skipping build."
fi
# ----------------------------------------------------------
# Phase 2: vLLM wheels
# ----------------------------------------------------------
VLLM_WHEELS_EXIST=false
if compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
VLLM_WHEELS_EXIST=true
fi
if [ "$REBUILD_VLLM" = true ] || [ "$VLLM_WHEELS_EXIST" = false ]; then
if [ "$REBUILD_VLLM" = true ]; then
echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
else
echo "No vLLM wheels found in ./wheels/ — building..."
fi
VLLM_CMD=("docker" "build"
"--target" "vllm-export"
"--output" "type=local,dest=./wheels"
"${COMMON_BUILD_FLAGS[@]}"
"--build-arg" "VLLM_REF=$VLLM_REF")
if [ "$REBUILD_VLLM" = true ]; then
VLLM_CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
fi
if [ -n "$VLLM_PRS" ]; then
echo "Applying vLLM PRs: $VLLM_PRS"
VLLM_CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
fi
if [ "$PRE_TRANSFORMERS" = true ]; then
echo "Using transformers>=5.0.0..."
VLLM_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
fi
VLLM_CMD+=(".")
echo "vLLM build command: ${VLLM_CMD[*]}"
VLLM_START=$(date +%s)
"${VLLM_CMD[@]}"
VLLM_END=$(date +%s)
VLLM_BUILD_TIME=$((VLLM_END - VLLM_START))
else
echo "vLLM wheels already present in ./wheels/ — skipping build."
fi
# ----------------------------------------------------------
# Phase 3: Runner image
# ----------------------------------------------------------
if ! compgen -G "./wheels/*.whl" > /dev/null 2>&1; then
echo "Error: No wheel files found in ./wheels/ — cannot build runner image."
exit 1
fi
RUNNER_CMD=("docker" "build"
"-t" "$IMAGE_TAG"
"${COMMON_BUILD_FLAGS[@]}")
if [ "$PRE_TRANSFORMERS" = true ]; then
RUNNER_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
fi
RUNNER_CMD+=(".")
echo "Building runner image with command: ${RUNNER_CMD[*]}"
RUNNER_START=$(date +%s)
"${RUNNER_CMD[@]}"
RUNNER_END=$(date +%s)
RUNNER_BUILD_TIME=$((RUNNER_END - RUNNER_START))
fi fi
if [ "$REBUILD_DEPS" = true ]; then
echo "Setting CACHEBUST_DEPS..."
CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")
fi
if [ "$REBUILD_VLLM" = true ]; then
echo "Setting CACHEBUST_VLLM..."
CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
fi
# Add TRITON_REF to build arguments
CMD+=("--build-arg" "TRITON_REF=$TRITON_REF")
# Add VLLM_REF to build arguments
CMD+=("--build-arg" "VLLM_REF=$VLLM_REF")
# Add BUILD_JOBS to build arguments
CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
# Add GPU architecture to build arguments
CMD+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
CMD+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
if [ "$PRE_FLASHINFER" = true ]; then
echo "Using pre-release FlashInfer..."
CMD+=("--build-arg" "FLASHINFER_PRE=--pre")
fi
if [ -n "$VLLM_PRS" ]; then
echo "Applying vLLM PRs: $VLLM_PRS"
CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
fi
if [ "$PRE_TRANSFORMERS" = true ]; then
echo "Using transformers>=5.0.0..."
CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
fi
# Add build context
CMD+=(".")
# Execute build
echo "Building image with command: ${CMD[*]}"
BUILD_START=$(date +%s)
"${CMD[@]}"
BUILD_END=$(date +%s)
BUILD_TIME=$((BUILD_END - BUILD_START))
else else
echo "Skipping build (--no-build specified)" echo "Skipping build (--no-build specified)"
fi fi
# Copy to host if requested # =====================================================
# Copy to host(s) if requested
# =====================================================
COPY_TIME=0 COPY_TIME=0
if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then
echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}" echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}"
@@ -320,12 +353,18 @@ echo ""
echo "=========================================" echo "========================================="
echo " TIMING STATISTICS" echo " TIMING STATISTICS"
echo "=========================================" echo "========================================="
if [ "$BUILD_TIME" -gt 0 ]; then if [ "$FLASHINFER_BUILD_TIME" -gt 0 ]; then
echo "Docker Build: $(printf '%02d:%02d:%02d' $((BUILD_TIME/3600)) $((BUILD_TIME%3600/60)) $((BUILD_TIME%60)))" echo "FlashInfer Build: $(printf '%02d:%02d:%02d' $((FLASHINFER_BUILD_TIME/3600)) $((FLASHINFER_BUILD_TIME%3600/60)) $((FLASHINFER_BUILD_TIME%60)))"
fi
if [ "$VLLM_BUILD_TIME" -gt 0 ]; then
echo "vLLM Build: $(printf '%02d:%02d:%02d' $((VLLM_BUILD_TIME/3600)) $((VLLM_BUILD_TIME%3600/60)) $((VLLM_BUILD_TIME%60)))"
fi
if [ "$RUNNER_BUILD_TIME" -gt 0 ]; then
echo "Runner Build: $(printf '%02d:%02d:%02d' $((RUNNER_BUILD_TIME/3600)) $((RUNNER_BUILD_TIME%3600/60)) $((RUNNER_BUILD_TIME%60)))"
fi fi
if [ "$COPY_TIME" -gt 0 ]; then if [ "$COPY_TIME" -gt 0 ]; then
echo "Image Copy: $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))" echo "Image Copy: $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))"
fi fi
echo "Total Time: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))" echo "Total Time: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
echo "=========================================" echo "========================================="
echo "Done building $IMAGE_TAG." echo "Done building $IMAGE_TAG."

0
wheels/.gitkeep Normal file
View File