Initial refactoring to enable separate wheel builds
This commit is contained in:
107
Dockerfile
107
Dockerfile
@@ -60,36 +60,9 @@ ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
|
||||
# =========================================================
|
||||
# STAGE 2: Builder (Builds Triton, Flashinfer and vLLM from Source)
|
||||
# STAGE 2: FlashInfer Builder
|
||||
# =========================================================
|
||||
FROM base AS builder
|
||||
|
||||
|
||||
# # ======= Triton Build ==========
|
||||
|
||||
# # Initial Triton repo clone (cached forever)
|
||||
# RUN git clone https://github.com/triton-lang/triton.git
|
||||
|
||||
# # We expect TRITON_REF to be passed from the command line to break the cache
|
||||
# # Set to v3.6.0 by default
|
||||
# ARG TRITON_REF=v3.6.0
|
||||
|
||||
# WORKDIR $VLLM_BASE_DIR/triton
|
||||
|
||||
# # This only runs if TRITON_REF differs from the last build
|
||||
# RUN --mount=type=cache,id=ccache,target=/root/.ccache \
|
||||
# --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
# git fetch origin && \
|
||||
# git checkout ${TRITON_REF} && \
|
||||
# git submodule sync && \
|
||||
# git submodule update --init --recursive && \
|
||||
# uv pip install -r python/requirements.txt && \
|
||||
# mkdir -p /workspace/wheels && \
|
||||
# rm -rf .git && \
|
||||
# uv build --no-build-isolation --wheel --out-dir=/workspace/wheels -v . && \
|
||||
# uv build --no-build-isolation --wheel --no-index --out-dir=/workspace/wheels python/triton_kernels
|
||||
|
||||
# ======= FlashInfer Build ==========
|
||||
FROM base AS flashinfer-builder
|
||||
|
||||
ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
||||
ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
|
||||
@@ -98,17 +71,14 @@ ARG FLASHINFER_REF=main
|
||||
|
||||
# --- CACHE BUSTER ---
|
||||
# Change this argument to force a re-download of FlashInfer
|
||||
ARG CACHEBUST_DEPS=1
|
||||
ARG CACHEBUST_FLASHINFER=1
|
||||
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||
|
||||
# 4. Smart Git Clone (Fetch changes instead of full re-clone)
|
||||
# We mount a cache at /repo-cache. This directory persists on your host machine.
|
||||
# Smart Git Clone (Fetch changes instead of full re-clone)
|
||||
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
||||
# 1. Go into the persistent cache directory
|
||||
cd /repo-cache && \
|
||||
# 2. Logic: Clone if missing, otherwise Fetch & Reset
|
||||
if [ ! -d "flashinfer" ]; then \
|
||||
echo "Cache miss: Cloning FlashInfer from scratch..." && \
|
||||
git clone --recursive https://github.com/flashinfer-ai/flashinfer.git; \
|
||||
@@ -124,55 +94,54 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
||||
(git checkout --detach origin/${FLASHINFER_REF} 2>/dev/null || git checkout ${FLASHINFER_REF}) && \
|
||||
git submodule update --init --recursive && \
|
||||
git clean -fdx && \
|
||||
# Optimize git repo size
|
||||
git gc --auto; \
|
||||
fi && \
|
||||
# 3. Copy the updated code from the cache to the actual container workspace
|
||||
# We use 'cp -a' to preserve permissions
|
||||
cp -a /repo-cache/flashinfer /workspace/flashinfer
|
||||
|
||||
# Build FlashInfer wheels
|
||||
|
||||
WORKDIR /workspace/flashinfer
|
||||
|
||||
# Apply patch to avoid re-downloading existing cubins
|
||||
COPY flashinfer_cache.patch .
|
||||
RUN patch -p1 < flashinfer_cache.patch
|
||||
|
||||
# flashinfer-python
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||
--mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
|
||||
patch -p1 < flashinfer_cache.patch && \
|
||||
# flashinfer-python
|
||||
sed -i -e 's/license = "Apache-2.0"/license = { text = "Apache-2.0" }/' -e '/license-files/d' pyproject.toml && \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# flashinfer-cubin
|
||||
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# flashinfer-jit-cache
|
||||
cd ../flashinfer-jit-cache && \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
|
||||
# flashinfer-cubin
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||
--mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
|
||||
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
# =========================================================
|
||||
# STAGE 3: FlashInfer Wheel Export
|
||||
# =========================================================
|
||||
FROM scratch AS flashinfer-export
|
||||
COPY --from=flashinfer-builder /workspace/wheels /
|
||||
|
||||
# =========================================================
|
||||
# STAGE 4: vLLM Builder
|
||||
# =========================================================
|
||||
FROM base AS vllm-builder
|
||||
|
||||
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
WORKDIR $VLLM_BASE_DIR
|
||||
|
||||
# flashinfer-jit-cache
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,id=ccache,target=/root/.ccache \
|
||||
--mount=type=cache,id=cubins-cache,target=/workspace/flashinfer/flashinfer-cubin/flashinfer_cubin/cubins \
|
||||
cd flashinfer-jit-cache && \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||
|
||||
# --- VLLM SOURCE CACHE BUSTER ---
|
||||
# Change THIS argument to force a fresh git clone and rebuild of vLLM
|
||||
# without re-installing the dependencies above.
|
||||
ARG CACHEBUST_VLLM=1
|
||||
|
||||
# Git reference (branch, tag, or SHA) to checkout
|
||||
ARG VLLM_REF=main
|
||||
|
||||
# 4. Smart Git Clone (Fetch changes instead of full re-clone)
|
||||
# We mount a cache at /repo-cache. This directory persists on your host machine.
|
||||
# Smart Git Clone (Fetch changes instead of full re-clone)
|
||||
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
||||
# 1. Go into the persistent cache directory
|
||||
cd /repo-cache && \
|
||||
# 2. Logic: Clone if missing, otherwise Fetch & Reset
|
||||
if [ ! -d "vllm" ]; then \
|
||||
echo "Cache miss: Cloning vLLM from scratch..." && \
|
||||
git clone --recursive https://github.com/vllm-project/vllm.git; \
|
||||
@@ -188,11 +157,8 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
||||
(git checkout --detach origin/${VLLM_REF} 2>/dev/null || git checkout ${VLLM_REF}) && \
|
||||
git submodule update --init --recursive && \
|
||||
git clean -fdx && \
|
||||
# Optimize git repo size
|
||||
git gc --auto; \
|
||||
fi && \
|
||||
# 3. Copy the updated code from the cache to the actual container workspace
|
||||
# We use 'cp -a' to preserve permissions
|
||||
cp -a /repo-cache/vllm $VLLM_BASE_DIR/
|
||||
|
||||
WORKDIR $VLLM_BASE_DIR/vllm
|
||||
@@ -231,19 +197,18 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
# fi
|
||||
|
||||
# Final Compilation
|
||||
# We mount the ccache directory here. Ideally, map this to a host volume for persistence
|
||||
# across totally separate `docker build` invocations.
|
||||
RUN --mount=type=cache,id=ccache,target=/root/.ccache \
|
||||
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
|
||||
# # Install custom Triton from triton-builder
|
||||
# COPY --from=triton-builder /workspace/wheels /workspace/wheels
|
||||
# RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
# uv pip install /workspace/wheels/*.whl
|
||||
# =========================================================
|
||||
# STAGE 5: vLLM Wheel Export
|
||||
# =========================================================
|
||||
FROM scratch AS vllm-export
|
||||
COPY --from=vllm-builder /workspace/wheels /
|
||||
|
||||
# =========================================================
|
||||
# STAGE 4: Runner (Transfers only necessary artifacts)
|
||||
# STAGE 6: Runner (Installs wheels from host ./wheels/)
|
||||
# =========================================================
|
||||
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
|
||||
|
||||
@@ -282,10 +247,10 @@ RUN mkdir -p tiktoken_encodings && \
|
||||
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
|
||||
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
|
||||
# Copy artifacts from Builder Stage
|
||||
RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels \
|
||||
# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
|
||||
RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
|
||||
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install /mount/wheels/*.whl
|
||||
uv pip install /workspace/wheels/*.whl
|
||||
|
||||
ARG PRE_TRANSFORMERS=0
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
|
||||
@@ -6,23 +6,21 @@ START_TIME=$(date +%s)
|
||||
|
||||
# Default values
|
||||
IMAGE_TAG="vllm-node"
|
||||
REBUILD_DEPS=false
|
||||
REBUILD_FLASHINFER=false
|
||||
REBUILD_VLLM=false
|
||||
COPY_HOSTS=()
|
||||
SSH_USER="$USER"
|
||||
NO_BUILD=false
|
||||
TRITON_REF="v3.6.0"
|
||||
VLLM_REF="main"
|
||||
TMP_IMAGE=""
|
||||
PARALLEL_COPY=false
|
||||
USE_WHEELS_MODE=""
|
||||
PRE_FLASHINFER=false
|
||||
PRE_TRANSFORMERS=false
|
||||
EXP_MXFP4=false
|
||||
TRITON_REF_SET=false
|
||||
VLLM_REF_SET=false
|
||||
VLLM_PRS=""
|
||||
PRE_TRANSFORMERS=false
|
||||
FULL_LOG=false
|
||||
BUILD_JOBS="16"
|
||||
GPU_ARCH_LIST="12.1a"
|
||||
|
||||
cleanup() {
|
||||
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
||||
@@ -60,28 +58,23 @@ copy_to_host() {
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
BUILD_JOBS="16"
|
||||
GPU_ARCH_LIST="12.1a"
|
||||
|
||||
# Help function
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
|
||||
echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')"
|
||||
echo " --rebuild-deps : Set cache bust for dependencies"
|
||||
echo " --rebuild-vllm : Set cache bust for vllm"
|
||||
echo " --triton-ref <ref> : Triton commit SHA, branch or tag (default: 'v3.5.1')"
|
||||
echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)"
|
||||
echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)"
|
||||
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
|
||||
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag."
|
||||
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists."
|
||||
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
|
||||
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
|
||||
echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: \${BUILD_JOBS})"
|
||||
echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: ${BUILD_JOBS})"
|
||||
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
|
||||
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
|
||||
echo " --pre-flashinfer : Use pre-release versions of FlashInfer"
|
||||
echo " --pre-tf, --pre-transformers : Install transformers 5.0.0rc0 or higher"
|
||||
echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
|
||||
echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source code. Can be specified multiple times."
|
||||
echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source. Can be specified multiple times."
|
||||
echo " --full-log : Enable full build logging (--progress=plain)"
|
||||
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
||||
echo " -h, --help : Show this help message"
|
||||
@@ -93,19 +86,16 @@ while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
||||
--gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
|
||||
--rebuild-deps) REBUILD_DEPS=true ;;
|
||||
--rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
|
||||
--rebuild-vllm) REBUILD_VLLM=true ;;
|
||||
--triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
|
||||
--vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
|
||||
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
||||
shift
|
||||
# Consume arguments until the next flag or end of args
|
||||
while [[ "$#" -gt 0 && "$1" != -* ]]; do
|
||||
add_copy_hosts "$1"
|
||||
shift
|
||||
done
|
||||
|
||||
# If no hosts specified, use autodiscovery
|
||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
echo "No hosts specified. Using autodiscovery..."
|
||||
source "$(dirname "$0")/autodiscover.sh"
|
||||
@@ -116,7 +106,6 @@ while [[ "$#" -gt 0 ]]; do
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Use PEER_NODES directly
|
||||
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
||||
COPY_HOSTS=("${PEER_NODES[@]}")
|
||||
fi
|
||||
@@ -132,19 +121,6 @@ while [[ "$#" -gt 0 ]]; do
|
||||
-j|--build-jobs) BUILD_JOBS="$2"; shift ;;
|
||||
-u|--user) SSH_USER="$2"; shift ;;
|
||||
--copy-parallel) PARALLEL_COPY=true ;;
|
||||
--use-wheels)
|
||||
if [[ "$2" != -* && -n "$2" ]]; then
|
||||
if [[ "$2" != "nightly" && "$2" != "release" ]]; then
|
||||
echo "Error: --use-wheels argument must be 'nightly' or 'release'."
|
||||
exit 1
|
||||
fi
|
||||
USE_WHEELS_MODE="$2"
|
||||
shift
|
||||
else
|
||||
USE_WHEELS_MODE="nightly"
|
||||
fi
|
||||
;;
|
||||
--pre-flashinfer) PRE_FLASHINFER=true ;;
|
||||
--pre-tf|--pre-transformers) PRE_TRANSFORMERS=true ;;
|
||||
--exp-mxfp4|--experimental-mxfp4) EXP_MXFP4=true ;;
|
||||
--apply-vllm-pr)
|
||||
@@ -168,25 +144,16 @@ while [[ "$#" -gt 0 ]]; do
|
||||
shift
|
||||
done
|
||||
|
||||
# Validate flag combinations
|
||||
if [ -n "$VLLM_PRS" ]; then
|
||||
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
|
||||
if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --apply-vllm-pr is incompatible with --use-wheels"; exit 1; fi
|
||||
fi
|
||||
|
||||
if [ "$EXP_MXFP4" = true ]; then
|
||||
if [ "$TRITON_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --triton-ref"; exit 1; fi
|
||||
if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
|
||||
if [ -n "$USE_WHEELS_MODE" ]; then echo "Error: --exp-mxfp4 is incompatible with --use-wheels"; exit 1; fi
|
||||
if [ "$PRE_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-flashinfer"; exit 1; fi
|
||||
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --pre-transformers"; exit 1; fi
|
||||
fi
|
||||
|
||||
if [ -n "$USE_WHEELS_MODE" ]; then
|
||||
read -p "!!! Wheels build is known not to work properly with all models after migration to Torch 2.10! Full build is recommended. Do you want to continue (y/N)? " choice
|
||||
case "$choice" in
|
||||
y|Y ) echo "Continuing...";;
|
||||
* ) echo "Aborting."; exit 1;;
|
||||
esac
|
||||
if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
|
||||
if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
|
||||
fi
|
||||
|
||||
# Validate --no-build usage
|
||||
@@ -195,81 +162,147 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Build image (unless --no-build is set)
|
||||
BUILD_TIME=0
|
||||
# Ensure wheels directory exists
|
||||
mkdir -p ./wheels
|
||||
|
||||
# Common build flags used across all non-mxfp4 sub-builds
|
||||
COMMON_BUILD_FLAGS=()
|
||||
if [ "$FULL_LOG" = true ]; then
|
||||
COMMON_BUILD_FLAGS+=("--progress=plain")
|
||||
fi
|
||||
COMMON_BUILD_FLAGS+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
|
||||
COMMON_BUILD_FLAGS+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||
COMMON_BUILD_FLAGS+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||
|
||||
# =====================================================
|
||||
# Build image (unless --no-build or --exp-mxfp4)
|
||||
# =====================================================
|
||||
FLASHINFER_BUILD_TIME=0
|
||||
VLLM_BUILD_TIME=0
|
||||
RUNNER_BUILD_TIME=0
|
||||
|
||||
if [ "$NO_BUILD" = false ]; then
|
||||
# Construct build command
|
||||
CMD=("docker" "build" "-t" "$IMAGE_TAG")
|
||||
|
||||
if [ "$FULL_LOG" = true ]; then
|
||||
CMD+=("--progress=plain")
|
||||
fi
|
||||
|
||||
if [ "$EXP_MXFP4" = true ]; then
|
||||
echo "Building with experimental MXFP4 support..."
|
||||
CMD+=("-f" "Dockerfile.mxfp4")
|
||||
elif [ -n "$USE_WHEELS_MODE" ]; then
|
||||
echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
|
||||
CMD+=("-f" "Dockerfile.wheels")
|
||||
if [ "$USE_WHEELS_MODE" = "release" ]; then
|
||||
CMD+=("--build-arg" "WHEELS_FROM_GITHUB_RELEASE=1")
|
||||
fi
|
||||
else
|
||||
echo "Building vLLM from source"
|
||||
fi
|
||||
|
||||
if [ "$REBUILD_DEPS" = true ]; then
|
||||
echo "Setting CACHEBUST_DEPS..."
|
||||
CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")
|
||||
fi
|
||||
|
||||
if [ "$REBUILD_VLLM" = true ]; then
|
||||
echo "Setting CACHEBUST_VLLM..."
|
||||
CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
|
||||
fi
|
||||
|
||||
# Add TRITON_REF to build arguments
|
||||
CMD+=("--build-arg" "TRITON_REF=$TRITON_REF")
|
||||
|
||||
# Add VLLM_REF to build arguments
|
||||
CMD+=("--build-arg" "VLLM_REF=$VLLM_REF")
|
||||
|
||||
# Add BUILD_JOBS to build arguments
|
||||
CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
|
||||
|
||||
# Add GPU architecture to build arguments
|
||||
CMD+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||
CMD+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||
|
||||
if [ "$PRE_FLASHINFER" = true ]; then
|
||||
echo "Using pre-release FlashInfer..."
|
||||
CMD+=("--build-arg" "FLASHINFER_PRE=--pre")
|
||||
fi
|
||||
|
||||
if [ -n "$VLLM_PRS" ]; then
|
||||
echo "Applying vLLM PRs: $VLLM_PRS"
|
||||
CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
|
||||
fi
|
||||
|
||||
if [ "$PRE_TRANSFORMERS" = true ]; then
|
||||
echo "Using transformers>=5.0.0..."
|
||||
CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
|
||||
fi
|
||||
|
||||
# Add build context
|
||||
CMD+=(".")
|
||||
|
||||
# Execute build
|
||||
CMD=("docker" "build" "-t" "$IMAGE_TAG" "${COMMON_BUILD_FLAGS[@]}" "-f" "Dockerfile.mxfp4" ".")
|
||||
echo "Building image with command: ${CMD[*]}"
|
||||
BUILD_START=$(date +%s)
|
||||
"${CMD[@]}"
|
||||
BUILD_END=$(date +%s)
|
||||
BUILD_TIME=$((BUILD_END - BUILD_START))
|
||||
RUNNER_BUILD_TIME=$((BUILD_END - BUILD_START))
|
||||
else
|
||||
# ----------------------------------------------------------
|
||||
# Phase 1: FlashInfer wheels
|
||||
# ----------------------------------------------------------
|
||||
FLASHINFER_WHEELS_EXIST=false
|
||||
if compgen -G "./wheels/flashinfer*.whl" > /dev/null 2>&1; then
|
||||
FLASHINFER_WHEELS_EXIST=true
|
||||
fi
|
||||
|
||||
if [ "$REBUILD_FLASHINFER" = true ] || [ "$FLASHINFER_WHEELS_EXIST" = false ]; then
|
||||
if [ "$REBUILD_FLASHINFER" = true ]; then
|
||||
echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
|
||||
else
|
||||
echo "No FlashInfer wheels found in ./wheels/ — building..."
|
||||
fi
|
||||
|
||||
FI_CMD=("docker" "build"
|
||||
"--target" "flashinfer-export"
|
||||
"--output" "type=local,dest=./wheels"
|
||||
"${COMMON_BUILD_FLAGS[@]}")
|
||||
|
||||
if [ "$REBUILD_FLASHINFER" = true ]; then
|
||||
FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
|
||||
fi
|
||||
|
||||
FI_CMD+=(".")
|
||||
|
||||
echo "FlashInfer build command: ${FI_CMD[*]}"
|
||||
FI_START=$(date +%s)
|
||||
"${FI_CMD[@]}"
|
||||
FI_END=$(date +%s)
|
||||
FLASHINFER_BUILD_TIME=$((FI_END - FI_START))
|
||||
else
|
||||
echo "FlashInfer wheels already present in ./wheels/ — skipping build."
|
||||
fi
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Phase 2: vLLM wheels
|
||||
# ----------------------------------------------------------
|
||||
VLLM_WHEELS_EXIST=false
|
||||
if compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
|
||||
VLLM_WHEELS_EXIST=true
|
||||
fi
|
||||
|
||||
if [ "$REBUILD_VLLM" = true ] || [ "$VLLM_WHEELS_EXIST" = false ]; then
|
||||
if [ "$REBUILD_VLLM" = true ]; then
|
||||
echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
|
||||
else
|
||||
echo "No vLLM wheels found in ./wheels/ — building..."
|
||||
fi
|
||||
|
||||
VLLM_CMD=("docker" "build"
|
||||
"--target" "vllm-export"
|
||||
"--output" "type=local,dest=./wheels"
|
||||
"${COMMON_BUILD_FLAGS[@]}"
|
||||
"--build-arg" "VLLM_REF=$VLLM_REF")
|
||||
|
||||
if [ "$REBUILD_VLLM" = true ]; then
|
||||
VLLM_CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
|
||||
fi
|
||||
|
||||
if [ -n "$VLLM_PRS" ]; then
|
||||
echo "Applying vLLM PRs: $VLLM_PRS"
|
||||
VLLM_CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
|
||||
fi
|
||||
|
||||
if [ "$PRE_TRANSFORMERS" = true ]; then
|
||||
echo "Using transformers>=5.0.0..."
|
||||
VLLM_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
|
||||
fi
|
||||
|
||||
VLLM_CMD+=(".")
|
||||
|
||||
echo "vLLM build command: ${VLLM_CMD[*]}"
|
||||
VLLM_START=$(date +%s)
|
||||
"${VLLM_CMD[@]}"
|
||||
VLLM_END=$(date +%s)
|
||||
VLLM_BUILD_TIME=$((VLLM_END - VLLM_START))
|
||||
else
|
||||
echo "vLLM wheels already present in ./wheels/ — skipping build."
|
||||
fi
|
||||
|
||||
# ----------------------------------------------------------
|
||||
# Phase 3: Runner image
|
||||
# ----------------------------------------------------------
|
||||
if ! compgen -G "./wheels/*.whl" > /dev/null 2>&1; then
|
||||
echo "Error: No wheel files found in ./wheels/ — cannot build runner image."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RUNNER_CMD=("docker" "build"
|
||||
"-t" "$IMAGE_TAG"
|
||||
"${COMMON_BUILD_FLAGS[@]}")
|
||||
|
||||
if [ "$PRE_TRANSFORMERS" = true ]; then
|
||||
RUNNER_CMD+=("--build-arg" "PRE_TRANSFORMERS=1")
|
||||
fi
|
||||
|
||||
RUNNER_CMD+=(".")
|
||||
|
||||
echo "Building runner image with command: ${RUNNER_CMD[*]}"
|
||||
RUNNER_START=$(date +%s)
|
||||
"${RUNNER_CMD[@]}"
|
||||
RUNNER_END=$(date +%s)
|
||||
RUNNER_BUILD_TIME=$((RUNNER_END - RUNNER_START))
|
||||
fi
|
||||
else
|
||||
echo "Skipping build (--no-build specified)"
|
||||
fi
|
||||
|
||||
# Copy to host if requested
|
||||
# =====================================================
|
||||
# Copy to host(s) if requested
|
||||
# =====================================================
|
||||
COPY_TIME=0
|
||||
if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then
|
||||
echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}"
|
||||
@@ -320,8 +353,14 @@ echo ""
|
||||
echo "========================================="
|
||||
echo " TIMING STATISTICS"
|
||||
echo "========================================="
|
||||
if [ "$BUILD_TIME" -gt 0 ]; then
|
||||
echo "Docker Build: $(printf '%02d:%02d:%02d' $((BUILD_TIME/3600)) $((BUILD_TIME%3600/60)) $((BUILD_TIME%60)))"
|
||||
if [ "$FLASHINFER_BUILD_TIME" -gt 0 ]; then
|
||||
echo "FlashInfer Build: $(printf '%02d:%02d:%02d' $((FLASHINFER_BUILD_TIME/3600)) $((FLASHINFER_BUILD_TIME%3600/60)) $((FLASHINFER_BUILD_TIME%60)))"
|
||||
fi
|
||||
if [ "$VLLM_BUILD_TIME" -gt 0 ]; then
|
||||
echo "vLLM Build: $(printf '%02d:%02d:%02d' $((VLLM_BUILD_TIME/3600)) $((VLLM_BUILD_TIME%3600/60)) $((VLLM_BUILD_TIME%60)))"
|
||||
fi
|
||||
if [ "$RUNNER_BUILD_TIME" -gt 0 ]; then
|
||||
echo "Runner Build: $(printf '%02d:%02d:%02d' $((RUNNER_BUILD_TIME/3600)) $((RUNNER_BUILD_TIME%3600/60)) $((RUNNER_BUILD_TIME%60)))"
|
||||
fi
|
||||
if [ "$COPY_TIME" -gt 0 ]; then
|
||||
echo "Image Copy: $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))"
|
||||
|
||||
0
wheels/.gitkeep
Normal file
0
wheels/.gitkeep
Normal file
Reference in New Issue
Block a user