From 76a8e92c86c9c65b352864a981b7535fb5682773 Mon Sep 17 00:00:00 2001 From: eugr Date: Sat, 13 Dec 2025 21:18:26 -0800 Subject: [PATCH] Multistage build with caching --- Dockerfile | 142 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 97 insertions(+), 45 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3300cfd..9b66709 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,7 @@ -FROM nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04 +# ========================================================= +# STAGE 1: Builder (Heavy image with compiler toolchain) +# ========================================================= +FROM nvidia/cuda:13.0.2-devel-ubuntu24.04 AS builder # Set non-interactive frontend to prevent apt prompts ENV DEBIAN_FRONTEND=noninteractive @@ -9,88 +12,137 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1 # Set the base directory environment variable ENV VLLM_BASE_DIR=/workspace/vllm -# 1. Install System Dependencies -RUN apt update && apt upgrade -y && apt install -y --allow-change-held-packages \ - curl \ - vim \ - cmake \ - build-essential \ - ninja-build \ - python3-dev \ - python3-pip \ - git \ - wget \ - gnuplot \ - libnccl-dev \ - libnccl2 \ - libibverbs1 \ - libibverbs-dev \ - rdma-core \ +# 1. Install Build Dependencies & Ccache +# Added ccache to enable incremental compilation caching +RUN apt update && apt upgrade -y \ + && apt install -y --allow-change-held-packages --no-install-recommends \ + curl vim cmake build-essential ninja-build \ + python3-dev python3-pip git wget \ + libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ + ccache \ && rm -rf /var/lib/apt/lists/* +# Configure Ccache for CUDA/C++ +ENV PATH=/usr/lib/ccache:$PATH +ENV CCACHE_DIR=/root/.ccache +# Tell CMake to use ccache for compilation +ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache +ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache + # Setup Workspace WORKDIR $VLLM_BASE_DIR -# 2. Download Tiktoken files -RUN mkdir -p tiktoken_encodings && \ - wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ - wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" - -# 3. Set Environment Variables +# 2. Set Environment Variables ENV TORCH_CUDA_ARCH_LIST=12.1a ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas -ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings # --- CACHE BUSTER --- # Change this argument to force a re-download of PyTorch/FlashInfer ARG CACHEBUST_DEPS=1 -# 4. Install Python Dependencies +# 3. Install Python Dependencies with Cache Mounts +# Using --mount=type=cache ensures that even if this layer invalidates, +# pip reuses previously downloaded wheels. -# Install PyTorch for CUDA 13.0 -RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130 -# Install Helper libraries -RUN pip install xgrammar triton termplotlib +# You can add termplotlib to the list below if you want to visualize text graphs in vllm bench serve +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install xgrammar triton fastsafetensors -# Install FlashInfer -RUN pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/whl && \ +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install flashinfer-python --no-deps --index-url https://flashinfer.ai/whl && \ pip install flashinfer-cubin --index-url https://flashinfer.ai/whl && \ pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu130 && \ pip install apache-tvm-ffi nvidia-cudnn-frontend nvidia-cutlass-dsl nvidia-ml-py tabulate -# Install fast safetensors to improve loading speeds -RUN pip install fastsafetensors - # --- VLLM SOURCE CACHE BUSTER --- # Change THIS argument to force a fresh git clone and rebuild of vLLM # without re-installing the dependencies above. ARG CACHEBUST_VLLM=1 -# 5. Clone and Build vLLM -RUN git clone --recursive https://github.com/vllm-project/vllm.git +# 4. Smart Git Clone (Fetch changes instead of full re-clone) +# We mount a cache at /repo-cache. This directory persists on your host machine. +RUN --mount=type=cache,target=/repo-cache \ + # 1. Go into the persistent cache directory + cd /repo-cache && \ + # 2. Logic: Clone if missing, otherwise Fetch & Reset + if [ ! -d "vllm" ]; then \ + echo "Cache miss: Cloning vLLM from scratch..." && \ + git clone --recursive https://github.com/vllm-project/vllm.git; \ + else \ + echo "Cache hit: Fetching updates..." && \ + cd vllm && \ + git fetch --all && \ + git reset --hard origin/main && \ + git submodule update --init --recursive; \ + fi && \ + # 3. Copy the updated code from the cache to the actual container workspace + # We use 'cp -a' to preserve permissions + cp -a /repo-cache/vllm $VLLM_BASE_DIR/ + WORKDIR $VLLM_BASE_DIR/vllm # Prepare build requirements -RUN python3 use_existing_torch.py && \ +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 use_existing_torch.py && \ sed -i "/flashinfer/d" requirements/cuda.txt && \ pip install -r requirements/build.txt +# Apply Patches # TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 COPY fastsafetensors.patch . RUN patch -p1 < fastsafetensors.patch -# Final Build -# Uses --no-build-isolation to respect the pre-installed Torch/FlashInfer -RUN pip install --no-build-isolation . -v +# Final Compilation +# We mount the ccache directory here. Ideally, map this to a host volume for persistence +# across totally separate `docker build` invocations. +RUN --mount=type=cache,target=/root/.ccache \ + --mount=type=cache,target=/root/.cache/pip \ + pip install --no-build-isolation . -v -# Set the final workdir + +# ========================================================= +# STAGE 2: Runner (Lightweight Runtime Image) +# ========================================================= +FROM nvidia/cuda:13.0.2-devel-ubuntu24.04 AS runner + +ENV DEBIAN_FRONTEND=noninteractive +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +ENV VLLM_BASE_DIR=/workspace/vllm + +# Install minimal runtime dependencies (NCCL, Python) +# Note: "devel" tools like cmake/gcc are NOT installed here to save space +RUN apt update && apt upgrade -y \ + && apt install -y --allow-change-held-packages --no-install-recommends \ + python3 python3-pip python3-dev vim curl git wget \ + libnccl-dev libnccl2 libibverbs1 rdma-core \ + && rm -rf /var/lib/apt/lists/* + +# Set final working directory WORKDIR $VLLM_BASE_DIR -# Copy clustering script +# Copy artifacts from Builder Stage +# We copy the python packages and executables +# No need to copy source code, as it's already in the site-packages +COPY --from=builder /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages +COPY --from=builder /usr/local/bin /usr/local/bin + +# Download Tiktoken files +RUN mkdir -p tiktoken_encodings && \ + wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ + wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" + +# Setup Env for Runtime +ENV TORCH_CUDA_ARCH_LIST=12.1a +ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas +ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings +ENV PATH=$VLLM_BASE_DIR:$PATH + +# Copy scripts COPY run-cluster-node.sh $VLLM_BASE_DIR/ RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh -# Install additional modules for Ray dashboard support +# Final extra deps RUN pip install ray[default] -