diff --git a/Dockerfile.wheels b/Dockerfile.wheels new file mode 100644 index 0000000..669637f --- /dev/null +++ b/Dockerfile.wheels @@ -0,0 +1,75 @@ +# syntax=docker/dockerfile:1.6 + +FROM nvidia/cuda:13.1.0-devel-ubuntu24.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +ENV VLLM_BASE_DIR=/workspace/vllm + +# Just in case if some JIT compilation happens during runtime +# Limit build parallelism to reduce OOM situations +ARG BUILD_JOBS=16 +ENV MAX_JOBS=${BUILD_JOBS} +ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} +ENV NINJAFLAGS="-j${BUILD_JOBS}" +ENV MAKEFLAGS="-j${BUILD_JOBS}" + +# Set pip cache directory +ENV PIP_CACHE_DIR=/root/.cache/pip +ENV UV_CACHE_DIR=/root/.cache/uv +ENV UV_SYSTEM_PYTHON=1 + +# Install minimal runtime dependencies (NCCL, Python) +# Note: "devel" tools like cmake/gcc are NOT installed here to save space +RUN apt update && apt upgrade -y \ + && apt install -y --allow-change-held-packages --no-install-recommends \ + python3 python3-pip python3-dev vim curl git wget \ + libcudnn9-cuda-13 \ + libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ + && rm -rf /var/lib/apt/lists/* \ + && pip install uv + +# Set final working directory +WORKDIR $VLLM_BASE_DIR + +# Download Tiktoken files +RUN mkdir -p tiktoken_encodings && \ + wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \ + wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" + +# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 +COPY fastsafetensors.patch . + +# Install fastsafetensors +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install --system --break-system-packages -U fastsafetensors + +# --- VLLM SOURCE CACHE BUSTER --- +# Change THIS argument to force a fresh git clone and rebuild of vLLM +# without re-installing the dependencies above. +ARG CACHEBUST_VLLM=1 +ARG VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130 + +# Install nightly vLLM build from prebuilt wheels +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install --system --break-system-packages -U vllm \ + --torch-backend=auto \ + --extra-index-url $VLLM_WHEELS_URL + +# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 +# Apply in site-packages +RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch + +# Setup Env for Runtime +ENV TORCH_CUDA_ARCH_LIST=12.1a +ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas +ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings + +# Copy scripts +COPY run-cluster-node.sh $VLLM_BASE_DIR/ +RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh + +# Final extra deps +RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ + uv pip install --system --break-system-packages ray[default] + diff --git a/README.md b/README.md index fc2e79f..dbac2b1 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,13 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run ## CHANGELOG +### 2025-12-20 + +- Added `--use-wheels [mode]` flag to `build-and-copy.sh`. + - Allows building the container using pre-built vLLM wheels instead of compiling from source. + - The resulting Docker container size is reduced considerably (14GB vs 24GB) + - `mode` is optional and defaults to `nightly`. + - Supported modes: `nightly` (release wheels are broken with CUDA 13 currently). ### 2025-12-19 Updated `build-and-copy.sh` to support copying to multiple hosts (thanks @ericlewis for the contribution). @@ -179,6 +186,7 @@ Using a different username: | `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) | | `--triton-ref ` | Triton commit SHA, branch or tag (default: 'v3.5.1') | | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: 'main') | +| `--use-wheels [mode]` | Use pre-built vLLM wheels. Mode: `nightly` (default) or `release`. | | `-c, --copy-to ` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). | | `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). | | `--copy-parallel` | Copy to all specified hosts concurrently. | diff --git a/build-and-copy.sh b/build-and-copy.sh index c970b65..eb26cc5 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -15,6 +15,7 @@ TRITON_REF="v3.5.1" VLLM_REF="main" TMP_IMAGE="" PARALLEL_COPY=false +USE_WHEELS_MODE="" cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -67,6 +68,7 @@ usage() { echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " -j, --build-jobs : Number of concurrent build jobs (default: \${BUILD_JOBS})" echo " -u, --user : Username for ssh command (default: \$USER)" + echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'." echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " -h, --help : Show this help message" exit 1 @@ -115,6 +117,18 @@ while [[ "$#" -gt 0 ]]; do -j|--build-jobs) BUILD_JOBS="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;; --copy-parallel) PARALLEL_COPY=true ;; + --use-wheels) + if [[ "$2" != -* && -n "$2" ]]; then + if [[ "$2" != "nightly" && "$2" != "release" ]]; then + echo "Error: --use-wheels argument must be 'nightly' or 'release'." + exit 1 + fi + USE_WHEELS_MODE="$2" + shift + else + USE_WHEELS_MODE="nightly" + fi + ;; --no-build) NO_BUILD=true ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; @@ -134,6 +148,20 @@ if [ "$NO_BUILD" = false ]; then # Construct build command CMD=("docker" "build" "-t" "$IMAGE_TAG") + if [ -n "$USE_WHEELS_MODE" ]; then + echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)" + CMD+=("-f" "Dockerfile.wheels") + if [ "$USE_WHEELS_MODE" = "release" ]; then + echo "Release wheels are currently broken with CUDA 13, use nightly instead." + exit 1 + CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/cu130") + else + CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130") + fi + else + echo "Building vLLM from source" + fi + if [ "$REBUILD_DEPS" = true ]; then echo "Setting CACHEBUST_DEPS..." CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")