Added --use-wheels to use precompiled vLLM wheels instead of compiling from the source

This commit is contained in:
Eugene Rakhmatulin
2025-12-20 20:25:07 -08:00
parent f075801c59
commit 76988e0c75
3 changed files with 111 additions and 0 deletions

75
Dockerfile.wheels Normal file
View File

@@ -0,0 +1,75 @@
# syntax=docker/dockerfile:1.6
FROM nvidia/cuda:13.1.0-devel-ubuntu24.04
ENV DEBIAN_FRONTEND=noninteractive
ENV PIP_BREAK_SYSTEM_PACKAGES=1
ENV VLLM_BASE_DIR=/workspace/vllm
# Just in case if some JIT compilation happens during runtime
# Limit build parallelism to reduce OOM situations
ARG BUILD_JOBS=16
ENV MAX_JOBS=${BUILD_JOBS}
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
ENV NINJAFLAGS="-j${BUILD_JOBS}"
ENV MAKEFLAGS="-j${BUILD_JOBS}"
# Set pip cache directory
ENV PIP_CACHE_DIR=/root/.cache/pip
ENV UV_CACHE_DIR=/root/.cache/uv
ENV UV_SYSTEM_PYTHON=1
# Install minimal runtime dependencies (NCCL, Python)
# Note: "devel" tools like cmake/gcc are NOT installed here to save space
RUN apt update && apt upgrade -y \
&& apt install -y --allow-change-held-packages --no-install-recommends \
python3 python3-pip python3-dev vim curl git wget \
libcudnn9-cuda-13 \
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
&& rm -rf /var/lib/apt/lists/* \
&& pip install uv
# Set final working directory
WORKDIR $VLLM_BASE_DIR
# Download Tiktoken files
RUN mkdir -p tiktoken_encodings && \
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
COPY fastsafetensors.patch .
# Install fastsafetensors
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install --system --break-system-packages -U fastsafetensors
# --- VLLM SOURCE CACHE BUSTER ---
# Change THIS argument to force a fresh git clone and rebuild of vLLM
# without re-installing the dependencies above.
ARG CACHEBUST_VLLM=1
ARG VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130
# Install nightly vLLM build from prebuilt wheels
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install --system --break-system-packages -U vllm \
--torch-backend=auto \
--extra-index-url $VLLM_WHEELS_URL
# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
# Apply in site-packages
RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
# Setup Env for Runtime
ENV TORCH_CUDA_ARCH_LIST=12.1a
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
# Copy scripts
COPY run-cluster-node.sh $VLLM_BASE_DIR/
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
# Final extra deps
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install --system --break-system-packages ray[default]

View File

@@ -24,6 +24,13 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run
## CHANGELOG
### 2025-12-20
- Added `--use-wheels [mode]` flag to `build-and-copy.sh`.
- Allows building the container using pre-built vLLM wheels instead of compiling from source.
- The resulting Docker container size is reduced considerably (14GB vs 24GB)
- `mode` is optional and defaults to `nightly`.
- Supported modes: `nightly` (release wheels are broken with CUDA 13 currently).
### 2025-12-19
Updated `build-and-copy.sh` to support copying to multiple hosts (thanks @ericlewis for the contribution).
@@ -179,6 +186,7 @@ Using a different username:
| `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) |
| `--triton-ref <ref>` | Triton commit SHA, branch or tag (default: 'v3.5.1') |
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: 'main') |
| `--use-wheels [mode]` | Use pre-built vLLM wheels. Mode: `nightly` (default) or `release`. |
| `-c, --copy-to <host[,host...] or host host...>` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). |
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
| `--copy-parallel` | Copy to all specified hosts concurrently. |

View File

@@ -15,6 +15,7 @@ TRITON_REF="v3.5.1"
VLLM_REF="main"
TMP_IMAGE=""
PARALLEL_COPY=false
USE_WHEELS_MODE=""
cleanup() {
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -67,6 +68,7 @@ usage() {
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: \${BUILD_JOBS})"
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
echo " --no-build : Skip building, only copy image (requires --copy-to)"
echo " -h, --help : Show this help message"
exit 1
@@ -115,6 +117,18 @@ while [[ "$#" -gt 0 ]]; do
-j|--build-jobs) BUILD_JOBS="$2"; shift ;;
-u|--user) SSH_USER="$2"; shift ;;
--copy-parallel) PARALLEL_COPY=true ;;
--use-wheels)
if [[ "$2" != -* && -n "$2" ]]; then
if [[ "$2" != "nightly" && "$2" != "release" ]]; then
echo "Error: --use-wheels argument must be 'nightly' or 'release'."
exit 1
fi
USE_WHEELS_MODE="$2"
shift
else
USE_WHEELS_MODE="nightly"
fi
;;
--no-build) NO_BUILD=true ;;
-h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;;
@@ -134,6 +148,20 @@ if [ "$NO_BUILD" = false ]; then
# Construct build command
CMD=("docker" "build" "-t" "$IMAGE_TAG")
if [ -n "$USE_WHEELS_MODE" ]; then
echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
CMD+=("-f" "Dockerfile.wheels")
if [ "$USE_WHEELS_MODE" = "release" ]; then
echo "Release wheels are currently broken with CUDA 13, use nightly instead."
exit 1
CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/cu130")
else
CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130")
fi
else
echo "Building vLLM from source"
fi
if [ "$REBUILD_DEPS" = true ]; then
echo "Setting CACHEBUST_DEPS..."
CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")