Added --use-wheels to use precompiled vLLM wheels instead of compiling from the source
This commit is contained in:
75
Dockerfile.wheels
Normal file
75
Dockerfile.wheels
Normal file
@@ -0,0 +1,75 @@
|
||||
# syntax=docker/dockerfile:1.6
|
||||
|
||||
FROM nvidia/cuda:13.1.0-devel-ubuntu24.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
||||
ENV VLLM_BASE_DIR=/workspace/vllm
|
||||
|
||||
# Just in case if some JIT compilation happens during runtime
|
||||
# Limit build parallelism to reduce OOM situations
|
||||
ARG BUILD_JOBS=16
|
||||
ENV MAX_JOBS=${BUILD_JOBS}
|
||||
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
|
||||
ENV NINJAFLAGS="-j${BUILD_JOBS}"
|
||||
ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
||||
|
||||
# Set pip cache directory
|
||||
ENV PIP_CACHE_DIR=/root/.cache/pip
|
||||
ENV UV_CACHE_DIR=/root/.cache/uv
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
|
||||
# Install minimal runtime dependencies (NCCL, Python)
|
||||
# Note: "devel" tools like cmake/gcc are NOT installed here to save space
|
||||
RUN apt update && apt upgrade -y \
|
||||
&& apt install -y --allow-change-held-packages --no-install-recommends \
|
||||
python3 python3-pip python3-dev vim curl git wget \
|
||||
libcudnn9-cuda-13 \
|
||||
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& pip install uv
|
||||
|
||||
# Set final working directory
|
||||
WORKDIR $VLLM_BASE_DIR
|
||||
|
||||
# Download Tiktoken files
|
||||
RUN mkdir -p tiktoken_encodings && \
|
||||
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
|
||||
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
|
||||
# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
||||
COPY fastsafetensors.patch .
|
||||
|
||||
# Install fastsafetensors
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install --system --break-system-packages -U fastsafetensors
|
||||
|
||||
# --- VLLM SOURCE CACHE BUSTER ---
|
||||
# Change THIS argument to force a fresh git clone and rebuild of vLLM
|
||||
# without re-installing the dependencies above.
|
||||
ARG CACHEBUST_VLLM=1
|
||||
ARG VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130
|
||||
|
||||
# Install nightly vLLM build from prebuilt wheels
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install --system --break-system-packages -U vllm \
|
||||
--torch-backend=auto \
|
||||
--extra-index-url $VLLM_WHEELS_URL
|
||||
|
||||
# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
|
||||
# Apply in site-packages
|
||||
RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
|
||||
|
||||
# Setup Env for Runtime
|
||||
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||
|
||||
# Copy scripts
|
||||
COPY run-cluster-node.sh $VLLM_BASE_DIR/
|
||||
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
||||
|
||||
# Final extra deps
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install --system --break-system-packages ray[default]
|
||||
|
||||
@@ -24,6 +24,13 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run
|
||||
|
||||
## CHANGELOG
|
||||
|
||||
### 2025-12-20
|
||||
|
||||
- Added `--use-wheels [mode]` flag to `build-and-copy.sh`.
|
||||
- Allows building the container using pre-built vLLM wheels instead of compiling from source.
|
||||
- The resulting Docker container size is reduced considerably (14GB vs 24GB)
|
||||
- `mode` is optional and defaults to `nightly`.
|
||||
- Supported modes: `nightly` (release wheels are broken with CUDA 13 currently).
|
||||
### 2025-12-19
|
||||
|
||||
Updated `build-and-copy.sh` to support copying to multiple hosts (thanks @ericlewis for the contribution).
|
||||
@@ -179,6 +186,7 @@ Using a different username:
|
||||
| `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) |
|
||||
| `--triton-ref <ref>` | Triton commit SHA, branch or tag (default: 'v3.5.1') |
|
||||
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: 'main') |
|
||||
| `--use-wheels [mode]` | Use pre-built vLLM wheels. Mode: `nightly` (default) or `release`. |
|
||||
| `-c, --copy-to <host[,host...] or host host...>` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). |
|
||||
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
|
||||
| `--copy-parallel` | Copy to all specified hosts concurrently. |
|
||||
|
||||
@@ -15,6 +15,7 @@ TRITON_REF="v3.5.1"
|
||||
VLLM_REF="main"
|
||||
TMP_IMAGE=""
|
||||
PARALLEL_COPY=false
|
||||
USE_WHEELS_MODE=""
|
||||
|
||||
cleanup() {
|
||||
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
||||
@@ -67,6 +68,7 @@ usage() {
|
||||
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
|
||||
echo " -j, --build-jobs <jobs> : Number of concurrent build jobs (default: \${BUILD_JOBS})"
|
||||
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
|
||||
echo " --use-wheels [mode] : Use prebuilt vLLM wheels. Mode can be 'nightly' (default) or 'release'."
|
||||
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
||||
echo " -h, --help : Show this help message"
|
||||
exit 1
|
||||
@@ -115,6 +117,18 @@ while [[ "$#" -gt 0 ]]; do
|
||||
-j|--build-jobs) BUILD_JOBS="$2"; shift ;;
|
||||
-u|--user) SSH_USER="$2"; shift ;;
|
||||
--copy-parallel) PARALLEL_COPY=true ;;
|
||||
--use-wheels)
|
||||
if [[ "$2" != -* && -n "$2" ]]; then
|
||||
if [[ "$2" != "nightly" && "$2" != "release" ]]; then
|
||||
echo "Error: --use-wheels argument must be 'nightly' or 'release'."
|
||||
exit 1
|
||||
fi
|
||||
USE_WHEELS_MODE="$2"
|
||||
shift
|
||||
else
|
||||
USE_WHEELS_MODE="nightly"
|
||||
fi
|
||||
;;
|
||||
--no-build) NO_BUILD=true ;;
|
||||
-h|--help) usage ;;
|
||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||
@@ -134,6 +148,20 @@ if [ "$NO_BUILD" = false ]; then
|
||||
# Construct build command
|
||||
CMD=("docker" "build" "-t" "$IMAGE_TAG")
|
||||
|
||||
if [ -n "$USE_WHEELS_MODE" ]; then
|
||||
echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)"
|
||||
CMD+=("-f" "Dockerfile.wheels")
|
||||
if [ "$USE_WHEELS_MODE" = "release" ]; then
|
||||
echo "Release wheels are currently broken with CUDA 13, use nightly instead."
|
||||
exit 1
|
||||
CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/cu130")
|
||||
else
|
||||
CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130")
|
||||
fi
|
||||
else
|
||||
echo "Building vLLM from source"
|
||||
fi
|
||||
|
||||
if [ "$REBUILD_DEPS" = true ]; then
|
||||
echo "Setting CACHEBUST_DEPS..."
|
||||
CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")
|
||||
|
||||
Reference in New Issue
Block a user