Added --use-wheels to use precompiled vLLM wheels instead of compiling from the source

2025-12-20 20:25:07 -08:00
parent f075801c59
commit 76988e0c75
3 changed files with 111 additions and 0 deletions
--- a/Dockerfile.wheels
+++ b/Dockerfile.wheels
@@ -0,0 +1,75 @@
+# syntax=docker/dockerfile:1.6
+
+FROM nvidia/cuda:13.1.0-devel-ubuntu24.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+ENV VLLM_BASE_DIR=/workspace/vllm
+
+# Just in case if some JIT compilation happens during runtime
+# Limit build parallelism to reduce OOM situations
+ARG BUILD_JOBS=16
+ENV MAX_JOBS=${BUILD_JOBS}
+ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
+ENV NINJAFLAGS="-j${BUILD_JOBS}"
+ENV MAKEFLAGS="-j${BUILD_JOBS}"
+
+# Set pip cache directory
+ENV PIP_CACHE_DIR=/root/.cache/pip
+ENV UV_CACHE_DIR=/root/.cache/uv
+ENV UV_SYSTEM_PYTHON=1
+
+# Install minimal runtime dependencies (NCCL, Python)
+# Note: "devel" tools like cmake/gcc are NOT installed here to save space
+RUN apt update && apt upgrade -y \
+    && apt install -y --allow-change-held-packages --no-install-recommends \
+    python3 python3-pip python3-dev vim curl git wget \
+    libcudnn9-cuda-13 \
+    libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip install uv
+
+# Set final working directory
+WORKDIR $VLLM_BASE_DIR
+
+# Download Tiktoken files
+RUN mkdir -p tiktoken_encodings && \
+    wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
+    wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
+
+# Cache TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
+COPY fastsafetensors.patch .
+
+# Install fastsafetensors
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install --system --break-system-packages -U fastsafetensors
+
+# --- VLLM SOURCE CACHE BUSTER ---
+# Change THIS argument to force a fresh git clone and rebuild of vLLM
+# without re-installing the dependencies above.
+ARG CACHEBUST_VLLM=1
+ARG VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130
+
+# Install nightly vLLM build from prebuilt wheels
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install --system --break-system-packages -U vllm  \
+    --torch-backend=auto \
+    --extra-index-url $VLLM_WHEELS_URL
+
+# Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36
+# Apply in site-packages
+RUN patch -p1 -d /usr/local/lib/python3.12/dist-packages < ${VLLM_BASE_DIR}/fastsafetensors.patch
+
+# Setup Env for Runtime
+ENV TORCH_CUDA_ARCH_LIST=12.1a
+ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
+ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
+
+# Copy scripts
+COPY run-cluster-node.sh $VLLM_BASE_DIR/
+RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
+
+# Final extra deps
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+    uv pip install --system --break-system-packages ray[default]
+