diff --git a/Dockerfile.wheels b/Dockerfile.wheels index 4dbc20d..26b7e44 100644 --- a/Dockerfile.wheels +++ b/Dockerfile.wheels @@ -24,7 +24,7 @@ ENV UV_LINK_MODE=copy # Note: "devel" tools like cmake/gcc are NOT installed here to save space RUN apt update && apt upgrade -y \ && apt install -y --allow-change-held-packages --no-install-recommends \ - python3 python3-pip python3-dev vim curl git wget \ + python3 python3-pip python3-dev vim curl git wget jq \ libcudnn9-cuda-13 \ libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ && rm -rf /var/lib/apt/lists/* \ @@ -49,13 +49,20 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # Change THIS argument to force a fresh git clone and rebuild of vLLM # without re-installing the dependencies above. ARG CACHEBUST_VLLM=1 -ARG VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130 +ARG WHEELS_FROM_GITHUB_RELEASE=0 -# Install nightly vLLM build from prebuilt wheels +# Install vLLM +# If INSTALL_FROM_GITHUB_RELEASE is 1, install from GitHub releases (specific for aarch64/cu130 as requested) +# Otherwise, install from nightly wheels RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install --system --break-system-packages -U vllm \ - --torch-backend=auto \ - --extra-index-url $VLLM_WHEELS_URL + if [ "$WHEELS_FROM_GITHUB_RELEASE" = "1" ]; then \ + export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') && \ + uv pip install --system --break-system-packages -U https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl --torch-backend=auto; \ + else \ + uv pip install --system --break-system-packages -U vllm \ + --torch-backend=auto \ + --extra-index-url https://wheels.vllm.ai/nightly/cu130; \ + fi # Apply TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/foundation-model-stack/fastsafetensors/issues/36 # Apply in site-packages diff --git a/README.md b/README.md index ec80752..34b4788 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ -# vLLM Ray Cluster Node Docker for DGX Spark +# vLLM Docker Optimized for DGX Spark (single or multi-node) This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups. +While it was primarily developed to support multi-node inference, it works just as well on a single node setups. + ## Table of Contents - [DISCLAIMER](#disclaimer) @@ -20,7 +22,7 @@ This repository contains the Docker configuration and startup scripts to run a m This repository is not affiliated with NVIDIA or their subsidiaries. This is a community effort aimed to help DGX Spark users to set up and run the most recent versions of vLLM on Spark cluster or single nodes. -The Dockerfile builds from the main branch of VLLM, so depending on when you run the build process, it may not be in fully functioning state. You can target a specific vLLM release by setting `--vllm-ref` parameter. +The Dockerfile builds from the main branch of VLLM, so depending on when you run the build process, it may not be in fully functioning state. You can target a specific vLLM release by setting `--vllm-ref` parameter or use `--use-wheels release` to install pre-built release wheels. ## CHANGELOG @@ -44,6 +46,11 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +### 2025-12-21 + +Pre-built wheels now support release versions. Use with `--use-wheels release`. +Using nightly wheels or building from source is recommended for better performance. + ### 2025-12-20 - Limited ccache to 50G when building from source to reduce build cache size. @@ -52,7 +59,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi - Allows building the container using pre-built vLLM wheels instead of compiling from source. - Reduced build time and container size. - `mode` is optional and defaults to `nightly`. - - Supported modes: `nightly` (release wheels are broken with CUDA 13 currently). + - Supported modes: `nightly` (release wheels are broken with CUDA 13 currently). UPDATE: `release` also works now. ### 2025-12-19 Updated `build-and-copy.sh` to support copying to multiple hosts (thanks @ericlewis for the contribution). diff --git a/build-and-copy.sh b/build-and-copy.sh index 01b6d16..fbcbd12 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -155,11 +155,7 @@ if [ "$NO_BUILD" = false ]; then echo "Using pre-built vLLM wheels (mode: $USE_WHEELS_MODE)" CMD+=("-f" "Dockerfile.wheels") if [ "$USE_WHEELS_MODE" = "release" ]; then - echo "Release wheels are currently broken with CUDA 13, use nightly instead." - exit 1 - CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/cu130") - else - CMD+=("--build-arg" "VLLM_WHEELS_URL=https://wheels.vllm.ai/nightly/cu130") + CMD+=("--build-arg" "WHEELS_FROM_GITHUB_RELEASE=1") fi else echo "Building vLLM from source"