From 3b1e49dcb039f2c28357f29e88a93f4b3c4385cf Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 11 Feb 2026 13:10:41 -0800 Subject: [PATCH] Supporting other CUDA archs via `--gpu-arch` flag --- Dockerfile | 14 +++++++++----- Dockerfile.mxfp4 | 14 +++++++++----- Dockerfile.wheels | 5 ++++- README.md | 20 ++++++++++++++++++-- build-and-copy.sh | 7 +++++++ 5 files changed, 47 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 27e5bdc..397565d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -55,7 +55,8 @@ ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache WORKDIR $VLLM_BASE_DIR # 2. Set Environment Variables -ENV TORCH_CUDA_ARCH_LIST=12.1a +ARG TORCH_CUDA_ARCH_LIST="12.1a" +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # ========================================================= @@ -90,7 +91,8 @@ FROM base AS builder # ======= FlashInfer Build ========== -ENV FLASHINFER_CUDA_ARCH_LIST="12.1a" +ARG FLASHINFER_CUDA_ARCH_LIST="12.1a" +ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST} WORKDIR $VLLM_BASE_DIR ARG FLASHINFER_REF=main @@ -286,9 +288,11 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install -U transformers --pre; \ fi -# Setup Env for Runtime -ENV TORCH_CUDA_ARCH_LIST=12.1a -ENV FLASHINFER_CUDA_ARCH_LIST="12.1a" +# Setup environment for runtime +ARG TORCH_CUDA_ARCH_LIST="12.1a" +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +ARG FLASHINFER_CUDA_ARCH_LIST="12.1a" +ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST} ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings ENV PATH=$VLLM_BASE_DIR:$PATH diff --git a/Dockerfile.mxfp4 b/Dockerfile.mxfp4 index ed7d6d9..a83dc86 100644 --- a/Dockerfile.mxfp4 +++ b/Dockerfile.mxfp4 @@ -67,7 +67,8 @@ ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache WORKDIR $VLLM_BASE_DIR # 2. Set Environment Variables -ENV TORCH_CUDA_ARCH_LIST="12.1a" +ARG TORCH_CUDA_ARCH_LIST="12.1a" +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas # --- CACHE BUSTER --- @@ -88,7 +89,8 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # ========================================================= FROM base AS builder -ENV FLASHINFER_CUDA_ARCH_LIST="12.1a" +ARG FLASHINFER_CUDA_ARCH_LIST="12.1a" +ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST} WORKDIR $VLLM_BASE_DIR @@ -260,9 +262,11 @@ RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ uv pip install /mount/wheels/*.whl -# Setup Env for Runtime -ENV TORCH_CUDA_ARCH_LIST="12.1a" -ENV FLASHINFER_CUDA_ARCH_LIST="12.1a" +# Setup environment vars for runtime +ARG TORCH_CUDA_ARCH_LIST="12.1a" +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +ARG FLASHINFER_CUDA_ARCH_LIST="12.1a" +ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST} ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings ENV PATH=$VLLM_BASE_DIR:$PATH diff --git a/Dockerfile.wheels b/Dockerfile.wheels index ab48c1f..ed72845 100644 --- a/Dockerfile.wheels +++ b/Dockerfile.wheels @@ -90,7 +90,10 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ fi # Setup Env for Runtime -ENV TORCH_CUDA_ARCH_LIST=12.1a +ARG TORCH_CUDA_ARCH_LIST="12.1a" +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +ARG FLASHINFER_CUDA_ARCH_LIST="12.1a" +ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST} ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings diff --git a/README.md b/README.md index f6eb5f3..80d463b 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,12 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +### 2026-02-11 + +#### Configurable GPU Architecture + +Added `--gpu-arch ` flag to `build-and-copy.sh`. This allows specifying the target GPU architecture (e.g., `12.0f`) during the build process, instead of being hardcoded to `12.1a`. This argument controls both `TORCH_CUDA_ARCH_LIST` and `FLASHINFER_CUDA_ARCH_LIST` build arguments. + ### 2026-02-10 #### Cache Directory Mounting @@ -525,8 +531,10 @@ Using a provided build script is recommended, but if you want to build using `do | :--- | :--- | :--- | | `CACHEBUST_DEPS` | `1` | Change this to force a re-download of PyTorch, FlashInfer, and system dependencies. | | `CACHEBUST_VLLM` | `1` | Change this to force a fresh git clone and rebuild of vLLM source code. | -| `TRITON_REF` | `v3.5.1` | Triton commit SHA, branch, or tag to build. | +| `TRITON_REF` | `v3.6.0` | Triton commit SHA, branch, or tag to build - currently ignored. | | `VLLM_REF` | `main` | vLLM commit SHA, branch, or tag to build. | +| `TORCH_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list for PyTorch. | +| `FLASHINFER_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list for FlashInfer. | | `BUILD_JOBS` | `16` | Number of parallel build jobs (default: 16). | | `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. | | `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). | @@ -548,6 +556,7 @@ Supported build arguments for `Dockerfile.wheels`: | `WHEELS_FROM_GITHUB_RELEASE` | `0` | Set to `1` to use GitHub release wheels instead of nightly wheels. | | `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. | | `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). | +| `TORCH_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list. | ### Using the Build Script (Recommended) @@ -622,6 +631,11 @@ Using a different username: ```bash ./build-and-copy.sh --triton-ref abc123def456 ``` +**Build for specific GPU architecture:** + +```bash +./build-and-copy.sh --gpu-arch 12.0f +``` **Copy existing image without rebuilding:** @@ -634,6 +648,8 @@ Using a different username: | Flag | Description | | :--- | :--- | | `-t, --tag ` | Image tag (default: 'vllm-node') | +| `--gpu-arch ` | Target GPU architecture (default: '12.1a') | +| `-t, --tag ` | Image tag (default: 'vllm-node') | | `--rebuild-deps` | Force rebuild all dependencies (sets CACHEBUST_DEPS) | | `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) | | `--triton-ref ` | Triton commit SHA, branch or tag (default: 'v3.5.1') | @@ -1041,4 +1057,4 @@ The `hf-download.sh` script provides a convenient way to download models from Hu ### Hardware Architecture -**Note:** The Dockerfile defaults to `TORCH_CUDA_ARCH_LIST=12.1a` (NVIDIA GB10). If you are using different hardware, update the `ENV` variable in the Dockerfile before building. +**Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`. diff --git a/build-and-copy.sh b/build-and-copy.sh index 00456f3..aee2b2f 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -60,11 +60,13 @@ copy_to_host() { fi } BUILD_JOBS="16" +GPU_ARCH_LIST="12.1a" # Help function usage() { echo "Usage: $0 [OPTIONS]" echo " -t, --tag : Image tag (default: 'vllm-node')" + echo " --gpu-arch : GPU architecture (default: '12.1a')" echo " --rebuild-deps : Set cache bust for dependencies" echo " --rebuild-vllm : Set cache bust for vllm" echo " --triton-ref : Triton commit SHA, branch or tag (default: 'v3.5.1')" @@ -88,6 +90,7 @@ usage() { while [[ "$#" -gt 0 ]]; do case $1 in -t|--tag) IMAGE_TAG="$2"; shift ;; + --gpu-arch) GPU_ARCH_LIST="$2"; shift ;; --rebuild-deps) REBUILD_DEPS=true ;; --rebuild-vllm) REBUILD_VLLM=true ;; --triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;; @@ -227,6 +230,10 @@ if [ "$NO_BUILD" = false ]; then # Add BUILD_JOBS to build arguments CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS") + # Add GPU architecture to build arguments + CMD+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST") + CMD+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST") + if [ "$PRE_FLASHINFER" = true ]; then echo "Using pre-release FlashInfer..." CMD+=("--build-arg" "FLASHINFER_PRE=--pre")