Supporting other CUDA archs via --gpu-arch flag
This commit is contained in:
14
Dockerfile
14
Dockerfile
@@ -55,7 +55,8 @@ ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
|||||||
WORKDIR $VLLM_BASE_DIR
|
WORKDIR $VLLM_BASE_DIR
|
||||||
|
|
||||||
# 2. Set Environment Variables
|
# 2. Set Environment Variables
|
||||||
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
|
|
||||||
# =========================================================
|
# =========================================================
|
||||||
@@ -90,7 +91,8 @@ FROM base AS builder
|
|||||||
|
|
||||||
# ======= FlashInfer Build ==========
|
# ======= FlashInfer Build ==========
|
||||||
|
|
||||||
ENV FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
|
||||||
WORKDIR $VLLM_BASE_DIR
|
WORKDIR $VLLM_BASE_DIR
|
||||||
ARG FLASHINFER_REF=main
|
ARG FLASHINFER_REF=main
|
||||||
|
|
||||||
@@ -286,9 +288,11 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
uv pip install -U transformers --pre; \
|
uv pip install -U transformers --pre; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Setup Env for Runtime
|
# Setup environment for runtime
|
||||||
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||||
ENV FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||||
|
ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
|
||||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||||
ENV PATH=$VLLM_BASE_DIR:$PATH
|
ENV PATH=$VLLM_BASE_DIR:$PATH
|
||||||
|
|||||||
@@ -67,7 +67,8 @@ ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
|||||||
WORKDIR $VLLM_BASE_DIR
|
WORKDIR $VLLM_BASE_DIR
|
||||||
|
|
||||||
# 2. Set Environment Variables
|
# 2. Set Environment Variables
|
||||||
ENV TORCH_CUDA_ARCH_LIST="12.1a"
|
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
|
|
||||||
# --- CACHE BUSTER ---
|
# --- CACHE BUSTER ---
|
||||||
@@ -88,7 +89,8 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
# =========================================================
|
# =========================================================
|
||||||
FROM base AS builder
|
FROM base AS builder
|
||||||
|
|
||||||
ENV FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
|
||||||
|
|
||||||
WORKDIR $VLLM_BASE_DIR
|
WORKDIR $VLLM_BASE_DIR
|
||||||
|
|
||||||
@@ -260,9 +262,11 @@ RUN --mount=type=bind,from=builder,source=/workspace/wheels,target=/mount/wheels
|
|||||||
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||||
uv pip install /mount/wheels/*.whl
|
uv pip install /mount/wheels/*.whl
|
||||||
|
|
||||||
# Setup Env for Runtime
|
# Setup environment vars for runtime
|
||||||
ENV TORCH_CUDA_ARCH_LIST="12.1a"
|
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||||
ENV FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||||
|
ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
|
||||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||||
ENV PATH=$VLLM_BASE_DIR:$PATH
|
ENV PATH=$VLLM_BASE_DIR:$PATH
|
||||||
|
|||||||
@@ -90,7 +90,10 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Setup Env for Runtime
|
# Setup Env for Runtime
|
||||||
ENV TORCH_CUDA_ARCH_LIST=12.1a
|
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||||
|
ARG FLASHINFER_CUDA_ARCH_LIST="12.1a"
|
||||||
|
ENV FLASHINFER_CUDA_ARCH_LIST=${FLASHINFER_CUDA_ARCH_LIST}
|
||||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||||
|
|
||||||
|
|||||||
20
README.md
20
README.md
@@ -164,6 +164,12 @@ Don't do it every time you rebuild, because it will slow down compilation times.
|
|||||||
|
|
||||||
For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
|
For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
|
||||||
|
|
||||||
|
### 2026-02-11
|
||||||
|
|
||||||
|
#### Configurable GPU Architecture
|
||||||
|
|
||||||
|
Added `--gpu-arch <arch>` flag to `build-and-copy.sh`. This allows specifying the target GPU architecture (e.g., `12.0f`) during the build process, instead of being hardcoded to `12.1a`. This argument controls both `TORCH_CUDA_ARCH_LIST` and `FLASHINFER_CUDA_ARCH_LIST` build arguments.
|
||||||
|
|
||||||
### 2026-02-10
|
### 2026-02-10
|
||||||
|
|
||||||
#### Cache Directory Mounting
|
#### Cache Directory Mounting
|
||||||
@@ -525,8 +531,10 @@ Using a provided build script is recommended, but if you want to build using `do
|
|||||||
| :--- | :--- | :--- |
|
| :--- | :--- | :--- |
|
||||||
| `CACHEBUST_DEPS` | `1` | Change this to force a re-download of PyTorch, FlashInfer, and system dependencies. |
|
| `CACHEBUST_DEPS` | `1` | Change this to force a re-download of PyTorch, FlashInfer, and system dependencies. |
|
||||||
| `CACHEBUST_VLLM` | `1` | Change this to force a fresh git clone and rebuild of vLLM source code. |
|
| `CACHEBUST_VLLM` | `1` | Change this to force a fresh git clone and rebuild of vLLM source code. |
|
||||||
| `TRITON_REF` | `v3.5.1` | Triton commit SHA, branch, or tag to build. |
|
| `TRITON_REF` | `v3.6.0` | Triton commit SHA, branch, or tag to build - currently ignored. |
|
||||||
| `VLLM_REF` | `main` | vLLM commit SHA, branch, or tag to build. |
|
| `VLLM_REF` | `main` | vLLM commit SHA, branch, or tag to build. |
|
||||||
|
| `TORCH_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list for PyTorch. |
|
||||||
|
| `FLASHINFER_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list for FlashInfer. |
|
||||||
| `BUILD_JOBS` | `16` | Number of parallel build jobs (default: 16). |
|
| `BUILD_JOBS` | `16` | Number of parallel build jobs (default: 16). |
|
||||||
| `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. |
|
| `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. |
|
||||||
| `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). |
|
| `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). |
|
||||||
@@ -548,6 +556,7 @@ Supported build arguments for `Dockerfile.wheels`:
|
|||||||
| `WHEELS_FROM_GITHUB_RELEASE` | `0` | Set to `1` to use GitHub release wheels instead of nightly wheels. |
|
| `WHEELS_FROM_GITHUB_RELEASE` | `0` | Set to `1` to use GitHub release wheels instead of nightly wheels. |
|
||||||
| `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. |
|
| `FLASHINFER_PRE` | `""` | Set to `--pre` to use pre-release versions of FlashInfer. |
|
||||||
| `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). |
|
| `PRE_TRANSFORMERS` | `0` | Set to `1` to install pre-release transformers (5.0.0rc or higher). |
|
||||||
|
| `TORCH_CUDA_ARCH_LIST` | `12.1a` | Target GPU architecture list. |
|
||||||
|
|
||||||
### Using the Build Script (Recommended)
|
### Using the Build Script (Recommended)
|
||||||
|
|
||||||
@@ -622,6 +631,11 @@ Using a different username:
|
|||||||
```bash
|
```bash
|
||||||
./build-and-copy.sh --triton-ref abc123def456
|
./build-and-copy.sh --triton-ref abc123def456
|
||||||
```
|
```
|
||||||
|
**Build for specific GPU architecture:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./build-and-copy.sh --gpu-arch 12.0f
|
||||||
|
```
|
||||||
|
|
||||||
**Copy existing image without rebuilding:**
|
**Copy existing image without rebuilding:**
|
||||||
|
|
||||||
@@ -634,6 +648,8 @@ Using a different username:
|
|||||||
| Flag | Description |
|
| Flag | Description |
|
||||||
| :--- | :--- |
|
| :--- | :--- |
|
||||||
| `-t, --tag <tag>` | Image tag (default: 'vllm-node') |
|
| `-t, --tag <tag>` | Image tag (default: 'vllm-node') |
|
||||||
|
| `--gpu-arch <arch>` | Target GPU architecture (default: '12.1a') |
|
||||||
|
| `-t, --tag <tag>` | Image tag (default: 'vllm-node') |
|
||||||
| `--rebuild-deps` | Force rebuild all dependencies (sets CACHEBUST_DEPS) |
|
| `--rebuild-deps` | Force rebuild all dependencies (sets CACHEBUST_DEPS) |
|
||||||
| `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) |
|
| `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) |
|
||||||
| `--triton-ref <ref>` | Triton commit SHA, branch or tag (default: 'v3.5.1') |
|
| `--triton-ref <ref>` | Triton commit SHA, branch or tag (default: 'v3.5.1') |
|
||||||
@@ -1041,4 +1057,4 @@ The `hf-download.sh` script provides a convenient way to download models from Hu
|
|||||||
|
|
||||||
### Hardware Architecture
|
### Hardware Architecture
|
||||||
|
|
||||||
**Note:** The Dockerfile defaults to `TORCH_CUDA_ARCH_LIST=12.1a` (NVIDIA GB10). If you are using different hardware, update the `ENV` variable in the Dockerfile before building.
|
**Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`.
|
||||||
|
|||||||
@@ -60,11 +60,13 @@ copy_to_host() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
BUILD_JOBS="16"
|
BUILD_JOBS="16"
|
||||||
|
GPU_ARCH_LIST="12.1a"
|
||||||
|
|
||||||
# Help function
|
# Help function
|
||||||
usage() {
|
usage() {
|
||||||
echo "Usage: $0 [OPTIONS]"
|
echo "Usage: $0 [OPTIONS]"
|
||||||
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
|
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
|
||||||
|
echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')"
|
||||||
echo " --rebuild-deps : Set cache bust for dependencies"
|
echo " --rebuild-deps : Set cache bust for dependencies"
|
||||||
echo " --rebuild-vllm : Set cache bust for vllm"
|
echo " --rebuild-vllm : Set cache bust for vllm"
|
||||||
echo " --triton-ref <ref> : Triton commit SHA, branch or tag (default: 'v3.5.1')"
|
echo " --triton-ref <ref> : Triton commit SHA, branch or tag (default: 'v3.5.1')"
|
||||||
@@ -88,6 +90,7 @@ usage() {
|
|||||||
while [[ "$#" -gt 0 ]]; do
|
while [[ "$#" -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
||||||
|
--gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
|
||||||
--rebuild-deps) REBUILD_DEPS=true ;;
|
--rebuild-deps) REBUILD_DEPS=true ;;
|
||||||
--rebuild-vllm) REBUILD_VLLM=true ;;
|
--rebuild-vllm) REBUILD_VLLM=true ;;
|
||||||
--triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
|
--triton-ref) TRITON_REF="$2"; TRITON_REF_SET=true; shift ;;
|
||||||
@@ -227,6 +230,10 @@ if [ "$NO_BUILD" = false ]; then
|
|||||||
# Add BUILD_JOBS to build arguments
|
# Add BUILD_JOBS to build arguments
|
||||||
CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
|
CMD+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
|
||||||
|
|
||||||
|
# Add GPU architecture to build arguments
|
||||||
|
CMD+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||||
|
CMD+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||||
|
|
||||||
if [ "$PRE_FLASHINFER" = true ]; then
|
if [ "$PRE_FLASHINFER" = true ]; then
|
||||||
echo "Using pre-release FlashInfer..."
|
echo "Using pre-release FlashInfer..."
|
||||||
CMD+=("--build-arg" "FLASHINFER_PRE=--pre")
|
CMD+=("--build-arg" "FLASHINFER_PRE=--pre")
|
||||||
|
|||||||
Reference in New Issue
Block a user