Compare commits
179 Commits
staging-cu
...
prebuilt-f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ba9dde963f | ||
|
|
ae8ac815ac | ||
|
|
83a680c87b | ||
|
|
69ea62294f | ||
|
|
8e548ce664 | ||
|
|
bca64f9a53 | ||
|
|
29d5904b80 | ||
|
|
b87854fd4c | ||
|
|
c67c5b5c1e | ||
|
|
9fbed882bc | ||
|
|
97e51d5d23 | ||
|
|
87cb9f6e1e | ||
|
|
e3243bf555 | ||
|
|
43a00ed90f | ||
|
|
ef9b0e50f4 | ||
|
|
c1e952de2e | ||
|
|
b13a3600d3 | ||
|
|
7dea11bbf0 | ||
|
|
c187912e23 | ||
|
|
caa28c8e12 | ||
|
|
5415c1fe9e | ||
|
|
d49fac1b8b | ||
|
|
6b7f8dace6 | ||
|
|
76fbf0d0be | ||
|
|
b7830469be | ||
|
|
b50fa426c8 | ||
|
|
2c13e1ce25 | ||
|
|
c026c92bd0 | ||
|
|
cf4cb35356 | ||
|
|
1ad85442ac | ||
|
|
30919581ee | ||
|
|
b7c8616743 | ||
|
|
8e8e850ef1 | ||
|
|
fc08740fba | ||
|
|
288da8e911 | ||
|
|
7bc4e4ce5e | ||
|
|
49d6d9fefd | ||
|
|
4afca860a5 | ||
|
|
ed32612cdd | ||
|
|
44808f7018 | ||
|
|
12caec228e | ||
|
|
27eb35f08d | ||
|
|
3335540972 | ||
|
|
ae25d64ac0 | ||
|
|
a770865834 | ||
|
|
7b47235463 | ||
|
|
3a3ab98b3e | ||
|
|
23fb7dcc20 | ||
|
|
c4860b86a2 | ||
|
|
044557943c | ||
|
|
ead749239d | ||
|
|
a889fed254 | ||
|
|
e89104d91b | ||
|
|
15a04ada32 | ||
|
|
a467a7a0bd | ||
|
|
48318380f9 | ||
|
|
287d3c72e5 | ||
|
|
9370b2bb34 | ||
|
|
bb177383ff | ||
|
|
7f0be29fcc | ||
|
|
41c0ce2c9a | ||
|
|
45494688d1 | ||
|
|
a3201f8873 | ||
|
|
e471ca2436 | ||
|
|
32674c2619 | ||
|
|
47f5f931b5 | ||
|
|
d37217bad0 | ||
|
|
e70c87b4f6 | ||
|
|
c1a6cec074 | ||
|
|
51d69c5c17 | ||
|
|
e7f2ee692f | ||
|
|
101ae6fd56 | ||
|
|
f4ca15ce18 | ||
|
|
3d918e0b82 | ||
|
|
47a896d722 | ||
|
|
0fa585f909 | ||
|
|
cecec74828 | ||
|
|
c8ee2a2511 | ||
|
|
ce293b5f05 | ||
|
|
f872cc17a8 | ||
|
|
00c16746e5 | ||
|
|
f163ca69de | ||
|
|
a78e221de3 | ||
|
|
e6ee108cdf | ||
|
|
174de6f0a8 | ||
|
|
83a74bccec | ||
|
|
ff18a9ad5b | ||
|
|
c08b34a218 | ||
|
|
23cca2a11a | ||
|
|
c2fe579ccc | ||
|
|
8b7c02aa25 | ||
|
|
73fec1bdf8 | ||
|
|
2f5ff0211e | ||
|
|
63ee72e729 | ||
|
|
4a0feea6c3 | ||
|
|
429042b7dc | ||
|
|
ef95336937 | ||
|
|
b8930b05a1 | ||
|
|
49d505ad14 | ||
|
|
1755dfd114 | ||
|
|
3d4dc4c82e | ||
|
|
07fac71dac | ||
|
|
1702f47df6 | ||
|
|
ad2cd3373f | ||
|
|
1fd8c7afc3 | ||
|
|
3dcd2a90c1 | ||
|
|
efacbd69f2 | ||
|
|
c4b078b868 | ||
|
|
3be2fb24a8 | ||
|
|
7fa69187df | ||
|
|
8298c3d7f8 | ||
|
|
f8c2653fd3 | ||
|
|
990a7b3837 | ||
|
|
9e089acf2b | ||
|
|
2d749742e4 | ||
|
|
7a54657abf | ||
|
|
926dd57a87 | ||
|
|
6e8d85c914 | ||
|
|
d6e76f8e2f | ||
|
|
8385506c5e | ||
|
|
8caebe3155 | ||
|
|
919a881cb1 | ||
|
|
8ddc259619 | ||
|
|
22f3fa6c21 | ||
|
|
15d295887c | ||
|
|
7e4150feed | ||
|
|
7b752c31c5 | ||
|
|
bdd2b10f54 | ||
|
|
2755b62d12 | ||
|
|
f327b92abe | ||
|
|
57b458570e | ||
|
|
57ed099465 | ||
|
|
fb0687cd1b | ||
|
|
ccea2ba861 | ||
|
|
957605498c | ||
|
|
b1eeefc0eb | ||
|
|
b879b7748f | ||
|
|
fa645f3e4b | ||
|
|
dedbd0a01d | ||
|
|
caa83d9e5b | ||
|
|
4bcbbaa25a | ||
|
|
d08266a123 | ||
|
|
03b055d7f0 | ||
|
|
d609fecef3 | ||
|
|
7c198b1ceb | ||
|
|
8ae51192e5 | ||
|
|
8fec9bed06 | ||
|
|
6a323cc6f5 | ||
|
|
6f9a2f981c | ||
|
|
122edc8229 | ||
|
|
7ceea85647 | ||
|
|
45066e2b16 | ||
|
|
f2cf11b047 | ||
|
|
3baca14eb1 | ||
|
|
66b5c85907 | ||
|
|
0019bdf5ed | ||
|
|
006734910c | ||
|
|
e225c709fb | ||
|
|
63b2a8dbed | ||
|
|
9724619dbd | ||
|
|
d42c4199fa | ||
|
|
b9fc32ec34 | ||
|
|
9dc09bd04b | ||
|
|
e88426646b | ||
|
|
f95beba566 | ||
|
|
eb8abcca7f | ||
|
|
d148d95a19 | ||
|
|
5346372f14 | ||
|
|
5f8f988d91 | ||
|
|
3fabd3fb1c | ||
|
|
2d03bc138d | ||
|
|
a749fcce87 | ||
|
|
505a060a7d | ||
|
|
ca34ebcffc | ||
|
|
4303f8b6d0 | ||
|
|
2152ef127d | ||
|
|
50b3ca60f3 | ||
|
|
163f23d85b | ||
|
|
e8f94d6b8b |
38
.env.example
Normal file
38
.env.example
Normal file
@@ -0,0 +1,38 @@
|
||||
# Example .env configuration file for spark-vllm-docker
|
||||
# Copy this file to .env and customize for your environment
|
||||
|
||||
# Cluster configuration
|
||||
# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node)
|
||||
CLUSTER_NODES="192.168.177.11,192.168.177.12"
|
||||
|
||||
# ETH_IF: Ethernet interface name (optional, auto-detected if not specified)
|
||||
ETH_IF="enp1s0f1np1"
|
||||
|
||||
# IB_IF: InfiniBand interface name (optional, auto-detected if not specified)
|
||||
IB_IF="rocep1s0f1,roceP2p1s0f1"
|
||||
|
||||
# LOCAL_IP: Local IP address (optional, auto-detected if not specified)
|
||||
# Useful for solo mode or overriding auto-detection
|
||||
LOCAL_IP="192.168.177.11"
|
||||
|
||||
# MASTER_PORT: Port for cluster coordination (default: 29501)
|
||||
MASTER_PORT="29501"
|
||||
|
||||
# CONTAINER_NAME: Container name (default: vllm_node)
|
||||
# Note: This is a configuration variable, NOT passed as env var to container
|
||||
CONTAINER_NAME="vllm_node"
|
||||
|
||||
# Container environment variables
|
||||
# Any variable starting with CONTAINER_ (except CONTAINER_NAME) will be converted to -e flags
|
||||
# Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO
|
||||
CONTAINER_NCCL_DEBUG="INFO"
|
||||
CONTAINER_HF_TOKEN="your_huggingface_token_here"
|
||||
CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
|
||||
|
||||
# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional)
|
||||
# Used by build-and-copy.sh to distribute images across cluster
|
||||
COPY_HOSTS="192.168.177.12"
|
||||
|
||||
# Additional container environment variables
|
||||
# CONTAINER_MAX_JOBS="16"
|
||||
# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1 +1,2 @@
|
||||
.env
|
||||
.env
|
||||
build-metadata.yaml
|
||||
170
Dockerfile
170
Dockerfile
@@ -4,9 +4,9 @@
|
||||
ARG BUILD_JOBS=16
|
||||
|
||||
# =========================================================
|
||||
# STAGE 1: Base Image (Installs Dependencies)
|
||||
# STAGE 1: Base Build Image
|
||||
# =========================================================
|
||||
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base
|
||||
FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS base
|
||||
|
||||
# Build parallemism
|
||||
ARG BUILD_JOBS
|
||||
@@ -14,6 +14,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
|
||||
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
|
||||
ENV NINJAFLAGS="-j${BUILD_JOBS}"
|
||||
ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
||||
ENV DG_JIT_USE_NVRTC=1
|
||||
ENV USE_CUDNN=1
|
||||
|
||||
# Set non-interactive frontend to prevent apt prompts
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
@@ -27,6 +29,9 @@ ENV UV_CACHE_DIR=/root/.cache/uv
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
ENV UV_BREAK_SYSTEM_PACKAGES=1
|
||||
ENV UV_LINK_MODE=copy
|
||||
# Set timeouts
|
||||
ENV UV_HTTP_TIMEOUT=600
|
||||
ENV UV_HTTP_RETRIES=10
|
||||
|
||||
# Set the base directory environment variable
|
||||
ENV VLLM_BASE_DIR=/workspace/vllm
|
||||
@@ -35,10 +40,18 @@ ENV VLLM_BASE_DIR=/workspace/vllm
|
||||
# Added ccache to enable incremental compilation caching
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends \
|
||||
curl vim ninja-build git \
|
||||
ccache \
|
||||
curl vim cmake build-essential ninja-build \
|
||||
libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
|
||||
python3-dev python3-pip git wget \
|
||||
libibverbs1 libibverbs-dev rdma-core \
|
||||
ccache devscripts debhelper fakeroot \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& pip install uv && pip uninstall -y flash-attn
|
||||
&& pip install uv
|
||||
|
||||
# Additional deps
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install torch==2.11.0 torchvision torchaudio triton --index-url https://download.pytorch.org/whl/cu130 && \
|
||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm
|
||||
|
||||
# Configure Ccache for CUDA/C++
|
||||
ENV PATH=/usr/lib/ccache:$PATH
|
||||
@@ -51,14 +64,19 @@ ENV CCACHE_COMPRESS=1
|
||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||
ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
|
||||
|
||||
# Setup Workspace
|
||||
WORKDIR $VLLM_BASE_DIR
|
||||
|
||||
# 2. Set Environment Variables
|
||||
ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
|
||||
# Setup Workspace
|
||||
WORKDIR $VLLM_BASE_DIR
|
||||
|
||||
# Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb
|
||||
RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \
|
||||
cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \
|
||||
make pkg.debian.build && apt install -y --no-install-recommends --allow-downgrades ./build/pkg/deb/*.deb
|
||||
|
||||
# =========================================================
|
||||
# STAGE 2: FlashInfer Builder
|
||||
# =========================================================
|
||||
@@ -73,8 +91,9 @@ ARG FLASHINFER_REF=main
|
||||
# Change this argument to force a re-download of FlashInfer
|
||||
ARG CACHEBUST_FLASHINFER=1
|
||||
|
||||
# Additional deps
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||
uv pip install packaging
|
||||
|
||||
# Smart Git Clone (Fetch changes instead of full re-clone)
|
||||
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
||||
@@ -100,6 +119,31 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
|
||||
|
||||
WORKDIR /workspace/flashinfer
|
||||
|
||||
ARG FLASHINFER_PRS=""
|
||||
|
||||
RUN if [ -n "$FLASHINFER_PRS" ]; then \
|
||||
# Git requires a user identity to create merge commits
|
||||
git config --global user.email "builder@example.com"; \
|
||||
git config --global user.name "Docker Builder"; \
|
||||
\
|
||||
echo "Applying PRs: $FLASHINFER_PRS"; \
|
||||
for pr in $FLASHINFER_PRS; do \
|
||||
echo "Fetching and merging PR #$pr..."; \
|
||||
git fetch origin pull/${pr}/head:pr-${pr}; \
|
||||
git merge pr-${pr} --no-edit; \
|
||||
done; \
|
||||
fi
|
||||
|
||||
# TEMPORARY patch for flashinfer autotune and other improvements (PR 2927) - MERGED 4/3
|
||||
# RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/2927.diff -o pr2927.diff \
|
||||
# && if git apply --reverse --check pr2927.diff 2>/dev/null; then \
|
||||
# echo "PR #2927 already applied, skipping."; \
|
||||
# else \
|
||||
# echo "Applying FI PR #2927..."; \
|
||||
# git apply -v pr2927.diff; \
|
||||
# fi \
|
||||
# && rm pr2927.diff
|
||||
|
||||
# Apply patch to avoid re-downloading existing cubins
|
||||
COPY flashinfer_cache.patch .
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
@@ -113,7 +157,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# flashinfer-jit-cache
|
||||
cd ../flashinfer-jit-cache && \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# dump git ref in the wheels dir
|
||||
cd .. && git rev-parse HEAD > /workspace/wheels/.flashinfer-commit
|
||||
|
||||
# =========================================================
|
||||
# STAGE 3: FlashInfer Wheel Export
|
||||
@@ -130,9 +176,6 @@ ARG TORCH_CUDA_ARCH_LIST="12.1a"
|
||||
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
WORKDIR $VLLM_BASE_DIR
|
||||
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||
|
||||
# --- VLLM SOURCE CACHE BUSTER ---
|
||||
ARG CACHEBUST_VLLM=1
|
||||
|
||||
@@ -166,20 +209,56 @@ WORKDIR $VLLM_BASE_DIR/vllm
|
||||
ARG VLLM_PRS=""
|
||||
|
||||
RUN if [ -n "$VLLM_PRS" ]; then \
|
||||
# Git requires a user identity to create merge commits
|
||||
git config --global user.email "builder@example.com"; \
|
||||
git config --global user.name "Docker Builder"; \
|
||||
\
|
||||
echo "Applying PRs: $VLLM_PRS"; \
|
||||
for pr in $VLLM_PRS; do \
|
||||
echo "Fetching and applying PR #$pr..."; \
|
||||
curl -fL "https://github.com/vllm-project/vllm/pull/${pr}.diff" | git apply -v; \
|
||||
echo "Fetching and merging PR #$pr..."; \
|
||||
git fetch origin pull/${pr}/head:pr-${pr}; \
|
||||
git merge pr-${pr} --no-edit; \
|
||||
done; \
|
||||
fi
|
||||
|
||||
# TEMPORARY PATCH for broken FP8 kernels - https://github.com/vllm-project/vllm/pull/35568
|
||||
RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/35568.diff -o pr35568.diff \
|
||||
&& if git apply --reverse --check pr35568.diff 2>/dev/null; then \
|
||||
echo "PR 35568 already applied, skipping."; \
|
||||
else \
|
||||
echo "Applying PR 35568..."; \
|
||||
git apply -v --exclude="tests/*" pr35568.diff; \
|
||||
fi \
|
||||
&& rm pr35568.diff
|
||||
|
||||
# TEMPORARY PATCH: revert vLLM PR #41524 / commit c51df430,
|
||||
# which disables FlashInfer autotune and regresses DGX Spark throughput.
|
||||
RUN set -eux; \
|
||||
patch_commit="c51df43005726a09c6eb7348e8c1b00501c70a8e"; \
|
||||
target="vllm/config/vllm.py"; \
|
||||
marker="https://github.com/flashinfer-ai/flashinfer/issues/3197"; \
|
||||
if grep -q "$marker" "$target"; then \
|
||||
echo "PR #41524 regression found; reverting ${patch_commit}"; \
|
||||
if ! git revert --no-commit "$patch_commit"; then \
|
||||
git revert --abort 2>/dev/null || true; \
|
||||
echo "ERROR: PR #41524 appears present but could not be reverted"; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
if grep -q "$marker" "$target"; then \
|
||||
echo "ERROR: revert completed but PR #41524 marker is still present"; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
else \
|
||||
echo "PR #41524 regression marker not present; skipping revert"; \
|
||||
fi
|
||||
|
||||
# Prepare build requirements
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
python3 use_existing_torch.py && \
|
||||
sed -i "/flashinfer/d" requirements/cuda.txt && \
|
||||
sed -i '/^triton\b/d' requirements/test.txt && \
|
||||
sed -i '/^fastsafetensors\b/d' requirements/test.txt && \
|
||||
uv pip install -r requirements/build.txt
|
||||
sed -i '/^triton\b/d' requirements/test/cuda.txt && \
|
||||
sed -i '/^fastsafetensors\b/d' requirements/test/cuda.txt && \
|
||||
uv pip install -r requirements/build/cuda.txt
|
||||
|
||||
# Apply Patches
|
||||
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
|
||||
@@ -190,13 +269,15 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
# patch -p1 < fastsafetensors.patch; \
|
||||
# fi
|
||||
# TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302
|
||||
RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping"
|
||||
RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping"
|
||||
# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping"
|
||||
# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping"
|
||||
|
||||
# Final Compilation
|
||||
RUN --mount=type=cache,id=ccache,target=/root/.ccache \
|
||||
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
|
||||
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
|
||||
# dump git ref in the wheels dir
|
||||
git rev-parse HEAD > /workspace/wheels/.vllm-commit
|
||||
|
||||
# =========================================================
|
||||
# STAGE 5: vLLM Wheel Export
|
||||
@@ -207,7 +288,7 @@ COPY --from=vllm-builder /workspace/wheels /
|
||||
# =========================================================
|
||||
# STAGE 6: Runner (Installs wheels from host ./wheels/)
|
||||
# =========================================================
|
||||
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
|
||||
FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS runner
|
||||
|
||||
# Transferring build settings from build image because of ptxas/jit compilation during vLLM startup
|
||||
# Build parallemism
|
||||
@@ -216,6 +297,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
|
||||
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
|
||||
ENV NINJAFLAGS="-j${BUILD_JOBS}"
|
||||
ENV MAKEFLAGS="-j${BUILD_JOBS}"
|
||||
ENV DG_JIT_USE_NVRTC=1
|
||||
ENV USE_CUDNN=1
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
||||
@@ -228,13 +311,18 @@ ENV UV_SYSTEM_PYTHON=1
|
||||
ENV UV_BREAK_SYSTEM_PACKAGES=1
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Mount additional packages from base builder image
|
||||
# Install runtime dependencies
|
||||
RUN apt update && \
|
||||
RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target=/workspace/nccl-pkg \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends \
|
||||
curl vim git \
|
||||
python3 python3-pip python3-dev vim curl git wget \
|
||||
libcudnn9-cuda-13 \
|
||||
libibverbs1 libibverbs-dev rdma-core \
|
||||
libxcb1 \
|
||||
&& cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& pip install uv && pip uninstall -y flash-attn # triton-kernels pytorch-triton
|
||||
&& pip install uv
|
||||
|
||||
# Set final working directory
|
||||
WORKDIR $VLLM_BASE_DIR
|
||||
@@ -246,6 +334,11 @@ RUN mkdir -p tiktoken_encodings && \
|
||||
|
||||
ARG PRE_TRANSFORMERS=0
|
||||
|
||||
# Install deps
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install torch==2.11.0 torchvision torchaudio triton --index-url https://download.pytorch.org/whl/cu130 && \
|
||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||
|
||||
# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
|
||||
# With --tf5: override vLLM's transformers<5 constraint to get transformers>=5
|
||||
RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
|
||||
@@ -266,27 +359,14 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||
ENV PATH=$VLLM_BASE_DIR:$PATH
|
||||
|
||||
# Copy scripts
|
||||
COPY run-cluster-node.sh $VLLM_BASE_DIR/
|
||||
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
||||
|
||||
# Final extra deps
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
|
||||
uv pip install ray[default] fastsafetensors instanttensor
|
||||
|
||||
# Cleanup
|
||||
|
||||
# Keeping it here for reference - this won't work as is without squashing layers
|
||||
# RUN uv pip uninstall absl-py apex argon2-cffi \
|
||||
# argon2-cffi-bindings arrow asttokens astunparse async-lru audioread babel beautifulsoup4 \
|
||||
# black bleach comm contourpy cycler datasets debugpy decorator defusedxml dllist dm-tree \
|
||||
# execnet executing expecttest fastjsonschema fonttools fqdn gast hypothesis \
|
||||
# ipykernel ipython ipython_pygments_lexers isoduration isort jedi joblib jupyter-events \
|
||||
# jupyter-lsp jupyter_client jupyter_core jupyter_server jupyter_server_terminals jupyterlab \
|
||||
# jupyterlab_code_formatter jupyterlab_code_formatter jupyterlab_pygments jupyterlab_server \
|
||||
# jupyterlab_tensorboard_pro jupytext kiwisolver matplotlib matplotlib-inline matplotlib-inline \
|
||||
# mistune ml_dtypes mock nbclient nbconvert nbformat nest-asyncio notebook notebook_shim \
|
||||
# opt_einsum optree outlines_core overrides pandas pandocfilters parso pexpect polygraphy pooch \
|
||||
# pyarrow pycocotools pytest-flakefinder pytest-rerunfailures pytest-shard pytest-xdist \
|
||||
# scikit-learn scipy Send2Trash soundfile soupsieve soxr spin stack-data \
|
||||
# wcwidth webcolors xdoctest Werkzeug
|
||||
# Fix NCCL
|
||||
RUN rm /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2 && \
|
||||
ln -s /usr/lib/aarch64-linux-gnu/libnccl.so.2 /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2
|
||||
|
||||
# Build metadata (generated by build-and-copy.sh)
|
||||
COPY build-metadata.yaml /workspace/build-metadata.yaml
|
||||
|
||||
@@ -98,10 +98,10 @@ ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
|
||||
ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
|
||||
|
||||
ARG FLASHINFER_SHA=f349e52496a72a00d8c4ac02c7a1e38523ff7194
|
||||
ARG CUTLASS_SHA=c7516ad20f3d022fdbc93e9468643bf3b577e02c
|
||||
ARG CUTLASS_SHA=fede53000a962b46e05bafe0c86311778caeb380
|
||||
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
|
||||
uv pip install "nvidia-nvshmem-cu13<3.6" "apache-tvm-ffi<0.2"
|
||||
|
||||
# Clone FlashInfer (cached for faster rebuilds)
|
||||
RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
|
||||
@@ -270,13 +270,12 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
|
||||
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
|
||||
ENV PATH=$VLLM_BASE_DIR:$PATH
|
||||
|
||||
# Copy scripts
|
||||
COPY run-cluster-node.sh $VLLM_BASE_DIR/
|
||||
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
|
||||
|
||||
# Final extra deps
|
||||
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
|
||||
uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
|
||||
uv pip install ray[default] fastsafetensors "nvidia-nvshmem-cu13<3.6"
|
||||
|
||||
# Build metadata (generated by build-and-copy.sh)
|
||||
COPY build-metadata.yaml /workspace/build-metadata.yaml
|
||||
|
||||
# If not compiling Triton
|
||||
# remove triton-kernels as they are not compatible with this vLLM version yet
|
||||
|
||||
471
README.md
471
README.md
@@ -2,6 +2,7 @@
|
||||
# vLLM Docker Optimized for DGX Spark (single or multi-node)
|
||||
|
||||
This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups.
|
||||
Cluster setup supports direct connect between dual Sparks, connecting via QSFP/RoCE switch and 3-node mesh configuration.
|
||||
|
||||
While it was primarily developed to support multi-node inference, it works just as well on a single node setups.
|
||||
|
||||
@@ -26,7 +27,12 @@ While it was primarily developed to support multi-node inference, it works just
|
||||
|
||||
This repository is not affiliated with NVIDIA or their subsidiaries. This is a community effort aimed to help DGX Spark users to set up and run the most recent versions of vLLM on Spark cluster or single nodes.
|
||||
|
||||
The Dockerfile builds from the main branch of VLLM, so depending on when you run the build process, it may not be in fully functioning state. You can target a specific vLLM release by setting `--vllm-ref` parameter.
|
||||
Unless `--rebuild-vllm` or `--vllm-ref` or `--apply-vllm-pr` is specified, the builder will fetch the latest precompiled vLLM wheels from the repository. They are built nightly and tested on multiple models in both cluster and solo configuration before publishing.
|
||||
We will expand the selection of models we test in the pipeline, but since vLLM is a rapidly developing platform, some things may break.
|
||||
|
||||
If you want to build the latest from main branch, you can specify `--rebuild-vllm` flag. Or you can target a specific vLLM release by setting `--vllm-ref` parameter.
|
||||
|
||||
Similarly, `--rebuild-flashinfer`, `--flashinfer-ref`, and `--apply-flashinfer-pr` control the FlashInfer build in the same way.
|
||||
|
||||
## QUICK START
|
||||
|
||||
@@ -49,8 +55,8 @@ Build the container.
|
||||
|
||||
**On DGX Spark cluster:**
|
||||
|
||||
Make sure you connect your Sparks together and enable passwordless SSH as described in NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks).
|
||||
You can also check out our new [Networking Guide](docs/NETWORKING.md).
|
||||
Make sure you connect your Sparks together and enable passwordless SSH as described in our [Networking Guide](docs/NETWORKING.md). You can also check out NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks), but using our guide is the best way to get started.
|
||||
**NEW**: the guide now includes instructions on setting up 3-node Spark mesh!
|
||||
|
||||
Then run the following command that will build and distribute image across the cluster.
|
||||
|
||||
@@ -58,13 +64,13 @@ Then run the following command that will build and distribute image across the c
|
||||
./build-and-copy.sh -c
|
||||
```
|
||||
|
||||
An initial build will take around 20-30 minutes, but subsequent builds will be faster. Precompiled vLLM wheels for DGX Spark will also be available soon.
|
||||
An initial build speed depends on your Internet connection speed and whether the base image is already present on your machine. After base image pull, the build should take only 2-3 minutes. If `--rebuild-vllm` and/or `--rebuild-flashinfer` is used to trigger a build from the sourcew, it will take between 20-40 minutes, but subsequent builds will be faster. Prebuilt FlashInfer and vLLM wheels are downloaded automatically from GitHub releases, so compilation from source is usually not required.
|
||||
|
||||
### Run
|
||||
|
||||
**On a single node**:
|
||||
|
||||
**NEW** - `launch-cluster.sh` now supports solo mode, which is now a recommended way to run the container on a single Spark:
|
||||
`launch-cluster.sh` supports solo mode, which is now a recommended way to run the container on a single Spark:
|
||||
|
||||
```bash
|
||||
./launch-cluster.sh --solo exec \
|
||||
@@ -75,23 +81,6 @@ An initial build will take around 20-30 minutes, but subsequent builds will be f
|
||||
--load-format fastsafetensors
|
||||
```
|
||||
|
||||
**To launch using regular `docker run`**
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
--privileged \
|
||||
--gpus all \
|
||||
-it --rm \
|
||||
--network host --ipc=host \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
vllm-node \
|
||||
bash -c -i "vllm serve \
|
||||
QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ \
|
||||
--port 8000 --host 0.0.0.0 \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--load-format fastsafetensors"
|
||||
```
|
||||
|
||||
**On a cluster**
|
||||
|
||||
It's recommended to download the model on one node and distribute across the cluster using ConnectX interconnect prior to launching. This is to avoid re-downloading the model from the Internet on every node in the cluster.
|
||||
@@ -120,15 +109,13 @@ To launch the model:
|
||||
|
||||
This will run the model on all available cluster nodes.
|
||||
|
||||
**NOTE:** do not use `--load-format fastsafetensors` if you are loading models that would take >0.8 of available RAM (without KV cache) as it may result in out of memory situation.
|
||||
**NOTE:** do not use `--load-format fastsafetensors` if you are loading models that would take >0.85 of available RAM (without KV cache) as it may result in out of memory situation.
|
||||
|
||||
**Also:** You can use any vLLM container that has "bash" as its default entrypoint with the launch script. It was tested with NGC vLLM, but can work with others too. To use such container in the cluster, you need to specify `--apply-mod use-ngc-vllm` argument to `./launch-cluster.sh`. However, it's recommended to build the container using this repository for best compatibility and most up-to-date features.
|
||||
|
||||
## CHANGELOG
|
||||
|
||||
**IMPORTANT**
|
||||
|
||||
You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning.
|
||||
You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning.
|
||||
|
||||
You can check the build cache size by running:
|
||||
|
||||
@@ -146,6 +133,325 @@ Don't do it every time you rebuild, because it will slow down compilation times.
|
||||
|
||||
For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
|
||||
|
||||
## CHANGELOG
|
||||
|
||||
### 2026-04-14
|
||||
|
||||
Added `--load-format instanttensor` support to vLLM - thanks @SeraphimSerapis.
|
||||
An experimental option for now, but allows for faster loading than the current fastsafetensors default. You need to rebuild the container to start using the option, but you don't have to trigger the source build.
|
||||
|
||||
### 2026-04-12
|
||||
|
||||
#### Drop-caches mod for Qwen3.5-397B
|
||||
|
||||
Updated Qwen3.5-397B recipe (for dual node configuration) to use the new mod `mods/drop-caches` which clears filesystem caches every minute while the container is running, resolving fastsafetensors getting stuck during loading and a few other bugs when operating close to max memory limit.
|
||||
|
||||
### 2026-04-11
|
||||
|
||||
#### Pinned PyTorch Version
|
||||
|
||||
Pinned PyTorch to version 2.11.0 (previously using nightly builds) to fix incompatibility with transformers 5.x and avoid torch version mismatch in builds.
|
||||
|
||||
### 2026-04-02
|
||||
|
||||
A new recipe for Gemma4-26B-A4B in "on-the-fly" FP8 quantization:
|
||||
|
||||
Single Spark:
|
||||
|
||||
```bash
|
||||
./run-recipe.sh gemma4-26b-a4b --solo
|
||||
```
|
||||
|
||||
Dual Sparks:
|
||||
|
||||
```bash
|
||||
./run-recipe.sh gemma4-26b-a4b --no-ray
|
||||
```
|
||||
|
||||
### 2026-03-31
|
||||
|
||||
#### Flags to specify Flashinfer ref and apply PRs
|
||||
|
||||
`build-and-copy.sh` gains two new flags that mirror the existing vLLM equivalents:
|
||||
|
||||
- `--flashinfer-ref <ref>` — build FlashInfer from a specific commit SHA, branch, or tag instead of `main`. Forces a local FlashInfer build (skips prebuilt wheel download).
|
||||
- `--apply-flashinfer-pr <pr-num>` — fetch and apply a FlashInfer GitHub PR patch before building. Can be specified multiple times. Forces a local FlashInfer build.
|
||||
|
||||
Both flags are incompatible with `--exp-mxfp4`.
|
||||
|
||||
#### Default image tag in `build-and-copy.sh`
|
||||
|
||||
`build-and-copy.sh` now automatically sets a sensible default image tag when `-t` is not specified:
|
||||
|
||||
- `--tf5` / `--pre-tf` - tag defaults to `vllm-node-tf5`
|
||||
- `--exp-mxfp4` - tag defaults to `vllm-node-mxfp4`
|
||||
- in all other cases - tag defaults to `vllm-node` (no change)
|
||||
|
||||
An explicit `-t <tag>` always takes precedence.
|
||||
|
||||
#### Support for 3-node mesh setups
|
||||
|
||||
Added initial support for setups where 3 Sparks are connected in a ring-like mesh without an additional switch.
|
||||
See [Networking Guide](docs/NETWORKING.md) for instructions on how to connect and set up networking in such cluster.
|
||||
|
||||
Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can detect mesh setups and configure parameters accordingly.
|
||||
|
||||
You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe:
|
||||
|
||||
```bash
|
||||
./run-recipe.sh --discover # force mesh discovery
|
||||
./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls
|
||||
```
|
||||
|
||||
Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are:
|
||||
|
||||
- `--pipeline-parallel 3` will let you run a model that can't fit on dual Sparks, but without additional speed improvements (total throughtput may improve though).
|
||||
- `--data-parallel 3` (possibly with `--enable-expert-parallel`) will let you run a model that can fit on a single Spark, but allow for better concurrency.
|
||||
|
||||
You can also run models with `--tensor-parallel 2` in a 3-node configuration - in this case only first two nodes (from autodiscovery/.env or from the CLI parameters) will be utilized.
|
||||
|
||||
#### GB10 Verification During Node Discovery
|
||||
|
||||
Node discovery now confirms each SSH-reachable peer is a GB10 system before adding it to the cluster:
|
||||
Only hosts reporting `NVIDIA GB10` are included. This prevents accidentally adding non-Spark machines that happen to be on the same subnet.
|
||||
|
||||
#### Separate COPY_HOSTS Discovery
|
||||
|
||||
Autodiscover now determines the host list used for image and model distribution separately from `CLUSTER_NODES`:
|
||||
|
||||
- **Non-mesh**: `COPY_HOSTS` mirrors `CLUSTER_NODES` (no change in behaviour).
|
||||
- **Mesh**: scans the direct IB-attached `enp1s0f0np0` and `enp1s0f1np1` interfaces (not the OOB ETH interface), so large file transfers use the faster direct InfiniBand path.
|
||||
|
||||
`COPY_HOSTS` is saved to `.env` and respected by `build-and-copy.sh`, `hf-download.sh`, and `run-recipe.py`.
|
||||
|
||||
#### Interactive Configuration Save in `autodiscover.sh`
|
||||
|
||||
`autodiscover.sh` now handles `.env` creation with a guided interactive flow, replacing the previous logic in `run-recipe.py`:
|
||||
|
||||
- Runs automatically when `.env` is absent.
|
||||
- Asks per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
|
||||
- Skips if `.env` already exists (use `--setup` to force).
|
||||
|
||||
`run-recipe.py` no longer contains its own `.env`-save prompt — it delegates entirely to `autodiscover.sh`.
|
||||
|
||||
#### `--setup` Flag in `launch-cluster.sh` and `build-and-copy.sh`
|
||||
|
||||
Both scripts now accept `--setup` to force a full autodiscovery run and overwrite the existing `.env`:
|
||||
|
||||
```bash
|
||||
./launch-cluster.sh --setup exec vllm serve ...
|
||||
./build-and-copy.sh --setup -c
|
||||
```
|
||||
|
||||
This is equivalent to the existing `--setup` in `run-recipe.sh`.
|
||||
|
||||
#### `--config` Flag
|
||||
|
||||
`hf-download.sh`, `build-and-copy.sh` and `launch-cluster.sh` now accept `--config <file>` to load a custom `.env` configuration file. `COPY_HOSTS` from the config is used for model distribution:
|
||||
|
||||
```bash
|
||||
./hf-download.sh QuantTrio/MiniMax-M2-AWQ --config /path/to/cluster.env -c --copy-parallel
|
||||
```
|
||||
|
||||
#### Parallelism-Aware Node Trimming
|
||||
|
||||
`launch-cluster.sh` now parses `-tp` / `--tensor-parallel-size`, `-pp` / `--pipeline-parallel-size`, and `-dp` / `--data-parallel-size` from the exec command or launch script and adjusts the active node count accordingly — for both Ray and no-Ray modes.
|
||||
|
||||
- If **fewer nodes are needed** than configured, only the required nodes get containers started (excess nodes are left idle).
|
||||
- If **more nodes are needed** than available, an error is raised before anything starts.
|
||||
|
||||
```
|
||||
Note: Command requires 2 node(s) (tp=2 * pp=1 * dp=1); using 2 of 3 configured node(s).
|
||||
Error: Command requires 4 nodes (tp=4 * pp=1 * dp=1) but only 3 node(s) are configured.
|
||||
```
|
||||
|
||||
No flags required — the check is automatic whenever parallelism arguments are present in the command.
|
||||
|
||||
### 2026-03-18
|
||||
|
||||
#### `--master-port` / `--head-port` Parameter
|
||||
|
||||
Added `--master-port` (synonym: `--head-port`) to both `launch-cluster.sh` and `run-recipe.sh` to configure the port used for cluster coordination:
|
||||
|
||||
- In **Ray mode**: sets the Ray head node port (previously hardcoded to 6379)
|
||||
- In **No-Ray mode**: sets the PyTorch distributed `--master-port` passed to vLLM
|
||||
|
||||
Default is `29501`.
|
||||
|
||||
```bash
|
||||
./launch-cluster.sh --master-port 29501 --no-ray exec vllm serve ...
|
||||
./run-recipe.sh qwen3.5-122b-fp8 --no-ray --master-port 29501
|
||||
```
|
||||
|
||||
#### `--network` Parameter in Build Arguments
|
||||
|
||||
Added `--network <name>` to `build-and-copy.sh` to allow using host networking during builds.
|
||||
Thanks @apairmont for the PR.
|
||||
|
||||
### 2026-03-17
|
||||
|
||||
#### EXPERIMENTAL Intel/Qwen3.5-397B-A17B-int4-AutoRound Recipe
|
||||
|
||||
You can run full 397B Qwen3.5 model on just two Sparks with vision and full context, however you need to make sure your Sparks don't run anything extra that can take a lot of RAM. That means that you don't want to log into the graphical interface or use remote desktop. Connect to the head node via ssh.
|
||||
|
||||
Alternatively, you can run in non-graphical mode (runlevel 3) by using `sudo systemctl isolate multi-user.target` to switch (you can use `sudo systemctl set-default graphical.target` to switch back to graphical mode), however this is known to reduce performance a bit.
|
||||
|
||||
You can run the model with the following command on the head node:
|
||||
|
||||
```bash
|
||||
./run-recipe.sh qwen3.5-397b-int4-autoround.yaml --no-ray
|
||||
```
|
||||
|
||||
Please, note `--no-ray` is necessary to fit full context. It also improves inference speed by ~1 t/s.
|
||||
By default it will try to allocate 112 GB for vLLM on each node. You can change this by changing `--gpu-memory-utilization` (e.g. `--gpu-memory-utilization 113`), but please be aware that it uses GB instead of percentage **for this recipe**.
|
||||
|
||||
**KNOWN ISSUES**:
|
||||
|
||||
1. The current firmware may cause sudden shutdown event on one or both Sparks during heavy inference. If you have this issue, you will need to lower GPU clock frequency on the affected unit(s), e.g. `sudo nvidia-smi -lgc 200,2150`. This command will reduce max GPU frequency to 2150 MHz. You can play with higher values to see what works for you (default is 2411 MHz, but can boost to 3000 MHz). Please note that this setting only survives until the next reboot, but can be applied at any time.
|
||||
2. You will need to use the new `--no-ray` argument to fit full context.
|
||||
3. If the model gets stuck loading weights, clearing the cache on both nodes can "unstuck" it. Use `sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'` to clear the cache.
|
||||
|
||||
|
||||
#### Major Cluster Orchestration Refactoring
|
||||
|
||||
Significantly refactored the internal cluster startup logic in `launch-cluster.sh`:
|
||||
- Removed the standalone `run-cluster-node.sh` script; its logic is now fully integrated into `launch-cluster.sh`.
|
||||
- Ray head/worker startup, environment variable injection, and launch script distribution are now handled by `launch-cluster.sh` directly.
|
||||
- Worker containers are started with proper per-node environment variables (`VLLM_HOST_IP`, `NCCL_SOCKET_IFNAME`, etc.) injected via `docker run`/`docker exec` instead of relying on `.bashrc`.
|
||||
- You will now be able to run other vLLM containers without applying `use-ngc-vllm` mod (current version is just an empty stub).
|
||||
|
||||
#### No-Ray Multi-Node Mode
|
||||
|
||||
Added `--no-ray` flag to `launch-cluster.sh` to run multi-node vLLM clusters without Ray, using PyTorch's native distributed backend instead. It slightly improves inference performance for most models and reduces memory requirements.
|
||||
|
||||
```bash
|
||||
./launch-cluster.sh --no-ray exec vllm serve ...
|
||||
```
|
||||
|
||||
`--no-ray` is incompatible with `--solo` (which already runs without Ray).
|
||||
|
||||
#### `run-recipe.sh` No-Ray Mode and Extended Flag Passthrough
|
||||
|
||||
`run-recipe.sh` now supports `--no-ray` flag for running multi-node inference without Ray (uses PyTorch distributed backend instead):
|
||||
|
||||
```bash
|
||||
./run-recipe.sh qwen3.5-122b-fp8 --no-ray
|
||||
```
|
||||
|
||||
The following `launch-cluster.sh` flags are now also passed through from `run-recipe.sh`:
|
||||
`--master-port`, `--name`, `--eth-if`, `--ib-if`, `-j`, `--no-cache-dirs`, `--non-privileged`, `--mem-limit-gb`, `--mem-swap-limit-gb`, `--pids-limit`, `--shm-size-gb`.
|
||||
|
||||
#### Nemotron-3-Nano-NVFP4 Switched to Marlin Backend
|
||||
|
||||
The `nemotron-3-nano-nvfp4` recipe has been updated to use the Marlin backend for better performance and reliability (until Flashinfer fully supports NVFP4 on sm121).
|
||||
|
||||
### 2026-03-12
|
||||
|
||||
#### Experimental `--gpu-memory-utilization-gb` Mod
|
||||
|
||||
Added a new mod `mods/gpu-mem-util-gb` that adds a `--gpu-memory-utilization-gb` flag to vLLM, allowing you to specify GPU memory reservation in GiB instead of as a fraction. This is particularly useful on DGX Spark's unified memory architecture where available memory changes dynamically.
|
||||
|
||||
```bash
|
||||
./launch-cluster.sh --apply-mod mods/gpu-mem-util-gb exec vllm serve ... \
|
||||
--gpu-memory-utilization-gb 110
|
||||
```
|
||||
|
||||
Cannot be used simultaneously with `--kv-cache-memory-bytes`.
|
||||
|
||||
#### Qwen3.5-397B INT4-AutoRound TP=4 Recipe (4× Spark Cluster)
|
||||
|
||||
Added `recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml` for running Intel/Qwen3.5-397B-A17B-int4-AutoRound across 4 DGX Spark nodes with tensor parallelism (TP=4).
|
||||
|
||||
Benchmarked at ~37 tok/s single-user, ~103 tok/s aggregate (4 concurrent users).
|
||||
|
||||
Includes a new mod `mods/fix-qwen35-tp4-marlin` that resolves a Marlin kernel constraint (`MIN_THREAD_N=64`) that breaks certain projection layers at TP=4.
|
||||
|
||||
**Note:** Requires NVIDIA driver 580.x. Driver 590.x has a CUDAGraph capture deadlock on GB10 unified memory.
|
||||
|
||||
```bash
|
||||
./run-recipe.sh 4x-spark-cluster/qwen3.5-397b-int4-autoround
|
||||
```
|
||||
Thanks @sonusflow for the contribution.
|
||||
|
||||
#### Nemotron-3-Super-120B NVFP4 Recipe
|
||||
|
||||
Added a new recipe `nemotron-3-super-nvfp4` for running `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4` with Marlin kernels. Supports both solo and cluster modes. Includes a custom reasoning parser (`super_v3_reasoning_parser.py`) fetched from the model repository. Supports both dual and single Spark configurations.
|
||||
|
||||
```bash
|
||||
./run-recipe.sh nemotron-3-super-nvfp4
|
||||
```
|
||||
|
||||
### 2026-03-11
|
||||
|
||||
#### Qwen3-Coder-Next INT4-AutoRound Recipe
|
||||
|
||||
Added a new recipe `qwen3-coder-next-int4-autoround` for running Intel/Qwen3-Coder-Next-int4-AutoRound. Supports single Spark only (use with `--solo` switch), since split weights are too small for Marlin kernel.
|
||||
|
||||
```bash
|
||||
./run-recipe.sh qwen3-coder-next-int4-autoround --solo
|
||||
```
|
||||
|
||||
### 2026-03-06
|
||||
|
||||
#### `-e/--env` Passthrough in `run-recipe.py`
|
||||
|
||||
`run-recipe.sh` now accepts one or more `-e VAR=VALUE` flags to pass environment variables directly to the container, mirroring the existing behaviour of `launch-cluster.sh`.
|
||||
|
||||
```bash
|
||||
./run-recipe.sh qwen3.5-122b-int4-autoround --solo -e HF_TOKEN=$HF_TOKEN
|
||||
```
|
||||
|
||||
#### Unsloth Chat Template for Qwen3.5
|
||||
|
||||
Added a new mod `mods/fix-qwen3.5-chat-template` that applies the Unsloth chat template to Qwen3.5 models for better compatibility with modern clients. The template is now included in the `qwen3.5-122b-fp8`, `qwen3.5-122b-int4-autoround`, and `qwen3.5-35b-a3b-fp8` recipes.
|
||||
|
||||
#### Fix Shell Quoting for Exec Command Arguments
|
||||
|
||||
Fixed shell quoting for exec command arguments in `launch-cluster.sh` and `run-recipe.py` to correctly handle arguments containing spaces or special characters.
|
||||
|
||||
### 2026-03-05
|
||||
|
||||
#### Qwen3.5-35B-A3B-FP8 Recipe
|
||||
|
||||
Added a new recipe `qwen3.5-35b-a3b-fp8` for running Qwen3.5-35B-A3B in FP8 format.
|
||||
|
||||
```bash
|
||||
./run-recipe.sh qwen3.5-35b-a3b-fp8
|
||||
```
|
||||
|
||||
#### 4× Spark Cluster Recipes
|
||||
|
||||
Added a `recipes/4x-spark-cluster/` subdirectory with recipes optimised for a 4-node Spark cluster:
|
||||
- `minimax-m2.5` — MiniMax M2.5 on 4× Spark
|
||||
- `qwen3.5-397b-a17B-fp8` — Qwen3.5-397B-A17B in FP8 on 4× Spark
|
||||
|
||||
#### More Robust Wheels Check Before Download
|
||||
|
||||
Improved the wheels availability check in `build-and-copy.sh` to be more reliable when deciding whether to download remote wheels.
|
||||
|
||||
### 2026-03-04
|
||||
|
||||
#### Prebuilt vLLM Wheels via GitHub Releases
|
||||
|
||||
`build-and-copy.sh` now automatically downloads prebuilt vLLM wheels from the [GitHub releases](https://github.com/eugr/spark-vllm-docker/releases/tag/prebuilt-vllm-current) before falling back to a local build — identical to the existing FlashInfer download mechanism. This eliminates the need to compile vLLM from source on first use.
|
||||
|
||||
The download logic mirrors the FlashInfer behaviour:
|
||||
- If prebuilt wheels are available and newer than any locally cached version, they are downloaded automatically.
|
||||
- If the download fails (e.g. no network, release not found, GPU arch not supported), the script falls back to building locally, or reuses existing local wheels if present.
|
||||
- `--rebuild-vllm`, `--vllm-ref`, or `--apply-vllm-pr` skip the download entirely and force a local build.
|
||||
|
||||
No new flags are required — the download happens transparently.
|
||||
|
||||
All prebuilt wheels are now tested with multiple models in both solo and cluster configuration as a part of automated deployment pipeline which will now run nightly. The wheels are released only if they pass all the tests and no significant performance regressions are detected.
|
||||
|
||||
#### Qwen3.5-122B-FP8 Recipe
|
||||
|
||||
Added a new recipe `qwen3.5-122b-fp8` for running Qwen3.5-122B in FP8 format.
|
||||
|
||||
```bash
|
||||
./run-recipe.sh qwen3.5-122b-fp8
|
||||
```
|
||||
|
||||
### 2026-03-02
|
||||
|
||||
#### Qwen3.5-122B-INT4-Autoround Support
|
||||
@@ -178,7 +484,6 @@ Added a new mod for Intel/Qwen3-Coder-Next-INT4-Autoround model support: `mods/f
|
||||
|
||||
Changed reasoning parser in Minimax for better compatibility with modern clients (like coding tools).
|
||||
|
||||
|
||||
### 2026-02-18
|
||||
|
||||
#### Completely Redesigned Build Process
|
||||
@@ -404,7 +709,8 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm-
|
||||
To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:
|
||||
|
||||
```bash
|
||||
./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c
|
||||
# Image tag defaults to vllm-node-tf5 when --tf5/--pre-tf is used
|
||||
./build-and-copy.sh --pre-tf -c
|
||||
```
|
||||
|
||||
Then, to run on a single node:
|
||||
@@ -454,7 +760,8 @@ It is currently the fastest way to run GPT-OSS on DGX Spark, achieving 60 t/s on
|
||||
To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss:
|
||||
|
||||
```bash
|
||||
./build-and-copy.sh -t vllm-node-mxfp4 --exp-mxfp4 -c
|
||||
# Image tag defaults to vllm-node-mxfp4 when --exp-mxfp4 is used
|
||||
./build-and-copy.sh --exp-mxfp4 -c
|
||||
```
|
||||
|
||||
Then, to run on a single Spark:
|
||||
@@ -698,12 +1005,14 @@ Using a different username:
|
||||
|
||||
| Flag | Description |
|
||||
| :--- | :--- |
|
||||
| `-t, --tag <tag>` | Image tag (default: `vllm-node`) |
|
||||
| `-t, --tag <tag>` | Image tag (default: `vllm-node`; auto-set to `vllm-node-tf5` with `--tf5`, `vllm-node-mxfp4` with `--exp-mxfp4`) |
|
||||
| `--gpu-arch <arch>` | Target GPU architecture (default: `12.1a`) |
|
||||
| `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build |
|
||||
| `--rebuild-vllm` | Force rebuild vLLM from source |
|
||||
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: `main`) |
|
||||
| `--flashinfer-ref <ref>` | FlashInfer commit SHA, branch or tag (default: `main`) |
|
||||
| `--apply-vllm-pr <pr-num>` | Apply a vLLM PR patch during build. Can be specified multiple times. |
|
||||
| `--apply-flashinfer-pr <pr-num>` | Apply a FlashInfer PR patch during build. Can be specified multiple times. |
|
||||
| `--tf5` | Install transformers v5 (5.0.0 or higher). Aliases: `--pre-tf, --pre-transformers`. |
|
||||
| `--exp-mxfp4` | Build with experimental native MXFP4 support. Alias: `--experimental-mxfp4`. |
|
||||
| `-c, --copy-to <hosts>` | Host(s) to copy the image to after building (space- or comma-separated). |
|
||||
@@ -713,9 +1022,13 @@ Using a different username:
|
||||
| `-u, --user <user>` | Username for SSH connection (default: current user) |
|
||||
| `--full-log` | Enable full Docker build output (`--progress=plain`) |
|
||||
| `--no-build` | Skip building, only copy existing image (requires `--copy-to`) |
|
||||
| `--network <name>` | Docker network to use during build (e.g. `host`). |
|
||||
| `--cleanup` | Remove all cached `.whl` and `*-commit` files from the `wheels/` directory. |
|
||||
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory) |
|
||||
| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists) |
|
||||
| `-h, --help` | Show help message |
|
||||
|
||||
**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! If you omit the IP address and use `-c` without addresses, it will use autodiscovery to detect a proper IP address.
|
||||
**IMPORTANT**: When copying to another node manually, use the IP assigned to a ConnectX 7 interface (`enp1s0f*`), not the 10G/wireless interfaces. When using `-c` without addresses, autodiscovery selects the correct interface automatically — in mesh mode it uses the direct IB-attached interfaces (`enp1s0f0np0`, `enp1s0f1np1`) for maximum transfer speed.
|
||||
|
||||
### Copying the container to another Spark node (Manual Method)
|
||||
|
||||
@@ -784,9 +1097,12 @@ Assumptions and limitations:
|
||||
### Auto-Detection
|
||||
|
||||
The script attempts to automatically detect:
|
||||
* **Ethernet Interface:** The interface associated with the active InfiniBand device that has an IP address.
|
||||
* **InfiniBand Interface:** The active InfiniBand devices. By default both active RoCE interfaces that correspond to active IB port(s) will be utilized.
|
||||
* **Node Role:** Based on the detected IP address and the list of nodes (defaults to `192.168.177.11` as head and `192.168.177.12` as worker).
|
||||
* **Ethernet Interface (`ETH_IF`):** Determined by the number of active CX7 interfaces:
|
||||
- **2 active** (standard): the `enp*` interface (no capital P) that has an IP address.
|
||||
- **4 active** (mesh topology): `enP7s7` (preferred) or `wlP9s9` (wireless, shown with a warning) — the cluster coordination interface is separate from the CX7 ports in this configuration.
|
||||
* **InfiniBand Interface (`IB_IF`):** All active RoCE devices. In mesh mode this is always `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`.
|
||||
* **Cluster peers:** Discovered by scanning the `ETH_IF` subnet for hosts with SSH access **and** a GB10 GPU (`nvidia-smi --query-gpu=name` must return `NVIDIA GB10`).
|
||||
* **Copy hosts (`COPY_HOSTS`):** In standard mode, same as cluster peers. In mesh mode, scanned separately on `enp1s0f0np0` and `enp1s0f1np1` subnets so that image/model transfers use the direct InfiniBand path.
|
||||
|
||||
### Manual Overrides
|
||||
|
||||
@@ -809,6 +1125,8 @@ You can override the auto-detected values if needed:
|
||||
| `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. |
|
||||
| `--check-config` | Check configuration and auto-detection without launching. |
|
||||
| `--solo` | Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster |
|
||||
| `--no-ray` | No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend). |
|
||||
| `--master-port` / `--head-port` | Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501). |
|
||||
| `--no-cache-dirs` | Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton). |
|
||||
| `--launch-script` | Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted. |
|
||||
| `-d` | Run in daemon mode (detached). |
|
||||
@@ -817,6 +1135,10 @@ You can override the auto-detected values if needed:
|
||||
| `--mem-swap-limit-gb` | Memory+swap limit in GB (default: mem-limit + 10, only with `--non-privileged`). |
|
||||
| `--pids-limit` | Process limit (default: 4096, only with `--non-privileged`). |
|
||||
| `--shm-size-gb` | Shared memory size in GB (default: 64, only with `--non-privileged`). |
|
||||
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
|
||||
| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists). |
|
||||
| `start \| stop \| status \| exec` | Action to perform (default: `start`). Not compatible with `--launch-script`. |
|
||||
| `command` | Command to execute inside the container (only for `exec` action). |
|
||||
|
||||
### Non-Privileged Mode
|
||||
|
||||
@@ -960,6 +1282,61 @@ You need to make sure you allocate IP addresses to them (no need to allocate IP
|
||||
|
||||
## 5\. Configuration Details
|
||||
|
||||
### Cluster Configuration (`.env` file)
|
||||
|
||||
The scripts share a `.env` file (default: `.env` in the repo directory) for persistent cluster configuration. It is created automatically by autodiscovery — run `--discover` (via `run-recipe.sh`) or `--setup` (via `launch-cluster.sh` / `build-and-copy.sh`) on first use.
|
||||
|
||||
**Supported variables:**
|
||||
|
||||
| Variable | Description |
|
||||
| :--- | :--- |
|
||||
| `CLUSTER_NODES` | Comma-separated node IPs used for Ray/vLLM cluster (head node first). |
|
||||
| `COPY_HOSTS` | Comma-separated node IPs used for image and model distribution. In mesh mode these are the IPs on the direct IB-attached interfaces, which may differ from `CLUSTER_NODES`. |
|
||||
| `LOCAL_IP` | IP address of the local node. |
|
||||
| `ETH_IF` | Ethernet interface for cluster coordination (e.g. `enp1s0f1np1` or `enP7s7`). |
|
||||
| `IB_IF` | Comma-separated RoCE/IB device names (e.g. `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`). |
|
||||
| `CONTAINER_*` | Any variable prefixed with `CONTAINER_` (except `CONTAINER_NAME`) is passed as `-e VAR=VALUE` to the container. Example: `CONTAINER_NCCL_DEBUG=INFO` → `-e NCCL_DEBUG=INFO`. |
|
||||
|
||||
**Mesh-mode NCCL variables** (written automatically when mesh topology is detected):
|
||||
|
||||
```
|
||||
CONTAINER_NCCL_NET_PLUGIN=none
|
||||
CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
|
||||
CONTAINER_NCCL_IB_MERGE_NICS=0
|
||||
```
|
||||
|
||||
**Example `.env` for a standard 2-node cluster:**
|
||||
|
||||
```
|
||||
CLUSTER_NODES=192.168.177.11,192.168.177.12
|
||||
COPY_HOSTS=192.168.177.12
|
||||
LOCAL_IP=192.168.177.11
|
||||
ETH_IF=enp1s0f1np1
|
||||
IB_IF=rocep1s0f1,roceP2p1s0f1
|
||||
```
|
||||
|
||||
To use a custom config file path, pass `--config /path/to/file.env` to any script.
|
||||
|
||||
### Autodiscovery Workflow
|
||||
|
||||
On first run, if no `.env` is present, the scripts will automatically trigger autodiscovery. You can also run it explicitly:
|
||||
|
||||
```bash
|
||||
# Via run-recipe.sh
|
||||
./run-recipe.sh --discover
|
||||
|
||||
# Via launch-cluster.sh or build-and-copy.sh (force re-run even if .env exists)
|
||||
./launch-cluster.sh --setup exec vllm serve ...
|
||||
./build-and-copy.sh --setup -c
|
||||
```
|
||||
|
||||
Autodiscovery:
|
||||
1. Detects active CX7 interfaces and determines mesh vs. standard topology.
|
||||
2. Scans the network for SSH-reachable GB10 peers.
|
||||
3. In mesh mode, separately discovers `COPY_HOSTS` on direct IB-attached interfaces.
|
||||
4. Prompts for per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
|
||||
5. Saves the result to `.env`.
|
||||
|
||||
### Environment Persistence
|
||||
|
||||
The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run:
|
||||
@@ -1133,6 +1510,32 @@ The `hf-download.sh` script provides a convenient way to download models from Hu
|
||||
./hf-download.sh -c --copy-parallel QuantTrio/MiniMax-M2-AWQ
|
||||
```
|
||||
|
||||
**Use nodes from `.env` (respects `COPY_HOSTS`):**
|
||||
|
||||
```bash
|
||||
./hf-download.sh -c QuantTrio/MiniMax-M2-AWQ
|
||||
```
|
||||
|
||||
When `-c` is given without explicit hosts, the script checks `COPY_HOSTS` in `.env` first, then falls back to autodiscovery. In mesh mode this means transfers go over the direct IB-attached interfaces automatically.
|
||||
|
||||
**Use a custom config file:**
|
||||
|
||||
```bash
|
||||
./hf-download.sh --config /path/to/cluster.env -c QuantTrio/MiniMax-M2-AWQ
|
||||
```
|
||||
|
||||
**Available options:**
|
||||
|
||||
| Flag | Description |
|
||||
| :--- | :--- |
|
||||
| `<model-name>` | HuggingFace model ID (e.g. `QuantTrio/MiniMax-M2-AWQ`). Required. |
|
||||
| `-c, --copy-to <hosts>` | Host(s) to copy the model to after download (space- or comma-separated). Omit hosts to use `COPY_HOSTS` from `.env` or autodiscovery. |
|
||||
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
|
||||
| `--copy-parallel` | Copy to all hosts concurrently instead of serially. |
|
||||
| `-u, --user <user>` | SSH username for remote copies (default: current user). |
|
||||
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
|
||||
| `-h, --help` | Show help message. |
|
||||
|
||||
### Hardware Architecture
|
||||
|
||||
**Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`.
|
||||
|
||||
447
autodiscover.sh
447
autodiscover.sh
@@ -1,5 +1,57 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
|
||||
|
||||
# Load .env file if exists (for shared configuration)
|
||||
# This is called early so that DOTENV_* variables are available to all functions
|
||||
load_env_if_exists() {
|
||||
local env_file="${CONFIG_FILE:-}"
|
||||
local config_explicit="${CONFIG_FILE_SET:-false}"
|
||||
|
||||
# If CONFIG_FILE is not set, check default location
|
||||
if [[ -z "$env_file" ]]; then
|
||||
env_file="$SCRIPT_DIR/.env"
|
||||
config_explicit="false"
|
||||
fi
|
||||
|
||||
# Validate config file exists if explicitly specified
|
||||
# Exception: if --setup is also specified, the file will be created by the setup procedure
|
||||
if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]] && [[ "${FORCE_DISCOVER:-false}" != "true" ]]; then
|
||||
echo "Error: Config file not found: $env_file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -f "$env_file" ]]; then
|
||||
# Load .env variables with DOTENV_ prefix
|
||||
while IFS='=' read -r key value || [[ -n "$key" ]]; do
|
||||
# Skip comments and empty lines
|
||||
[[ "$key" =~ ^[[:space:]]*# ]] && continue
|
||||
[[ -z "$key" ]] && continue
|
||||
|
||||
# Remove leading/trailing whitespace from key
|
||||
key=$(echo "$key" | xargs)
|
||||
|
||||
# Skip if key is empty after trimming
|
||||
[[ -z "$key" ]] && continue
|
||||
|
||||
# Remove quotes from value
|
||||
value="${value%\"}"
|
||||
value="${value#\"}"
|
||||
value="${value%\'}"
|
||||
value="${value#\'}"
|
||||
|
||||
# Export with DOTENV_ prefix
|
||||
export "DOTENV_$key=$value"
|
||||
done < "$env_file"
|
||||
fi
|
||||
}
|
||||
|
||||
# Load .env file
|
||||
load_env_if_exists
|
||||
|
||||
# Mesh mode flag (set by detect_interfaces)
|
||||
MESH_MODE="false"
|
||||
|
||||
# Function to detect IB and Ethernet interfaces
|
||||
detect_interfaces() {
|
||||
# If both interfaces are already set, nothing to do
|
||||
@@ -14,60 +66,132 @@ detect_interfaces() {
|
||||
fi
|
||||
|
||||
echo "Auto-detecting interfaces..."
|
||||
|
||||
|
||||
# Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)"
|
||||
# We capture: IB_DEV, NET_DEV
|
||||
mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}')
|
||||
|
||||
|
||||
if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then
|
||||
echo "Error: No active IB interfaces found."
|
||||
return 1
|
||||
fi
|
||||
|
||||
DETECTED_IB_IFS=()
|
||||
CANDIDATE_ETH_IFS=()
|
||||
ALL_NET_IFS=()
|
||||
|
||||
for pair in "${IB_NET_PAIRS[@]}"; do
|
||||
ib_dev=$(echo "$pair" | awk '{print $1}')
|
||||
net_dev=$(echo "$pair" | awk '{print $2}')
|
||||
|
||||
DETECTED_IB_IFS+=("$ib_dev")
|
||||
|
||||
# Check if interface has an IP address
|
||||
if ip addr show "$net_dev" | grep -q "inet "; then
|
||||
CANDIDATE_ETH_IFS+=("$net_dev")
|
||||
ALL_NET_IFS+=("$net_dev")
|
||||
done
|
||||
|
||||
local num_up="${#IB_NET_PAIRS[@]}"
|
||||
|
||||
# --- Sanity checks ---
|
||||
|
||||
# 1. enp* (no capital P) interfaces MUST have an IP
|
||||
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||
if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then
|
||||
if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
|
||||
echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned."
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# Set IB_IF if not provided
|
||||
if [[ -z "$IB_IF" ]]; then
|
||||
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
|
||||
echo " Detected IB_IF: $IB_IF"
|
||||
fi
|
||||
|
||||
# Set ETH_IF if not provided
|
||||
if [[ -z "$ETH_IF" ]]; then
|
||||
if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then
|
||||
echo "Error: No active IB-associated interfaces have IP addresses."
|
||||
# 2. No two interfaces with IPs should share the same subnet
|
||||
declare -A SEEN_SUBNETS
|
||||
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||
local cidr
|
||||
cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1)
|
||||
[[ -z "$cidr" ]] && continue
|
||||
# Compute network address using python3
|
||||
local net_addr
|
||||
net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null)
|
||||
if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then
|
||||
echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Selection logic: Prefer interface without capital 'P'
|
||||
SELECTED_ETH=""
|
||||
for iface in "${CANDIDATE_ETH_IFS[@]}"; do
|
||||
if [[ "$iface" != *"P"* ]]; then
|
||||
SELECTED_ETH="$iface"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Fallback: Use the first one if all have 'P' or none found yet
|
||||
if [[ -z "$SELECTED_ETH" ]]; then
|
||||
SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}"
|
||||
SEEN_SUBNETS["$net_addr"]="$net_dev"
|
||||
done
|
||||
|
||||
# --- Mode selection ---
|
||||
|
||||
if [[ "$num_up" -eq 2 ]]; then
|
||||
# Non-mesh configuration
|
||||
MESH_MODE="false"
|
||||
echo " Non-mesh mode: 2 CX7 interfaces active."
|
||||
|
||||
# Set IB_IF if not provided
|
||||
if [[ -z "$IB_IF" ]]; then
|
||||
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
|
||||
echo " Detected IB_IF: $IB_IF"
|
||||
fi
|
||||
|
||||
ETH_IF="$SELECTED_ETH"
|
||||
echo " Detected ETH_IF: $ETH_IF"
|
||||
|
||||
# Set ETH_IF if not provided: prefer interface without capital 'P'
|
||||
if [[ -z "$ETH_IF" ]]; then
|
||||
local selected_eth=""
|
||||
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
|
||||
if [[ "$net_dev" != *P* ]]; then
|
||||
selected_eth="$net_dev"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
# Fallback: first interface with an IP
|
||||
if [[ -z "$selected_eth" ]]; then
|
||||
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
|
||||
selected_eth="$net_dev"
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
if [[ -z "$selected_eth" ]]; then
|
||||
echo "Error: No active IB-associated interfaces have IP addresses."
|
||||
return 1
|
||||
fi
|
||||
ETH_IF="$selected_eth"
|
||||
echo " Detected ETH_IF: $ETH_IF"
|
||||
fi
|
||||
|
||||
elif [[ "$num_up" -eq 4 ]]; then
|
||||
# Mesh configuration
|
||||
MESH_MODE="true"
|
||||
echo " Mesh mode: all 4 CX7 interfaces active."
|
||||
|
||||
# Set IB_IF to all four RoCE interfaces (hardcoded for mesh)
|
||||
if [[ -z "$IB_IF" ]]; then
|
||||
IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1"
|
||||
echo " Detected IB_IF: $IB_IF"
|
||||
fi
|
||||
|
||||
# Set ETH_IF: check enP7s7 first, then wlP9s9
|
||||
if [[ -z "$ETH_IF" ]]; then
|
||||
if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then
|
||||
ETH_IF="enP7s7"
|
||||
echo " Detected ETH_IF: $ETH_IF"
|
||||
elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then
|
||||
ETH_IF="wlP9s9"
|
||||
echo " Detected ETH_IF: $ETH_IF"
|
||||
echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited."
|
||||
else
|
||||
echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination."
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Export mesh NCCL settings directly so launch-cluster.sh picks them up
|
||||
# even if the user declines to save config to .env
|
||||
export DOTENV_CONTAINER_NCCL_NET_PLUGIN=none
|
||||
export DOTENV_CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
|
||||
export DOTENV_CONTAINER_NCCL_IB_MERGE_NICS=0
|
||||
|
||||
else
|
||||
echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)."
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -84,16 +208,51 @@ detect_local_ip() {
|
||||
|
||||
# Get CIDR of the selected ETH_IF
|
||||
CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1)
|
||||
|
||||
|
||||
if [[ -z "$CIDR" ]]; then
|
||||
echo "Error: Could not determine IP/CIDR for interface $ETH_IF"
|
||||
return 1
|
||||
fi
|
||||
|
||||
|
||||
LOCAL_IP=${CIDR%/*}
|
||||
echo " Detected Local IP: $LOCAL_IP ($CIDR)"
|
||||
}
|
||||
|
||||
# Scan a subnet for GB10-capable peers via SSH
|
||||
# Usage: _scan_subnet_for_gb10 <cidr> <local_ip_to_exclude> <output_file>
|
||||
_scan_subnet_for_gb10() {
|
||||
local cidr="$1"
|
||||
local exclude_ip="$2"
|
||||
local out_file="$3"
|
||||
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
echo "Error: python3 not found."
|
||||
return 1
|
||||
fi
|
||||
if ! command -v nc &> /dev/null; then
|
||||
echo "Error: nc (netcat) not found."
|
||||
return 1
|
||||
fi
|
||||
|
||||
local all_ips
|
||||
all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr")
|
||||
|
||||
for ip in $all_ips; do
|
||||
[[ "$ip" == "$exclude_ip" ]] && continue
|
||||
(
|
||||
if nc -z -w 1 "$ip" 22 &>/dev/null; then
|
||||
# Check if remote is a GB10 system
|
||||
if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
|
||||
"nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \
|
||||
2>/dev/null | grep -q "NVIDIA GB10"; then
|
||||
echo "$ip" >> "$out_file"
|
||||
fi
|
||||
fi
|
||||
) &
|
||||
done
|
||||
wait
|
||||
}
|
||||
|
||||
# Function to detect cluster nodes
|
||||
detect_nodes() {
|
||||
detect_local_ip || return 1
|
||||
@@ -111,58 +270,182 @@ detect_nodes() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "Auto-detecting nodes..."
|
||||
|
||||
if ! command -v nc &> /dev/null; then
|
||||
echo "Error: nc (netcat) not found. Please install netcat."
|
||||
return 1
|
||||
fi
|
||||
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
echo "Error: python3 not found. Please install python3."
|
||||
return 1
|
||||
# Try to use CLUSTER_NODES from .env
|
||||
if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then
|
||||
echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES"
|
||||
PEER_NODES=()
|
||||
IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES"
|
||||
for node in "${ALL_NODES[@]}"; do
|
||||
node=$(echo "$node" | xargs)
|
||||
[[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node")
|
||||
done
|
||||
NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||
return 0
|
||||
fi
|
||||
|
||||
DETECTED_IPS=("$LOCAL_IP")
|
||||
echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..."
|
||||
|
||||
local temp_file
|
||||
temp_file=$(mktemp)
|
||||
|
||||
_scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file"
|
||||
|
||||
PEER_NODES=()
|
||||
|
||||
echo " Scanning for SSH peers on $CIDR..."
|
||||
|
||||
# Generate list of IPs using python
|
||||
ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR")
|
||||
|
||||
TEMP_IPS_FILE=$(mktemp)
|
||||
|
||||
# Scan in parallel
|
||||
for ip in $ALL_IPS; do
|
||||
# Skip own IP
|
||||
if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi
|
||||
|
||||
(
|
||||
# Check port 22 with 1 second timeout
|
||||
if nc -z -w 1 "$ip" 22 &>/dev/null; then
|
||||
echo "$ip" >> "$TEMP_IPS_FILE"
|
||||
fi
|
||||
) &
|
||||
done
|
||||
|
||||
# Wait for all background scans to complete
|
||||
wait
|
||||
|
||||
# Read found IPs
|
||||
if [[ -f "$TEMP_IPS_FILE" ]]; then
|
||||
local detected_ips=("$LOCAL_IP")
|
||||
if [[ -f "$temp_file" ]]; then
|
||||
while read -r ip; do
|
||||
DETECTED_IPS+=("$ip")
|
||||
PEER_NODES+=("$ip")
|
||||
echo " Found peer: $ip"
|
||||
done < "$TEMP_IPS_FILE"
|
||||
rm -f "$TEMP_IPS_FILE"
|
||||
PEER_NODES+=("$ip")
|
||||
detected_ips+=("$ip")
|
||||
echo " Found GB10 peer: $ip"
|
||||
done < <(sort "$temp_file")
|
||||
rm -f "$temp_file"
|
||||
fi
|
||||
|
||||
# Sort IPs
|
||||
IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}"))
|
||||
|
||||
# Sort and set NODES_ARG
|
||||
IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}"))
|
||||
unset IFS
|
||||
|
||||
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
|
||||
echo " Cluster Nodes: $NODES_ARG"
|
||||
}
|
||||
|
||||
# Function to detect COPY_HOSTS for build/model distribution
|
||||
# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network)
|
||||
# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers
|
||||
detect_copy_hosts() {
|
||||
if [[ "$MESH_MODE" == "false" ]]; then
|
||||
COPY_PEER_NODES=("${PEER_NODES[@]}")
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets
|
||||
echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..."
|
||||
|
||||
local temp_file
|
||||
temp_file=$(mktemp)
|
||||
|
||||
for iface in enp1s0f0np0 enp1s0f1np1; do
|
||||
local cidr
|
||||
cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1)
|
||||
[[ -z "$cidr" ]] && continue
|
||||
local local_iface_ip="${cidr%/*}"
|
||||
echo " Scanning $iface ($cidr)..."
|
||||
_scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file"
|
||||
done
|
||||
|
||||
# Deduplicate and collect results.
|
||||
# On two-cable setups two IB IPs may belong to the same host; deduplicate by
|
||||
# querying each host's ETH_IF IP as a canonical identity.
|
||||
COPY_PEER_NODES=()
|
||||
declare -A _SEEN_COPY # keyed by IB IP
|
||||
declare -A _SEEN_HOST # keyed by ETH_IF IP → first IB IP seen for that host
|
||||
if [[ -f "$temp_file" ]]; then
|
||||
while read -r ip; do
|
||||
[[ -n "${_SEEN_COPY[$ip]}" ]] && continue
|
||||
_SEEN_COPY["$ip"]=1
|
||||
# Resolve canonical host identity via ETH_IF IP
|
||||
local host_ip
|
||||
host_ip=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
|
||||
"ip -o -f inet addr show $ETH_IF 2>/dev/null | awk '{print \$4}' | head -n1 | cut -d/ -f1" \
|
||||
</dev/null 2>/dev/null)
|
||||
if [[ -n "$host_ip" && -n "${_SEEN_HOST[$host_ip]}" ]]; then
|
||||
echo " Skipping $ip (same host as ${_SEEN_HOST[$host_ip]}, ETH_IF: $host_ip)"
|
||||
continue
|
||||
fi
|
||||
[[ -n "$host_ip" ]] && _SEEN_HOST["$host_ip"]="$ip"
|
||||
COPY_PEER_NODES+=("$ip")
|
||||
echo " Found GB10 copy host: $ip"
|
||||
done < <(sort "$temp_file")
|
||||
rm -f "$temp_file"
|
||||
fi
|
||||
}
|
||||
|
||||
# Save discovered configuration to .env
|
||||
# Skips if .env already exists unless FORCE_DISCOVER=true
|
||||
save_config() {
|
||||
local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}"
|
||||
|
||||
# Skip if .env exists and not forced
|
||||
if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
local save_prompt="Save discovered configuration to $env_file?"
|
||||
if [[ -f "$env_file" ]]; then
|
||||
save_prompt="Overwrite existing configuration in $env_file?"
|
||||
fi
|
||||
read -r -p "$save_prompt [Y/n]: " response
|
||||
response="${response,,}"
|
||||
if [[ "$response" =~ ^(n|no)$ ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Build list of all cluster nodes (local + peers)
|
||||
local all_cluster_nodes=()
|
||||
if [[ -n "$LOCAL_IP" ]]; then
|
||||
all_cluster_nodes+=("$LOCAL_IP")
|
||||
fi
|
||||
for node in "${PEER_NODES[@]}"; do
|
||||
all_cluster_nodes+=("$node")
|
||||
done
|
||||
|
||||
# Per-node confirmation for CLUSTER_NODES
|
||||
echo ""
|
||||
echo "Select nodes for CLUSTER_NODES:"
|
||||
local selected_cluster=()
|
||||
for node in "${all_cluster_nodes[@]}"; do
|
||||
local label="$node"
|
||||
[[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)"
|
||||
read -r -p " Include $label? [Y/n]: " r
|
||||
r="${r,,}"
|
||||
if [[ ! "$r" =~ ^(n|no)$ ]]; then
|
||||
selected_cluster+=("$node")
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "${#selected_cluster[@]}" -eq 0 ]]; then
|
||||
echo "No nodes selected. Aborting save."
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Per-node confirmation for COPY_HOSTS
|
||||
echo ""
|
||||
echo "Select nodes for COPY_HOSTS (build/model distribution):"
|
||||
local selected_copy=()
|
||||
for node in "${COPY_PEER_NODES[@]}"; do
|
||||
read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r
|
||||
r="${r,,}"
|
||||
if [[ ! "$r" =~ ^(n|no)$ ]]; then
|
||||
selected_copy+=("$node")
|
||||
fi
|
||||
done
|
||||
|
||||
# Write .env
|
||||
{
|
||||
echo "# Auto-generated by autodiscover.sh"
|
||||
echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")"
|
||||
if [[ "${#selected_copy[@]}" -gt 0 ]]; then
|
||||
echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")"
|
||||
fi
|
||||
echo "LOCAL_IP=$LOCAL_IP"
|
||||
echo "ETH_IF=$ETH_IF"
|
||||
echo "IB_IF=$IB_IF"
|
||||
if [[ "$MESH_MODE" == "true" ]]; then
|
||||
echo "# Mesh mode NCCL settings"
|
||||
echo "CONTAINER_NCCL_NET_PLUGIN=none"
|
||||
echo "CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1"
|
||||
echo "CONTAINER_NCCL_IB_MERGE_NICS=0"
|
||||
fi
|
||||
} > "$env_file"
|
||||
echo ""
|
||||
echo "Saved to $env_file"
|
||||
}
|
||||
|
||||
# Convenience function: run full autodiscovery pipeline
|
||||
run_autodiscover() {
|
||||
detect_interfaces || return 1
|
||||
detect_local_ip || return 1
|
||||
detect_nodes || return 1
|
||||
detect_copy_hosts || return 1
|
||||
save_config
|
||||
}
|
||||
|
||||
@@ -6,35 +6,76 @@ START_TIME=$(date +%s)
|
||||
|
||||
# Default values
|
||||
IMAGE_TAG="vllm-node"
|
||||
IMAGE_TAG_SET=false
|
||||
REBUILD_FLASHINFER=false
|
||||
REBUILD_VLLM=false
|
||||
COPY_HOSTS=()
|
||||
COPY_TO_FLAG=false
|
||||
SSH_USER="$USER"
|
||||
NO_BUILD=false
|
||||
VLLM_REF="main"
|
||||
VLLM_REF_SET=false
|
||||
FLASHINFER_REF="main"
|
||||
FLASHINFER_REF_SET=false
|
||||
TMP_IMAGE=""
|
||||
PARALLEL_COPY=false
|
||||
EXP_MXFP4=false
|
||||
VLLM_REF_SET=false
|
||||
VLLM_PRS=""
|
||||
FLASHINFER_PRS=""
|
||||
PRE_TRANSFORMERS=false
|
||||
FULL_LOG=false
|
||||
BUILD_JOBS="16"
|
||||
GPU_ARCH_LIST="12.1a"
|
||||
NETWORK_ARG=""
|
||||
WHEELS_REPO="eugr/spark-vllm-docker"
|
||||
FLASHINFER_RELEASE_TAG="prebuilt-flashinfer-current"
|
||||
VLLM_RELEASE_TAG="prebuilt-vllm-current"
|
||||
# Space-separated list of GPU architectures for which prebuilt wheels are available
|
||||
PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a"
|
||||
CLEANUP_MODE="false"
|
||||
CONFIG_FILE=""
|
||||
|
||||
cleanup() {
|
||||
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
||||
echo "Cleaning up temporary image $TMP_IMAGE"
|
||||
rm -f "$TMP_IMAGE"
|
||||
fi
|
||||
rm -f ./build-metadata.yaml
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
generate_build_metadata() {
|
||||
local dockerfile="$1"
|
||||
local vllm_version="$2"
|
||||
local vllm_commit="$3"
|
||||
local flashinfer_commit="$4"
|
||||
local vllm_ref="$5"
|
||||
local pre_transformers="$6"
|
||||
local exp_mxfp4="$7"
|
||||
local vllm_prs="$8"
|
||||
|
||||
local base_image
|
||||
base_image=$(grep -m1 '^FROM .* AS runner' "$dockerfile" | awk '{print $2}')
|
||||
|
||||
cat > ./build-metadata.yaml <<EOF
|
||||
build_date: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||
build_script_commit: $(git rev-parse HEAD 2>/dev/null || echo "unknown")
|
||||
vllm_version: ${vllm_version:-unknown}
|
||||
vllm_commit: ${vllm_commit:-unknown}
|
||||
flashinfer_commit: ${flashinfer_commit:-unknown}
|
||||
gpu_arch: ${GPU_ARCH_LIST}
|
||||
base_image: ${base_image:-unknown}
|
||||
build_args:
|
||||
vllm_ref: ${vllm_ref}
|
||||
transformers_5: ${pre_transformers}
|
||||
exp_mxfp4: ${exp_mxfp4}
|
||||
vllm_prs: "${vllm_prs}"
|
||||
build_jobs: ${BUILD_JOBS}
|
||||
EOF
|
||||
echo "Generated build-metadata.yaml"
|
||||
}
|
||||
|
||||
add_copy_hosts() {
|
||||
local token part
|
||||
for token in "$@"; do
|
||||
@@ -65,7 +106,12 @@ copy_to_host() {
|
||||
|
||||
# try_download_wheels TAG PREFIX
|
||||
# Downloads wheels matching PREFIX*.whl from a GitHub release.
|
||||
# Skips files that are already present and up to date (by remote updated_at vs local mtime).
|
||||
# Skip conditions (either is sufficient):
|
||||
# 1. Commit hash in release name matches .wheels/.{PREFIX}_commit (primary check).
|
||||
# 2. All local wheels are newer than the latest GitHub asset (freshly built).
|
||||
# Only downloads a file when the remote asset is newer than the local copy AND
|
||||
# the above skip conditions are not met.
|
||||
# On success, persists the release commit hash to .wheels/.{PREFIX}_commit.
|
||||
# Returns 0 if all matching wheels are now available, 1 on any error.
|
||||
try_download_wheels() {
|
||||
local TAG="$1"
|
||||
@@ -91,7 +137,7 @@ try_download_wheels() {
|
||||
|
||||
local DOWNLOAD_LIST
|
||||
DOWNLOAD_LIST=$(echo "$RELEASE_JSON" | python3 -c '
|
||||
import json, sys, os
|
||||
import json, sys, os, re
|
||||
from datetime import datetime, timezone
|
||||
|
||||
wheels_dir, prefix = sys.argv[1], sys.argv[2]
|
||||
@@ -103,6 +149,31 @@ if not assets:
|
||||
print("No assets found matching prefix: " + prefix, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Extract commit hash from the release name:
|
||||
# FlashInfer: "Prebuilt FlashInfer Wheels (0.6.5-124a2d32-d20260305) - DGX Spark Only"
|
||||
# vLLM: "Prebuilt vLLM Wheels (0.16.1rc1.dev296+ga73af584f.d20260305.cu131) - DGX Spark only"
|
||||
release_name = data.get("name", "")
|
||||
commit_hash = None
|
||||
if prefix.startswith("flashinfer"):
|
||||
m = re.search(r"\([\d.]+\w*-([0-9a-f]{6,})-d\d{8}\)", release_name, re.IGNORECASE)
|
||||
if m:
|
||||
commit_hash = m.group(1)
|
||||
else:
|
||||
m = re.search(r"\+g([0-9a-f]{6,})\.", release_name, re.IGNORECASE)
|
||||
if m:
|
||||
commit_hash = m.group(1)
|
||||
|
||||
# Compare against the locally stored commit hash
|
||||
commit_file = os.path.join(wheels_dir, "." + prefix + "-commit")
|
||||
local_commit = None
|
||||
if os.path.exists(commit_file):
|
||||
with open(commit_file) as f:
|
||||
local_commit = f.read().strip()
|
||||
|
||||
if commit_hash and local_commit and local_commit[:len(commit_hash)] == commit_hash:
|
||||
print("Commit hash matches (" + commit_hash + ") — wheels are up to date.", file=sys.stderr)
|
||||
sys.exit(0)
|
||||
|
||||
newest_remote_ts = max(
|
||||
datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ")
|
||||
.replace(tzinfo=timezone.utc).timestamp()
|
||||
@@ -118,12 +189,19 @@ local_wheels = [
|
||||
if local_wheels and all(os.path.getmtime(p) >= newest_remote_ts for p in local_wheels):
|
||||
sys.exit(0)
|
||||
|
||||
downloads = []
|
||||
for a in assets:
|
||||
local_path = os.path.join(wheels_dir, a["name"])
|
||||
remote_ts = datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ") \
|
||||
.replace(tzinfo=timezone.utc).timestamp()
|
||||
if not os.path.exists(local_path) or remote_ts > os.path.getmtime(local_path):
|
||||
print(a["browser_download_url"] + " " + a["name"])
|
||||
downloads.append(a["browser_download_url"] + " " + a["name"])
|
||||
|
||||
if downloads:
|
||||
if commit_hash:
|
||||
print("#commit:" + commit_hash)
|
||||
for d in downloads:
|
||||
print(d)
|
||||
' "$WHEELS_DIR" "$PREFIX") || return 1
|
||||
|
||||
if [ -z "$DOWNLOAD_LIST" ]; then
|
||||
@@ -131,16 +209,36 @@ for a in assets:
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Parse the optional '#commit:HASH' sentinel emitted by the Python script
|
||||
local REMOTE_COMMIT=""
|
||||
local DOWNLOAD_ENTRIES=""
|
||||
while IFS= read -r LINE; do
|
||||
if [[ "$LINE" == "#commit:"* ]]; then
|
||||
REMOTE_COMMIT="${LINE#"#commit:"}"
|
||||
elif [[ -n "$LINE" ]]; then
|
||||
DOWNLOAD_ENTRIES+="$LINE"$'\n'
|
||||
fi
|
||||
done <<< "$DOWNLOAD_LIST"
|
||||
|
||||
if [ -z "$DOWNLOAD_ENTRIES" ]; then
|
||||
echo "All $PREFIX wheels are up to date — skipping download."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Back up existing wheels so we never leave a mix of old and new on failure
|
||||
local DL_BACKUP="$WHEELS_DIR/.backup-download-${PREFIX}"
|
||||
rm -rf "$DL_BACKUP" && mkdir -p "$DL_BACKUP"
|
||||
for f in "$WHEELS_DIR/${PREFIX}"*.whl; do
|
||||
[ -f "$f" ] && mv "$f" "$DL_BACKUP/"
|
||||
done
|
||||
for f in "$WHEELS_DIR/.${PREFIX}"*; do
|
||||
[ -f "$f" ] && mv "$f" "$DL_BACKUP/"
|
||||
done
|
||||
|
||||
local URL NAME TMP_WHL
|
||||
local DOWNLOADED=()
|
||||
while IFS=' ' read -r URL NAME; do
|
||||
[ -z "$URL" ] && continue
|
||||
echo "Downloading $NAME..."
|
||||
TMP_WHL=$(mktemp "$WHEELS_DIR/${NAME}.XXXXXX")
|
||||
if curl -L --progress-bar --connect-timeout 30 "$URL" -o "$TMP_WHL"; then
|
||||
@@ -153,24 +251,30 @@ for a in assets:
|
||||
if compgen -G "$DL_BACKUP/${PREFIX}*.whl" > /dev/null 2>&1; then
|
||||
echo "Restoring previous $PREFIX wheels..."
|
||||
mv "$DL_BACKUP/${PREFIX}"*.whl "$WHEELS_DIR/"
|
||||
mv "$DL_BACKUP/.${PREFIX}"* "$WHEELS_DIR/"
|
||||
fi
|
||||
rm -rf "$DL_BACKUP"
|
||||
return 1
|
||||
fi
|
||||
done <<< "$DOWNLOAD_LIST"
|
||||
done <<< "$DOWNLOAD_ENTRIES"
|
||||
|
||||
rm -rf "$DL_BACKUP"
|
||||
if [ -n "$REMOTE_COMMIT" ]; then
|
||||
echo "$REMOTE_COMMIT" > "$WHEELS_DIR/.${PREFIX}-commit"
|
||||
echo "Recorded $PREFIX commit hash: $REMOTE_COMMIT"
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# Help function
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
|
||||
echo " -t, --tag <tag> : Image tag (default: 'vllm-node', 'vllm-node-tf5' with --tf5, 'vllm-node-mxfp4' with --exp-mxfp4)"
|
||||
echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')"
|
||||
echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)"
|
||||
echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)"
|
||||
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
|
||||
echo " --flashinfer-ref <ref> : FlashInfer commit SHA, branch or tag (default: 'main')"
|
||||
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists."
|
||||
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
|
||||
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
|
||||
@@ -179,47 +283,34 @@ usage() {
|
||||
echo " --tf5 : Install transformers>=5 (aliases: --pre-tf, --pre-transformers)"
|
||||
echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
|
||||
echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source. Can be specified multiple times."
|
||||
echo " --apply-flashinfer-pr <pr-num>: Apply a specific PR patch to FlashInfer source. Can be specified multiple times."
|
||||
echo " --full-log : Enable full build logging (--progress=plain)"
|
||||
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
||||
echo " --network <network> : Docker network to use during build"
|
||||
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
|
||||
echo " --config : Path to .env configuration file (default: .env in script directory)"
|
||||
echo " --setup : Force autodiscovery and save configuration (even if .env exists)"
|
||||
echo " -h, --help : Show this help message"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Argument parsing
|
||||
# Parse all arguments
|
||||
CONFIG_FILE_SET=false
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
||||
-t|--tag) IMAGE_TAG="$2"; IMAGE_TAG_SET=true; shift ;;
|
||||
--gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
|
||||
--rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
|
||||
--rebuild-vllm) REBUILD_VLLM=true ;;
|
||||
--vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
|
||||
--flashinfer-ref) FLASHINFER_REF="$2"; FLASHINFER_REF_SET=true; shift ;;
|
||||
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
||||
COPY_TO_FLAG=true
|
||||
shift
|
||||
while [[ "$#" -gt 0 && "$1" != -* ]]; do
|
||||
add_copy_hosts "$1"
|
||||
shift
|
||||
done
|
||||
|
||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
echo "No hosts specified. Using autodiscovery..."
|
||||
source "$(dirname "$0")/autodiscover.sh"
|
||||
|
||||
detect_nodes
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Autodiscovery failed."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
||||
COPY_HOSTS=("${PEER_NODES[@]}")
|
||||
fi
|
||||
|
||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
echo "Error: Autodiscovery found no other nodes."
|
||||
exit 1
|
||||
fi
|
||||
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
||||
fi
|
||||
continue
|
||||
;;
|
||||
-j|--build-jobs) BUILD_JOBS="$2"; shift ;;
|
||||
@@ -240,21 +331,100 @@ while [[ "$#" -gt 0 ]]; do
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--apply-flashinfer-pr)
|
||||
if [ -n "$2" ] && [[ "$2" != -* ]]; then
|
||||
if [ -n "$FLASHINFER_PRS" ]; then
|
||||
FLASHINFER_PRS="$FLASHINFER_PRS $2"
|
||||
else
|
||||
FLASHINFER_PRS="$2"
|
||||
fi
|
||||
shift
|
||||
else
|
||||
echo "Error: --apply-flashinfer-pr requires a PR number."
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--full-log) FULL_LOG=true ;;
|
||||
--no-build) NO_BUILD=true ;;
|
||||
--cleanup) CLEANUP_MODE=true ;;
|
||||
--network)
|
||||
if [ -n "$2" ] && [[ "$2" != -* ]]; then
|
||||
NETWORK_ARG="$2"
|
||||
shift
|
||||
else
|
||||
echo "Error: --network requires a network name."
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
|
||||
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
||||
-h|--help) usage ;;
|
||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# Apply default IMAGE_TAG based on flags if -t was not specified
|
||||
if [ "$IMAGE_TAG_SET" = false ]; then
|
||||
if [ "$PRE_TRANSFORMERS" = true ]; then
|
||||
IMAGE_TAG="vllm-node-tf5"
|
||||
elif [ "$EXP_MXFP4" = true ]; then
|
||||
IMAGE_TAG="vllm-node-mxfp4"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Source autodiscover.sh to load .env file
|
||||
source "$(dirname "$0")/autodiscover.sh"
|
||||
|
||||
# If --setup: force full autodiscovery and save configuration
|
||||
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
|
||||
echo "Running full autodiscovery (--setup)..."
|
||||
detect_interfaces || exit 1
|
||||
detect_local_ip || exit 1
|
||||
detect_nodes || exit 1
|
||||
detect_copy_hosts || exit 1
|
||||
save_config || exit 1
|
||||
# Reload .env so DOTENV_* variables reflect saved config
|
||||
load_env_if_exists
|
||||
fi
|
||||
|
||||
# Handle COPY_HOSTS from .env or autodiscovery only if -c was explicitly specified
|
||||
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
|
||||
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
|
||||
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
|
||||
else
|
||||
echo "No hosts specified. Using autodiscovery..."
|
||||
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
|
||||
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
|
||||
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
|
||||
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
|
||||
|
||||
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
|
||||
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
|
||||
fi
|
||||
|
||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
echo "Error: Autodiscovery found no other nodes."
|
||||
exit 1
|
||||
fi
|
||||
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Validate flag combinations
|
||||
if [ -n "$VLLM_PRS" ]; then
|
||||
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
|
||||
fi
|
||||
|
||||
if [ -n "$FLASHINFER_PRS" ]; then
|
||||
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-flashinfer-pr is incompatible with --exp-mxfp4"; exit 1; fi
|
||||
fi
|
||||
|
||||
if [ "$EXP_MXFP4" = true ]; then
|
||||
if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
|
||||
if [ "$FLASHINFER_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --flashinfer-ref"; exit 1; fi
|
||||
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --tf5"; exit 1; fi
|
||||
if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
|
||||
if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
|
||||
@@ -266,6 +436,30 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Handle cleanup mode
|
||||
if [[ "$CLEANUP_MODE" == "true" ]]; then
|
||||
WHEELS_DIR="./wheels"
|
||||
echo "Cleaning up wheels directory..."
|
||||
|
||||
# Remove all .whl files
|
||||
if compgen -G "$WHEELS_DIR/*.whl" > /dev/null 2>&1; then
|
||||
rm -f "$WHEELS_DIR"/*.whl
|
||||
echo "Removed *.whl files from $WHEELS_DIR"
|
||||
else
|
||||
echo "No *.whl files found in $WHEELS_DIR"
|
||||
fi
|
||||
|
||||
# Remove all .-commit files
|
||||
if compgen -G "$WHEELS_DIR/.*-commit" > /dev/null 2>&1; then
|
||||
rm -f "$WHEELS_DIR"/.*-commit
|
||||
echo "Removed .*-commit files from $WHEELS_DIR"
|
||||
else
|
||||
echo "No .*-commit files found in $WHEELS_DIR"
|
||||
fi
|
||||
|
||||
echo "Cleanup complete."
|
||||
fi
|
||||
|
||||
# Ensure wheels directory exists
|
||||
mkdir -p ./wheels
|
||||
|
||||
@@ -277,6 +471,9 @@ fi
|
||||
COMMON_BUILD_FLAGS+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
|
||||
COMMON_BUILD_FLAGS+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||
COMMON_BUILD_FLAGS+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
|
||||
if [ -n "$NETWORK_ARG" ]; then
|
||||
COMMON_BUILD_FLAGS+=("--network" "$NETWORK_ARG")
|
||||
fi
|
||||
|
||||
# =====================================================
|
||||
# Build image (unless --no-build or --exp-mxfp4)
|
||||
@@ -288,6 +485,13 @@ RUNNER_BUILD_TIME=0
|
||||
if [ "$NO_BUILD" = false ]; then
|
||||
if [ "$EXP_MXFP4" = true ]; then
|
||||
echo "Building with experimental MXFP4 support..."
|
||||
|
||||
# Generate build metadata YAML for mxfp4 build
|
||||
MXFP4_VLLM_SHA=$(grep -m1 '^ARG VLLM_SHA=' Dockerfile.mxfp4 | cut -d= -f2)
|
||||
MXFP4_FLASHINFER_SHA=$(grep -m1 '^ARG FLASHINFER_SHA=' Dockerfile.mxfp4 | cut -d= -f2)
|
||||
generate_build_metadata Dockerfile.mxfp4 "unknown" "$MXFP4_VLLM_SHA" "$MXFP4_FLASHINFER_SHA" \
|
||||
"mxfp4-pinned" "false" "true" ""
|
||||
|
||||
CMD=("docker" "build" "-t" "$IMAGE_TAG" "${COMMON_BUILD_FLAGS[@]}" "-f" "Dockerfile.mxfp4" ".")
|
||||
echo "Building image with command: ${CMD[*]}"
|
||||
BUILD_START=$(date +%s)
|
||||
@@ -298,9 +502,21 @@ if [ "$NO_BUILD" = false ]; then
|
||||
# ----------------------------------------------------------
|
||||
# Phase 1: FlashInfer wheels
|
||||
# ----------------------------------------------------------
|
||||
if [ "$FLASHINFER_REF_SET" = true ] || [ -n "$FLASHINFER_PRS" ]; then
|
||||
REBUILD_FLASHINFER=true
|
||||
fi
|
||||
|
||||
BUILD_FLASHINFER=false
|
||||
if [ "$REBUILD_FLASHINFER" = true ]; then
|
||||
echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
|
||||
if [ "$FLASHINFER_REF_SET" = true ] && [ -n "$FLASHINFER_PRS" ]; then
|
||||
echo "Rebuilding FlashInfer wheels (--flashinfer-ref and --apply-flashinfer-pr specified)..."
|
||||
elif [ "$FLASHINFER_REF_SET" = true ]; then
|
||||
echo "Rebuilding FlashInfer wheels (--flashinfer-ref specified)..."
|
||||
elif [ -n "$FLASHINFER_PRS" ]; then
|
||||
echo "Rebuilding FlashInfer wheels (--apply-flashinfer-pr specified)..."
|
||||
else
|
||||
echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
|
||||
fi
|
||||
BUILD_FLASHINFER=true
|
||||
elif try_download_wheels "$FLASHINFER_RELEASE_TAG" "flashinfer"; then
|
||||
echo "FlashInfer wheels ready."
|
||||
@@ -322,12 +538,18 @@ if [ "$NO_BUILD" = false ]; then
|
||||
FI_CMD=("docker" "build"
|
||||
"--target" "flashinfer-export"
|
||||
"--output" "type=local,dest=./wheels"
|
||||
"${COMMON_BUILD_FLAGS[@]}")
|
||||
"${COMMON_BUILD_FLAGS[@]}"
|
||||
"--build-arg" "FLASHINFER_REF=$FLASHINFER_REF")
|
||||
|
||||
if [ "$REBUILD_FLASHINFER" = true ]; then
|
||||
FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
|
||||
fi
|
||||
|
||||
if [ -n "$FLASHINFER_PRS" ]; then
|
||||
echo "Applying FlashInfer PRs: $FLASHINFER_PRS"
|
||||
FI_CMD+=("--build-arg" "FLASHINFER_PRS=$FLASHINFER_PRS")
|
||||
fi
|
||||
|
||||
FI_CMD+=(".")
|
||||
|
||||
echo "FlashInfer build command: ${FI_CMD[*]}"
|
||||
@@ -347,30 +569,32 @@ if [ "$NO_BUILD" = false ]; then
|
||||
# ----------------------------------------------------------
|
||||
# Phase 2: vLLM wheels
|
||||
# ----------------------------------------------------------
|
||||
VLLM_WHEELS_EXIST=false
|
||||
if compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
|
||||
VLLM_WHEELS_EXIST=true
|
||||
fi
|
||||
|
||||
if [ "$VLLM_REF_SET" = true ] || [ -n "$VLLM_PRS" ]; then
|
||||
REBUILD_VLLM=true
|
||||
fi
|
||||
|
||||
if [ "$REBUILD_VLLM" = true ] || [ "$VLLM_WHEELS_EXIST" = false ]; then
|
||||
if [ "$REBUILD_VLLM" = true ]; then
|
||||
if [ "$VLLM_REF_SET" = true ] && [ -n "$VLLM_PRS" ]; then
|
||||
echo "Rebuilding vLLM wheels (--vllm-ref and --apply-vllm-pr specified)..."
|
||||
elif [ "$VLLM_REF_SET" = true ]; then
|
||||
echo "Rebuilding vLLM wheels (--vllm-ref specified)..."
|
||||
elif [ -n "$VLLM_PRS" ]; then
|
||||
echo "Rebuilding vLLM wheels (--apply-vllm-pr specified)..."
|
||||
else
|
||||
echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
|
||||
fi
|
||||
BUILD_VLLM=false
|
||||
if [ "$REBUILD_VLLM" = true ]; then
|
||||
if [ "$VLLM_REF_SET" = true ] && [ -n "$VLLM_PRS" ]; then
|
||||
echo "Rebuilding vLLM wheels (--vllm-ref and --apply-vllm-pr specified)..."
|
||||
elif [ "$VLLM_REF_SET" = true ]; then
|
||||
echo "Rebuilding vLLM wheels (--vllm-ref specified)..."
|
||||
elif [ -n "$VLLM_PRS" ]; then
|
||||
echo "Rebuilding vLLM wheels (--apply-vllm-pr specified)..."
|
||||
else
|
||||
echo "No vLLM wheels found in ./wheels/ — building..."
|
||||
echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
|
||||
fi
|
||||
BUILD_VLLM=true
|
||||
elif try_download_wheels "$VLLM_RELEASE_TAG" "vllm"; then
|
||||
echo "vLLM wheels ready."
|
||||
elif compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
|
||||
echo "Download failed — using existing local vLLM wheels."
|
||||
else
|
||||
echo "No vLLM wheels available (download failed) — building..."
|
||||
BUILD_VLLM=true
|
||||
fi
|
||||
|
||||
if [ "$BUILD_VLLM" = true ]; then
|
||||
# Back up existing vllm wheels; restore them if the build fails
|
||||
VLLM_BACKUP="./wheels/.backup-vllm"
|
||||
rm -rf "$VLLM_BACKUP" && mkdir -p "$VLLM_BACKUP"
|
||||
@@ -393,7 +617,6 @@ if [ "$NO_BUILD" = false ]; then
|
||||
VLLM_CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
|
||||
fi
|
||||
|
||||
|
||||
VLLM_CMD+=(".")
|
||||
|
||||
echo "vLLM build command: ${VLLM_CMD[*]}"
|
||||
@@ -408,8 +631,6 @@ if [ "$NO_BUILD" = false ]; then
|
||||
rm -rf "$VLLM_BACKUP"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "vLLM wheels already present in ./wheels/ — skipping build."
|
||||
fi
|
||||
|
||||
# ----------------------------------------------------------
|
||||
@@ -420,6 +641,15 @@ if [ "$NO_BUILD" = false ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Generate build metadata YAML
|
||||
VLLM_VERSION=$(ls ./wheels/vllm-*.whl 2>/dev/null | head -1 | sed 's|.*/vllm-||;s|-.*||')
|
||||
VLLM_COMMIT=""
|
||||
[ -f "./wheels/.vllm-commit" ] && VLLM_COMMIT=$(cat ./wheels/.vllm-commit)
|
||||
FLASHINFER_COMMIT=""
|
||||
[ -f "./wheels/.flashinfer-commit" ] && FLASHINFER_COMMIT=$(cat ./wheels/.flashinfer-commit)
|
||||
generate_build_metadata Dockerfile "$VLLM_VERSION" "$VLLM_COMMIT" "$FLASHINFER_COMMIT" \
|
||||
"$VLLM_REF" "$PRE_TRANSFORMERS" "false" "$VLLM_PRS"
|
||||
|
||||
RUNNER_CMD=("docker" "build"
|
||||
"-t" "$IMAGE_TAG"
|
||||
"${COMMON_BUILD_FLAGS[@]}")
|
||||
|
||||
@@ -42,13 +42,54 @@ However, in order to get full bandwidth in NCCL RDMA mode, we need to utilize **
|
||||
Also, note that connecting two Sparks using **both** ports won't give you any noticeable advantage in bandwidth, so single connection is sufficient.
|
||||
If you connect 3 Sparks by daisy-chaining them, you will only be able to sustain 100G between each pair of Sparks.
|
||||
|
||||
## Connecting more than 2 Sparks in the cluster
|
||||
## Connecting 3 Sparks in a mesh cluster without a switch
|
||||
|
||||
Three Sparks can be connected together in a cluster without using a separate RoCE switch.
|
||||
However, all three Sparks need to be on the same wired network using it's 10G Ethernet port (RG-45, not QSFP). Being on a same wireless network should work too, but it's not recommended and was not tested.
|
||||
|
||||
You need to make sure they are connected the following way: port 0 on one Spark should connect to port 1 on another Spark (unlike non-mesh configuration).
|
||||
Example diagram:
|
||||
|
||||
```mermaid
|
||||
block-beta
|
||||
columns 1
|
||||
|
||||
block:Spark3
|
||||
columns 2
|
||||
Title3["Spark 3"]:2
|
||||
s3p0["Port 0<br>192.168.187.13<br>192.168.188.13"] s3p1["Port 1<br>192.168.197.13<br>192.168.198.13"]
|
||||
end
|
||||
|
||||
space
|
||||
|
||||
block:Spark2
|
||||
columns 2
|
||||
Title2["Spark 2"]:2
|
||||
s2p0["Port 0<br>192.168.197.12<br>192.168.198.12"] s2p1["Port 1<br>192.168.177.12<br>192.168.178.13"]
|
||||
end
|
||||
|
||||
space
|
||||
|
||||
block:Spark1
|
||||
columns 2
|
||||
Title1["Spark 1"]:2
|
||||
s1p0["Port 0<br>192.168.177.11<br>192.168.178.11"] s1p1["Port 1<br>192.168.187.11<br>192.168.188.11"]
|
||||
end
|
||||
|
||||
s1p0 <--> s2p1
|
||||
s2p0 <--> s3p1
|
||||
s3p0 <--> s1p1
|
||||
```
|
||||
|
||||
## Connecting more than 2 Sparks in the cluster using a switch
|
||||
|
||||
To connect more than 2 Sparks, you will need a proper switch, for example [Microtik CRS812-DDQ](https://mikrotik.com/product/crs812_ddq).
|
||||
Please refer to [this post](https://forums.developer.nvidia.com/t/6x-spark-setup/354399/56) for an example of setting up a 6-8 node Spark cluster.
|
||||
|
||||
## Network setup
|
||||
|
||||
### For dual Sparks or multiple Sparks using a QSFP switch
|
||||
|
||||
Assuming both are connected using rightmost QFSP port (when looking from the back).
|
||||
|
||||
Create `/etc/netplan/40-cx7.yaml` on `spark`:
|
||||
@@ -58,15 +99,16 @@ network:
|
||||
ethernets:
|
||||
enp1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [ ipv4 ] # Restrict link-local addresses to IPv4 only
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.177.11/24]
|
||||
enP2p1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: [ ipv4 ]
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.178.11/24]
|
||||
```
|
||||
|
||||
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
|
||||
@@ -76,23 +118,19 @@ network:
|
||||
ethernets:
|
||||
enp1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [ ipv4 ] # Restrict link-local addresses to IPv4 only
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.177.12/24]
|
||||
enP2p1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: [ ipv4 ]
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.178.12/24]
|
||||
```
|
||||
|
||||
Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both.
|
||||
You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet.
|
||||
|
||||
For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
|
||||
|
||||
This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one.
|
||||
**DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
|
||||
|
||||
Then run on each node:
|
||||
|
||||
@@ -115,6 +153,122 @@ MTU setting (testing):
|
||||
sudo ip link set dev enp1s0f1np1 mtu 9000
|
||||
```
|
||||
|
||||
### For 3-node mesh
|
||||
|
||||
3-node mesh is configured differently than dual clusters or clusters using a QSFP switch.
|
||||
|
||||
Assuming, your Sparks are connected according to the diagram above:
|
||||
|
||||
Create `/etc/netplan/40-cx7.yaml` on `spark1`:
|
||||
```yaml
|
||||
network:
|
||||
version: 2
|
||||
ethernets:
|
||||
enp1s0f0np0:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.177.11/24]
|
||||
enP2p1s0f0np0:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.178.11/24]
|
||||
enp1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.187.11/24]
|
||||
enP2p1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.188.11/24]
|
||||
```
|
||||
|
||||
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
|
||||
```yaml
|
||||
network:
|
||||
version: 2
|
||||
ethernets:
|
||||
enp1s0f0np0:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.197.12/24]
|
||||
enP2p1s0f0np0:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.198.12/24]
|
||||
enp1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.177.12/24]
|
||||
enP2p1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.178.12/24]
|
||||
```
|
||||
|
||||
Create `/etc/netplan/40-cx7.yaml` on `spark3`:
|
||||
```yaml
|
||||
network:
|
||||
version: 2
|
||||
ethernets:
|
||||
enp1s0f0np0:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.187.13/24]
|
||||
enP2p1s0f0np0:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.188.13/24]
|
||||
enp1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no # Explicitly disable DHCPv6
|
||||
link-local: [] # Restrict link-local addresses to static IPv4 only
|
||||
mtu: 9000
|
||||
addresses: [192.168.197.13/24]
|
||||
enP2p1s0f1np1:
|
||||
dhcp4: no
|
||||
dhcp6: no
|
||||
link-local: []
|
||||
mtu: 9000
|
||||
addresses: [192.168.198.13/24]
|
||||
```
|
||||
|
||||
Then run (on each Spark):
|
||||
|
||||
```bash
|
||||
sudo chmod 600 /etc/netplan/40-cx7.yaml
|
||||
sudo netplan apply
|
||||
```
|
||||
|
||||
### Passwordless SSH and benchmarks
|
||||
|
||||
Set up passwordless ssh. On the first spark:
|
||||
|
||||
```bash
|
||||
wget https://raw.githubusercontent.com/NVIDIA/dgx-spark-playbooks/refs/heads/main/nvidia/connect-two-sparks/assets/discover-sparks
|
||||
chmod +x discover-sparks
|
||||
./discover-sparks
|
||||
```
|
||||
|
||||
**Benchmark connection (use perftest package):**
|
||||
|
||||
Run the receiver on `spark2` node:
|
||||
@@ -196,7 +350,9 @@ ib_write_lat 192.168.177.12 -d rocep1s0f1 --report_gbits -R --force-link IB
|
||||
---------------------------------------------------------------------------------------
|
||||
```
|
||||
|
||||
## NCCL Setup
|
||||
## NCCL Tests
|
||||
|
||||
### Dual Sparks or Sparks via QSFP switch
|
||||
|
||||
From https://build.nvidia.com/spark/nccl/stacked-sparks
|
||||
|
||||
@@ -239,4 +395,52 @@ mpirun -np 2 -H 192.168.177.11:1,192.168.177.12:1 \
|
||||
-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
|
||||
$HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2
|
||||
|
||||
```
|
||||
|
||||
### 3-node mesh
|
||||
|
||||
```bash
|
||||
# Install dependencies and build NCCL
|
||||
sudo apt-get update && sudo apt-get install -y libopenmpi-dev
|
||||
git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git ~/nccl/
|
||||
cd ~/nccl/
|
||||
make -j src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121"
|
||||
|
||||
# Set environment variables
|
||||
export CUDA_HOME="/usr/local/cuda"
|
||||
export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
|
||||
export NCCL_HOME="$HOME/nccl/build/"
|
||||
export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
|
||||
```
|
||||
|
||||
Build NCCL Test Suite:
|
||||
|
||||
```bash
|
||||
# Clone and build NCCL tests
|
||||
git clone https://github.com/NVIDIA/nccl-tests.git ~/nccl-tests/
|
||||
cd ~/nccl-tests/
|
||||
make MPI=1
|
||||
```
|
||||
|
||||
Test on both nodes (replace spark1, spark2, spark3 with the actual hostnames or IP address on non-QSFP interface):
|
||||
|
||||
```bash
|
||||
# Set environment variables
|
||||
export CUDA_HOME="/usr/local/cuda"
|
||||
export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
|
||||
export NCCL_HOME="$HOME/nccl_spark_cluster/build/"
|
||||
export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
|
||||
|
||||
# For 3-node mesh we have to use 10G interface for OOB communication!
|
||||
export UCX_NET_DEVICES=enP7s7
|
||||
export NCCL_SOCKET_IFNAME=enP7s7
|
||||
export OMPI_MCA_btl_tcp_if_include=enP7s7
|
||||
export NCCL_IB_HCA=rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1
|
||||
export NCCL_IB_DISABLE=0
|
||||
|
||||
# Run the all_gather performance test across both nodes
|
||||
mpirun -np 3 -H spark1:1,spark2:1,spark3:1 \
|
||||
--mca plm_rsh_agent "ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" \
|
||||
-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH -x NCCL_IB_MERGE_NICS=0 -x NCCL_NET_PLUGIN=none -x NCCL_IB_SUBNET_AWARE_ROUTING=1 \
|
||||
$HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 3
|
||||
```
|
||||
@@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub"
|
||||
COPY_HOSTS=()
|
||||
SSH_USER="$USER"
|
||||
PARALLEL_COPY=false
|
||||
CONFIG_FILE=""
|
||||
CONFIG_FILE_SET=false
|
||||
|
||||
# Help function
|
||||
usage() {
|
||||
@@ -16,6 +18,7 @@ usage() {
|
||||
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
|
||||
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
|
||||
echo " -u, --user <user> : Username for ssh commands (default: \$USER)"
|
||||
echo " --config <file> : Path to .env configuration file (default: .env in script directory)"
|
||||
echo " -h, --help : Show this help message"
|
||||
exit 1
|
||||
}
|
||||
@@ -37,11 +40,11 @@ copy_model_to_host() {
|
||||
local host="$1"
|
||||
local model_name="$2"
|
||||
local model_dir="$3"
|
||||
|
||||
|
||||
echo "Copying model '$model_name' to ${SSH_USER}@${host}..."
|
||||
local host_copy_start host_copy_end host_copy_time
|
||||
host_copy_start=$(date +%s)
|
||||
|
||||
|
||||
if rsync -av --mkpath --progress "$model_dir" "${SSH_USER}@${host}:$HUB_PATH/"; then
|
||||
host_copy_end=$(date +%s)
|
||||
host_copy_time=$((host_copy_end - host_copy_start))
|
||||
@@ -53,44 +56,24 @@ copy_model_to_host() {
|
||||
}
|
||||
|
||||
# Argument parsing
|
||||
COPY_TO_FLAG=false
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
||||
COPY_TO_FLAG=true
|
||||
shift
|
||||
# Consume arguments until the next flag or end of args
|
||||
while [[ "$#" -gt 0 && "$1" != -* ]]; do
|
||||
add_copy_hosts "$1"
|
||||
shift
|
||||
done
|
||||
|
||||
# If no hosts specified, use autodiscovery
|
||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
echo "No hosts specified. Using autodiscovery..."
|
||||
source "$(dirname "$0")/autodiscover.sh"
|
||||
|
||||
detect_nodes
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error: Autodiscovery failed."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Use PEER_NODES directly
|
||||
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
||||
COPY_HOSTS=("${PEER_NODES[@]}")
|
||||
fi
|
||||
|
||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
echo "Error: Autodiscovery found no other nodes."
|
||||
exit 1
|
||||
fi
|
||||
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
||||
fi
|
||||
continue
|
||||
;;
|
||||
--copy-parallel) PARALLEL_COPY=true ;;
|
||||
-u|--user) SSH_USER="$2"; shift ;;
|
||||
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
|
||||
-h|--help) usage ;;
|
||||
*)
|
||||
*)
|
||||
# If positional argument is provided
|
||||
if [ -z "${MODEL_NAME:-}" ]; then
|
||||
MODEL_NAME="$1"
|
||||
@@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do
|
||||
shift
|
||||
done
|
||||
|
||||
# Export config so autodiscover.sh picks it up
|
||||
export CONFIG_FILE CONFIG_FILE_SET
|
||||
|
||||
# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available
|
||||
source "$(dirname "$0")/autodiscover.sh"
|
||||
|
||||
# Validate model name is provided
|
||||
if [ -z "${MODEL_NAME:-}" ]; then
|
||||
echo "Error: Model name is required."
|
||||
usage
|
||||
fi
|
||||
|
||||
# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env
|
||||
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
# --copy-to was specified but no hosts given: use .env or autodiscover
|
||||
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
|
||||
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
|
||||
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
|
||||
else
|
||||
echo "No hosts specified. Using autodiscovery..."
|
||||
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
|
||||
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
|
||||
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
|
||||
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
|
||||
|
||||
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
|
||||
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
|
||||
fi
|
||||
|
||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||
echo "Error: Autodiscovery found no other nodes."
|
||||
exit 1
|
||||
fi
|
||||
echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}"
|
||||
fi
|
||||
elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||
# No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly
|
||||
: # intentional no-op; user didn't ask for copy
|
||||
fi
|
||||
|
||||
# Check if uvx is installed
|
||||
if ! command -v uvx &> /dev/null; then
|
||||
echo "Error: 'uvx' command not found."
|
||||
@@ -231,4 +249,4 @@ if [ "$COPY_TIME" -gt 0 ]; then
|
||||
fi
|
||||
echo "Total: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
|
||||
echo "========================================="
|
||||
echo "Done downloading $MODEL_NAME."
|
||||
echo "Done downloading $MODEL_NAME."
|
||||
|
||||
@@ -16,6 +16,7 @@ fi
|
||||
ETH_IF=""
|
||||
IB_IF=""
|
||||
NCCL_DEBUG_VAL=""
|
||||
MASTER_PORT="29501"
|
||||
|
||||
# Initialize variables
|
||||
NODES_ARG=""
|
||||
@@ -23,15 +24,18 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
|
||||
COMMAND_TO_RUN=""
|
||||
DAEMON_MODE="false"
|
||||
CHECK_CONFIG="false"
|
||||
ACTION="start"
|
||||
ACTION=""
|
||||
CLUSTER_WAS_RUNNING="false"
|
||||
MOD_PATHS=()
|
||||
MOD_TYPES=()
|
||||
LAUNCH_SCRIPT_PATH=""
|
||||
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
|
||||
CONFIG_FILE="" # Will be set to default after argument parsing
|
||||
|
||||
ACTIONS_ARG=""
|
||||
SOLO_MODE="false"
|
||||
NO_RAY_MODE="false"
|
||||
LAUNCH_SCRIPT_MODE="false"
|
||||
MOUNT_CACHE_DIRS="true"
|
||||
BUILD_JOBS=""
|
||||
NON_PRIVILEGED_MODE="false"
|
||||
@@ -55,6 +59,8 @@ usage() {
|
||||
echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted."
|
||||
echo " --check-config Check configuration and auto-detection without launching"
|
||||
echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
|
||||
echo " --master-port Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501)"
|
||||
echo " --no-ray No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
|
||||
echo " --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)"
|
||||
echo " -d Daemon mode (only for 'start' action)"
|
||||
echo " --non-privileged Run in non-privileged mode (removes --privileged and --ipc=host)"
|
||||
@@ -62,9 +68,31 @@ usage() {
|
||||
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
|
||||
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
||||
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
||||
echo " --config Path to .env configuration file (default: .env in script directory)
|
||||
--setup/--discover Force autodiscovery and save configuration (even if .env exists)"
|
||||
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
||||
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
||||
echo ""
|
||||
echo "Supported .env file variables:"
|
||||
echo " CLUSTER_NODES Comma-separated list of node IPs"
|
||||
echo " ETH_IF Ethernet interface name"
|
||||
echo " IB_IF InfiniBand interface name"
|
||||
echo " MASTER_PORT Port for cluster coordination (default: 29501)"
|
||||
echo " CONTAINER_NAME Container name (default: vllm_node)"
|
||||
echo " LOCAL_IP Local IP address (for solo mode or override auto-detection)"
|
||||
echo " CONTAINER_* Any variable starting with CONTAINER_ (except CONTAINER_NAME)"
|
||||
echo " becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO"
|
||||
echo ""
|
||||
echo "Example .env file:"
|
||||
echo " CLUSTER_NODES=192.168.1.1,192.168.1.2"
|
||||
echo " ETH_IF=eth0"
|
||||
echo " IB_IF=ib0"
|
||||
echo " MASTER_PORT=29501"
|
||||
echo " CONTAINER_NAME=vllm_node"
|
||||
echo " LOCAL_IP=192.168.1.1"
|
||||
echo " CONTAINER_NCCL_DEBUG=INFO"
|
||||
echo " CONTAINER_HF_TOKEN=abc123"
|
||||
echo ""
|
||||
echo "Launch Script Usage:"
|
||||
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
|
||||
echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script"
|
||||
@@ -91,8 +119,10 @@ while [[ "$#" -gt 0 ]]; do
|
||||
NCCL_DEBUG_VAL="INFO"
|
||||
fi
|
||||
;;
|
||||
--master-port|--head-port) MASTER_PORT="$2"; shift ;;
|
||||
--check-config) CHECK_CONFIG="true" ;;
|
||||
--solo) SOLO_MODE="true" ;;
|
||||
--no-ray) NO_RAY_MODE="true" ;;
|
||||
--no-cache-dirs) MOUNT_CACHE_DIRS="false" ;;
|
||||
--non-privileged) NON_PRIVILEGED_MODE="true" ;;
|
||||
--mem-limit-gb) MEM_LIMIT_GB="$2"; shift ;;
|
||||
@@ -101,6 +131,8 @@ while [[ "$#" -gt 0 ]]; do
|
||||
--shm-size-gb) SHM_SIZE_GB="$2"; shift ;;
|
||||
-d) DAEMON_MODE="true" ;;
|
||||
-h|--help) usage ;;
|
||||
--config) CONFIG_FILE="$2"; shift ;;
|
||||
--setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
||||
start|stop|status)
|
||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
||||
@@ -115,7 +147,7 @@ while [[ "$#" -gt 0 ]]; do
|
||||
fi
|
||||
ACTION="exec"
|
||||
shift
|
||||
COMMAND_TO_RUN="$@"
|
||||
COMMAND_TO_RUN=$(printf "%q " "$@")
|
||||
break
|
||||
;;
|
||||
*)
|
||||
@@ -126,6 +158,115 @@ while [[ "$#" -gt 0 ]]; do
|
||||
shift
|
||||
done
|
||||
|
||||
# Set .env file path (use default if not specified)
|
||||
if [[ -z "$CONFIG_FILE" ]]; then
|
||||
CONFIG_FILE="$SCRIPT_DIR/.env"
|
||||
CONFIG_FILE_SET=false
|
||||
else
|
||||
CONFIG_FILE_SET=true
|
||||
fi
|
||||
|
||||
# Load .env file
|
||||
if [[ -f "$CONFIG_FILE" ]]; then
|
||||
echo "Loading configuration from .env file..."
|
||||
|
||||
# Validate .env file syntax
|
||||
if ! python3 -c "
|
||||
import sys
|
||||
import re
|
||||
|
||||
env_file = '$CONFIG_FILE'
|
||||
seen_keys = set()
|
||||
|
||||
with open(env_file, 'r') as f:
|
||||
for line_num, line in enumerate(f, 1):
|
||||
line = line.strip()
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
# Check for key=value format
|
||||
if '=' not in line:
|
||||
print(f'Error: Invalid syntax at line {line_num}: missing \"=\"')
|
||||
sys.exit(1)
|
||||
|
||||
key = line.split('=', 1)[0].strip()
|
||||
|
||||
# Validate key format (alphanumeric + underscore)
|
||||
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', key):
|
||||
print(f'Error: Invalid key format at line {line_num}: {key}')
|
||||
sys.exit(1)
|
||||
|
||||
# Check for duplicates
|
||||
if key in seen_keys:
|
||||
print(f'Error: Duplicate key at line {line_num}: {key}')
|
||||
sys.exit(1)
|
||||
|
||||
seen_keys.add(key)
|
||||
|
||||
sys.exit(0)
|
||||
" 2>/dev/null; then
|
||||
echo "Error: Invalid .env file syntax. Aborting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Load .env variables with DOTENV_ prefix
|
||||
while IFS='=' read -r key value || [[ -n "$key" ]]; do
|
||||
# Skip comments and empty lines
|
||||
[[ "$key" =~ ^[[:space:]]*# ]] && continue
|
||||
[[ -z "$key" ]] && continue
|
||||
|
||||
# Remove leading/trailing whitespace from key
|
||||
key=$(echo "$key" | xargs)
|
||||
|
||||
# Skip if key is empty after trimming
|
||||
[[ -z "$key" ]] && continue
|
||||
|
||||
# Remove quotes and whitespace from value using Python for proper shlex handling
|
||||
value=$(python3 -c "
|
||||
import shlex
|
||||
import sys
|
||||
value = '''$value'''
|
||||
# Strip whitespace
|
||||
value = value.strip()
|
||||
# Remove surrounding quotes if present
|
||||
if (value.startswith('\"') and value.endswith('\"')) or (value.startswith(\"'\" ) and value.endswith(\"'\")):
|
||||
value = value[1:-1]
|
||||
print(value)
|
||||
")
|
||||
|
||||
# Export with DOTENV_ prefix
|
||||
export "DOTENV_$key=$value"
|
||||
done < "$CONFIG_FILE"
|
||||
|
||||
echo "Loaded .env variables: $(compgen -v DOTENV_ | tr '\n' ' ')"
|
||||
fi
|
||||
|
||||
# Apply .env configuration (CLI args take precedence)
|
||||
if [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]]; then
|
||||
NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||
fi
|
||||
|
||||
if [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]]; then
|
||||
ETH_IF="$DOTENV_ETH_IF"
|
||||
fi
|
||||
|
||||
if [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]]; then
|
||||
IB_IF="$DOTENV_IB_IF"
|
||||
fi
|
||||
|
||||
if [[ -z "$MASTER_PORT" || "$MASTER_PORT" == "29501" ]] && [[ -n "$DOTENV_MASTER_PORT" ]]; then
|
||||
MASTER_PORT="$DOTENV_MASTER_PORT"
|
||||
fi
|
||||
|
||||
if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOTENV_CONTAINER_NAME" ]]; then
|
||||
CONTAINER_NAME="$DOTENV_CONTAINER_NAME"
|
||||
fi
|
||||
|
||||
if [[ -n "$DOTENV_LOCAL_IP" ]]; then
|
||||
export LOCAL_IP="$DOTENV_LOCAL_IP"
|
||||
fi
|
||||
|
||||
# Validate non-privileged mode flags
|
||||
if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
|
||||
# Set default swap limit if not specified
|
||||
@@ -156,6 +297,26 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
|
||||
esac
|
||||
fi
|
||||
|
||||
# Add container environment variables from .env (CONTAINER_* pattern)
|
||||
# Excludes CONTAINER_NAME which is a configuration variable, not an env var
|
||||
for env_var in $(compgen -v DOTENV_CONTAINER_); do
|
||||
# Skip CONTAINER_NAME as it's a configuration variable
|
||||
[[ "$env_var" == "DOTENV_CONTAINER_NAME" ]] && continue
|
||||
|
||||
# Get the value
|
||||
value="${!env_var}"
|
||||
|
||||
# Extract the actual env var name (remove DOTENV_CONTAINER_ prefix)
|
||||
actual_var="${env_var#DOTENV_CONTAINER_}"
|
||||
|
||||
# Properly escape the value for shell using Python
|
||||
escaped_value=$(python3 -c "import shlex; print(shlex.quote('$value'))")
|
||||
|
||||
# Add to docker args
|
||||
DOCKER_ARGS="$DOCKER_ARGS -e $actual_var=$escaped_value"
|
||||
echo "Adding container env: $actual_var"
|
||||
done
|
||||
|
||||
# Add build job parallelization environment variables if BUILD_JOBS is set
|
||||
if [[ -n "$BUILD_JOBS" ]]; then
|
||||
DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS"
|
||||
@@ -204,9 +365,10 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
|
||||
# Set command to run the copied script (use absolute path since docker exec may not be in /workspace)
|
||||
COMMAND_TO_RUN="/workspace/exec-script.sh"
|
||||
|
||||
LAUNCH_SCRIPT_MODE="true"
|
||||
|
||||
# If launch script is specified, default action to exec unless explicitly set to stop/status
|
||||
if [[ "$ACTION" == "start" ]]; then
|
||||
if [[ -z "$ACTION" || "$ACTION" == "start" ]]; then
|
||||
ACTION="exec"
|
||||
fi
|
||||
fi
|
||||
@@ -251,13 +413,33 @@ done
|
||||
# Source autodiscover module
|
||||
source "$(dirname "$0")/autodiscover.sh"
|
||||
|
||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||
if [[ -n "$NODES_ARG" ]]; then
|
||||
echo "Error: --solo is incompatible with -n/--nodes."
|
||||
exit 1
|
||||
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
|
||||
# --setup: force full autodiscovery and save configuration
|
||||
echo "Running full autodiscovery (--setup)..."
|
||||
# Clear pre-loaded values so detect functions run fresh instead of short-circuiting
|
||||
ETH_IF="" IB_IF="" NODES_ARG="" LOCAL_IP=""
|
||||
detect_interfaces || exit 1
|
||||
detect_local_ip || exit 1
|
||||
detect_nodes || exit 1
|
||||
detect_copy_hosts || exit 1
|
||||
save_config || exit 1
|
||||
# Reload .env so DOTENV_* variables reflect saved config
|
||||
load_env_if_exists
|
||||
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
|
||||
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
|
||||
# If no action was specified, setup was the only intent — exit cleanly
|
||||
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||
# Solo mode: skip node detection, just get local IP
|
||||
LOCAL_IP="127.0.0.1"
|
||||
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1
|
||||
if [[ -z "$LOCAL_IP" ]]; then
|
||||
LOCAL_IP="127.0.0.1"
|
||||
fi
|
||||
NODES_ARG="$LOCAL_IP"
|
||||
PEER_NODES=()
|
||||
echo "Solo mode enabled. Skipping node detection."
|
||||
@@ -303,6 +485,11 @@ if [[ "$SOLO_MODE" == "false" && ${#PEER_NODES[@]} -eq 0 ]]; then
|
||||
SOLO_MODE="true"
|
||||
fi
|
||||
|
||||
if [[ "$NO_RAY_MODE" == "true" && "$SOLO_MODE" == "true" ]]; then
|
||||
echo "Warning: Only one node detected; --no-ray has no effect in solo mode. Proceeding normally."
|
||||
NO_RAY_MODE="false"
|
||||
fi
|
||||
|
||||
echo "Head Node: $HEAD_IP"
|
||||
echo "Worker Nodes: ${PEER_NODES[*]}"
|
||||
echo "Container Name: $CONTAINER_NAME"
|
||||
@@ -324,6 +511,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" && "$CHECK_CONFIG" != "true" ]]; then
|
||||
echo "Error: No action specified. Use: start | stop | status | exec"
|
||||
usage
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$CHECK_CONFIG" == "true" ]]; then
|
||||
echo "Configuration Check Complete."
|
||||
echo " Image Name: $IMAGE_NAME"
|
||||
@@ -377,9 +570,11 @@ if [[ "$ACTION" == "status" ]]; then
|
||||
# Check Head
|
||||
if docker ps | grep -q "$CONTAINER_NAME"; then
|
||||
echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is RUNNING."
|
||||
echo "--- Ray Status ---"
|
||||
docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
|
||||
echo "------------------"
|
||||
if [[ "$NO_RAY_MODE" == "false" ]]; then
|
||||
echo "--- Ray Status ---"
|
||||
docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
|
||||
echo "------------------"
|
||||
fi
|
||||
else
|
||||
echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is NOT running."
|
||||
fi
|
||||
@@ -537,23 +732,109 @@ apply_mod_to_container() {
|
||||
fi
|
||||
}
|
||||
|
||||
# Copy Launch Script to Container Function
|
||||
copy_launch_script_to_container() {
|
||||
local container="$1"
|
||||
local script_path="$2"
|
||||
# Parse -tp/-pp/-dp (and long forms) from a text string (command or script content).
|
||||
# Sets TP_SIZE, PP_SIZE, DP_SIZE, PARALLELISM_FOUND globals.
|
||||
# Only acts when at least one parallelism flag is present.
|
||||
parse_parallelism_from_text() {
|
||||
local text="$1"
|
||||
TP_SIZE=1; PP_SIZE=1; DP_SIZE=1
|
||||
PARALLELISM_FOUND=false
|
||||
|
||||
echo "Copying launch script to head node..."
|
||||
# Normalize --flag=value to --flag value for uniform word-by-word parsing
|
||||
local normalized
|
||||
normalized=$(echo "$text" | sed 's/\(--[a-z-]*\)=/\1 /g')
|
||||
|
||||
local target_script_path="$script_path"
|
||||
local prev=""
|
||||
for word in $normalized; do
|
||||
case "$prev" in
|
||||
-tp|--tensor-parallel-size)
|
||||
[[ "$word" =~ ^[0-9]+$ ]] && TP_SIZE="$word" && PARALLELISM_FOUND=true ;;
|
||||
-pp|--pipeline-parallel-size)
|
||||
[[ "$word" =~ ^[0-9]+$ ]] && PP_SIZE="$word" && PARALLELISM_FOUND=true ;;
|
||||
-dp|--data-parallel-size)
|
||||
[[ "$word" =~ ^[0-9]+$ ]] && DP_SIZE="$word" && PARALLELISM_FOUND=true ;;
|
||||
esac
|
||||
prev="$word"
|
||||
done
|
||||
}
|
||||
|
||||
# Copy script into container as /workspace/exec-script.sh
|
||||
echo " Copying script into container..."
|
||||
docker cp "$target_script_path" "$container:/workspace/exec-script.sh"
|
||||
# Build a patched copy of the launch script on the host for a specific node.
|
||||
# Strips --distributed-executor-backend and appends multi-node args.
|
||||
# Prints the path of the temp file (caller must delete it).
|
||||
make_node_script() {
|
||||
local script_path="$1"; local nnodes="$2"; local node_rank="$3"; local master_addr="$4"
|
||||
local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr --master-port $MASTER_PORT"
|
||||
[[ "$node_rank" -gt 0 ]] && extra="$extra --headless"
|
||||
|
||||
# Make executable
|
||||
local tmp; tmp=$(mktemp /tmp/vllm_node_script_XXXXXX.sh)
|
||||
# Remove just the flag and its value (not the whole line), then filter empty/backslash-only lines
|
||||
sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//' "$script_path" | \
|
||||
grep -Ev '^[[:space:]\\]*$' > "$tmp"
|
||||
# Strip trailing backslash from last line before appending multi-node args
|
||||
sed -i "$ s/[[:space:]]*\\\\[[:space:]]*$//" "$tmp"
|
||||
sed -i "$ s/$/ $extra/" "$tmp"
|
||||
chmod +x "$tmp"
|
||||
echo "$tmp"
|
||||
}
|
||||
|
||||
# Copy a script file into a local container as /workspace/exec-script.sh
|
||||
copy_script_to_container() {
|
||||
local container="$1"; local script_path="$2"; local label="${3:-node}"
|
||||
echo "Copying launch script to $label..."
|
||||
docker cp "$script_path" "$container:/workspace/exec-script.sh" || { echo "Error: docker cp to $label failed"; exit 1; }
|
||||
docker exec "$container" chmod +x /workspace/exec-script.sh
|
||||
}
|
||||
|
||||
echo " Launch script copied to head node"
|
||||
# Copy a script file to a remote container via scp + docker cp
|
||||
copy_script_to_worker() {
|
||||
local worker_ip="$1"; local container="$2"; local script_path="$3"
|
||||
echo "Copying launch script to worker $worker_ip..."
|
||||
local remote_tmp="/tmp/vllm_script_$(date +%s)_$RANDOM.sh"
|
||||
scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$worker_ip:$remote_tmp" || { echo "Error: scp to $worker_ip failed"; exit 1; }
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
|
||||
"docker cp $remote_tmp $container:/workspace/exec-script.sh && \
|
||||
docker exec $container chmod +x /workspace/exec-script.sh && \
|
||||
rm -f $remote_tmp" || { echo "Error: docker cp to worker $worker_ip failed"; exit 1; }
|
||||
}
|
||||
|
||||
# Build -e KEY=VALUE flags for a given node IP (used in docker run and docker exec)
|
||||
get_env_flags() {
|
||||
local node_ip="$1"
|
||||
printf -- '-e %s ' \
|
||||
"VLLM_HOST_IP=$node_ip" \
|
||||
"RAY_NODE_IP_ADDRESS=$node_ip" \
|
||||
"RAY_OVERRIDE_NODE_IP_ADDRESS=$node_ip" \
|
||||
"MN_IF_NAME=$ETH_IF" \
|
||||
"UCX_NET_DEVICES=$ETH_IF" \
|
||||
"NCCL_SOCKET_IFNAME=$ETH_IF" \
|
||||
"NCCL_IB_HCA=$IB_IF" \
|
||||
"NCCL_IB_DISABLE=0" \
|
||||
"OMPI_MCA_btl_tcp_if_include=$ETH_IF" \
|
||||
"GLOO_SOCKET_IFNAME=$ETH_IF" \
|
||||
"TP_SOCKET_IFNAME=$ETH_IF" \
|
||||
"RAY_memory_monitor_refresh_ms=0" \
|
||||
"RAY_num_prestart_python_workers=0" \
|
||||
"RAY_object_store_memory=1073741824"
|
||||
}
|
||||
|
||||
# Start Ray head node inside the container
|
||||
start_ray_head() {
|
||||
local container="$1"
|
||||
echo "Starting Ray HEAD node on $HEAD_IP..."
|
||||
docker exec -d "$container" bash -c \
|
||||
"ray start --block --head --port $MASTER_PORT --object-store-memory 1073741824 --num-cpus 2 \
|
||||
--node-ip-address $HEAD_IP --include-dashboard=false --disable-usage-stats \
|
||||
>> /proc/1/fd/1 2>&1"
|
||||
}
|
||||
|
||||
# Start Ray worker node inside the container on a remote host
|
||||
start_ray_worker() {
|
||||
local worker_ip="$1"; local container="$2"
|
||||
echo "Starting Ray WORKER node on $worker_ip..."
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
|
||||
"docker exec -d $container bash -c \
|
||||
'ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
|
||||
--address=$HEAD_IP:$MASTER_PORT --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'"
|
||||
}
|
||||
|
||||
# Start Cluster Function
|
||||
@@ -564,31 +845,6 @@ start_cluster() {
|
||||
return
|
||||
fi
|
||||
|
||||
# Start Head Node
|
||||
echo "Starting Head Node on $HEAD_IP..."
|
||||
|
||||
# Ensure cache dirs exist on head
|
||||
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
|
||||
for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
|
||||
mkdir -p "$dir"
|
||||
done
|
||||
fi
|
||||
|
||||
local head_cmd_args=()
|
||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting container...; exec sleep infinity")
|
||||
else
|
||||
head_cmd_args=(sleep infinity)
|
||||
fi
|
||||
else
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role head --host-ip $HEAD_IP --eth-if $ETH_IF --ib-if $IB_IF")
|
||||
else
|
||||
head_cmd_args=(./run-cluster-node.sh --role head --host-ip "$HEAD_IP" --eth-if "$ETH_IF" --ib-if "$IB_IF")
|
||||
fi
|
||||
fi
|
||||
|
||||
# Build docker run arguments based on mode
|
||||
local docker_args_common="--gpus all -d --rm --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME"
|
||||
local docker_caps_args=""
|
||||
@@ -603,62 +859,68 @@ start_cluster() {
|
||||
docker_resource_args="--ipc=host"
|
||||
fi
|
||||
|
||||
# Start Head Node
|
||||
echo "Starting Head Node on $HEAD_IP..."
|
||||
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
|
||||
for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
|
||||
mkdir -p "$dir"
|
||||
done
|
||||
fi
|
||||
docker run $docker_caps_args $docker_resource_args \
|
||||
$docker_args_common \
|
||||
"${head_cmd_args[@]}"
|
||||
$(get_env_flags "$HEAD_IP") $docker_args_common sleep infinity
|
||||
|
||||
# Start Worker Nodes
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
echo "Starting Worker Node on $worker..."
|
||||
|
||||
# Ensure cache dirs exist on worker
|
||||
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
|
||||
# Create string of dirs to create
|
||||
dirs_str="${CACHE_DIRS_TO_CREATE[*]}"
|
||||
ssh "$worker" "mkdir -p $dirs_str"
|
||||
fi
|
||||
|
||||
local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $docker_args_common"
|
||||
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
local inner_script="echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
|
||||
ssh "$worker" "$docker_run_cmd bash -c \"$inner_script\""
|
||||
else
|
||||
ssh "$worker" "$docker_run_cmd ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
|
||||
ssh "$worker" "mkdir -p ${CACHE_DIRS_TO_CREATE[*]}"
|
||||
fi
|
||||
local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $(get_env_flags "$worker") $docker_args_common"
|
||||
ssh "$worker" "$docker_run_cmd sleep infinity"
|
||||
done
|
||||
|
||||
# Apply mods if requested
|
||||
# Apply mods (containers are idle — no mod_done sync needed)
|
||||
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
|
||||
echo "Applying modifications to cluster nodes..."
|
||||
|
||||
# Apply to Head
|
||||
for i in "${!MOD_PATHS[@]}"; do
|
||||
apply_mod_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
|
||||
done
|
||||
# Signal completion on Head
|
||||
docker exec "$CONTAINER_NAME" touch /tmp/mod_done
|
||||
|
||||
# Apply to Workers
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
for i in "${!MOD_PATHS[@]}"; do
|
||||
apply_mod_to_container "$worker" "$CONTAINER_NAME" "false" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
|
||||
done
|
||||
# Signal completion on Worker
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "docker exec $CONTAINER_NAME touch /tmp/mod_done"
|
||||
done
|
||||
fi
|
||||
|
||||
# Copy launch script to head node only (workers don't need it - they just run Ray)
|
||||
# Copy (and patch for no-ray) launch script
|
||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||
copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
|
||||
local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
|
||||
if [[ "$NO_RAY_MODE" == "true" ]]; then
|
||||
# Build per-node patched scripts on the host, then copy
|
||||
local head_script; head_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "0" "$HEAD_IP")
|
||||
copy_script_to_container "$CONTAINER_NAME" "$head_script" "head node ($HEAD_IP)"
|
||||
rm -f "$head_script"
|
||||
|
||||
local rank=1
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
local worker_script; worker_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "$rank" "$HEAD_IP")
|
||||
copy_script_to_worker "$worker" "$CONTAINER_NAME" "$worker_script"
|
||||
rm -f "$worker_script"
|
||||
(( rank++ ))
|
||||
done
|
||||
else
|
||||
copy_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" "head node"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$SOLO_MODE" == "false" ]]; then
|
||||
# Start Ray cluster (unless solo or no-ray)
|
||||
if [[ "$SOLO_MODE" == "false" && "$NO_RAY_MODE" == "false" ]]; then
|
||||
start_ray_head "$CONTAINER_NAME"
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
start_ray_worker "$worker" "$CONTAINER_NAME"
|
||||
done
|
||||
wait_for_cluster
|
||||
else
|
||||
echo "Solo mode active: Skipping Ray cluster readiness check."
|
||||
# Give container a moment to start up
|
||||
sleep 2
|
||||
fi
|
||||
}
|
||||
@@ -686,25 +948,97 @@ wait_for_cluster() {
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [[ "$ACTION" == "exec" ]]; then
|
||||
start_cluster
|
||||
echo "Executing command on head node: $COMMAND_TO_RUN"
|
||||
|
||||
# Execute command on head node (daemon or interactive)
|
||||
_exec_on_head() {
|
||||
local cmd="$1"
|
||||
if [[ "$DAEMON_MODE" == "true" ]]; then
|
||||
# Daemon mode: run command detached inside the container and exit immediately
|
||||
# Extract env vars starting from VLLM_HOST_IP to avoid interactive check in .bashrc
|
||||
# Redirect output to PID 1 stdout/stderr so it shows up in docker logs
|
||||
docker exec -d "$CONTAINER_NAME" bash -c "eval \"\$(sed -n '/export VLLM_HOST_IP/,\$p' /root/.bashrc)\" && { $COMMAND_TO_RUN; } >> /proc/1/fd/1 2>> /proc/1/fd/2"
|
||||
docker exec -d "$CONTAINER_NAME" bash -c "$cmd >> /proc/1/fd/1 2>&1"
|
||||
echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
|
||||
else
|
||||
# Check if running in a TTY to avoid "input device is not a TTY" error
|
||||
if [ -t 0 ]; then
|
||||
DOCKER_EXEC_FLAGS="-it"
|
||||
else
|
||||
DOCKER_EXEC_FLAGS="-i"
|
||||
fi
|
||||
if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
|
||||
docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -c "$cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
|
||||
# Execute a no-ray multi-node command: workers (background) then head
|
||||
exec_no_ray_cluster() {
|
||||
local base_cmd="$1"
|
||||
local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
|
||||
|
||||
# Launch workers first (always background)
|
||||
local rank=1
|
||||
for worker in "${PEER_NODES[@]}"; do
|
||||
local worker_cmd
|
||||
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
|
||||
worker_cmd="$base_cmd" # script already patched per-node in start_cluster()
|
||||
else
|
||||
local clean
|
||||
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
|
||||
worker_cmd="$clean --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --master-port $MASTER_PORT --headless"
|
||||
fi
|
||||
echo "Launching worker (rank $rank) on $worker..."
|
||||
local remote_payload remote_cmd
|
||||
remote_payload="$worker_cmd >> /proc/1/fd/1 2>&1"
|
||||
printf -v remote_cmd 'docker exec -d %q bash -c %q' "$CONTAINER_NAME" "$remote_payload"
|
||||
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "$remote_cmd"
|
||||
(( rank++ ))
|
||||
done
|
||||
|
||||
# Launch head (rank 0) last
|
||||
local head_cmd
|
||||
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
|
||||
head_cmd="$base_cmd"
|
||||
else
|
||||
local clean
|
||||
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
|
||||
head_cmd="$clean --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP --master-port $MASTER_PORT"
|
||||
fi
|
||||
|
||||
echo "Executing command on head node (rank 0): $head_cmd"
|
||||
if [[ "$DAEMON_MODE" == "true" ]]; then
|
||||
docker exec -d "$CONTAINER_NAME" bash -c "$head_cmd >> /proc/1/fd/1 2>&1"
|
||||
echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
|
||||
else
|
||||
if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
|
||||
docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -c "$head_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ "$ACTION" == "exec" ]]; then
|
||||
# Trim (or error on) PEER_NODES based on declared parallelism, for any multi-node exec
|
||||
if [[ "$SOLO_MODE" != "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
|
||||
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
|
||||
cmd_text=$(cat "$LAUNCH_SCRIPT_PATH" 2>/dev/null || true)
|
||||
else
|
||||
cmd_text="$COMMAND_TO_RUN"
|
||||
fi
|
||||
parse_parallelism_from_text "$cmd_text"
|
||||
|
||||
if [[ "$PARALLELISM_FOUND" == "true" ]]; then
|
||||
required_nodes=$(( TP_SIZE * PP_SIZE * DP_SIZE ))
|
||||
total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
|
||||
|
||||
if [[ "$required_nodes" -gt "$total_nodes" ]]; then
|
||||
echo "Error: Command requires $required_nodes nodes (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE) but only $total_nodes node(s) are configured."
|
||||
exit 1
|
||||
elif [[ "$required_nodes" -lt "$total_nodes" ]]; then
|
||||
echo "Note: Command requires $required_nodes node(s) (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE); using $required_nodes of $total_nodes configured node(s)."
|
||||
PEER_NODES=("${PEER_NODES[@]:0:$(( required_nodes - 1 ))}")
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
start_cluster
|
||||
echo "Executing command: $COMMAND_TO_RUN"
|
||||
|
||||
if [[ "$NO_RAY_MODE" == "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
|
||||
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]] || echo "$COMMAND_TO_RUN" | grep -q "vllm serve"; then
|
||||
exec_no_ray_cluster "$COMMAND_TO_RUN"
|
||||
else
|
||||
_exec_on_head "$COMMAND_TO_RUN"
|
||||
fi
|
||||
else
|
||||
_exec_on_head "$COMMAND_TO_RUN"
|
||||
fi
|
||||
elif [[ "$ACTION" == "start" ]]; then
|
||||
start_cluster
|
||||
|
||||
17
mods/drop-caches/run.sh
Normal file
17
mods/drop-caches/run.sh
Normal file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This mod will drop the FS caches every minute - useful to unstuck Qwen3.5-397B or other similar models during loading
|
||||
|
||||
CMD='sync; echo 3 > /proc/sys/vm/drop_caches'
|
||||
LOG="/tmp/drop_caches.log"
|
||||
PIDFILE="/tmp/drop_caches.pid"
|
||||
|
||||
nohup bash -c '
|
||||
while true; do
|
||||
'"$CMD"' >> "'"$LOG"'" 2>&1
|
||||
sleep 60
|
||||
done
|
||||
' >/dev/null 2>&1 &
|
||||
|
||||
echo $! > "$PIDFILE"
|
||||
echo "Started drop_caches loop with PID $(cat "$PIDFILE"); log is available in $LOG"
|
||||
116
mods/exp-b12x/run.sh
Normal file
116
mods/exp-b12x/run.sh
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
SITE_PACKAGES="/usr/local/lib/python3.12/dist-packages"
|
||||
|
||||
echo "=== EXPERIMENTAL b12x-patches mod ==="
|
||||
|
||||
# 0a. Check if b12x support is present in vLLM
|
||||
if [ ! -f "$SITE_PACKAGES/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py" ]; then
|
||||
echo "[b12x ERROR] No b12x support detected; please rebuild with --apply-vllm-pr 40082, e.g.:"
|
||||
echo "./build-and-copy.sh -t vllm-node-40082 --apply-vllm-pr 40082"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 0b. Check if environment variables are set
|
||||
|
||||
if [[ "$VLLM_NVFP4_GEMM_BACKEND" != "flashinfer-b12x" ]]; then
|
||||
echo "[b12x ERROR] Please set required environment variables to use b12x backend"
|
||||
echo "*** Add the following arguments to launch-cluster.sh:"
|
||||
echo " -e FLASHINFER_DISABLE_VERSION_CHECK=1 -e VLLM_USE_FLASHINFER_MOE_FP16=1 -e VLLM_NVFP4_GEMM_BACKEND=flashinfer-b12x -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 -e VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm -e VLLM_USE_FLASHINFER_MOE_FP4=1"
|
||||
echo "*** also set the following vLLM parameters:"
|
||||
echo " --moe-backend flashinfer_b12x --attention-backend flashinfer"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# 1. Pin nvidia-cutlass-dsl + companion libs to 4.4.2
|
||||
# (4.5.x generates bad PTX on SM121 — `_mma` rejected by ptxas).
|
||||
# All THREE packages must match: the python frontend, the base libs,
|
||||
# and the CUDA 13 libs (which contain the MLIR compiler).
|
||||
# ---------------------------------------------------------------
|
||||
DSL_VER=$(pip show nvidia-cutlass-dsl 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
|
||||
LIBS_BASE_VER=$(pip show nvidia-cutlass-dsl-libs-base 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
|
||||
# LIBS_CU13_VER=$(pip show nvidia-cutlass-dsl-libs-cu13 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
|
||||
if [ "$DSL_VER" != "4.4.2" ] || [ "$LIBS_BASE_VER" != "4.4.2" ] || [ "$LIBS_CU13_VER" != "4.4.2" ]; then
|
||||
echo "[b12x] Pinning nvidia-cutlass-dsl{,-libs-base,-libs-cu13} to 4.4.2"
|
||||
echo "[b12x] current: dsl=${DSL_VER:-none} libs-base=${LIBS_BASE_VER:-none} libs-cu13=${LIBS_CU13_VER:-none}"
|
||||
uv pip install \
|
||||
nvidia-cutlass-dsl==4.4.2 \
|
||||
nvidia-cutlass-dsl-libs-base==4.4.2 \
|
||||
nvidia-cutlass-dsl-libs-cu13==4.4.2 \
|
||||
-q 2>/dev/null || echo "[b12x] WARNING: cutlass-dsl pin install returned non-zero"
|
||||
else
|
||||
echo "[b12x] nvidia-cutlass-dsl + libs already at 4.4.2"
|
||||
fi
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# 2. Apply cutlass-dsl SM121 patches
|
||||
# FlashInfer/vLLM install wipes vendored cutlass, so re-apply every time
|
||||
# ---------------------------------------------------------------
|
||||
echo "[b12x] Applying cutlass-dsl SM121 patches..."
|
||||
|
||||
# 2a. warp/mma.py: allow sm_121a alongside sm_120a in both the runtime
|
||||
# arch check and the `admissible_archs` string list (used in error msgs)
|
||||
for f in $(find "$SITE_PACKAGES" -name "mma.py" -path "*/warp/*" 2>/dev/null); do
|
||||
if grep -q "if not arch == Arch.sm_120a:" "$f" 2>/dev/null; then
|
||||
sed -i "s/if not arch == Arch.sm_120a:/if arch not in (Arch.sm_120a, Arch.sm_121a):/" "$f"
|
||||
echo " patched $f (warp sm_121a runtime check)"
|
||||
fi
|
||||
# Add sm_121a to the admissible_archs list if missing
|
||||
if grep -q '"sm_120a",' "$f" 2>/dev/null && ! grep -q '"sm_121a"' "$f" 2>/dev/null; then
|
||||
sed -i 's/^\(\s*\)"sm_120a",$/\1"sm_120a",\n\1"sm_121a",/' "$f"
|
||||
echo " patched $f (warp sm_121a admissible_archs)"
|
||||
fi
|
||||
done
|
||||
|
||||
# 2b. tcgen05/mma.py: add sm_120a and sm_121a to supported arch list
|
||||
for f in $(find "$SITE_PACKAGES" -name "mma.py" -path "*/tcgen05/*" 2>/dev/null); do
|
||||
if ! grep -q "Arch.sm_121a" "$f" 2>/dev/null; then
|
||||
sed -i "/Arch.sm_103a,/a\\ Arch.sm_120a,\n Arch.sm_121a," "$f"
|
||||
echo " patched $f (tcgen05 mma sm_121a)"
|
||||
fi
|
||||
done
|
||||
|
||||
# 2c. tcgen05/copy.py: allow sm_120f family
|
||||
for f in $(find "$SITE_PACKAGES" -name "copy.py" -path "*/tcgen05/*" 2>/dev/null); do
|
||||
if ! grep -q "sm_120f" "$f" 2>/dev/null; then
|
||||
sed -i "s/arch.is_family_of(Arch.sm_110f)/arch.is_family_of(Arch.sm_110f) or arch.is_family_of(Arch.sm_120f)/" "$f"
|
||||
echo " patched $f (tcgen05 copy sm_120f)"
|
||||
fi
|
||||
done
|
||||
|
||||
# Clear pycache so patched code takes effect
|
||||
find "$SITE_PACKAGES" -name "__pycache__" -path "*/cutlass*" -exec rm -rf {} + 2>/dev/null || true
|
||||
find "$SITE_PACKAGES" -name "__pycache__" -path "*/flashinfer*" -exec rm -rf {} + 2>/dev/null || true
|
||||
|
||||
# ---------------------------------------------------------------
|
||||
# 3 Patch FlashInfer's blackwell_sm12x __init__.py to drop the
|
||||
# broken `sm120_moe_dispatch_context` import (FlashInfer main
|
||||
# has a stale __init__ that references a function that no
|
||||
# longer exists in moe_dispatch.py — but the symbol isn't
|
||||
# actually used by anything, so we just remove it from the
|
||||
# import + __all__ list).
|
||||
# ---------------------------------------------------------------
|
||||
SM12X_INIT="$SITE_PACKAGES/flashinfer/fused_moe/cute_dsl/blackwell_sm12x/__init__.py"
|
||||
if [ -f "$SM12X_INIT" ]; then
|
||||
if grep -q "sm120_moe_dispatch_context" "$SM12X_INIT"; then
|
||||
# Drop the line that imports/exports the missing symbol
|
||||
sed -i '/sm120_moe_dispatch_context/d' "$SM12X_INIT"
|
||||
find "$SITE_PACKAGES/flashinfer" -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
|
||||
echo "[b12x] patched $SM12X_INIT (dropped stale sm120_moe_dispatch_context references)"
|
||||
else
|
||||
echo "[b12x] $SM12X_INIT already cleaned"
|
||||
fi
|
||||
else
|
||||
echo "[b12x] $SM12X_INIT not found (older FlashInfer?), skipping"
|
||||
fi
|
||||
|
||||
if grep -q "if current_platform.has_device_capability(120) and has_flashinfer_b12x_gemm():" $SITE_PACKAGES/vllm/model_executor/kernels/linear/nvfp4/flashinfer.py; then
|
||||
echo "[b12x] Patching vLLM PR 40080 to enable sm121 cap"
|
||||
sed -i "s/if current_platform.has_device_capability(120) and has_flashinfer_b12x_gemm():/if True:/" $SITE_PACKAGES/vllm/model_executor/kernels/linear/nvfp4/flashinfer.py
|
||||
fi
|
||||
|
||||
|
||||
10
mods/fix-gemma4-tool-parser/run.sh
Normal file
10
mods/fix-gemma4-tool-parser/run.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
cd /usr/local/lib/python3.12/dist-packages
|
||||
echo "Applying PR #38909"
|
||||
if curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38909.diff | git apply --exclude="tests/*"; then
|
||||
echo "- PR #38909 applied successfully"
|
||||
else
|
||||
echo "- PR #38909 can't be applied, skipping"
|
||||
fi
|
||||
@@ -1,3 +1,3 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
patch -p1 -d /usr/local/lib/python3.12/dist-packages < transformers.patch
|
||||
patch -p1 -d /usr/local/lib/python3.12/dist-packages < transformers.patch || echo "Patch is not applicable, skipping..."
|
||||
155
mods/fix-qwen3.5-chat-template/chat_template.jinja
Normal file
155
mods/fix-qwen3.5-chat-template/chat_template.jinja
Normal file
@@ -0,0 +1,155 @@
|
||||
{%- set image_count = namespace(value=0) %}
|
||||
{%- set video_count = namespace(value=0) %}
|
||||
{%- macro render_content(content, do_vision_count, is_system_content=false) %}
|
||||
{%- if content is string %}
|
||||
{{- content }}
|
||||
{%- elif content is iterable and content is not mapping %}
|
||||
{%- for item in content %}
|
||||
{%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
|
||||
{%- if is_system_content %}
|
||||
{{- raise_exception('System message cannot contain images.') }}
|
||||
{%- endif %}
|
||||
{%- if do_vision_count %}
|
||||
{%- set image_count.value = image_count.value + 1 %}
|
||||
{%- endif %}
|
||||
{%- if add_vision_id %}
|
||||
{{- 'Picture ' ~ image_count.value ~ ': ' }}
|
||||
{%- endif %}
|
||||
{{- '<|vision_start|><|image_pad|><|vision_end|>' }}
|
||||
{%- elif 'video' in item or item.type == 'video' %}
|
||||
{%- if is_system_content %}
|
||||
{{- raise_exception('System message cannot contain videos.') }}
|
||||
{%- endif %}
|
||||
{%- if do_vision_count %}
|
||||
{%- set video_count.value = video_count.value + 1 %}
|
||||
{%- endif %}
|
||||
{%- if add_vision_id %}
|
||||
{{- 'Video ' ~ video_count.value ~ ': ' }}
|
||||
{%- endif %}
|
||||
{{- '<|vision_start|><|video_pad|><|vision_end|>' }}
|
||||
{%- elif 'text' in item %}
|
||||
{{- item.text }}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected item type in content.') }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- elif content is none or content is undefined %}
|
||||
{{- '' }}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected content type.') }}
|
||||
{%- endif %}
|
||||
{%- endmacro %}
|
||||
{%- if not messages %}
|
||||
{{- raise_exception('No messages provided.') }}
|
||||
{%- endif %}
|
||||
{%- if tools and tools is iterable and tools is not mapping %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>" }}
|
||||
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
||||
{%- if content %}
|
||||
{{- '\n\n' + content }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- else %}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
||||
{{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||
{%- for message in messages[::-1] %}
|
||||
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||
{%- if ns.multi_step_tool and message.role == "user" %}
|
||||
{%- set content = render_content(message.content, false)|trim %}
|
||||
{%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
|
||||
{%- set ns.multi_step_tool = false %}
|
||||
{%- set ns.last_query_index = index %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if ns.multi_step_tool %}
|
||||
{{- raise_exception('No user query found in messages.') }}
|
||||
{%- endif %}
|
||||
{%- for message in messages %}
|
||||
{%- set content = render_content(message.content, true)|trim %}
|
||||
{%- if message.role == "system" %}
|
||||
{%- if not loop.first %}
|
||||
{{- raise_exception('System message must be at the beginning.') }}
|
||||
{%- endif %}
|
||||
{%- elif message.role == "user" %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{%- if message.reasoning_content is string %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- else %}
|
||||
{%- if '</think>' in content %}
|
||||
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set reasoning_content = reasoning_content|trim %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if tool_call.function is defined %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{%- if loop.first %}
|
||||
{%- if content|trim %}
|
||||
{{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- else %}
|
||||
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- endif %}
|
||||
{%- if tool_call.arguments is mapping %}
|
||||
{%- for args_name in tool_call.arguments %}
|
||||
{%- set args_value = tool_call.arguments[args_name] %}
|
||||
{{- '<parameter=' + args_name + '>\n' }}
|
||||
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
|
||||
{{- args_value }}
|
||||
{{- '\n</parameter>\n' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '</function>\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
||||
{{- '<|im_start|>user' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- content }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if not loop.last and loop.nextitem.role != "tool" %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif loop.last %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected message role.') }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n' }}
|
||||
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||
{{- '<think>\n\n</think>\n\n' }}
|
||||
{%- else %}
|
||||
{{- '<think>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
4
mods/fix-qwen3.5-chat-template/run.sh
Normal file
4
mods/fix-qwen3.5-chat-template/run.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
cp chat_template.jinja $WORKSPACE_DIR/unsloth.jinja
|
||||
echo "=======> to apply chat template, use --chat-template unsloth.jinja"
|
||||
223
mods/fix-qwen3.6-chat-template/chat_template.jinja
Normal file
223
mods/fix-qwen3.6-chat-template/chat_template.jinja
Normal file
@@ -0,0 +1,223 @@
|
||||
{%- set image_count = namespace(value=0) %}
|
||||
{%- set video_count = namespace(value=0) %}
|
||||
{%- macro render_content(content, do_vision_count, is_system_content=false) %}
|
||||
{%- if content is string %}
|
||||
{{- content }}
|
||||
{%- elif content is iterable and content is not mapping %}
|
||||
{%- for item in content %}
|
||||
{%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
|
||||
{%- if is_system_content %}
|
||||
{{- raise_exception('System message cannot contain images.') }}
|
||||
{%- endif %}
|
||||
{%- if do_vision_count %}
|
||||
{%- set image_count.value = image_count.value + 1 %}
|
||||
{%- endif %}
|
||||
{%- if add_vision_id is defined and add_vision_id %}
|
||||
{{- 'Picture ' ~ image_count.value ~ ': ' }}
|
||||
{%- endif %}
|
||||
{{- '<|vision_start|><|image_pad|><|vision_end|>' }}
|
||||
{%- elif 'video' in item or item.type == 'video' %}
|
||||
{%- if is_system_content %}
|
||||
{{- raise_exception('System message cannot contain videos.') }}
|
||||
{%- endif %}
|
||||
{%- if do_vision_count %}
|
||||
{%- set video_count.value = video_count.value + 1 %}
|
||||
{%- endif %}
|
||||
{%- if add_vision_id is defined and add_vision_id %}
|
||||
{{- 'Video ' ~ video_count.value ~ ': ' }}
|
||||
{%- endif %}
|
||||
{{- '<|vision_start|><|video_pad|><|vision_end|>' }}
|
||||
{%- elif 'text' in item %}
|
||||
{{- item.text }}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected item type in content.') }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- elif content is none or content is undefined %}
|
||||
{{- '' }}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected content type.') }}
|
||||
{%- endif %}
|
||||
{%- endmacro %}
|
||||
{%- set ns_flags = namespace(enable_thinking=true) %}
|
||||
{%- if enable_thinking is defined %}
|
||||
{%- set ns_flags.enable_thinking = enable_thinking %}
|
||||
{%- endif %}
|
||||
{%- if not messages %}
|
||||
{{- raise_exception('No messages provided.') }}
|
||||
{%- endif %}
|
||||
{%- if tools and tools is iterable and tools is not mapping %}
|
||||
{{- '<|im_start|>system\n' }}
|
||||
{{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
|
||||
{%- for tool in tools %}
|
||||
{{- "\n" }}
|
||||
{{- tool | tojson }}
|
||||
{%- endfor %}
|
||||
{{- "\n</tools>" }}
|
||||
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
|
||||
{%- if messages[0].role == 'system' or messages[0].role == 'developer' %}
|
||||
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
||||
{%- if '<|think_off|>' in content %}
|
||||
{%- set ns_flags.enable_thinking = false %}
|
||||
{%- set content = content.replace('<|think_off|>', '') %}
|
||||
{%- endif %}
|
||||
{%- if '<|think_on|>' in content %}
|
||||
{%- set ns_flags.enable_thinking = true %}
|
||||
{%- set content = content.replace('<|think_on|>', '') %}
|
||||
{%- endif %}
|
||||
{%- set content = content.strip() %}
|
||||
{%- if content %}
|
||||
{{- '\n\n' + content }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- else %}
|
||||
{%- if messages[0].role == 'system' or messages[0].role == 'developer' %}
|
||||
{%- set content = render_content(messages[0].content, false, true)|trim %}
|
||||
{%- if '<|think_off|>' in content %}
|
||||
{%- set ns_flags.enable_thinking = false %}
|
||||
{%- set content = content.replace('<|think_off|>', '') %}
|
||||
{%- endif %}
|
||||
{%- if '<|think_on|>' in content %}
|
||||
{%- set ns_flags.enable_thinking = true %}
|
||||
{%- set content = content.replace('<|think_on|>', '') %}
|
||||
{%- endif %}
|
||||
{%- set content = content.strip() %}
|
||||
{{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||
{%- for message in messages[::-1] %}
|
||||
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||
{%- if ns.multi_step_tool and message.role == "user" %}
|
||||
{%- set content = render_content(message.content, false)|trim %}
|
||||
{%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
|
||||
{%- set ns.multi_step_tool = false %}
|
||||
{%- set ns.last_query_index = index %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if ns.multi_step_tool %}
|
||||
{%- set ns.last_query_index = messages|length - 1 %}
|
||||
{%- endif %}
|
||||
{%- for message in messages %}
|
||||
{%- set content = render_content(message.content, true)|trim %}
|
||||
{%- if '<|think_off|>' in content %}
|
||||
{%- set ns_flags.enable_thinking = false %}
|
||||
{%- set content = content.replace('<|think_off|>', '') %}
|
||||
{%- endif %}
|
||||
{%- if '<|think_on|>' in content %}
|
||||
{%- set ns_flags.enable_thinking = true %}
|
||||
{%- set content = content.replace('<|think_on|>', '') %}
|
||||
{%- endif %}
|
||||
{%- set content = content.strip() %}
|
||||
{%- if message.role == "system" or message.role == "developer" %}
|
||||
{%- if not loop.first %}
|
||||
{{- raise_exception('System message must be at the beginning.') }}
|
||||
{%- endif %}
|
||||
{%- elif message.role == "user" %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{%- set reasoning_content = '' %}
|
||||
{#- Auto-close unclosed think before tool_call -#}
|
||||
{%- if '<think>' in content and '<tool_call>' in content %}
|
||||
{%- set last_think = content.rfind('<think>') %}
|
||||
{%- set last_close = content.rfind('</think>') %}
|
||||
{%- set tool_pos = content.find('<tool_call>') %}
|
||||
{%- if last_close < last_think or last_close == -1 %}
|
||||
{%- if tool_pos > last_think %}
|
||||
{%- set content = content[:tool_pos] + '</think>' + content[tool_pos:] %}
|
||||
{%- else %}
|
||||
{%- set content = content + '</think>' %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if message.reasoning_content is string %}
|
||||
{%- set reasoning_content = message.reasoning_content %}
|
||||
{%- else %}
|
||||
{%- set has_think_tag = false %}
|
||||
{%- set think_end_token = '</think>' %}
|
||||
{%- if '</think>' in content %}
|
||||
{%- set has_think_tag = true %}
|
||||
{%- elif '</thinking>' in content %}
|
||||
{%- set has_think_tag = true %}
|
||||
{%- set think_end_token = '</thinking>' %}
|
||||
{%- elif '<think>' in content %}
|
||||
{%- set reasoning_content = content.split('<think>')[-1].lstrip('\n') %}
|
||||
{%- set content = '' %}
|
||||
{%- endif %}
|
||||
{%- if has_think_tag %}
|
||||
{%- set reasoning_content = content.split(think_end_token)[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||
{%- set content = content.split(think_end_token)[-1].lstrip('\n') %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- set reasoning_content = reasoning_content|trim %}
|
||||
{%- set show_think = false %}
|
||||
{%- if loop.index0 > ns.last_query_index %}
|
||||
{%- set show_think = true %}
|
||||
{%- elif ns_flags.enable_thinking and (preserve_thinking is undefined or preserve_thinking is true) and reasoning_content|length > 0 %}
|
||||
{%- set show_think = true %}
|
||||
{%- endif %}
|
||||
{%- if show_think %}
|
||||
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
|
||||
{%- else %}
|
||||
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||
{%- endif %}
|
||||
{%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if tool_call.function is defined %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{%- if loop.first %}
|
||||
{%- if content|trim %}
|
||||
{{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- else %}
|
||||
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
|
||||
{%- endif %}
|
||||
{%- if tool_call.arguments is defined and tool_call.arguments is mapping %}
|
||||
{%- if tool_call.arguments|length > 0 %}
|
||||
{%- for args_name in tool_call.arguments %}
|
||||
{%- set args_value = tool_call.arguments[args_name] %}
|
||||
{{- '<parameter=' + args_name + '>\n' }}
|
||||
{%- set args_value = args_value | string if args_value is string else args_value | tojson %}
|
||||
{{- args_value }}
|
||||
{{- '\n</parameter>\n' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{%- elif tool_call.arguments is defined and tool_call.arguments is string %}
|
||||
{%- if tool_call.arguments|trim|length > 0 %}
|
||||
{{- tool_call.arguments }}
|
||||
{{- '\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{{- '</function>\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif message.role == "tool" %}
|
||||
{%- if loop.previtem and loop.previtem.role != "tool" %}
|
||||
{{- '<|im_start|>user' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- content }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if not loop.last and loop.nextitem.role != "tool" %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- elif loop.last %}
|
||||
{{- '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{{- raise_exception('Unexpected message role.') }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n' }}
|
||||
{%- if ns_flags.enable_thinking is false %}
|
||||
{{- '<think>\n\n</think>\n\n' }}
|
||||
{%- else %}
|
||||
{{- '<think>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
4
mods/fix-qwen3.6-chat-template/run.sh
Normal file
4
mods/fix-qwen3.6-chat-template/run.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
cp chat_template.jinja $WORKSPACE_DIR/fixed_chat_template.jinja
|
||||
echo "=======> to apply chat template, use --chat-template fixed_chat_template.jinja"
|
||||
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
|
||||
import re
|
||||
|
||||
path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
|
||||
with open(path) as f:
|
||||
content = f.read()
|
||||
|
||||
old = """kwargs["ignore_keys_at_rope_validation"] = [
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
]"""
|
||||
|
||||
new = """kwargs["ignore_keys_at_rope_validation"] = {
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
}"""
|
||||
|
||||
content = content.replace(old, new)
|
||||
|
||||
with open(path, "w") as f:
|
||||
f.write(content)
|
||||
|
||||
print("Fixed ignore_keys_at_rope_validation: list -> set")
|
||||
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
@@ -0,0 +1,46 @@
|
||||
--- qwen3_5.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||
+++ qwen3_5.py 2026-03-03 00:00:00.000000000 +0000
|
||||
@@ -166,11 +166,13 @@
|
||||
z_size = self.value_dim // self.tp_size
|
||||
mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
|
||||
z = z.reshape(z.size(0), -1, self.head_v_dim)
|
||||
- ba, _ = self.in_proj_ba(hidden_states)
|
||||
- b, a = ba.chunk(2, dim=-1)
|
||||
-
|
||||
- b = b.contiguous()
|
||||
- a = a.contiguous()
|
||||
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||
+ _ba_start = self.tp_rank * _ba_chunk
|
||||
+ b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||
+ a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||
|
||||
# ============================================================
|
||||
# Part 2: Core Attention (Custom Op)
|
||||
@@ -374,8 +376,6 @@
|
||||
# GDN
|
||||
("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
|
||||
("in_proj_qkvz", "in_proj_z", 3),
|
||||
- ("in_proj_ba", "in_proj_b", 0),
|
||||
- ("in_proj_ba", "in_proj_a", 1),
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
@@ -530,7 +530,6 @@
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
# GDN fused projections.
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
@@ -630,7 +629,6 @@
|
||||
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
|
||||
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
|
||||
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
@@ -0,0 +1,56 @@
|
||||
--- qwen3_next.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||
+++ qwen3_next.py 2026-03-03 00:00:00.000000000 +0000
|
||||
@@ -411,15 +411,22 @@
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.in_proj_qkvz",
|
||||
)
|
||||
- # ba_proj doesn't support blockwise fp8 quantization.
|
||||
- # # in_proj_ba is defined as MergedColumnParallelLinear for
|
||||
- # compatibility with Qwen3_5.
|
||||
- self.in_proj_ba = MergedColumnParallelLinear(
|
||||
+ # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
|
||||
+ # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
|
||||
+ # Each rank loads full weights and slices in forward.
|
||||
+ self.in_proj_b = ReplicatedLinear(
|
||||
input_size=self.hidden_size,
|
||||
- output_sizes=[self.num_v_heads] * 2,
|
||||
+ output_size=self.num_v_heads,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
- prefix=f"{prefix}.in_proj_ba",
|
||||
+ prefix=f"{prefix}.in_proj_b",
|
||||
+ )
|
||||
+ self.in_proj_a = ReplicatedLinear(
|
||||
+ input_size=self.hidden_size,
|
||||
+ output_size=self.num_v_heads,
|
||||
+ bias=False,
|
||||
+ quant_config=quant_config,
|
||||
+ prefix=f"{prefix}.in_proj_a",
|
||||
)
|
||||
|
||||
query_key_settings = (self.key_dim, 0, False)
|
||||
@@ -584,7 +591,15 @@
|
||||
# Part 1: Input Projection
|
||||
# ============================================================
|
||||
projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
|
||||
- projected_states_ba, _ = self.in_proj_ba(hidden_states)
|
||||
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||
+ _ba_start = self.tp_rank * _ba_chunk
|
||||
+ projected_states_ba = torch.cat([
|
||||
+ b_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||
+ a_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||
+ ], dim=-1)
|
||||
query, key, value, z, b, a = self.fix_query_key_value_ordering(
|
||||
projected_states_qkvz, projected_states_ba
|
||||
)
|
||||
@@ -1326,7 +1341,6 @@
|
||||
],
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
"in_proj_qkvz": ["in_proj_qkvz"],
|
||||
- "in_proj_ba": ["in_proj_ba"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
|
||||
# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
|
||||
# Delivery: unified diff patches (portable across vLLM versions)
|
||||
|
||||
set -e
|
||||
MOD_DIR="$(dirname "$0")"
|
||||
MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
|
||||
|
||||
echo "[fix-qwen35-tp4-marlin] Applying patches..."
|
||||
|
||||
# Apply patches with --forward (skip if already applied)
|
||||
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
|
||||
echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
|
||||
}
|
||||
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
|
||||
echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
|
||||
}
|
||||
|
||||
# Fix rope validation (idempotent)
|
||||
python3 "$MOD_DIR/fix_rope.py"
|
||||
|
||||
echo "[fix-qwen35-tp4-marlin] Done."
|
||||
255
mods/gpu-mem-util-gb/gpu_mem.patch
Normal file
255
mods/gpu-mem-util-gb/gpu_mem.patch
Normal file
@@ -0,0 +1,255 @@
|
||||
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
|
||||
index 3796265ff..b6dcfb54c 100644
|
||||
--- a/vllm/config/cache.py
|
||||
+++ b/vllm/config/cache.py
|
||||
@@ -45,6 +45,11 @@ class CacheConfig:
|
||||
not matter if you have another vLLM instance running on the same GPU. For
|
||||
example, if you have two vLLM instances running on the same GPU, you can
|
||||
set the GPU memory utilization to 0.5 for each instance."""
|
||||
+ gpu_memory_utilization_gb: float | None = Field(default=None, gt=0)
|
||||
+ """Amount of GPU memory to be used in GiB. This provides fine-grained control
|
||||
+ over GPU memory usage and is particularly useful on unified memory systems
|
||||
+ where available memory changes dynamically. If specified, it overrides
|
||||
+ gpu_memory_utilization. Cannot be used simultaneously with kv_cache_memory_bytes."""
|
||||
cache_dtype: CacheDType = "auto"
|
||||
"""Data type for kv cache storage. If "auto", will use model data type.
|
||||
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
|
||||
@@ -204,6 +209,18 @@ class CacheConfig:
|
||||
object.__setattr__(self, "user_specified_block_size", True)
|
||||
return self
|
||||
|
||||
+ @model_validator(mode="after")
|
||||
+ def _validate_memory_params(self) -> "CacheConfig":
|
||||
+ if (
|
||||
+ self.gpu_memory_utilization_gb is not None
|
||||
+ and self.kv_cache_memory_bytes is not None
|
||||
+ ):
|
||||
+ raise ValueError(
|
||||
+ "Cannot specify both gpu_memory_utilization_gb and "
|
||||
+ "kv_cache_memory_bytes. Please use only one of them."
|
||||
+ )
|
||||
+ return self
|
||||
+
|
||||
@field_validator("cache_dtype", mode="after")
|
||||
@classmethod
|
||||
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
|
||||
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
|
||||
index 56bbb7bf5..db5012608 100644
|
||||
--- a/vllm/engine/arg_utils.py
|
||||
+++ b/vllm/engine/arg_utils.py
|
||||
@@ -454,6 +454,7 @@ class EngineArgs:
|
||||
offload_prefetch_step: int = PrefetchOffloadConfig.offload_prefetch_step
|
||||
offload_params: set[str] = get_field(PrefetchOffloadConfig, "offload_params")
|
||||
gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
|
||||
+ gpu_memory_utilization_gb: float | None = CacheConfig.gpu_memory_utilization_gb
|
||||
kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
|
||||
max_num_batched_tokens: int | None = None
|
||||
max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
|
||||
@@ -954,6 +955,9 @@ class EngineArgs:
|
||||
cache_group.add_argument(
|
||||
"--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]
|
||||
)
|
||||
+ cache_group.add_argument(
|
||||
+ "--gpu-memory-utilization-gb", **cache_kwargs["gpu_memory_utilization_gb"]
|
||||
+ )
|
||||
cache_group.add_argument(
|
||||
"--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
|
||||
)
|
||||
@@ -1512,6 +1516,7 @@ class EngineArgs:
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size, # type: ignore[arg-type]
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
+ gpu_memory_utilization_gb=self.gpu_memory_utilization_gb,
|
||||
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
|
||||
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
|
||||
is_attention_free=model_config.is_attention_free,
|
||||
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
|
||||
index 5909b3043..c2607df6a 100644
|
||||
--- a/vllm/entrypoints/llm.py
|
||||
+++ b/vllm/entrypoints/llm.py
|
||||
@@ -156,6 +156,11 @@ class LLM:
|
||||
values will increase the KV cache size and thus improve the model's
|
||||
throughput. However, if the value is too high, it may cause out-of-
|
||||
memory (OOM) errors.
|
||||
+ gpu_memory_utilization_gb: Amount of GPU memory to reserve in GiB.
|
||||
+ This provides fine-grained control over GPU memory usage and is
|
||||
+ particularly useful on unified memory systems where available memory
|
||||
+ changes dynamically. If specified, it overrides gpu_memory_utilization.
|
||||
+ Cannot be used simultaneously with kv_cache_memory_bytes.
|
||||
kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default,
|
||||
this is set to None and vllm can automatically infer the kv cache
|
||||
size based on gpu_memory_utilization. However, users may want to
|
||||
@@ -234,6 +239,7 @@ class LLM:
|
||||
chat_template: Path | str | None = None,
|
||||
seed: int = 0,
|
||||
gpu_memory_utilization: float = 0.92,
|
||||
+ gpu_memory_utilization_gb: float | None = None,
|
||||
cpu_offload_gb: float = 0,
|
||||
offload_group_size: int = 0,
|
||||
offload_num_in_group: int = 1,
|
||||
@@ -356,6 +362,7 @@ class LLM:
|
||||
tokenizer_revision=tokenizer_revision,
|
||||
seed=seed,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
+ gpu_memory_utilization_gb=gpu_memory_utilization_gb,
|
||||
kv_cache_memory_bytes=kv_cache_memory_bytes,
|
||||
cpu_offload_gb=cpu_offload_gb,
|
||||
offload_group_size=offload_group_size,
|
||||
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
|
||||
index 2ed7ef7e0..806830b17 100644
|
||||
--- a/vllm/v1/core/kv_cache_utils.py
|
||||
+++ b/vllm/v1/core/kv_cache_utils.py
|
||||
@@ -622,7 +622,8 @@ def _check_enough_kv_cache_memory(
|
||||
if available_memory <= 0:
|
||||
raise ValueError(
|
||||
"No available memory for the cache blocks. "
|
||||
- "Try increasing `gpu_memory_utilization` when initializing the engine. "
|
||||
+ "Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb` "
|
||||
+ "when initializing the engine. "
|
||||
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||
"for more details."
|
||||
)
|
||||
@@ -643,8 +644,8 @@ def _check_enough_kv_cache_memory(
|
||||
f"({max_model_len}), ({format_gib(needed_memory)} GiB KV "
|
||||
f"cache is needed, which is larger than the available KV cache "
|
||||
f"memory ({format_gib(available_memory)} GiB). {estimated_msg}"
|
||||
- f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
|
||||
- f"when initializing the engine. "
|
||||
+ f"Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb`, "
|
||||
+ f"or decreasing `max_model_len` when initializing the engine. "
|
||||
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||
f"for more details."
|
||||
)
|
||||
@@ -1438,7 +1439,8 @@ def _auto_fit_max_model_len(
|
||||
if auto_fit_max <= 0:
|
||||
raise ValueError(
|
||||
"Cannot auto-fit max_model_len: not enough GPU memory available "
|
||||
- "to serve even a single token. Try increasing `gpu_memory_utilization`."
|
||||
+ "to serve even a single token. Try increasing `gpu_memory_utilization` "
|
||||
+ "or `gpu_memory_utilization_gb`."
|
||||
)
|
||||
|
||||
if auto_fit_max >= original_max:
|
||||
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
|
||||
index 3d065927e..e8cef2ceb 100644
|
||||
--- a/vllm/v1/utils.py
|
||||
+++ b/vllm/v1/utils.py
|
||||
@@ -358,6 +358,7 @@ def report_usage_stats(
|
||||
"dtype": str(vllm_config.model_config.dtype),
|
||||
"block_size": vllm_config.cache_config.block_size,
|
||||
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
|
||||
+ "gpu_memory_utilization_gb": vllm_config.cache_config.gpu_memory_utilization_gb,
|
||||
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
|
||||
# Quantization
|
||||
"quantization": vllm_config.model_config.quantization,
|
||||
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
|
||||
index b53bd71a1..d28821328 100644
|
||||
--- a/vllm/v1/worker/gpu_model_runner.py
|
||||
+++ b/vllm/v1/worker/gpu_model_runner.py
|
||||
@@ -5355,8 +5355,8 @@ class GPUModelRunner(
|
||||
raise RuntimeError(
|
||||
"CUDA out of memory occurred when warming up sampler with "
|
||||
f"{num_reqs} dummy requests. Please try lowering "
|
||||
- "`max_num_seqs` or `gpu_memory_utilization` when "
|
||||
- "initializing the engine."
|
||||
+ "`max_num_seqs`, `gpu_memory_utilization`, or "
|
||||
+ "`gpu_memory_utilization_gb` when initializing the engine."
|
||||
) from e
|
||||
else:
|
||||
raise e
|
||||
@@ -5434,8 +5434,8 @@ class GPUModelRunner(
|
||||
raise RuntimeError(
|
||||
"CUDA out of memory occurred when warming up pooler "
|
||||
f"({task=}) with {num_reqs} dummy requests. Please try "
|
||||
- "lowering `max_num_seqs` or `gpu_memory_utilization` when "
|
||||
- "initializing the engine."
|
||||
+ "lowering `max_num_seqs`, `gpu_memory_utilization`, or "
|
||||
+ "`gpu_memory_utilization_gb` when initializing the engine."
|
||||
) from e
|
||||
else:
|
||||
raise e
|
||||
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
|
||||
index 842e76549..bf3bb359b 100644
|
||||
--- a/vllm/v1/worker/gpu_worker.py
|
||||
+++ b/vllm/v1/worker/gpu_worker.py
|
||||
@@ -357,7 +357,8 @@ class Worker(WorkerBase):
|
||||
|
||||
Tip:
|
||||
You may limit the usage of GPU memory
|
||||
- by adjusting the `gpu_memory_utilization` parameter.
|
||||
+ by adjusting the `gpu_memory_utilization` or
|
||||
+ `gpu_memory_utilization_gb` parameter.
|
||||
"""
|
||||
if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
|
||||
# still need a profile run which compiles the model for
|
||||
@@ -369,7 +370,8 @@ class Worker(WorkerBase):
|
||||
f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
|
||||
"KV Cache as specified by kv_cache_memory_bytes config and "
|
||||
"skipped memory profiling. This does not respect the "
|
||||
- "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
|
||||
+ "gpu_memory_utilization or gpu_memory_utilization_gb config. "
|
||||
+ "Only use kv_cache_memory_bytes "
|
||||
"config when you want manual control of KV cache memory "
|
||||
"size. If OOM'ed, check the difference of initial free "
|
||||
"memory between the current run and the previous run "
|
||||
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
|
||||
index d06c40ed6..89c94e641 100644
|
||||
--- a/vllm/v1/worker/utils.py
|
||||
+++ b/vllm/v1/worker/utils.py
|
||||
@@ -405,21 +405,43 @@ def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) ->
|
||||
Calculate the amount of memory required by vLLM, then validate
|
||||
that the current amount of free memory is sufficient for that.
|
||||
"""
|
||||
- requested_memory = math.ceil(
|
||||
- init_snapshot.total_memory * cache_config.gpu_memory_utilization
|
||||
- )
|
||||
-
|
||||
- if init_snapshot.free_memory < requested_memory:
|
||||
- raise ValueError(
|
||||
- f"Free memory on device {init_snapshot.device_} "
|
||||
- f"({format_gib(init_snapshot.free_memory)}/"
|
||||
- f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
|
||||
- f"is less than desired GPU memory utilization "
|
||||
- f"({cache_config.gpu_memory_utilization}, "
|
||||
- f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
|
||||
- f"utilization or reduce GPU memory used by other processes."
|
||||
+ if cache_config.gpu_memory_utilization_gb is not None:
|
||||
+ requested_memory = math.ceil(cache_config.gpu_memory_utilization_gb * 1024**3)
|
||||
+ if requested_memory <= 0:
|
||||
+ raise ValueError(
|
||||
+ f"gpu_memory_utilization_gb must be positive, got "
|
||||
+ f"{cache_config.gpu_memory_utilization_gb} GiB."
|
||||
+ )
|
||||
+ if requested_memory > init_snapshot.total_memory:
|
||||
+ raise ValueError(
|
||||
+ f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
|
||||
+ f"total GPU memory ({format_gib(init_snapshot.total_memory)} GiB). "
|
||||
+ f"Reduce gpu_memory_utilization_gb or use a smaller value."
|
||||
+ )
|
||||
+ safety_margin = 0.5 * 1024**3
|
||||
+ if requested_memory > init_snapshot.free_memory + safety_margin:
|
||||
+ raise ValueError(
|
||||
+ f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
|
||||
+ f"available memory ({format_gib(init_snapshot.free_memory)} GiB) "
|
||||
+ f"with safety margin ({format_gib(safety_margin)} GiB). "
|
||||
+ f"Reduce gpu_memory_utilization_gb or free up GPU memory."
|
||||
+ )
|
||||
+ else:
|
||||
+ requested_memory = math.ceil(
|
||||
+ init_snapshot.total_memory * cache_config.gpu_memory_utilization
|
||||
)
|
||||
|
||||
+ if init_snapshot.free_memory < requested_memory:
|
||||
+ raise ValueError(
|
||||
+ f"Free memory on device {init_snapshot.device_} "
|
||||
+ f"({format_gib(init_snapshot.free_memory)}/"
|
||||
+ f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
|
||||
+ f"is less than desired GPU memory utilization "
|
||||
+ f"({cache_config.gpu_memory_utilization}, "
|
||||
+ f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
|
||||
+ f"utilization or reduce GPU memory used by other processes."
|
||||
+ )
|
||||
+
|
||||
return requested_memory
|
||||
|
||||
|
||||
6
mods/gpu-mem-util-gb/run.sh
Normal file
6
mods/gpu-mem-util-gb/run.sh
Normal file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
patch -p1 -d /usr/local/lib/python3.12/dist-packages < gpu_mem.patch \
|
||||
&& echo "=====> You can now use --gpu-memory-utilization-gb parameter to specify reserved memory in GiB"
|
||||
4
mods/nemotron-super/run.sh
Normal file
4
mods/nemotron-super/run.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
cd $WORKSPACE_DIR
|
||||
wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/resolve/main/super_v3_reasoning_parser.py
|
||||
@@ -1,117 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Define a function to export immediately AND save to .bashrc for future sessions
|
||||
export_persist() {
|
||||
local var_name="$1"
|
||||
local var_value="$2"
|
||||
|
||||
# 1. Export for the current running process
|
||||
export "$var_name"="$var_value"
|
||||
|
||||
# 2. Append to .bashrc (idempotent check to avoid duplicate lines)
|
||||
if ! grep -q "export $var_name=" ~/.bashrc; then
|
||||
echo "export $var_name=\"$var_value\"" >> ~/.bashrc
|
||||
else
|
||||
# Optional: Update the existing line if it exists
|
||||
sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Help Function ---
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Required Arguments:"
|
||||
echo " -r, --role <head|node> : Set the node type"
|
||||
echo " -h, --host-ip <ip> : IP address of this interface (Host IP)"
|
||||
echo " -e, --eth-if <name> : Ethernet interface name (e.g., eth0)"
|
||||
echo " -i, --ib-if <name> : InfiniBand/RDMA interface name"
|
||||
echo ""
|
||||
echo "Conditional Arguments:"
|
||||
echo " -m, --head-ip <ip> : IP of the head node (REQUIRED if role is 'node')"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
|
||||
echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- Argument Parsing ---
|
||||
|
||||
# Initialize variables to empty
|
||||
NODE_TYPE=""
|
||||
HOST_IP=""
|
||||
ETH_IF_NAME=""
|
||||
IB_IF_NAME=""
|
||||
HEAD_IP=""
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-r|--role) NODE_TYPE="$2"; shift ;;
|
||||
-h|--host-ip) HOST_IP="$2"; shift ;;
|
||||
-e|--eth-if) ETH_IF_NAME="$2"; shift ;;
|
||||
-i|--ib-if) IB_IF_NAME="$2"; shift ;;
|
||||
-m|--head-ip) HEAD_IP="$2"; shift ;;
|
||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# --- Validation ---
|
||||
|
||||
# 1. Check if all common required arguments are present
|
||||
if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
|
||||
echo "Error: Missing required arguments."
|
||||
usage
|
||||
fi
|
||||
|
||||
# 2. Validate Role
|
||||
if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
|
||||
echo "Error: --role must be 'head' or 'node'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. Conditional Check for Head IP
|
||||
if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
|
||||
echo "Error: When --role is 'node', you must provide --head-ip."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Environment Configuration ---
|
||||
|
||||
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
|
||||
|
||||
export_persist VLLM_HOST_IP "$HOST_IP"
|
||||
export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
|
||||
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
|
||||
|
||||
# Network Interface
|
||||
export_persist MN_IF_NAME "$ETH_IF_NAME"
|
||||
export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
|
||||
export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
|
||||
# InfiniBand
|
||||
export_persist NCCL_IB_HCA "$IB_IF_NAME"
|
||||
export_persist NCCL_IB_DISABLE "0"
|
||||
|
||||
# Sockets/Transport
|
||||
export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
|
||||
export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist RAY_memory_monitor_refresh_ms "0"
|
||||
|
||||
# --- Execution ---
|
||||
|
||||
if [ "${NODE_TYPE}" == "head" ]; then
|
||||
echo "Starting Ray HEAD node..."
|
||||
exec ray start --block --head --port 6379 \
|
||||
--node-ip-address "$VLLM_HOST_IP" \
|
||||
--disable-usage-stats
|
||||
else
|
||||
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
|
||||
exec ray start --block \
|
||||
--address="$HEAD_IP:6379" \
|
||||
--node-ip-address "$VLLM_HOST_IP"
|
||||
fi
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
echo "Setting up cluster initialization script..."
|
||||
cp run-cluster-node.sh $WORKSPACE_DIR/run-cluster-node.sh
|
||||
chmod +x $WORKSPACE_DIR/run-cluster-node.sh
|
||||
# NGC vLLM mod: container initialization is now handled by launch-cluster.sh
|
||||
echo "NGC vLLM mod applied."
|
||||
|
||||
58
recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
Normal file
58
recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
Normal file
@@ -0,0 +1,58 @@
|
||||
# Recipe: Qwen3.5-122B-A10B-INT4-Autoround
|
||||
# Qwen3.5-122B model in Intel INT4-Autoround quantization
|
||||
# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks.
|
||||
# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150`
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3.5-397B-INT4-Autoround (PP=3)
|
||||
description: Recipe for Qwen3.5-397B-INT4-Autoround to run on 3-node mesh in pipeline-parallel mode
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
|
||||
|
||||
cluster_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
build_args:
|
||||
- --tf5
|
||||
|
||||
mods:
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
pipeline_parallel: 3
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 16384
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-seqs 10 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--enable-prefix-caching \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser qwen3 \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--trust-remote-code \
|
||||
--chat-template unsloth.jinja \
|
||||
-tp 1 \
|
||||
-pp {pipeline_parallel} \
|
||||
--load-format instanttensor \
|
||||
--distributed-executor-backend ray
|
||||
|
||||
|
||||
45
recipes/4x-spark-cluster/minimax-m2.5.yaml
Normal file
45
recipes/4x-spark-cluster/minimax-m2.5.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
# Recipe: MiniMax-M2.5
|
||||
# MiniMaxAI/MiniMax-M2.5
|
||||
|
||||
recipe_version: "1"
|
||||
name: MiniMax-M2.5
|
||||
description: vLLM serving MiniMax-M2.5 with Ray distributed backend
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: MiniMaxAI/MiniMax-M2.5
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# Can only be run in a cluster
|
||||
cluster_only: true
|
||||
|
||||
# No mods required
|
||||
mods: []
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 4
|
||||
gpu_memory_utilization: 0.90
|
||||
max_model_len: 128000
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_DISTRIBUTED_EXECUTOR_CONFIG: '{"placement_group_options":{"strategy":"SPREAD"}}'
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve MiniMaxAI/MiniMax-M2.5 \
|
||||
--trust-remote-code \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--max-model-len {max_model_len} \
|
||||
--load-format fastsafetensors \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser minimax_m2 \
|
||||
--reasoning-parser minimax_m2_append_think
|
||||
61
recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
Normal file
61
recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
Normal file
@@ -0,0 +1,61 @@
|
||||
# Recipe: Qwen3.5-397B-A17B-FP8
|
||||
# Qwen3.5-397B-A17B model in FP8 precision
|
||||
# Multi-modal input
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3.5-397B-A17B-FP8
|
||||
description: vLLM serving Qwen3.5-397B-A17B-FP8
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Qwen/Qwen3.5-397B-A17B-FP8
|
||||
|
||||
#solo_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
build_args:
|
||||
- --tf5
|
||||
|
||||
# Mod required to fix ROPE syntax error
|
||||
mods:
|
||||
- mods/fix-qwen3.5-autoround
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 4
|
||||
gpu_memory_utilization: 0.85
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 8192
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_USE_DEEP_GEMM: 0
|
||||
VLLM_USE_FLASHINFER_MOE_FP16: 1
|
||||
VLLM_USE_FLASHINFER_SAMPLER: 0
|
||||
OMP_NUM_THREADS: 4
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
|
||||
--max-model-len {max_model_len} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--load-format fastsafetensors \
|
||||
--enable-prefix-caching \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser qwen3 \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--trust-remote-code \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--mm-encoder-tp-mode data \
|
||||
--kv-cache-dtype fp8 \
|
||||
--compilation-config.cudagraph_mode none \
|
||||
--max-num-seqs 32 \
|
||||
--attention-backend flashinfer
|
||||
|
||||
53
recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
Normal file
53
recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
Normal file
@@ -0,0 +1,53 @@
|
||||
# Recipe: Qwen3.5-397B-A17B-INT4-Autoround
|
||||
# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
|
||||
# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
|
||||
# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3.5-397B-INT4-Autoround
|
||||
description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
build_args:
|
||||
- --tf5
|
||||
|
||||
# Mods required: coder-next tool/reasoning parser + Marlin TP fix
|
||||
mods:
|
||||
- mods/fix-qwen3-coder-next
|
||||
- mods/fix-qwen35-tp4-marlin
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
|
||||
|
||||
# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 4
|
||||
gpu_memory_utilization: 0.78
|
||||
max_model_len: 32768
|
||||
max_num_batched_tokens: 8192
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser qwen3 \
|
||||
--enable-auto-tool-choice \
|
||||
--tensor-parallel-size {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--kv-cache-dtype fp8 \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--enable-prefix-caching \
|
||||
--trust-remote-code \
|
||||
--host {host} \
|
||||
--port {port}
|
||||
@@ -44,12 +44,16 @@ The recipe runner can automatically discover cluster nodes:
|
||||
```
|
||||
|
||||
When you run `--discover`, it:
|
||||
1. Scans the network for nodes with SSH access
|
||||
2. Prompts you to select which nodes to include
|
||||
3. Saves the configuration to `.env`
|
||||
1. Detects active CX7 interfaces and determines mesh vs. standard topology.
|
||||
2. Scans the network for peers that are both SSH-reachable **and** have an NVIDIA GB10 GPU.
|
||||
3. In mesh mode, separately discovers `COPY_HOSTS` on the direct IB-attached interfaces.
|
||||
4. Prompts for per-node confirmation for `CLUSTER_NODES` and `COPY_HOSTS`.
|
||||
5. Saves the full configuration (including mesh NCCL settings if applicable) to `.env`.
|
||||
|
||||
Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`.
|
||||
|
||||
When distributing the container image or model files, the runner uses `COPY_HOSTS` from `.env` (which may differ from `CLUSTER_NODES` in mesh mode) to ensure transfers go over the fastest available path.
|
||||
|
||||
## Workflow Modes
|
||||
|
||||
### Solo Mode (Single Node)
|
||||
@@ -169,6 +173,7 @@ Usage: ./run-recipe.sh [OPTIONS] [RECIPE]
|
||||
Cluster discovery:
|
||||
--discover Auto-detect cluster nodes and save to .env
|
||||
--show-env Show current .env configuration
|
||||
--config FILE Path to .env configuration file (default: .env in repo directory)
|
||||
|
||||
Recipe overrides:
|
||||
--port PORT Override port
|
||||
@@ -186,10 +191,25 @@ Setup options:
|
||||
|
||||
Launch options:
|
||||
--solo Run in solo mode (single node, no Ray)
|
||||
--no-ray Multi-node without Ray (PyTorch distributed backend)
|
||||
-n, --nodes IPS Comma-separated node IPs (first = head)
|
||||
-d, --daemon Run in daemon mode
|
||||
-t, --container IMAGE Override container from recipe
|
||||
--name NAME Override container name
|
||||
--nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE)
|
||||
--master-port PORT Cluster coordination port: Ray head port or PyTorch
|
||||
distributed master port (default: 29501).
|
||||
Alias: --head-port
|
||||
--eth-if IFACE Override Ethernet interface
|
||||
--ib-if IFACE Override InfiniBand interface
|
||||
-e VAR=VALUE Pass environment variable to container (repeatable)
|
||||
-j N Number of parallel build jobs
|
||||
--no-cache-dirs Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton
|
||||
--non-privileged Run container without --privileged
|
||||
--mem-limit-gb N Memory limit in GB (only with --non-privileged)
|
||||
--mem-swap-limit-gb N Memory+swap limit in GB (only with --non-privileged)
|
||||
--pids-limit N Process limit (only with --non-privileged)
|
||||
--shm-size-gb N Shared memory size in GB (only with --non-privileged)
|
||||
|
||||
Extra vLLM arguments:
|
||||
-- ARGS... Pass additional arguments directly to vLLM
|
||||
@@ -261,10 +281,18 @@ command: |
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ autodiscover.sh │
|
||||
│ - Interface detection (standard / mesh topology) │
|
||||
│ - GB10 peer verification via SSH │
|
||||
│ - CLUSTER_NODES and COPY_HOSTS discovery │
|
||||
│ - Interactive .env save with per-node confirmation │
|
||||
└──────────────────────────┬──────────────────────────────┘
|
||||
│ sourced by
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ run-recipe.sh / run-recipe.py │
|
||||
│ - Parses YAML recipe │
|
||||
│ - Auto-discovers cluster nodes (--discover) │
|
||||
│ - Loads nodes from .env │
|
||||
│ - Loads / triggers cluster discovery (--discover) │
|
||||
│ - Handles --setup (build + download + run) │
|
||||
│ - Generates launch script from template │
|
||||
│ - Applies CLI overrides │
|
||||
@@ -274,15 +302,15 @@ command: |
|
||||
┌──────────────────────┐ ┌───────────────────────────────┐
|
||||
│ build-and-copy.sh │ │ hf-download.sh │
|
||||
│ - Docker build │ │ - HuggingFace model download │
|
||||
│ - Copy to workers │ │ - Rsync to workers │
|
||||
│ - Copy to COPY_HOSTS│ │ - Rsync to COPY_HOSTS │
|
||||
└──────────────────────┘ └───────────────────────────────┘
|
||||
│
|
||||
│
|
||||
│ then calls (for run)
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────┐
|
||||
│ launch-cluster.sh │
|
||||
│ - Cluster orchestration │
|
||||
│ - Container lifecycle │
|
||||
│ - Container lifecycle (trimmed to required node count) │
|
||||
│ - Mod application │
|
||||
│ - Launch script execution │
|
||||
└─────────────────────────────────────────────────────────┘
|
||||
|
||||
53
recipes/gemma4-26b-a4b.yaml
Normal file
53
recipes/gemma4-26b-a4b.yaml
Normal file
@@ -0,0 +1,53 @@
|
||||
# Recipe: Gemma4-26B-A4B
|
||||
# Gemma4-26B-A4B model in online FP8 quantization
|
||||
|
||||
recipe_version: "1"
|
||||
name: Gemma4-26B-A4B
|
||||
description: vLLM serving Gemma4-26B-A4B
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: google/gemma-4-26B-A4B-it
|
||||
|
||||
# Only cluster is supported
|
||||
cluster_only: false
|
||||
solo_only: false
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
build_args:
|
||||
- --tf5
|
||||
|
||||
# Mods
|
||||
# mods:
|
||||
# - mods/fix-gemma4-tool-parser
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 8192
|
||||
|
||||
# Environment variables
|
||||
env: {}
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve google/gemma-4-26B-A4B-it \
|
||||
--max-model-len {max_model_len} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--load-format safetensors \
|
||||
--enable-prefix-caching \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser gemma4 \
|
||||
--reasoning-parser gemma4 \
|
||||
--quantization fp8 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
-tp {tensor_parallel} --distributed-executor-backend ray
|
||||
|
||||
@@ -30,8 +30,8 @@ build_args:
|
||||
|
||||
# Mods to apply before running (paths relative to repo root)
|
||||
# This mod prevents severe inference speed degradation
|
||||
mods:
|
||||
- mods/fix-glm-4.7-flash-AWQ
|
||||
# mods:
|
||||
# - mods/fix-glm-4.7-flash-AWQ
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
|
||||
44
recipes/minimax-m2.7-awq.yaml
Normal file
44
recipes/minimax-m2.7-awq.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
# Recipe: MiniMax-M2.7-AWQ
|
||||
# MiniMax M2.7 model with AWQ quantization
|
||||
|
||||
recipe_version: "1"
|
||||
name: MiniMax-M2.7-AWQ
|
||||
description: vLLM serving MiniMax-M2.7-AWQ with Ray distributed backend
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: cyankiwi/MiniMax-M2.7-AWQ-4bit
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# Can only be run in a cluster
|
||||
cluster_only: true
|
||||
|
||||
# No mods required
|
||||
mods: []
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.8
|
||||
max_model_len: 196608
|
||||
|
||||
# Environment variables
|
||||
env: {}
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve cyankiwi/MiniMax-M2.7-AWQ-4bit \
|
||||
--trust-remote-code \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--max-model-len {max_model_len} \
|
||||
--load-format fastsafetensors \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser minimax_m2 \
|
||||
--reasoning-parser minimax_m2
|
||||
@@ -25,16 +25,12 @@ defaults:
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 1
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 131072
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_USE_FLASHINFER_MOE_FP4: 1
|
||||
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
|
||||
max_model_len: 262144
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \
|
||||
--moe-backend cutlass \
|
||||
--max-model-len {max_model_len} \
|
||||
--port {port} --host {host} \
|
||||
--trust-remote-code \
|
||||
@@ -44,6 +40,5 @@ command: |
|
||||
--reasoning-parser nano_v3 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--enable-prefix-caching \
|
||||
--attention-backend flashinfer \
|
||||
--load-format fastsafetensors \
|
||||
--gpu-memory-utilization {gpu_memory_utilization}
|
||||
|
||||
46
recipes/nemotron-3-super-nvfp4.yaml
Normal file
46
recipes/nemotron-3-super-nvfp4.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
# Recipe: Nemotron-3-Super-NVFP4
|
||||
# Uses VLLM_CUTLASS for NVFP4
|
||||
recipe_version: "1"
|
||||
name: Nemotron-3-Super-NVFP4-CUTLASS-Optimized
|
||||
description: vLLM serving Nemotron-3-Super-120B using CUTLASS kernels
|
||||
|
||||
model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
|
||||
container: vllm-node
|
||||
cluster_only: false
|
||||
solo_only: false
|
||||
|
||||
# mods:
|
||||
# - mods/nemotron-super
|
||||
|
||||
env:
|
||||
VLLM_FLASHINFER_ALLREDUCE_BACKEND: trtllm
|
||||
VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1
|
||||
|
||||
container: vllm-node
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_seqs: 10
|
||||
|
||||
command: |
|
||||
vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--moe-backend cutlass \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-seqs {max_num_seqs} \
|
||||
--enable-prefix-caching \
|
||||
--host {host} \
|
||||
--port {port} \
|
||||
--enable-auto-tool-choice \
|
||||
--load-format fastsafetensors \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser nemotron_v3 \
|
||||
--mamba_ssm_cache_dtype float32 \
|
||||
--tensor-parallel-size {tensor_parallel} \
|
||||
--attention-backend TRITON_ATTN \
|
||||
--distributed-executor-backend ray
|
||||
@@ -11,6 +11,9 @@ model: openai/gpt-oss-120b
|
||||
# Container image to use
|
||||
container: vllm-node-mxfp4
|
||||
|
||||
# Only solo now
|
||||
solo_only: true
|
||||
|
||||
# Build arguments for build-and-copy.sh
|
||||
build_args:
|
||||
- --exp-mxfp4
|
||||
@@ -22,7 +25,7 @@ mods: []
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
tensor_parallel: 1
|
||||
gpu_memory_utilization: 0.70
|
||||
max_num_batched_tokens: 8192
|
||||
|
||||
@@ -37,8 +40,6 @@ command: |
|
||||
--tool-call-parser openai \
|
||||
--reasoning-parser openai_gptoss \
|
||||
--enable-auto-tool-choice \
|
||||
--tensor-parallel-size {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--enable-prefix-caching \
|
||||
--load-format fastsafetensors \
|
||||
|
||||
@@ -15,8 +15,8 @@ model: Qwen/Qwen3-Coder-Next-FP8
|
||||
container: vllm-node
|
||||
|
||||
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
|
||||
mods:
|
||||
- mods/fix-qwen3-coder-next
|
||||
# mods:
|
||||
# - mods/fix-qwen3-coder-next
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
|
||||
43
recipes/qwen3-coder-next-int4-autoround.yaml
Normal file
43
recipes/qwen3-coder-next-int4-autoround.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
# Recipe: Qwen3-Coder-Next-int4-Autoround
|
||||
# Qwen3-Coder-Next model in Intel int4-Autoround format
|
||||
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3-Coder-Next-int4-Autoround
|
||||
description: Qwen3-Coder-Next-int4-Autoround
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Intel/Qwen3-Coder-Next-int4-AutoRound
|
||||
|
||||
solo_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# Mod required to fix autoround weight loading issues
|
||||
mods:
|
||||
- mods/fix-qwen3-next-autoround
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Intel/Qwen3-Coder-Next-int4-AutoRound \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--host {host} \
|
||||
--port {port} \
|
||||
--load-format fastsafetensors \
|
||||
--enable-prefix-caching \
|
||||
--max-model-len {max_model_len}
|
||||
|
||||
47
recipes/qwen3.5-122b-fp8.yaml
Normal file
47
recipes/qwen3.5-122b-fp8.yaml
Normal file
@@ -0,0 +1,47 @@
|
||||
# Recipe: Qwen3.5-122B-A10B-FP8
|
||||
# Qwen3.5-122B model in native FP8 quantization
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3.5-122B-FP8
|
||||
description: vLLM serving Qwen3.5-122B-FP8
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Qwen/Qwen3.5-122B-A10B-FP8
|
||||
|
||||
# Only cluster is supported
|
||||
cluster_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# No mods required
|
||||
mods:
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 8192
|
||||
|
||||
# Environment variables
|
||||
env: {}
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Qwen/Qwen3.5-122B-A10B-FP8 \
|
||||
--max-model-len {max_model_len} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--load-format fastsafetensors \
|
||||
--enable-prefix-caching \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser qwen3 \
|
||||
--chat-template unsloth.jinja \
|
||||
-tp {tensor_parallel} --distributed-executor-backend ray \
|
||||
--max-num-batched-tokens {max_num_batched_tokens}
|
||||
@@ -18,7 +18,8 @@ build_args:
|
||||
|
||||
# Mod required to fix ROPE syntax error
|
||||
mods:
|
||||
- mods/fix-qwen3.5-autoround
|
||||
# - mods/fix-qwen3.5-autoround
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
@@ -43,10 +44,11 @@ command: |
|
||||
--load-format fastsafetensors \
|
||||
--enable-prefix-caching \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--tool-call-parser qwen3_xml \
|
||||
--reasoning-parser qwen3 \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--trust-remote-code \
|
||||
--chat-template unsloth.jinja \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray
|
||||
|
||||
|
||||
51
recipes/qwen3.5-35b-a3b-fp8.yaml
Normal file
51
recipes/qwen3.5-35b-a3b-fp8.yaml
Normal file
@@ -0,0 +1,51 @@
|
||||
# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
|
||||
# Qwen/Qwen3.5-35B-A3B model in native FP8 format
|
||||
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen35-35B-A3B
|
||||
description: vLLM serving Qwen3.5-35B-A3B-FP8
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Qwen/Qwen3.5-35B-A3B-FP8
|
||||
|
||||
#solo_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
|
||||
mods:
|
||||
- mods/fix-qwen3-coder-next
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 16384
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
|
||||
--host {host} \
|
||||
--port {port} \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--kv-cache-dtype fp8 \
|
||||
--load-format fastsafetensors \
|
||||
--attention-backend flashinfer \
|
||||
--enable-prefix-caching \
|
||||
--chat-template unsloth.jinja \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray
|
||||
59
recipes/qwen3.5-397b-int4-autoround.yaml
Normal file
59
recipes/qwen3.5-397b-int4-autoround.yaml
Normal file
@@ -0,0 +1,59 @@
|
||||
# Recipe: Qwen3.5-122B-A10B-INT4-Autoround
|
||||
# Qwen3.5-122B model in Intel INT4-Autoround quantization
|
||||
# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks.
|
||||
# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150`
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3.5-397B-INT4-Autoround
|
||||
description: EXPERIMENTAL recipe for Qwen3.5-397B-INT4-Autoround (please refer to README for details! Use with `--no-ray` parameter!)
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
|
||||
|
||||
cluster_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
build_args:
|
||||
- --tf5
|
||||
|
||||
mods:
|
||||
- mods/fix-qwen3.5-chat-template
|
||||
#- mods/gpu-mem-util-gb
|
||||
# - mods/drop-caches
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.9
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 4176
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-seqs 2 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--port {port} \
|
||||
--host {host} \
|
||||
--enable-prefix-caching \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_xml \
|
||||
--reasoning-parser qwen3 \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--trust-remote-code \
|
||||
--chat-template unsloth.jinja \
|
||||
--load-format instanttensor \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray
|
||||
|
||||
|
||||
51
recipes/qwen3.6-35b-a3b-fp8-dflash.yaml
Normal file
51
recipes/qwen3.6-35b-a3b-fp8-dflash.yaml
Normal file
@@ -0,0 +1,51 @@
|
||||
# Recipe: Qwen/Qwen3.6-35B-A3B-FP8
|
||||
# Qwen/Qwen3.6-35B-A3B model in native FP8 format
|
||||
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen36-35B-A3B
|
||||
description: vLLM serving Qwen3.6-35B-A3B-FP8
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Qwen/Qwen3.6-35B-A3B-FP8
|
||||
|
||||
#solo_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
|
||||
mods:
|
||||
- mods/fix-qwen3.6-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 16384
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \
|
||||
--host {host} \
|
||||
--port {port} \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_xml \
|
||||
--reasoning-parser qwen3 \
|
||||
--load-format fastsafetensors \
|
||||
--attention-backend flash_attn \
|
||||
--enable-prefix-caching \
|
||||
--chat-template fixed_chat_template.jinja \
|
||||
--speculative-config '{{"method": "dflash", "model": "z-lab/Qwen3.6-35B-A3B-DFlash", "num_speculative_tokens": 15}}' \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray
|
||||
49
recipes/qwen3.6-35b-a3b-fp8.yaml
Normal file
49
recipes/qwen3.6-35b-a3b-fp8.yaml
Normal file
@@ -0,0 +1,49 @@
|
||||
# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
|
||||
# Qwen/Qwen3.5-35B-A3B model in native FP8 format
|
||||
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen36-35B-A3B
|
||||
description: vLLM serving Qwen3.6-35B-A3B-FP8
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Qwen/Qwen3.6-35B-A3B-FP8
|
||||
|
||||
#solo_only: true
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node
|
||||
|
||||
mods:
|
||||
- mods/fix-qwen3.6-chat-template
|
||||
|
||||
# Default settings (can be overridden via CLI)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 2
|
||||
gpu_memory_utilization: 0.7
|
||||
max_model_len: 262144
|
||||
max_num_batched_tokens: 16384
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \
|
||||
--host {host} \
|
||||
--port {port} \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser qwen3_xml \
|
||||
--kv-cache-dtype fp8 \
|
||||
--load-format fastsafetensors \
|
||||
--attention-backend flashinfer \
|
||||
--enable-prefix-caching \
|
||||
--chat-template fixed_chat_template.jinja \
|
||||
-tp {tensor_parallel} \
|
||||
--distributed-executor-backend ray
|
||||
@@ -1,120 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Define a function to export immediately AND save to .bashrc for future sessions
|
||||
export_persist() {
|
||||
local var_name="$1"
|
||||
local var_value="$2"
|
||||
|
||||
# 1. Export for the current running process
|
||||
export "$var_name"="$var_value"
|
||||
|
||||
# 2. Append to .bashrc (idempotent check to avoid duplicate lines)
|
||||
if ! grep -q "export $var_name=" ~/.bashrc; then
|
||||
echo "export $var_name=\"$var_value\"" >> ~/.bashrc
|
||||
else
|
||||
# Optional: Update the existing line if it exists
|
||||
sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Help Function ---
|
||||
usage() {
|
||||
echo "Usage: $0 [OPTIONS]"
|
||||
echo ""
|
||||
echo "Required Arguments:"
|
||||
echo " -r, --role <head|node> : Set the node type"
|
||||
echo " -h, --host-ip <ip> : IP address of this interface (Host IP)"
|
||||
echo " -e, --eth-if <name> : Ethernet interface name (e.g., eth0)"
|
||||
echo " -i, --ib-if <name> : InfiniBand/RDMA interface name"
|
||||
echo ""
|
||||
echo "Conditional Arguments:"
|
||||
echo " -m, --head-ip <ip> : IP of the head node (REQUIRED if role is 'node')"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
|
||||
echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# --- Argument Parsing ---
|
||||
|
||||
# Initialize variables to empty
|
||||
NODE_TYPE=""
|
||||
HOST_IP=""
|
||||
ETH_IF_NAME=""
|
||||
IB_IF_NAME=""
|
||||
HEAD_IP=""
|
||||
|
||||
while [[ "$#" -gt 0 ]]; do
|
||||
case $1 in
|
||||
-r|--role) NODE_TYPE="$2"; shift ;;
|
||||
-h|--host-ip) HOST_IP="$2"; shift ;;
|
||||
-e|--eth-if) ETH_IF_NAME="$2"; shift ;;
|
||||
-i|--ib-if) IB_IF_NAME="$2"; shift ;;
|
||||
-m|--head-ip) HEAD_IP="$2"; shift ;;
|
||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# --- Validation ---
|
||||
|
||||
# 1. Check if all common required arguments are present
|
||||
if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
|
||||
echo "Error: Missing required arguments."
|
||||
usage
|
||||
fi
|
||||
|
||||
# 2. Validate Role
|
||||
if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
|
||||
echo "Error: --role must be 'head' or 'node'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 3. Conditional Check for Head IP
|
||||
if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
|
||||
echo "Error: When --role is 'node', you must provide --head-ip."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# --- Environment Configuration ---
|
||||
|
||||
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
|
||||
|
||||
export_persist VLLM_HOST_IP "$HOST_IP"
|
||||
export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
|
||||
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
|
||||
|
||||
# Network Interface
|
||||
export_persist MN_IF_NAME "$ETH_IF_NAME"
|
||||
export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
|
||||
export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
|
||||
# InfiniBand
|
||||
export_persist NCCL_IB_HCA "$IB_IF_NAME"
|
||||
export_persist NCCL_IB_DISABLE "0"
|
||||
|
||||
# Sockets/Transport
|
||||
export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
|
||||
export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist RAY_memory_monitor_refresh_ms "0"
|
||||
|
||||
# --- Execution ---
|
||||
|
||||
if [ "${NODE_TYPE}" == "head" ]; then
|
||||
echo "Starting Ray HEAD node..."
|
||||
exec ray start --block --head --port 6379 \
|
||||
--node-ip-address "$VLLM_HOST_IP" \
|
||||
--include-dashboard=True \
|
||||
--dashboard-host "0.0.0.0" \
|
||||
--dashboard-port 8265 \
|
||||
--disable-usage-stats
|
||||
else
|
||||
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
|
||||
exec ray start --block \
|
||||
--address="$HEAD_IP:6379" \
|
||||
--node-ip-address "$VLLM_HOST_IP"
|
||||
fi
|
||||
|
||||
837
run-recipe.py
837
run-recipe.py
File diff suppressed because it is too large
Load Diff
@@ -728,6 +728,48 @@ test_launch_cmd_no_solo_in_cluster() {
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: -e / --env passthrough to launch-cluster.sh
|
||||
test_launch_cmd_env_passthrough() {
|
||||
log_test "Launch command includes -e env vars"
|
||||
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -e HF_TOKEN=test123 -e MY_VAR=hello 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q "\-e HF_TOKEN=test123" && echo "$launch_cmd" | grep -q "\-e MY_VAR=hello"; then
|
||||
log_pass "Launch command includes -e env vars"
|
||||
else
|
||||
log_fail "-e env vars not found in launch command"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
fi
|
||||
}
|
||||
|
||||
# Test: no -e flags when none specified
|
||||
test_launch_cmd_no_env_by_default() {
|
||||
log_test "Launch command omits -e when no env vars specified"
|
||||
|
||||
recipe_name=$(find_solo_recipe)
|
||||
if [[ -z "$recipe_name" ]]; then
|
||||
log_skip "No solo-capable recipes found"
|
||||
return
|
||||
fi
|
||||
|
||||
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
|
||||
launch_cmd=$(extract_launch_cmd "$output")
|
||||
|
||||
if echo "$launch_cmd" | grep -q " -e "; then
|
||||
log_fail "Unexpected -e flag in launch command"
|
||||
log_verbose "Launch cmd: $launch_cmd"
|
||||
else
|
||||
log_pass "Launch command correctly omits -e when none specified"
|
||||
fi
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# README Documentation Verification Tests
|
||||
# ==============================================================================
|
||||
@@ -1203,6 +1245,8 @@ main() {
|
||||
test_launch_cmd_launch_script
|
||||
test_launch_cmd_container_override
|
||||
test_launch_cmd_no_solo_in_cluster
|
||||
test_launch_cmd_env_passthrough
|
||||
test_launch_cmd_no_env_by_default
|
||||
echo ""
|
||||
|
||||
# README documentation verification tests
|
||||
|
||||
2
wheels/.gitignore
vendored
Normal file
2
wheels/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
||||
Reference in New Issue
Block a user