Merge branch '3-node-autodiscover'

This commit is contained in:
Eugene Rakhmatulin
2026-03-31 18:22:23 -07:00
11 changed files with 1785 additions and 544 deletions

38
.env.example Normal file
View File

@@ -0,0 +1,38 @@
# Example .env configuration file for spark-vllm-docker
# Copy this file to .env and customize for your environment
# Cluster configuration
# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node)
CLUSTER_NODES="192.168.177.11,192.168.177.12"
# ETH_IF: Ethernet interface name (optional, auto-detected if not specified)
ETH_IF="enp1s0f1np1"
# IB_IF: InfiniBand interface name (optional, auto-detected if not specified)
IB_IF="rocep1s0f1,roceP2p1s0f1"
# LOCAL_IP: Local IP address (optional, auto-detected if not specified)
# Useful for solo mode or overriding auto-detection
LOCAL_IP="192.168.177.11"
# MASTER_PORT: Port for cluster coordination (default: 29501)
MASTER_PORT="29501"
# CONTAINER_NAME: Container name (default: vllm_node)
# Note: This is a configuration variable, NOT passed as env var to container
CONTAINER_NAME="vllm_node"
# Container environment variables
# Any variable starting with CONTAINER_ (except CONTAINER_NAME) will be converted to -e flags
# Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO
CONTAINER_NCCL_DEBUG="INFO"
CONTAINER_HF_TOKEN="your_huggingface_token_here"
CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional)
# Used by build-and-copy.sh to distribute images across cluster
COPY_HOSTS="192.168.177.12"
# Additional container environment variables
# CONTAINER_MAX_JOBS="16"
# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"

View File

@@ -14,6 +14,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV NINJAFLAGS="-j${BUILD_JOBS}"
ENV MAKEFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}"
ENV DG_JIT_USE_NVRTC=1
ENV USE_CUDNN=1
# Set non-interactive frontend to prevent apt prompts # Set non-interactive frontend to prevent apt prompts
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
@@ -38,8 +40,8 @@ RUN apt update && \
curl vim cmake build-essential ninja-build \ curl vim cmake build-essential ninja-build \
libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \ libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
python3-dev python3-pip git wget \ python3-dev python3-pip git wget \
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ libibverbs1 libibverbs-dev rdma-core \
ccache \ ccache devscripts debhelper fakeroot \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
&& pip install uv && pip install uv
@@ -59,14 +61,19 @@ ENV CCACHE_COMPRESS=1
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
# Setup Workspace
WORKDIR $VLLM_BASE_DIR
# 2. Set Environment Variables # 2. Set Environment Variables
ARG TORCH_CUDA_ARCH_LIST="12.1a" ARG TORCH_CUDA_ARCH_LIST="12.1a"
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
# Setup Workspace
WORKDIR $VLLM_BASE_DIR
# Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb
RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \
cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \
make pkg.debian.build && apt install -y --no-install-recommends --allow-downgrades ./build/pkg/deb/*.deb
# ========================================================= # =========================================================
# STAGE 2: FlashInfer Builder # STAGE 2: FlashInfer Builder
# ========================================================= # =========================================================
@@ -105,6 +112,26 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
WORKDIR /workspace/flashinfer WORKDIR /workspace/flashinfer
ARG FLASHINFER_PRS=""
RUN if [ -n "$FLASHINFER_PRS" ]; then \
echo "Applying PRs: $FLASHINFER_PRS"; \
for pr in $FLASHINFER_PRS; do \
echo "Fetching and applying PR #$pr..."; \
curl -fL "https://github.com/flashinfer-ai/flashinfer/pull/${pr}.diff" | git apply -v; \
done; \
fi
# TEMPORARY patch for NVFP4 crash (PR 2913)
RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/2913.diff -o pr2913.diff \
&& if git apply --reverse --check pr2913.diff 2>/dev/null; then \
echo "PR #2913 already applied, skipping."; \
else \
echo "Applying FI PR #2913..."; \
git apply -v pr2913.diff; \
fi \
&& rm pr2913.diff
# Apply patch to avoid re-downloading existing cubins # Apply patch to avoid re-downloading existing cubins
COPY flashinfer_cache.patch . COPY flashinfer_cache.patch .
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
@@ -177,6 +204,16 @@ RUN if [ -n "$VLLM_PRS" ]; then \
done; \ done; \
fi fi
# TEMPORARY PATCH for broken compilation
# RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38423.diff -o pr38423.diff \
# && if git apply --reverse --check pr38423.diff 2>/dev/null; then \
# echo "Patch already applied, skipping."; \
# else \
# echo "Applying patch..."; \
# git apply -v pr38423.diff; \
# fi \
# && rm pr38423.diff
# Prepare build requirements # Prepare build requirements
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
python3 use_existing_torch.py && \ python3 use_existing_torch.py && \
@@ -194,17 +231,8 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
# patch -p1 < fastsafetensors.patch; \ # patch -p1 < fastsafetensors.patch; \
# fi # fi
# TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302 # TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302
RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping" # RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping"
RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping" # RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping"
# TEMPORARY PATCH for broken NVFP4 quants
RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38126.diff -o pr38126.diff \
&& if git apply --reverse --check pr38126.diff 2>/dev/null; then \
echo "Patch already applied, skipping."; \
else \
echo "Applying patch..."; \
git apply -v pr38126.diff; \
fi \
&& rm pr38126.diff
# Final Compilation # Final Compilation
RUN --mount=type=cache,id=ccache,target=/root/.ccache \ RUN --mount=type=cache,id=ccache,target=/root/.ccache \
@@ -231,6 +259,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV NINJAFLAGS="-j${BUILD_JOBS}"
ENV MAKEFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}"
ENV DG_JIT_USE_NVRTC=1
ENV USE_CUDNN=1
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV PIP_BREAK_SYSTEM_PACKAGES=1 ENV PIP_BREAK_SYSTEM_PACKAGES=1
@@ -243,13 +273,16 @@ ENV UV_SYSTEM_PYTHON=1
ENV UV_BREAK_SYSTEM_PACKAGES=1 ENV UV_BREAK_SYSTEM_PACKAGES=1
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
# Mount additional packages from base builder image
# Install runtime dependencies # Install runtime dependencies
RUN apt update && \ RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target=/workspace/nccl-pkg \
apt update && \
apt install -y --no-install-recommends \ apt install -y --no-install-recommends \
python3 python3-pip python3-dev vim curl git wget \ python3 python3-pip python3-dev vim curl git wget \
libcudnn9-cuda-13 \ libcudnn9-cuda-13 \
libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ libibverbs1 libibverbs-dev rdma-core \
libxcb1 \ libxcb1 \
&& cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \
&& rm -rf /var/lib/apt/lists/* \ && rm -rf /var/lib/apt/lists/* \
&& pip install uv && pip install uv
@@ -293,5 +326,9 @@ ENV PATH=$VLLM_BASE_DIR:$PATH
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install ray[default] fastsafetensors uv pip install ray[default] fastsafetensors
# Fix NCCL
RUN rm /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2 && \
ln -s /usr/lib/aarch64-linux-gnu/libnccl.so.2 /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2
# Build metadata (generated by build-and-copy.sh) # Build metadata (generated by build-and-copy.sh)
COPY build-metadata.yaml /workspace/build-metadata.yaml COPY build-metadata.yaml /workspace/build-metadata.yaml

239
README.md
View File

@@ -2,6 +2,7 @@
# vLLM Docker Optimized for DGX Spark (single or multi-node) # vLLM Docker Optimized for DGX Spark (single or multi-node)
This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups. This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups.
Cluster setup supports direct connect between dual Sparks, connecting via QSFP/RoCE switch and 3-node mesh configuration.
While it was primarily developed to support multi-node inference, it works just as well on a single node setups. While it was primarily developed to support multi-node inference, it works just as well on a single node setups.
@@ -31,6 +32,8 @@ We will expand the selection of models we test in the pipeline, but since vLLM i
If you want to build the latest from main branch, you can specify `--rebuild-vllm` flag. Or you can target a specific vLLM release by setting `--vllm-ref` parameter. If you want to build the latest from main branch, you can specify `--rebuild-vllm` flag. Or you can target a specific vLLM release by setting `--vllm-ref` parameter.
Similarly, `--rebuild-flashinfer`, `--flashinfer-ref`, and `--apply-flashinfer-pr` control the FlashInfer build in the same way.
## QUICK START ## QUICK START
### Build ### Build
@@ -52,8 +55,8 @@ Build the container.
**On DGX Spark cluster:** **On DGX Spark cluster:**
Make sure you connect your Sparks together and enable passwordless SSH as described in NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks). Make sure you connect your Sparks together and enable passwordless SSH as described in our [Networking Guide](docs/NETWORKING.md). You can also check out NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks), but using our guide is the best way to get started.
You can also check out our new [Networking Guide](docs/NETWORKING.md). **NEW**: the guide now includes instructions on setting up 3-node Spark mesh!
Then run the following command that will build and distribute image across the cluster. Then run the following command that will build and distribute image across the cluster.
@@ -67,7 +70,7 @@ An initial build speed depends on your Internet connection speed and whether the
**On a single node**: **On a single node**:
**NEW** - `launch-cluster.sh` now supports solo mode, which is now a recommended way to run the container on a single Spark: `launch-cluster.sh` supports solo mode, which is now a recommended way to run the container on a single Spark:
```bash ```bash
./launch-cluster.sh --solo exec \ ./launch-cluster.sh --solo exec \
@@ -78,23 +81,6 @@ An initial build speed depends on your Internet connection speed and whether the
--load-format fastsafetensors --load-format fastsafetensors
``` ```
**To launch using regular `docker run`**
```bash
docker run \
--privileged \
--gpus all \
-it --rm \
--network host --ipc=host \
-v ~/.cache/huggingface:/root/.cache/huggingface \
vllm-node \
bash -c -i "vllm serve \
QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ \
--port 8000 --host 0.0.0.0 \
--gpu-memory-utilization 0.7 \
--load-format fastsafetensors"
```
**On a cluster** **On a cluster**
It's recommended to download the model on one node and distribute across the cluster using ConnectX interconnect prior to launching. This is to avoid re-downloading the model from the Internet on every node in the cluster. It's recommended to download the model on one node and distribute across the cluster using ConnectX interconnect prior to launching. This is to avoid re-downloading the model from the Internet on every node in the cluster.
@@ -127,8 +113,6 @@ This will run the model on all available cluster nodes.
**Also:** You can use any vLLM container that has "bash" as its default entrypoint with the launch script. It was tested with NGC vLLM, but can work with others too. To use such container in the cluster, you need to specify `--apply-mod use-ngc-vllm` argument to `./launch-cluster.sh`. However, it's recommended to build the container using this repository for best compatibility and most up-to-date features. **Also:** You can use any vLLM container that has "bash" as its default entrypoint with the launch script. It was tested with NGC vLLM, but can work with others too. To use such container in the cluster, you need to specify `--apply-mod use-ngc-vllm` argument to `./launch-cluster.sh`. However, it's recommended to build the container using this repository for best compatibility and most up-to-date features.
## CHANGELOG
**IMPORTANT** **IMPORTANT**
You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning. You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning.
@@ -149,6 +133,107 @@ Don't do it every time you rebuild, because it will slow down compilation times.
For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
## CHANGELOG
### 2026-03-31
#### Flags to specify Flashinfer ref and apply PRs
`build-and-copy.sh` gains two new flags that mirror the existing vLLM equivalents:
- `--flashinfer-ref <ref>` — build FlashInfer from a specific commit SHA, branch, or tag instead of `main`. Forces a local FlashInfer build (skips prebuilt wheel download).
- `--apply-flashinfer-pr <pr-num>` — fetch and apply a FlashInfer GitHub PR patch before building. Can be specified multiple times. Forces a local FlashInfer build.
Both flags are incompatible with `--exp-mxfp4`.
#### Default image tag in `build-and-copy.sh`
`build-and-copy.sh` now automatically sets a sensible default image tag when `-t` is not specified:
- `--tf5` / `--pre-tf` - tag defaults to `vllm-node-tf5`
- `--exp-mxfp4` - tag defaults to `vllm-node-mxfp4`
- in all other cases - tag defaults to `vllm-node` (no change)
An explicit `-t <tag>` always takes precedence.
#### Support for 3-node mesh setups
Added initial support for setups where 3 Sparks are connected in a ring-like mesh without an additional switch.
See [Networking Guide](docs/NETWORKING.md) for instructions on how to connect and set up networking in such cluster.
Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can detect mesh setups and configure parameters accordingly.
You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe:
```bash
./run-recipe.sh --discover # force mesh discovery
./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls
```
Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are:
- `--pipeline-parallel 3` will let you run a model that can't fit on dual Sparks, but without additional speed improvements (total throughtput may improve though).
- `--data-parallel 3` (possibly with `--enable-expert-parallel`) will let you run a model that can fit on a single Spark, but allow for better concurrency.
You can also run models with `--tensor-parallel 2` in a 3-node configuration - in this case only first two nodes (from autodiscovery/.env or from the CLI parameters) will be utilized.
#### GB10 Verification During Node Discovery
Node discovery now confirms each SSH-reachable peer is a GB10 system before adding it to the cluster:
Only hosts reporting `NVIDIA GB10` are included. This prevents accidentally adding non-Spark machines that happen to be on the same subnet.
#### Separate COPY_HOSTS Discovery
Autodiscover now determines the host list used for image and model distribution separately from `CLUSTER_NODES`:
- **Non-mesh**: `COPY_HOSTS` mirrors `CLUSTER_NODES` (no change in behaviour).
- **Mesh**: scans the direct IB-attached `enp1s0f0np0` and `enp1s0f1np1` interfaces (not the OOB ETH interface), so large file transfers use the faster direct InfiniBand path.
`COPY_HOSTS` is saved to `.env` and respected by `build-and-copy.sh`, `hf-download.sh`, and `run-recipe.py`.
#### Interactive Configuration Save in `autodiscover.sh`
`autodiscover.sh` now handles `.env` creation with a guided interactive flow, replacing the previous logic in `run-recipe.py`:
- Runs automatically when `.env` is absent.
- Asks per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
- Skips if `.env` already exists (use `--setup` to force).
`run-recipe.py` no longer contains its own `.env`-save prompt — it delegates entirely to `autodiscover.sh`.
#### `--setup` Flag in `launch-cluster.sh` and `build-and-copy.sh`
Both scripts now accept `--setup` to force a full autodiscovery run and overwrite the existing `.env`:
```bash
./launch-cluster.sh --setup exec vllm serve ...
./build-and-copy.sh --setup -c
```
This is equivalent to the existing `--setup` in `run-recipe.sh`.
#### `--config` Flag
`hf-download.sh`, `build-and-copy.sh` and `launch-cluster.sh` now accept `--config <file>` to load a custom `.env` configuration file. `COPY_HOSTS` from the config is used for model distribution:
```bash
./hf-download.sh QuantTrio/MiniMax-M2-AWQ --config /path/to/cluster.env -c --copy-parallel
```
#### Parallelism-Aware Node Trimming
`launch-cluster.sh` now parses `-tp` / `--tensor-parallel-size`, `-pp` / `--pipeline-parallel-size`, and `-dp` / `--data-parallel-size` from the exec command or launch script and adjusts the active node count accordingly — for both Ray and no-Ray modes.
- If **fewer nodes are needed** than configured, only the required nodes get containers started (excess nodes are left idle).
- If **more nodes are needed** than available, an error is raised before anything starts.
```
Note: Command requires 2 node(s) (tp=2 * pp=1 * dp=1); using 2 of 3 configured node(s).
Error: Command requires 4 nodes (tp=4 * pp=1 * dp=1) but only 3 node(s) are configured.
```
No flags required — the check is automatic whenever parallelism arguments are present in the command.
### 2026-03-18 ### 2026-03-18
#### `--master-port` / `--head-port` Parameter #### `--master-port` / `--head-port` Parameter
@@ -591,7 +676,8 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm-
To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.: To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:
```bash ```bash
./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c # Image tag defaults to vllm-node-tf5 when --tf5/--pre-tf is used
./build-and-copy.sh --pre-tf -c
``` ```
Then, to run on a single node: Then, to run on a single node:
@@ -641,7 +727,8 @@ It is currently the fastest way to run GPT-OSS on DGX Spark, achieving 60 t/s on
To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss: To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss:
```bash ```bash
./build-and-copy.sh -t vllm-node-mxfp4 --exp-mxfp4 -c # Image tag defaults to vllm-node-mxfp4 when --exp-mxfp4 is used
./build-and-copy.sh --exp-mxfp4 -c
``` ```
Then, to run on a single Spark: Then, to run on a single Spark:
@@ -885,12 +972,14 @@ Using a different username:
| Flag | Description | | Flag | Description |
| :--- | :--- | | :--- | :--- |
| `-t, --tag <tag>` | Image tag (default: `vllm-node`) | | `-t, --tag <tag>` | Image tag (default: `vllm-node`; auto-set to `vllm-node-tf5` with `--tf5`, `vllm-node-mxfp4` with `--exp-mxfp4`) |
| `--gpu-arch <arch>` | Target GPU architecture (default: `12.1a`) | | `--gpu-arch <arch>` | Target GPU architecture (default: `12.1a`) |
| `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build | | `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build |
| `--rebuild-vllm` | Force rebuild vLLM from source | | `--rebuild-vllm` | Force rebuild vLLM from source |
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: `main`) | | `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: `main`) |
| `--flashinfer-ref <ref>` | FlashInfer commit SHA, branch or tag (default: `main`) |
| `--apply-vllm-pr <pr-num>` | Apply a vLLM PR patch during build. Can be specified multiple times. | | `--apply-vllm-pr <pr-num>` | Apply a vLLM PR patch during build. Can be specified multiple times. |
| `--apply-flashinfer-pr <pr-num>` | Apply a FlashInfer PR patch during build. Can be specified multiple times. |
| `--tf5` | Install transformers v5 (5.0.0 or higher). Aliases: `--pre-tf, --pre-transformers`. | | `--tf5` | Install transformers v5 (5.0.0 or higher). Aliases: `--pre-tf, --pre-transformers`. |
| `--exp-mxfp4` | Build with experimental native MXFP4 support. Alias: `--experimental-mxfp4`. | | `--exp-mxfp4` | Build with experimental native MXFP4 support. Alias: `--experimental-mxfp4`. |
| `-c, --copy-to <hosts>` | Host(s) to copy the image to after building (space- or comma-separated). | | `-c, --copy-to <hosts>` | Host(s) to copy the image to after building (space- or comma-separated). |
@@ -900,9 +989,13 @@ Using a different username:
| `-u, --user <user>` | Username for SSH connection (default: current user) | | `-u, --user <user>` | Username for SSH connection (default: current user) |
| `--full-log` | Enable full Docker build output (`--progress=plain`) | | `--full-log` | Enable full Docker build output (`--progress=plain`) |
| `--no-build` | Skip building, only copy existing image (requires `--copy-to`) | | `--no-build` | Skip building, only copy existing image (requires `--copy-to`) |
| `--network <name>` | Docker network to use during build (e.g. `host`). |
| `--cleanup` | Remove all cached `.whl` and `*-commit` files from the `wheels/` directory. |
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory) |
| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists) |
| `-h, --help` | Show help message | | `-h, --help` | Show help message |
**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! If you omit the IP address and use `-c` without addresses, it will use autodiscovery to detect a proper IP address. **IMPORTANT**: When copying to another node manually, use the IP assigned to a ConnectX 7 interface (`enp1s0f*`), not the 10G/wireless interfaces. When using `-c` without addresses, autodiscovery selects the correct interface automatically — in mesh mode it uses the direct IB-attached interfaces (`enp1s0f0np0`, `enp1s0f1np1`) for maximum transfer speed.
### Copying the container to another Spark node (Manual Method) ### Copying the container to another Spark node (Manual Method)
@@ -971,9 +1064,12 @@ Assumptions and limitations:
### Auto-Detection ### Auto-Detection
The script attempts to automatically detect: The script attempts to automatically detect:
* **Ethernet Interface:** The interface associated with the active InfiniBand device that has an IP address. * **Ethernet Interface (`ETH_IF`):** Determined by the number of active CX7 interfaces:
* **InfiniBand Interface:** The active InfiniBand devices. By default both active RoCE interfaces that correspond to active IB port(s) will be utilized. - **2 active** (standard): the `enp*` interface (no capital P) that has an IP address.
* **Node Role:** Based on the detected IP address and the list of nodes (defaults to `192.168.177.11` as head and `192.168.177.12` as worker). - **4 active** (mesh topology): `enP7s7` (preferred) or `wlP9s9` (wireless, shown with a warning) — the cluster coordination interface is separate from the CX7 ports in this configuration.
* **InfiniBand Interface (`IB_IF`):** All active RoCE devices. In mesh mode this is always `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`.
* **Cluster peers:** Discovered by scanning the `ETH_IF` subnet for hosts with SSH access **and** a GB10 GPU (`nvidia-smi --query-gpu=name` must return `NVIDIA GB10`).
* **Copy hosts (`COPY_HOSTS`):** In standard mode, same as cluster peers. In mesh mode, scanned separately on `enp1s0f0np0` and `enp1s0f1np1` subnets so that image/model transfers use the direct InfiniBand path.
### Manual Overrides ### Manual Overrides
@@ -1006,6 +1102,10 @@ You can override the auto-detected values if needed:
| `--mem-swap-limit-gb` | Memory+swap limit in GB (default: mem-limit + 10, only with `--non-privileged`). | | `--mem-swap-limit-gb` | Memory+swap limit in GB (default: mem-limit + 10, only with `--non-privileged`). |
| `--pids-limit` | Process limit (default: 4096, only with `--non-privileged`). | | `--pids-limit` | Process limit (default: 4096, only with `--non-privileged`). |
| `--shm-size-gb` | Shared memory size in GB (default: 64, only with `--non-privileged`). | | `--shm-size-gb` | Shared memory size in GB (default: 64, only with `--non-privileged`). |
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists). |
| `start \| stop \| status \| exec` | Action to perform (default: `start`). Not compatible with `--launch-script`. |
| `command` | Command to execute inside the container (only for `exec` action). |
### Non-Privileged Mode ### Non-Privileged Mode
@@ -1149,6 +1249,61 @@ You need to make sure you allocate IP addresses to them (no need to allocate IP
## 5\. Configuration Details ## 5\. Configuration Details
### Cluster Configuration (`.env` file)
The scripts share a `.env` file (default: `.env` in the repo directory) for persistent cluster configuration. It is created automatically by autodiscovery — run `--discover` (via `run-recipe.sh`) or `--setup` (via `launch-cluster.sh` / `build-and-copy.sh`) on first use.
**Supported variables:**
| Variable | Description |
| :--- | :--- |
| `CLUSTER_NODES` | Comma-separated node IPs used for Ray/vLLM cluster (head node first). |
| `COPY_HOSTS` | Comma-separated node IPs used for image and model distribution. In mesh mode these are the IPs on the direct IB-attached interfaces, which may differ from `CLUSTER_NODES`. |
| `LOCAL_IP` | IP address of the local node. |
| `ETH_IF` | Ethernet interface for cluster coordination (e.g. `enp1s0f1np1` or `enP7s7`). |
| `IB_IF` | Comma-separated RoCE/IB device names (e.g. `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`). |
| `CONTAINER_*` | Any variable prefixed with `CONTAINER_` (except `CONTAINER_NAME`) is passed as `-e VAR=VALUE` to the container. Example: `CONTAINER_NCCL_DEBUG=INFO``-e NCCL_DEBUG=INFO`. |
**Mesh-mode NCCL variables** (written automatically when mesh topology is detected):
```
CONTAINER_NCCL_NET_PLUGIN=none
CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
CONTAINER_NCCL_IB_MERGE_NICS=0
```
**Example `.env` for a standard 2-node cluster:**
```
CLUSTER_NODES=192.168.177.11,192.168.177.12
COPY_HOSTS=192.168.177.12
LOCAL_IP=192.168.177.11
ETH_IF=enp1s0f1np1
IB_IF=rocep1s0f1,roceP2p1s0f1
```
To use a custom config file path, pass `--config /path/to/file.env` to any script.
### Autodiscovery Workflow
On first run, if no `.env` is present, the scripts will automatically trigger autodiscovery. You can also run it explicitly:
```bash
# Via run-recipe.sh
./run-recipe.sh --discover
# Via launch-cluster.sh or build-and-copy.sh (force re-run even if .env exists)
./launch-cluster.sh --setup exec vllm serve ...
./build-and-copy.sh --setup -c
```
Autodiscovery:
1. Detects active CX7 interfaces and determines mesh vs. standard topology.
2. Scans the network for SSH-reachable GB10 peers.
3. In mesh mode, separately discovers `COPY_HOSTS` on direct IB-attached interfaces.
4. Prompts for per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
5. Saves the result to `.env`.
### Environment Persistence ### Environment Persistence
The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run: The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run:
@@ -1322,6 +1477,32 @@ The `hf-download.sh` script provides a convenient way to download models from Hu
./hf-download.sh -c --copy-parallel QuantTrio/MiniMax-M2-AWQ ./hf-download.sh -c --copy-parallel QuantTrio/MiniMax-M2-AWQ
``` ```
**Use nodes from `.env` (respects `COPY_HOSTS`):**
```bash
./hf-download.sh -c QuantTrio/MiniMax-M2-AWQ
```
When `-c` is given without explicit hosts, the script checks `COPY_HOSTS` in `.env` first, then falls back to autodiscovery. In mesh mode this means transfers go over the direct IB-attached interfaces automatically.
**Use a custom config file:**
```bash
./hf-download.sh --config /path/to/cluster.env -c QuantTrio/MiniMax-M2-AWQ
```
**Available options:**
| Flag | Description |
| :--- | :--- |
| `<model-name>` | HuggingFace model ID (e.g. `QuantTrio/MiniMax-M2-AWQ`). Required. |
| `-c, --copy-to <hosts>` | Host(s) to copy the model to after download (space- or comma-separated). Omit hosts to use `COPY_HOSTS` from `.env` or autodiscovery. |
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
| `--copy-parallel` | Copy to all hosts concurrently instead of serially. |
| `-u, --user <user>` | SSH username for remote copies (default: current user). |
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
| `-h, --help` | Show help message. |
### Hardware Architecture ### Hardware Architecture
**Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`. **Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`.

View File

@@ -1,5 +1,57 @@
#!/bin/bash #!/bin/bash
SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
# Load .env file if exists (for shared configuration)
# This is called early so that DOTENV_* variables are available to all functions
load_env_if_exists() {
local env_file="${CONFIG_FILE:-}"
local config_explicit="${CONFIG_FILE_SET:-false}"
# If CONFIG_FILE is not set, check default location
if [[ -z "$env_file" ]]; then
env_file="$SCRIPT_DIR/.env"
config_explicit="false"
fi
# Validate config file exists if explicitly specified
# Exception: if --setup is also specified, the file will be created by the setup procedure
if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]] && [[ "${FORCE_DISCOVER:-false}" != "true" ]]; then
echo "Error: Config file not found: $env_file"
exit 1
fi
if [[ -f "$env_file" ]]; then
# Load .env variables with DOTENV_ prefix
while IFS='=' read -r key value || [[ -n "$key" ]]; do
# Skip comments and empty lines
[[ "$key" =~ ^[[:space:]]*# ]] && continue
[[ -z "$key" ]] && continue
# Remove leading/trailing whitespace from key
key=$(echo "$key" | xargs)
# Skip if key is empty after trimming
[[ -z "$key" ]] && continue
# Remove quotes from value
value="${value%\"}"
value="${value#\"}"
value="${value%\'}"
value="${value#\'}"
# Export with DOTENV_ prefix
export "DOTENV_$key=$value"
done < "$env_file"
fi
}
# Load .env file
load_env_if_exists
# Mesh mode flag (set by detect_interfaces)
MESH_MODE="false"
# Function to detect IB and Ethernet interfaces # Function to detect IB and Ethernet interfaces
detect_interfaces() { detect_interfaces() {
# If both interfaces are already set, nothing to do # If both interfaces are already set, nothing to do
@@ -25,49 +77,121 @@ detect_interfaces() {
fi fi
DETECTED_IB_IFS=() DETECTED_IB_IFS=()
CANDIDATE_ETH_IFS=() ALL_NET_IFS=()
for pair in "${IB_NET_PAIRS[@]}"; do for pair in "${IB_NET_PAIRS[@]}"; do
ib_dev=$(echo "$pair" | awk '{print $1}') ib_dev=$(echo "$pair" | awk '{print $1}')
net_dev=$(echo "$pair" | awk '{print $2}') net_dev=$(echo "$pair" | awk '{print $2}')
DETECTED_IB_IFS+=("$ib_dev") DETECTED_IB_IFS+=("$ib_dev")
ALL_NET_IFS+=("$net_dev")
done
# Check if interface has an IP address local num_up="${#IB_NET_PAIRS[@]}"
if ip addr show "$net_dev" | grep -q "inet "; then
CANDIDATE_ETH_IFS+=("$net_dev") # --- Sanity checks ---
# 1. enp* (no capital P) interfaces MUST have an IP
for net_dev in "${ALL_NET_IFS[@]}"; do
if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then
if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned."
return 1
fi
fi fi
done done
# 2. No two interfaces with IPs should share the same subnet
declare -A SEEN_SUBNETS
for net_dev in "${ALL_NET_IFS[@]}"; do
local cidr
cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
# Compute network address using python3
local net_addr
net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null)
if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then
echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration."
return 1
fi
SEEN_SUBNETS["$net_addr"]="$net_dev"
done
# --- Mode selection ---
if [[ "$num_up" -eq 2 ]]; then
# Non-mesh configuration
MESH_MODE="false"
echo " Non-mesh mode: 2 CX7 interfaces active."
# Set IB_IF if not provided # Set IB_IF if not provided
if [[ -z "$IB_IF" ]]; then if [[ -z "$IB_IF" ]]; then
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
echo " Detected IB_IF: $IB_IF" echo " Detected IB_IF: $IB_IF"
fi fi
# Set ETH_IF if not provided # Set ETH_IF if not provided: prefer interface without capital 'P'
if [[ -z "$ETH_IF" ]]; then if [[ -z "$ETH_IF" ]]; then
if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then local selected_eth=""
echo "Error: No active IB-associated interfaces have IP addresses." for net_dev in "${ALL_NET_IFS[@]}"; do
return 1 if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
if [[ "$net_dev" != *P* ]]; then
selected_eth="$net_dev"
break
fi fi
fi
# Selection logic: Prefer interface without capital 'P' done
SELECTED_ETH="" # Fallback: first interface with an IP
for iface in "${CANDIDATE_ETH_IFS[@]}"; do if [[ -z "$selected_eth" ]]; then
if [[ "$iface" != *"P"* ]]; then for net_dev in "${ALL_NET_IFS[@]}"; do
SELECTED_ETH="$iface" if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
selected_eth="$net_dev"
break break
fi fi
done done
fi
# Fallback: Use the first one if all have 'P' or none found yet if [[ -z "$selected_eth" ]]; then
if [[ -z "$SELECTED_ETH" ]]; then echo "Error: No active IB-associated interfaces have IP addresses."
SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" return 1
fi
ETH_IF="$selected_eth"
echo " Detected ETH_IF: $ETH_IF"
fi fi
ETH_IF="$SELECTED_ETH" elif [[ "$num_up" -eq 4 ]]; then
# Mesh configuration
MESH_MODE="true"
echo " Mesh mode: all 4 CX7 interfaces active."
# Set IB_IF to all four RoCE interfaces (hardcoded for mesh)
if [[ -z "$IB_IF" ]]; then
IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1"
echo " Detected IB_IF: $IB_IF"
fi
# Set ETH_IF: check enP7s7 first, then wlP9s9
if [[ -z "$ETH_IF" ]]; then
if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then
ETH_IF="enP7s7"
echo " Detected ETH_IF: $ETH_IF" echo " Detected ETH_IF: $ETH_IF"
elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then
ETH_IF="wlP9s9"
echo " Detected ETH_IF: $ETH_IF"
echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited."
else
echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination."
return 1
fi
fi
# Export mesh NCCL settings directly so launch-cluster.sh picks them up
# even if the user declines to save config to .env
export DOTENV_CONTAINER_NCCL_NET_PLUGIN=none
export DOTENV_CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
export DOTENV_CONTAINER_NCCL_IB_MERGE_NICS=0
else
echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)."
return 1
fi fi
} }
@@ -94,6 +218,41 @@ detect_local_ip() {
echo " Detected Local IP: $LOCAL_IP ($CIDR)" echo " Detected Local IP: $LOCAL_IP ($CIDR)"
} }
# Scan a subnet for GB10-capable peers via SSH
# Usage: _scan_subnet_for_gb10 <cidr> <local_ip_to_exclude> <output_file>
_scan_subnet_for_gb10() {
local cidr="$1"
local exclude_ip="$2"
local out_file="$3"
if ! command -v python3 &> /dev/null; then
echo "Error: python3 not found."
return 1
fi
if ! command -v nc &> /dev/null; then
echo "Error: nc (netcat) not found."
return 1
fi
local all_ips
all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr")
for ip in $all_ips; do
[[ "$ip" == "$exclude_ip" ]] && continue
(
if nc -z -w 1 "$ip" 22 &>/dev/null; then
# Check if remote is a GB10 system
if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
"nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \
2>/dev/null | grep -q "NVIDIA GB10"; then
echo "$ip" >> "$out_file"
fi
fi
) &
done
wait
}
# Function to detect cluster nodes # Function to detect cluster nodes
detect_nodes() { detect_nodes() {
detect_local_ip || return 1 detect_local_ip || return 1
@@ -111,58 +270,182 @@ detect_nodes() {
return 0 return 0
fi fi
echo "Auto-detecting nodes..." # Try to use CLUSTER_NODES from .env
if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then
if ! command -v nc &> /dev/null; then echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES"
echo "Error: nc (netcat) not found. Please install netcat."
return 1
fi
if ! command -v python3 &> /dev/null; then
echo "Error: python3 not found. Please install python3."
return 1
fi
DETECTED_IPS=("$LOCAL_IP")
PEER_NODES=() PEER_NODES=()
IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES"
echo " Scanning for SSH peers on $CIDR..." for node in "${ALL_NODES[@]}"; do
node=$(echo "$node" | xargs)
# Generate list of IPs using python [[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node")
ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR")
TEMP_IPS_FILE=$(mktemp)
# Scan in parallel
for ip in $ALL_IPS; do
# Skip own IP
if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi
(
# Check port 22 with 1 second timeout
if nc -z -w 1 "$ip" 22 &>/dev/null; then
echo "$ip" >> "$TEMP_IPS_FILE"
fi
) &
done done
NODES_ARG="$DOTENV_CLUSTER_NODES"
# Wait for all background scans to complete return 0
wait
# Read found IPs
if [[ -f "$TEMP_IPS_FILE" ]]; then
while read -r ip; do
DETECTED_IPS+=("$ip")
PEER_NODES+=("$ip")
echo " Found peer: $ip"
done < "$TEMP_IPS_FILE"
rm -f "$TEMP_IPS_FILE"
fi fi
# Sort IPs echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..."
IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}"))
unset IFS
local temp_file
temp_file=$(mktemp)
_scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file"
PEER_NODES=()
local detected_ips=("$LOCAL_IP")
if [[ -f "$temp_file" ]]; then
while read -r ip; do
PEER_NODES+=("$ip")
detected_ips+=("$ip")
echo " Found GB10 peer: $ip"
done < <(sort "$temp_file")
rm -f "$temp_file"
fi
# Sort and set NODES_ARG
IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}"))
unset IFS
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
echo " Cluster Nodes: $NODES_ARG" echo " Cluster Nodes: $NODES_ARG"
} }
# Function to detect COPY_HOSTS for build/model distribution
# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network)
# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers
detect_copy_hosts() {
if [[ "$MESH_MODE" == "false" ]]; then
COPY_PEER_NODES=("${PEER_NODES[@]}")
return 0
fi
# Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets
echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..."
local temp_file
temp_file=$(mktemp)
for iface in enp1s0f0np0 enp1s0f1np1; do
local cidr
cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
local local_iface_ip="${cidr%/*}"
echo " Scanning $iface ($cidr)..."
_scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file"
done
# Deduplicate and collect results.
# On two-cable setups two IB IPs may belong to the same host; deduplicate by
# querying each host's ETH_IF IP as a canonical identity.
COPY_PEER_NODES=()
declare -A _SEEN_COPY # keyed by IB IP
declare -A _SEEN_HOST # keyed by ETH_IF IP → first IB IP seen for that host
if [[ -f "$temp_file" ]]; then
while read -r ip; do
[[ -n "${_SEEN_COPY[$ip]}" ]] && continue
_SEEN_COPY["$ip"]=1
# Resolve canonical host identity via ETH_IF IP
local host_ip
host_ip=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
"ip -o -f inet addr show $ETH_IF 2>/dev/null | awk '{print \$4}' | head -n1 | cut -d/ -f1" \
</dev/null 2>/dev/null)
if [[ -n "$host_ip" && -n "${_SEEN_HOST[$host_ip]}" ]]; then
echo " Skipping $ip (same host as ${_SEEN_HOST[$host_ip]}, ETH_IF: $host_ip)"
continue
fi
[[ -n "$host_ip" ]] && _SEEN_HOST["$host_ip"]="$ip"
COPY_PEER_NODES+=("$ip")
echo " Found GB10 copy host: $ip"
done < <(sort "$temp_file")
rm -f "$temp_file"
fi
}
# Save discovered configuration to .env
# Skips if .env already exists unless FORCE_DISCOVER=true
save_config() {
local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}"
# Skip if .env exists and not forced
if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then
return 0
fi
echo ""
local save_prompt="Save discovered configuration to $env_file?"
if [[ -f "$env_file" ]]; then
save_prompt="Overwrite existing configuration in $env_file?"
fi
read -r -p "$save_prompt [Y/n]: " response
response="${response,,}"
if [[ "$response" =~ ^(n|no)$ ]]; then
return 0
fi
# Build list of all cluster nodes (local + peers)
local all_cluster_nodes=()
if [[ -n "$LOCAL_IP" ]]; then
all_cluster_nodes+=("$LOCAL_IP")
fi
for node in "${PEER_NODES[@]}"; do
all_cluster_nodes+=("$node")
done
# Per-node confirmation for CLUSTER_NODES
echo ""
echo "Select nodes for CLUSTER_NODES:"
local selected_cluster=()
for node in "${all_cluster_nodes[@]}"; do
local label="$node"
[[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)"
read -r -p " Include $label? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_cluster+=("$node")
fi
done
if [[ "${#selected_cluster[@]}" -eq 0 ]]; then
echo "No nodes selected. Aborting save."
return 1
fi
# Per-node confirmation for COPY_HOSTS
echo ""
echo "Select nodes for COPY_HOSTS (build/model distribution):"
local selected_copy=()
for node in "${COPY_PEER_NODES[@]}"; do
read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_copy+=("$node")
fi
done
# Write .env
{
echo "# Auto-generated by autodiscover.sh"
echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")"
if [[ "${#selected_copy[@]}" -gt 0 ]]; then
echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")"
fi
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
if [[ "$MESH_MODE" == "true" ]]; then
echo "# Mesh mode NCCL settings"
echo "CONTAINER_NCCL_NET_PLUGIN=none"
echo "CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1"
echo "CONTAINER_NCCL_IB_MERGE_NICS=0"
fi
} > "$env_file"
echo ""
echo "Saved to $env_file"
}
# Convenience function: run full autodiscovery pipeline
run_autodiscover() {
detect_interfaces || return 1
detect_local_ip || return 1
detect_nodes || return 1
detect_copy_hosts || return 1
save_config
}

View File

@@ -6,17 +6,22 @@ START_TIME=$(date +%s)
# Default values # Default values
IMAGE_TAG="vllm-node" IMAGE_TAG="vllm-node"
IMAGE_TAG_SET=false
REBUILD_FLASHINFER=false REBUILD_FLASHINFER=false
REBUILD_VLLM=false REBUILD_VLLM=false
COPY_HOSTS=() COPY_HOSTS=()
COPY_TO_FLAG=false
SSH_USER="$USER" SSH_USER="$USER"
NO_BUILD=false NO_BUILD=false
VLLM_REF="main" VLLM_REF="main"
VLLM_REF_SET=false
FLASHINFER_REF="main"
FLASHINFER_REF_SET=false
TMP_IMAGE="" TMP_IMAGE=""
PARALLEL_COPY=false PARALLEL_COPY=false
EXP_MXFP4=false EXP_MXFP4=false
VLLM_REF_SET=false
VLLM_PRS="" VLLM_PRS=""
FLASHINFER_PRS=""
PRE_TRANSFORMERS=false PRE_TRANSFORMERS=false
FULL_LOG=false FULL_LOG=false
BUILD_JOBS="16" BUILD_JOBS="16"
@@ -27,6 +32,8 @@ FLASHINFER_RELEASE_TAG="prebuilt-flashinfer-current"
VLLM_RELEASE_TAG="prebuilt-vllm-current" VLLM_RELEASE_TAG="prebuilt-vllm-current"
# Space-separated list of GPU architectures for which prebuilt wheels are available # Space-separated list of GPU architectures for which prebuilt wheels are available
PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a" PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a"
CLEANUP_MODE="false"
CONFIG_FILE=""
cleanup() { cleanup() {
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -262,11 +269,12 @@ if downloads:
# Help function # Help function
usage() { usage() {
echo "Usage: $0 [OPTIONS]" echo "Usage: $0 [OPTIONS]"
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')" echo " -t, --tag <tag> : Image tag (default: 'vllm-node', 'vllm-node-tf5' with --tf5, 'vllm-node-mxfp4' with --exp-mxfp4)"
echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')" echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')"
echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)" echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)"
echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)" echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)"
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')" echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
echo " --flashinfer-ref <ref> : FlashInfer commit SHA, branch or tag (default: 'main')"
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists." echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists."
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
@@ -275,48 +283,34 @@ usage() {
echo " --tf5 : Install transformers>=5 (aliases: --pre-tf, --pre-transformers)" echo " --tf5 : Install transformers>=5 (aliases: --pre-tf, --pre-transformers)"
echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support" echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source. Can be specified multiple times." echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source. Can be specified multiple times."
echo " --apply-flashinfer-pr <pr-num>: Apply a specific PR patch to FlashInfer source. Can be specified multiple times."
echo " --full-log : Enable full build logging (--progress=plain)" echo " --full-log : Enable full build logging (--progress=plain)"
echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " --no-build : Skip building, only copy image (requires --copy-to)"
echo " --network <network> : Docker network to use during build" echo " --network <network> : Docker network to use during build"
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
echo " --config : Path to .env configuration file (default: .env in script directory)"
echo " --setup : Force autodiscovery and save configuration (even if .env exists)"
echo " -h, --help : Show this help message" echo " -h, --help : Show this help message"
exit 1 exit 1
} }
# Argument parsing # Parse all arguments
CONFIG_FILE_SET=false
while [[ "$#" -gt 0 ]]; do while [[ "$#" -gt 0 ]]; do
case $1 in case $1 in
-t|--tag) IMAGE_TAG="$2"; shift ;; -t|--tag) IMAGE_TAG="$2"; IMAGE_TAG_SET=true; shift ;;
--gpu-arch) GPU_ARCH_LIST="$2"; shift ;; --gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
--rebuild-flashinfer) REBUILD_FLASHINFER=true ;; --rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
--rebuild-vllm) REBUILD_VLLM=true ;; --rebuild-vllm) REBUILD_VLLM=true ;;
--vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;; --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
--flashinfer-ref) FLASHINFER_REF="$2"; FLASHINFER_REF_SET=true; shift ;;
-c|--copy-to|--copy-to-host|--copy-to-hosts) -c|--copy-to|--copy-to-host|--copy-to-hosts)
COPY_TO_FLAG=true
shift shift
while [[ "$#" -gt 0 && "$1" != -* ]]; do while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1" add_copy_hosts "$1"
shift shift
done done
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "No hosts specified. Using autodiscovery..."
source "$(dirname "$0")/autodiscover.sh"
detect_nodes
if [ $? -ne 0 ]; then
echo "Error: Autodiscovery failed."
exit 1
fi
if [ ${#PEER_NODES[@]} -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
continue continue
;; ;;
-j|--build-jobs) BUILD_JOBS="$2"; shift ;; -j|--build-jobs) BUILD_JOBS="$2"; shift ;;
@@ -337,8 +331,22 @@ while [[ "$#" -gt 0 ]]; do
exit 1 exit 1
fi fi
;; ;;
--apply-flashinfer-pr)
if [ -n "$2" ] && [[ "$2" != -* ]]; then
if [ -n "$FLASHINFER_PRS" ]; then
FLASHINFER_PRS="$FLASHINFER_PRS $2"
else
FLASHINFER_PRS="$2"
fi
shift
else
echo "Error: --apply-flashinfer-pr requires a PR number."
exit 1
fi
;;
--full-log) FULL_LOG=true ;; --full-log) FULL_LOG=true ;;
--no-build) NO_BUILD=true ;; --no-build) NO_BUILD=true ;;
--cleanup) CLEANUP_MODE=true ;;
--network) --network)
if [ -n "$2" ] && [[ "$2" != -* ]]; then if [ -n "$2" ] && [[ "$2" != -* ]]; then
NETWORK_ARG="$2" NETWORK_ARG="$2"
@@ -348,19 +356,75 @@ while [[ "$#" -gt 0 ]]; do
exit 1 exit 1
fi fi
;; ;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
-h|--help) usage ;; -h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;; *) echo "Unknown parameter passed: $1"; usage ;;
esac esac
shift shift
done done
# Apply default IMAGE_TAG based on flags if -t was not specified
if [ "$IMAGE_TAG_SET" = false ]; then
if [ "$PRE_TRANSFORMERS" = true ]; then
IMAGE_TAG="vllm-node-tf5"
elif [ "$EXP_MXFP4" = true ]; then
IMAGE_TAG="vllm-node-mxfp4"
fi
fi
# Source autodiscover.sh to load .env file
source "$(dirname "$0")/autodiscover.sh"
# If --setup: force full autodiscovery and save configuration
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
echo "Running full autodiscovery (--setup)..."
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
fi
# Handle COPY_HOSTS from .env or autodiscovery only if -c was explicitly specified
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else
echo "No hosts specified. Using autodiscovery..."
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
fi
# Validate flag combinations # Validate flag combinations
if [ -n "$VLLM_PRS" ]; then if [ -n "$VLLM_PRS" ]; then
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
fi fi
if [ -n "$FLASHINFER_PRS" ]; then
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-flashinfer-pr is incompatible with --exp-mxfp4"; exit 1; fi
fi
if [ "$EXP_MXFP4" = true ]; then if [ "$EXP_MXFP4" = true ]; then
if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
if [ "$FLASHINFER_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --flashinfer-ref"; exit 1; fi
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --tf5"; exit 1; fi if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --tf5"; exit 1; fi
if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
@@ -372,6 +436,30 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
exit 1 exit 1
fi fi
# Handle cleanup mode
if [[ "$CLEANUP_MODE" == "true" ]]; then
WHEELS_DIR="./wheels"
echo "Cleaning up wheels directory..."
# Remove all .whl files
if compgen -G "$WHEELS_DIR/*.whl" > /dev/null 2>&1; then
rm -f "$WHEELS_DIR"/*.whl
echo "Removed *.whl files from $WHEELS_DIR"
else
echo "No *.whl files found in $WHEELS_DIR"
fi
# Remove all .-commit files
if compgen -G "$WHEELS_DIR/.*-commit" > /dev/null 2>&1; then
rm -f "$WHEELS_DIR"/.*-commit
echo "Removed .*-commit files from $WHEELS_DIR"
else
echo "No .*-commit files found in $WHEELS_DIR"
fi
echo "Cleanup complete."
fi
# Ensure wheels directory exists # Ensure wheels directory exists
mkdir -p ./wheels mkdir -p ./wheels
@@ -414,9 +502,21 @@ if [ "$NO_BUILD" = false ]; then
# ---------------------------------------------------------- # ----------------------------------------------------------
# Phase 1: FlashInfer wheels # Phase 1: FlashInfer wheels
# ---------------------------------------------------------- # ----------------------------------------------------------
if [ "$FLASHINFER_REF_SET" = true ] || [ -n "$FLASHINFER_PRS" ]; then
REBUILD_FLASHINFER=true
fi
BUILD_FLASHINFER=false BUILD_FLASHINFER=false
if [ "$REBUILD_FLASHINFER" = true ]; then if [ "$REBUILD_FLASHINFER" = true ]; then
if [ "$FLASHINFER_REF_SET" = true ] && [ -n "$FLASHINFER_PRS" ]; then
echo "Rebuilding FlashInfer wheels (--flashinfer-ref and --apply-flashinfer-pr specified)..."
elif [ "$FLASHINFER_REF_SET" = true ]; then
echo "Rebuilding FlashInfer wheels (--flashinfer-ref specified)..."
elif [ -n "$FLASHINFER_PRS" ]; then
echo "Rebuilding FlashInfer wheels (--apply-flashinfer-pr specified)..."
else
echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..." echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
fi
BUILD_FLASHINFER=true BUILD_FLASHINFER=true
elif try_download_wheels "$FLASHINFER_RELEASE_TAG" "flashinfer"; then elif try_download_wheels "$FLASHINFER_RELEASE_TAG" "flashinfer"; then
echo "FlashInfer wheels ready." echo "FlashInfer wheels ready."
@@ -438,12 +538,18 @@ if [ "$NO_BUILD" = false ]; then
FI_CMD=("docker" "build" FI_CMD=("docker" "build"
"--target" "flashinfer-export" "--target" "flashinfer-export"
"--output" "type=local,dest=./wheels" "--output" "type=local,dest=./wheels"
"${COMMON_BUILD_FLAGS[@]}") "${COMMON_BUILD_FLAGS[@]}"
"--build-arg" "FLASHINFER_REF=$FLASHINFER_REF")
if [ "$REBUILD_FLASHINFER" = true ]; then if [ "$REBUILD_FLASHINFER" = true ]; then
FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)") FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
fi fi
if [ -n "$FLASHINFER_PRS" ]; then
echo "Applying FlashInfer PRs: $FLASHINFER_PRS"
FI_CMD+=("--build-arg" "FLASHINFER_PRS=$FLASHINFER_PRS")
fi
FI_CMD+=(".") FI_CMD+=(".")
echo "FlashInfer build command: ${FI_CMD[*]}" echo "FlashInfer build command: ${FI_CMD[*]}"

View File

@@ -42,13 +42,54 @@ However, in order to get full bandwidth in NCCL RDMA mode, we need to utilize **
Also, note that connecting two Sparks using **both** ports won't give you any noticeable advantage in bandwidth, so single connection is sufficient. Also, note that connecting two Sparks using **both** ports won't give you any noticeable advantage in bandwidth, so single connection is sufficient.
If you connect 3 Sparks by daisy-chaining them, you will only be able to sustain 100G between each pair of Sparks. If you connect 3 Sparks by daisy-chaining them, you will only be able to sustain 100G between each pair of Sparks.
## Connecting more than 2 Sparks in the cluster ## Connecting 3 Sparks in a mesh cluster without a switch
Three Sparks can be connected together in a cluster without using a separate RoCE switch.
However, all three Sparks need to be on the same wired network using it's 10G Ethernet port (RG-45, not QSFP). Being on a same wireless network should work too, but it's not recommended and was not tested.
You need to make sure they are connected the following way: port 0 on one Spark should connect to port 1 on another Spark (unlike non-mesh configuration).
Example diagram:
```mermaid
block-beta
columns 1
block:Spark3
columns 2
Title3["Spark 3"]:2
s3p0["Port 0<br>192.168.187.13<br>192.168.188.13"] s3p1["Port 1<br>192.168.197.13<br>192.168.198.13"]
end
space
block:Spark2
columns 2
Title2["Spark 2"]:2
s2p0["Port 0<br>192.168.197.12<br>192.168.198.12"] s2p1["Port 1<br>192.168.177.12<br>192.168.178.13"]
end
space
block:Spark1
columns 2
Title1["Spark 1"]:2
s1p0["Port 0<br>192.168.177.11<br>192.168.178.11"] s1p1["Port 1<br>192.168.187.11<br>192.168.188.11"]
end
s1p0 <--> s2p1
s2p0 <--> s3p1
s3p0 <--> s1p1
```
## Connecting more than 2 Sparks in the cluster using a switch
To connect more than 2 Sparks, you will need a proper switch, for example [Microtik CRS812-DDQ](https://mikrotik.com/product/crs812_ddq). To connect more than 2 Sparks, you will need a proper switch, for example [Microtik CRS812-DDQ](https://mikrotik.com/product/crs812_ddq).
Please refer to [this post](https://forums.developer.nvidia.com/t/6x-spark-setup/354399/56) for an example of setting up a 6-8 node Spark cluster. Please refer to [this post](https://forums.developer.nvidia.com/t/6x-spark-setup/354399/56) for an example of setting up a 6-8 node Spark cluster.
## Network setup ## Network setup
### For dual Sparks or multiple Sparks using a QSFP switch
Assuming both are connected using rightmost QFSP port (when looking from the back). Assuming both are connected using rightmost QFSP port (when looking from the back).
Create `/etc/netplan/40-cx7.yaml` on `spark`: Create `/etc/netplan/40-cx7.yaml` on `spark`:
@@ -65,8 +106,9 @@ network:
enP2p1s0f1np1: enP2p1s0f1np1:
dhcp4: no dhcp4: no
dhcp6: no dhcp6: no
link-local: [ ipv4 ] link-local: []
mtu: 9000 mtu: 9000
addresses: [192.168.178.11/24]
``` ```
Create `/etc/netplan/40-cx7.yaml` on `spark2`: Create `/etc/netplan/40-cx7.yaml` on `spark2`:
@@ -83,16 +125,12 @@ network:
enP2p1s0f1np1: enP2p1s0f1np1:
dhcp4: no dhcp4: no
dhcp6: no dhcp6: no
link-local: [ ipv4 ] link-local: []
mtu: 9000 mtu: 9000
addresses: [192.168.178.12/24]
``` ```
Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet.
For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one.
Then run on each node: Then run on each node:
@@ -115,6 +153,122 @@ MTU setting (testing):
sudo ip link set dev enp1s0f1np1 mtu 9000 sudo ip link set dev enp1s0f1np1 mtu 9000
``` ```
### For 3-node mesh
3-node mesh is configured differently than dual clusters or clusters using a QSFP switch.
Assuming, your Sparks are connected according to the diagram above:
Create `/etc/netplan/40-cx7.yaml` on `spark1`:
```yaml
network:
version: 2
ethernets:
enp1s0f0np0:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.177.11/24]
enP2p1s0f0np0:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.178.11/24]
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.187.11/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.188.11/24]
```
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
```yaml
network:
version: 2
ethernets:
enp1s0f0np0:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.197.12/24]
enP2p1s0f0np0:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.198.12/24]
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.177.12/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.178.12/24]
```
Create `/etc/netplan/40-cx7.yaml` on `spark3`:
```yaml
network:
version: 2
ethernets:
enp1s0f0np0:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.187.13/24]
enP2p1s0f0np0:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.188.13/24]
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.197.13/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.198.13/24]
```
Then run (on each Spark):
```bash
sudo chmod 600 /etc/netplan/40-cx7.yaml
sudo netplan apply
```
### Passwordless SSH and benchmarks
Set up passwordless ssh. On the first spark:
```bash
wget https://raw.githubusercontent.com/NVIDIA/dgx-spark-playbooks/refs/heads/main/nvidia/connect-two-sparks/assets/discover-sparks
chmod +x discover-sparks
./discover-sparks
```
**Benchmark connection (use perftest package):** **Benchmark connection (use perftest package):**
Run the receiver on `spark2` node: Run the receiver on `spark2` node:
@@ -196,7 +350,9 @@ ib_write_lat 192.168.177.12 -d rocep1s0f1 --report_gbits -R --force-link IB
--------------------------------------------------------------------------------------- ---------------------------------------------------------------------------------------
``` ```
## NCCL Setup ## NCCL Tests
### Dual Sparks or Sparks via QSFP switch
From https://build.nvidia.com/spark/nccl/stacked-sparks From https://build.nvidia.com/spark/nccl/stacked-sparks
@@ -240,3 +396,51 @@ mpirun -np 2 -H 192.168.177.11:1,192.168.177.12:1 \
$HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2 $HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2
``` ```
### 3-node mesh
```bash
# Install dependencies and build NCCL
sudo apt-get update && sudo apt-get install -y libopenmpi-dev
git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git ~/nccl/
cd ~/nccl/
make -j src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121"
# Set environment variables
export CUDA_HOME="/usr/local/cuda"
export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
export NCCL_HOME="$HOME/nccl/build/"
export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
```
Build NCCL Test Suite:
```bash
# Clone and build NCCL tests
git clone https://github.com/NVIDIA/nccl-tests.git ~/nccl-tests/
cd ~/nccl-tests/
make MPI=1
```
Test on both nodes (replace spark1, spark2, spark3 with the actual hostnames or IP address on non-QSFP interface):
```bash
# Set environment variables
export CUDA_HOME="/usr/local/cuda"
export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
export NCCL_HOME="$HOME/nccl_spark_cluster/build/"
export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
# For 3-node mesh we have to use 10G interface for OOB communication!
export UCX_NET_DEVICES=enP7s7
export NCCL_SOCKET_IFNAME=enP7s7
export OMPI_MCA_btl_tcp_if_include=enP7s7
export NCCL_IB_HCA=rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1
export NCCL_IB_DISABLE=0
# Run the all_gather performance test across both nodes
mpirun -np 3 -H spark1:1,spark2:1,spark3:1 \
--mca plm_rsh_agent "ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" \
-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH -x NCCL_IB_MERGE_NICS=0 -x NCCL_NET_PLUGIN=none -x NCCL_IB_SUBNET_AWARE_ROUTING=1 \
$HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 3
```

View File

@@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub"
COPY_HOSTS=() COPY_HOSTS=()
SSH_USER="$USER" SSH_USER="$USER"
PARALLEL_COPY=false PARALLEL_COPY=false
CONFIG_FILE=""
CONFIG_FILE_SET=false
# Help function # Help function
usage() { usage() {
@@ -16,6 +18,7 @@ usage() {
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " -u, --user <user> : Username for ssh commands (default: \$USER)" echo " -u, --user <user> : Username for ssh commands (default: \$USER)"
echo " --config <file> : Path to .env configuration file (default: .env in script directory)"
echo " -h, --help : Show this help message" echo " -h, --help : Show this help message"
exit 1 exit 1
} }
@@ -53,42 +56,22 @@ copy_model_to_host() {
} }
# Argument parsing # Argument parsing
COPY_TO_FLAG=false
while [[ "$#" -gt 0 ]]; do while [[ "$#" -gt 0 ]]; do
case $1 in case $1 in
-c|--copy-to|--copy-to-host|--copy-to-hosts) -c|--copy-to|--copy-to-host|--copy-to-hosts)
COPY_TO_FLAG=true
shift shift
# Consume arguments until the next flag or end of args # Consume arguments until the next flag or end of args
while [[ "$#" -gt 0 && "$1" != -* ]]; do while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1" add_copy_hosts "$1"
shift shift
done done
# If no hosts specified, use autodiscovery
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "No hosts specified. Using autodiscovery..."
source "$(dirname "$0")/autodiscover.sh"
detect_nodes
if [ $? -ne 0 ]; then
echo "Error: Autodiscovery failed."
exit 1
fi
# Use PEER_NODES directly
if [ ${#PEER_NODES[@]} -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
continue continue
;; ;;
--copy-parallel) PARALLEL_COPY=true ;; --copy-parallel) PARALLEL_COPY=true ;;
-u|--user) SSH_USER="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
-h|--help) usage ;; -h|--help) usage ;;
*) *)
# If positional argument is provided # If positional argument is provided
@@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do
shift shift
done done
# Export config so autodiscover.sh picks it up
export CONFIG_FILE CONFIG_FILE_SET
# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available
source "$(dirname "$0")/autodiscover.sh"
# Validate model name is provided # Validate model name is provided
if [ -z "${MODEL_NAME:-}" ]; then if [ -z "${MODEL_NAME:-}" ]; then
echo "Error: Model name is required." echo "Error: Model name is required."
usage usage
fi fi
# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
# --copy-to was specified but no hosts given: use .env or autodiscover
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else
echo "No hosts specified. Using autodiscovery..."
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}"
fi
elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then
# No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly
: # intentional no-op; user didn't ask for copy
fi
# Check if uvx is installed # Check if uvx is installed
if ! command -v uvx &> /dev/null; then if ! command -v uvx &> /dev/null; then
echo "Error: 'uvx' command not found." echo "Error: 'uvx' command not found."

View File

@@ -24,12 +24,13 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
COMMAND_TO_RUN="" COMMAND_TO_RUN=""
DAEMON_MODE="false" DAEMON_MODE="false"
CHECK_CONFIG="false" CHECK_CONFIG="false"
ACTION="start" ACTION=""
CLUSTER_WAS_RUNNING="false" CLUSTER_WAS_RUNNING="false"
MOD_PATHS=() MOD_PATHS=()
MOD_TYPES=() MOD_TYPES=()
LAUNCH_SCRIPT_PATH="" LAUNCH_SCRIPT_PATH=""
SCRIPT_DIR="$(dirname "$(realpath "$0")")" SCRIPT_DIR="$(dirname "$(realpath "$0")")"
CONFIG_FILE="" # Will be set to default after argument parsing
ACTIONS_ARG="" ACTIONS_ARG=""
SOLO_MODE="false" SOLO_MODE="false"
@@ -67,9 +68,31 @@ usage() {
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)" echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
echo " --config Path to .env configuration file (default: .env in script directory)
--setup/--discover Force autodiscovery and save configuration (even if .env exists)"
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
echo "" echo ""
echo "Supported .env file variables:"
echo " CLUSTER_NODES Comma-separated list of node IPs"
echo " ETH_IF Ethernet interface name"
echo " IB_IF InfiniBand interface name"
echo " MASTER_PORT Port for cluster coordination (default: 29501)"
echo " CONTAINER_NAME Container name (default: vllm_node)"
echo " LOCAL_IP Local IP address (for solo mode or override auto-detection)"
echo " CONTAINER_* Any variable starting with CONTAINER_ (except CONTAINER_NAME)"
echo " becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO"
echo ""
echo "Example .env file:"
echo " CLUSTER_NODES=192.168.1.1,192.168.1.2"
echo " ETH_IF=eth0"
echo " IB_IF=ib0"
echo " MASTER_PORT=29501"
echo " CONTAINER_NAME=vllm_node"
echo " LOCAL_IP=192.168.1.1"
echo " CONTAINER_NCCL_DEBUG=INFO"
echo " CONTAINER_HF_TOKEN=abc123"
echo ""
echo "Launch Script Usage:" echo "Launch Script Usage:"
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed" echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script" echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script"
@@ -108,6 +131,8 @@ while [[ "$#" -gt 0 ]]; do
--shm-size-gb) SHM_SIZE_GB="$2"; shift ;; --shm-size-gb) SHM_SIZE_GB="$2"; shift ;;
-d) DAEMON_MODE="true" ;; -d) DAEMON_MODE="true" ;;
-h|--help) usage ;; -h|--help) usage ;;
--config) CONFIG_FILE="$2"; shift ;;
--setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
start|stop|status) start|stop|status)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
@@ -133,6 +158,115 @@ while [[ "$#" -gt 0 ]]; do
shift shift
done done
# Set .env file path (use default if not specified)
if [[ -z "$CONFIG_FILE" ]]; then
CONFIG_FILE="$SCRIPT_DIR/.env"
CONFIG_FILE_SET=false
else
CONFIG_FILE_SET=true
fi
# Load .env file
if [[ -f "$CONFIG_FILE" ]]; then
echo "Loading configuration from .env file..."
# Validate .env file syntax
if ! python3 -c "
import sys
import re
env_file = '$CONFIG_FILE'
seen_keys = set()
with open(env_file, 'r') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
continue
# Check for key=value format
if '=' not in line:
print(f'Error: Invalid syntax at line {line_num}: missing \"=\"')
sys.exit(1)
key = line.split('=', 1)[0].strip()
# Validate key format (alphanumeric + underscore)
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', key):
print(f'Error: Invalid key format at line {line_num}: {key}')
sys.exit(1)
# Check for duplicates
if key in seen_keys:
print(f'Error: Duplicate key at line {line_num}: {key}')
sys.exit(1)
seen_keys.add(key)
sys.exit(0)
" 2>/dev/null; then
echo "Error: Invalid .env file syntax. Aborting."
exit 1
fi
# Load .env variables with DOTENV_ prefix
while IFS='=' read -r key value || [[ -n "$key" ]]; do
# Skip comments and empty lines
[[ "$key" =~ ^[[:space:]]*# ]] && continue
[[ -z "$key" ]] && continue
# Remove leading/trailing whitespace from key
key=$(echo "$key" | xargs)
# Skip if key is empty after trimming
[[ -z "$key" ]] && continue
# Remove quotes and whitespace from value using Python for proper shlex handling
value=$(python3 -c "
import shlex
import sys
value = '''$value'''
# Strip whitespace
value = value.strip()
# Remove surrounding quotes if present
if (value.startswith('\"') and value.endswith('\"')) or (value.startswith(\"'\" ) and value.endswith(\"'\")):
value = value[1:-1]
print(value)
")
# Export with DOTENV_ prefix
export "DOTENV_$key=$value"
done < "$CONFIG_FILE"
echo "Loaded .env variables: $(compgen -v DOTENV_ | tr '\n' ' ')"
fi
# Apply .env configuration (CLI args take precedence)
if [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]]; then
NODES_ARG="$DOTENV_CLUSTER_NODES"
fi
if [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]]; then
ETH_IF="$DOTENV_ETH_IF"
fi
if [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]]; then
IB_IF="$DOTENV_IB_IF"
fi
if [[ -z "$MASTER_PORT" || "$MASTER_PORT" == "29501" ]] && [[ -n "$DOTENV_MASTER_PORT" ]]; then
MASTER_PORT="$DOTENV_MASTER_PORT"
fi
if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOTENV_CONTAINER_NAME" ]]; then
CONTAINER_NAME="$DOTENV_CONTAINER_NAME"
fi
if [[ -n "$DOTENV_LOCAL_IP" ]]; then
export LOCAL_IP="$DOTENV_LOCAL_IP"
fi
# Validate non-privileged mode flags # Validate non-privileged mode flags
if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
# Set default swap limit if not specified # Set default swap limit if not specified
@@ -163,6 +297,26 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
esac esac
fi fi
# Add container environment variables from .env (CONTAINER_* pattern)
# Excludes CONTAINER_NAME which is a configuration variable, not an env var
for env_var in $(compgen -v DOTENV_CONTAINER_); do
# Skip CONTAINER_NAME as it's a configuration variable
[[ "$env_var" == "DOTENV_CONTAINER_NAME" ]] && continue
# Get the value
value="${!env_var}"
# Extract the actual env var name (remove DOTENV_CONTAINER_ prefix)
actual_var="${env_var#DOTENV_CONTAINER_}"
# Properly escape the value for shell using Python
escaped_value=$(python3 -c "import shlex; print(shlex.quote('$value'))")
# Add to docker args
DOCKER_ARGS="$DOCKER_ARGS -e $actual_var=$escaped_value"
echo "Adding container env: $actual_var"
done
# Add build job parallelization environment variables if BUILD_JOBS is set # Add build job parallelization environment variables if BUILD_JOBS is set
if [[ -n "$BUILD_JOBS" ]]; then if [[ -n "$BUILD_JOBS" ]]; then
DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS" DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS"
@@ -214,7 +368,7 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
LAUNCH_SCRIPT_MODE="true" LAUNCH_SCRIPT_MODE="true"
# If launch script is specified, default action to exec unless explicitly set to stop/status # If launch script is specified, default action to exec unless explicitly set to stop/status
if [[ "$ACTION" == "start" ]]; then if [[ -z "$ACTION" || "$ACTION" == "start" ]]; then
ACTION="exec" ACTION="exec"
fi fi
fi fi
@@ -259,13 +413,33 @@ done
# Source autodiscover module # Source autodiscover module
source "$(dirname "$0")/autodiscover.sh" source "$(dirname "$0")/autodiscover.sh"
if [[ "$SOLO_MODE" == "true" ]]; then if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
if [[ -n "$NODES_ARG" ]]; then # --setup: force full autodiscovery and save configuration
echo "Error: --solo is incompatible with -n/--nodes." echo "Running full autodiscovery (--setup)..."
exit 1 # Clear pre-loaded values so detect functions run fresh instead of short-circuiting
ETH_IF="" IB_IF="" NODES_ARG="" LOCAL_IP=""
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
# If no action was specified, setup was the only intent — exit cleanly
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
exit 0
fi fi
fi
if [[ "$SOLO_MODE" == "true" ]]; then
# Solo mode: skip node detection, just get local IP # Solo mode: skip node detection, just get local IP
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1
if [[ -z "$LOCAL_IP" ]]; then
LOCAL_IP="127.0.0.1" LOCAL_IP="127.0.0.1"
fi
NODES_ARG="$LOCAL_IP" NODES_ARG="$LOCAL_IP"
PEER_NODES=() PEER_NODES=()
echo "Solo mode enabled. Skipping node detection." echo "Solo mode enabled. Skipping node detection."
@@ -337,6 +511,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]
fi fi
fi fi
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" && "$CHECK_CONFIG" != "true" ]]; then
echo "Error: No action specified. Use: start | stop | status | exec"
usage
exit 1
fi
if [[ "$CHECK_CONFIG" == "true" ]]; then if [[ "$CHECK_CONFIG" == "true" ]]; then
echo "Configuration Check Complete." echo "Configuration Check Complete."
echo " Image Name: $IMAGE_NAME" echo " Image Name: $IMAGE_NAME"
@@ -552,6 +732,32 @@ apply_mod_to_container() {
fi fi
} }
# Parse -tp/-pp/-dp (and long forms) from a text string (command or script content).
# Sets TP_SIZE, PP_SIZE, DP_SIZE, PARALLELISM_FOUND globals.
# Only acts when at least one parallelism flag is present.
parse_parallelism_from_text() {
local text="$1"
TP_SIZE=1; PP_SIZE=1; DP_SIZE=1
PARALLELISM_FOUND=false
# Normalize --flag=value to --flag value for uniform word-by-word parsing
local normalized
normalized=$(echo "$text" | sed 's/\(--[a-z-]*\)=/\1 /g')
local prev=""
for word in $normalized; do
case "$prev" in
-tp|--tensor-parallel-size)
[[ "$word" =~ ^[0-9]+$ ]] && TP_SIZE="$word" && PARALLELISM_FOUND=true ;;
-pp|--pipeline-parallel-size)
[[ "$word" =~ ^[0-9]+$ ]] && PP_SIZE="$word" && PARALLELISM_FOUND=true ;;
-dp|--data-parallel-size)
[[ "$word" =~ ^[0-9]+$ ]] && DP_SIZE="$word" && PARALLELISM_FOUND=true ;;
esac
prev="$word"
done
}
# Build a patched copy of the launch script on the host for a specific node. # Build a patched copy of the launch script on the host for a specific node.
# Strips --distributed-executor-backend and appends multi-node args. # Strips --distributed-executor-backend and appends multi-node args.
# Prints the path of the temp file (caller must delete it). # Prints the path of the temp file (caller must delete it).
@@ -797,6 +1003,29 @@ exec_no_ray_cluster() {
} }
if [[ "$ACTION" == "exec" ]]; then if [[ "$ACTION" == "exec" ]]; then
# Trim (or error on) PEER_NODES based on declared parallelism, for any multi-node exec
if [[ "$SOLO_MODE" != "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
cmd_text=$(cat "$LAUNCH_SCRIPT_PATH" 2>/dev/null || true)
else
cmd_text="$COMMAND_TO_RUN"
fi
parse_parallelism_from_text "$cmd_text"
if [[ "$PARALLELISM_FOUND" == "true" ]]; then
required_nodes=$(( TP_SIZE * PP_SIZE * DP_SIZE ))
total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
if [[ "$required_nodes" -gt "$total_nodes" ]]; then
echo "Error: Command requires $required_nodes nodes (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE) but only $total_nodes node(s) are configured."
exit 1
elif [[ "$required_nodes" -lt "$total_nodes" ]]; then
echo "Note: Command requires $required_nodes node(s) (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE); using $required_nodes of $total_nodes configured node(s)."
PEER_NODES=("${PEER_NODES[@]:0:$(( required_nodes - 1 ))}")
fi
fi
fi
start_cluster start_cluster
echo "Executing command: $COMMAND_TO_RUN" echo "Executing command: $COMMAND_TO_RUN"

View File

@@ -0,0 +1,61 @@
# Recipe: Qwen3.5-122B-A10B-INT4-Autoround
# Qwen3.5-122B model in Intel INT4-Autoround quantization
# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks.
# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150`
recipe_version: "1"
name: Qwen3.5-397B-INT4-Autoround (PP=3)
description: Recipe for Qwen3.5-397B-INT4-Autoround to run on 3-node mesh in pipeline-parallel mode
# HuggingFace model to download (optional, for --download-model)
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
cluster_only: true
# Container image to use
container: vllm-node-tf5
build_args:
- --tf5
# Mod required to fix ROPE syntax error
mods:
- mods/fix-qwen3.5-autoround
- mods/fix-qwen3.5-chat-template
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
pipeline_parallel: 3
gpu_memory_utilization: 0.7
max_model_len: 262144
max_num_batched_tokens: 16384
# Environment variables
env:
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template
command: |
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
--max-model-len {max_model_len} \
--max-num-seqs 10 \
--kv-cache-dtype fp8 \
--gpu-memory-utilization {gpu_memory_utilization} \
--port {port} \
--host {host} \
--enable-prefix-caching \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--max-num-batched-tokens {max_num_batched_tokens} \
--trust-remote-code \
--chat-template unsloth.jinja \
-tp 1 \
-pp {pipeline_parallel} \
--load-format fastsafetensors \
--distributed-executor-backend ray

View File

@@ -44,12 +44,16 @@ The recipe runner can automatically discover cluster nodes:
``` ```
When you run `--discover`, it: When you run `--discover`, it:
1. Scans the network for nodes with SSH access 1. Detects active CX7 interfaces and determines mesh vs. standard topology.
2. Prompts you to select which nodes to include 2. Scans the network for peers that are both SSH-reachable **and** have an NVIDIA GB10 GPU.
3. Saves the configuration to `.env` 3. In mesh mode, separately discovers `COPY_HOSTS` on the direct IB-attached interfaces.
4. Prompts for per-node confirmation for `CLUSTER_NODES` and `COPY_HOSTS`.
5. Saves the full configuration (including mesh NCCL settings if applicable) to `.env`.
Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`. Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`.
When distributing the container image or model files, the runner uses `COPY_HOSTS` from `.env` (which may differ from `CLUSTER_NODES` in mesh mode) to ensure transfers go over the fastest available path.
## Workflow Modes ## Workflow Modes
### Solo Mode (Single Node) ### Solo Mode (Single Node)
@@ -169,6 +173,7 @@ Usage: ./run-recipe.sh [OPTIONS] [RECIPE]
Cluster discovery: Cluster discovery:
--discover Auto-detect cluster nodes and save to .env --discover Auto-detect cluster nodes and save to .env
--show-env Show current .env configuration --show-env Show current .env configuration
--config FILE Path to .env configuration file (default: .env in repo directory)
Recipe overrides: Recipe overrides:
--port PORT Override port --port PORT Override port
@@ -186,10 +191,25 @@ Setup options:
Launch options: Launch options:
--solo Run in solo mode (single node, no Ray) --solo Run in solo mode (single node, no Ray)
--no-ray Multi-node without Ray (PyTorch distributed backend)
-n, --nodes IPS Comma-separated node IPs (first = head) -n, --nodes IPS Comma-separated node IPs (first = head)
-d, --daemon Run in daemon mode -d, --daemon Run in daemon mode
-t, --container IMAGE Override container from recipe -t, --container IMAGE Override container from recipe
--name NAME Override container name
--nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE) --nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE)
--master-port PORT Cluster coordination port: Ray head port or PyTorch
distributed master port (default: 29501).
Alias: --head-port
--eth-if IFACE Override Ethernet interface
--ib-if IFACE Override InfiniBand interface
-e VAR=VALUE Pass environment variable to container (repeatable)
-j N Number of parallel build jobs
--no-cache-dirs Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton
--non-privileged Run container without --privileged
--mem-limit-gb N Memory limit in GB (only with --non-privileged)
--mem-swap-limit-gb N Memory+swap limit in GB (only with --non-privileged)
--pids-limit N Process limit (only with --non-privileged)
--shm-size-gb N Shared memory size in GB (only with --non-privileged)
Extra vLLM arguments: Extra vLLM arguments:
-- ARGS... Pass additional arguments directly to vLLM -- ARGS... Pass additional arguments directly to vLLM
@@ -261,10 +281,18 @@ command: |
``` ```
┌─────────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────────────────┐
│ autodiscover.sh │
│ - Interface detection (standard / mesh topology) │
│ - GB10 peer verification via SSH │
│ - CLUSTER_NODES and COPY_HOSTS discovery │
│ - Interactive .env save with per-node confirmation │
└──────────────────────────┬──────────────────────────────┘
│ sourced by
┌─────────────────────────────────────────────────────────┐
│ run-recipe.sh / run-recipe.py │ │ run-recipe.sh / run-recipe.py │
│ - Parses YAML recipe │ │ - Parses YAML recipe │
│ - Auto-discovers cluster nodes (--discover) │ - Loads / triggers cluster discovery (--discover) │
│ - Loads nodes from .env │
│ - Handles --setup (build + download + run) │ │ - Handles --setup (build + download + run) │
│ - Generates launch script from template │ │ - Generates launch script from template │
│ - Applies CLI overrides │ │ - Applies CLI overrides │
@@ -274,7 +302,7 @@ command: |
┌──────────────────────┐ ┌───────────────────────────────┐ ┌──────────────────────┐ ┌───────────────────────────────┐
│ build-and-copy.sh │ │ hf-download.sh │ │ build-and-copy.sh │ │ hf-download.sh │
│ - Docker build │ │ - HuggingFace model download │ │ - Docker build │ │ - HuggingFace model download │
│ - Copy to workers │ │ - Rsync to workers │ - Copy to COPY_HOSTS│ │ - Rsync to COPY_HOSTS
└──────────────────────┘ └───────────────────────────────┘ └──────────────────────┘ └───────────────────────────────┘
│ then calls (for run) │ then calls (for run)
@@ -282,7 +310,7 @@ command: |
┌─────────────────────────────────────────────────────────┐ ┌─────────────────────────────────────────────────────────┐
│ launch-cluster.sh │ │ launch-cluster.sh │
│ - Cluster orchestration │ │ - Cluster orchestration │
│ - Container lifecycle │ - Container lifecycle (trimmed to required node count)
│ - Mod application │ │ - Mod application │
│ - Launch script execution │ │ - Launch script execution │
└─────────────────────────────────────────────────────────┘ └─────────────────────────────────────────────────────────┘

View File

@@ -105,7 +105,7 @@ LAUNCH_SCRIPT = SCRIPT_DIR / "launch-cluster.sh"
BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh" BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh"
DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh" DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh"
AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh" AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh"
ENV_FILE = SCRIPT_DIR / ".env" ENV_FILE = None # Will be set from CLI argument or default
def load_recipe(recipe_path: Path) -> dict[str, Any]: def load_recipe(recipe_path: Path) -> dict[str, Any]:
@@ -146,8 +146,11 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]:
SystemExit: If recipe not found or validation fails SystemExit: If recipe not found or validation fails
""" """
if not recipe_path.exists(): if not recipe_path.exists():
# Try recipes directory with various extensions # Try candidates in order: add extension to original path first,
# then fall back to flat recipes/ directory (for bare recipe names)
candidates = [ candidates = [
Path(str(recipe_path) + ".yaml"),
Path(str(recipe_path) + ".yml"),
RECIPES_DIR / recipe_path.name, RECIPES_DIR / recipe_path.name,
RECIPES_DIR / f"{recipe_path.name}.yaml", RECIPES_DIR / f"{recipe_path.name}.yaml",
RECIPES_DIR / f"{recipe_path.name}.yml", RECIPES_DIR / f"{recipe_path.name}.yml",
@@ -187,7 +190,9 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]:
SUPPORTED_VERSIONS = ["1"] SUPPORTED_VERSIONS = ["1"]
recipe_ver = str(recipe["recipe_version"]) recipe_ver = str(recipe["recipe_version"])
if recipe_ver not in SUPPORTED_VERSIONS: if recipe_ver not in SUPPORTED_VERSIONS:
print(f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}") print(
f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}"
)
print("Some features may not work correctly. Consider updating run-recipe.py.") print("Some features may not work correctly. Consider updating run-recipe.py.")
return recipe return recipe
@@ -269,19 +274,27 @@ def check_image_exists(image: str, host: str | None = None) -> bool:
""" """
if host: if host:
result = subprocess.run( result = subprocess.run(
["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", [
host, f"docker image inspect '{image}'"], "ssh",
capture_output=True "-o",
"BatchMode=yes",
"-o",
"StrictHostKeyChecking=no",
host,
f"docker image inspect '{image}'",
],
capture_output=True,
) )
else: else:
result = subprocess.run( result = subprocess.run(
["docker", "image", "inspect", image], ["docker", "image", "inspect", image], capture_output=True
capture_output=True
) )
return result.returncode == 0 return result.returncode == 0
def build_image(image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None) -> bool: def build_image(
image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None
) -> bool:
""" """
Build the container image using build-and-copy.sh. Build the container image using build-and-copy.sh.
@@ -315,7 +328,7 @@ def build_image(image: str, copy_to: list[str] | None = None, build_args: list[s
if build_args: if build_args:
cmd.extend(build_args) cmd.extend(build_args)
if copy_to: if copy_to:
cmd.extend(["--copy-to", ",".join(copy_to)]) cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
print(f"Building image '{image}'...") print(f"Building image '{image}'...")
if build_args: if build_args:
@@ -353,7 +366,7 @@ def download_model(model: str, copy_to: list[str] | None = None) -> bool:
cmd = [str(DOWNLOAD_SCRIPT), model] cmd = [str(DOWNLOAD_SCRIPT), model]
if copy_to: if copy_to:
cmd.extend(["--copy-to", ",".join(copy_to)]) cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"])
print(f"Downloading model '{model}'...") print(f"Downloading model '{model}'...")
if copy_to: if copy_to:
@@ -393,7 +406,13 @@ def check_model_exists(model: str) -> bool:
return False return False
def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None, no_ray: bool = False) -> str: def generate_launch_script(
recipe: dict[str, Any],
overrides: dict[str, Any],
is_solo: bool = False,
extra_args: list[str] | None = None,
no_ray: bool = False,
) -> str:
""" """
Generate a bash launch script from the recipe. Generate a bash launch script from the recipe.
@@ -446,7 +465,7 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
if env_vars: if env_vars:
lines.append("# Environment variables") lines.append("# Environment variables")
for key, value in env_vars.items(): for key, value in env_vars.items():
lines.append(f"export {key}=\"{value}\"") lines.append(f'export {key}="{value}"')
lines.append("") lines.append("")
# Format the command with parameters # Format the command with parameters
@@ -462,26 +481,24 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is
# (not needed for solo; no-ray uses PyTorch distributed instead) # (not needed for solo; no-ray uses PyTorch distributed instead)
if is_solo or no_ray: if is_solo or no_ray:
import re import re
# Remove just the flag and its value, not the whole line # Remove just the flag and its value, not the whole line
command = re.sub(r'--distributed-executor-backend\s+\S+', '', command) command = re.sub(r"--distributed-executor-backend\s+\S+", "", command)
# Remove lines that are now empty or just a backslash continuation # Remove lines that are now empty or just a backslash continuation
lines_list = command.split('\n') lines_list = command.split("\n")
filtered_lines = [ filtered_lines = [line for line in lines_list if line.strip() not in ("", "\\")]
line for line in lines_list command = "\n".join(filtered_lines)
if line.strip() not in ('', '\\')
]
command = '\n'.join(filtered_lines)
# Remove trailing backslash if present # Remove trailing backslash if present
command = command.rstrip() command = command.rstrip()
if command.endswith('\\'): if command.endswith("\\"):
command = command.rstrip('\\\n').rstrip() command = command.rstrip("\\\n").rstrip()
# Append extra args if provided (after --) # Append extra args if provided (after --)
if extra_args: if extra_args:
# Join extra args and append to command # Join extra args and append to command
extra_args_str = ' '.join(shlex.quote(a) for a in extra_args) extra_args_str = " ".join(shlex.quote(a) for a in extra_args)
command = command + ' ' + extra_args_str command = command + " " + extra_args_str
lines.append("# Run the model") lines.append("# Run the model")
lines.append(command.strip()) lines.append(command.strip())
@@ -533,7 +550,6 @@ def load_env_file() -> dict[str, str]:
Reads the .env file created by --discover for persistent cluster configuration. Reads the .env file created by --discover for persistent cluster configuration.
EXTENSIBILITY: EXTENSIBILITY:
- To add new persistent settings: Just add them to save_env_file()
- To support multiple .env files: Add a --env-file CLI argument - To support multiple .env files: Add a --env-file CLI argument
- To add validation: Check for required keys after loading - To add validation: Check for required keys after loading
@@ -559,57 +575,16 @@ def load_env_file() -> dict[str, str]:
return env return env
def save_env_file(env: dict[str, str]) -> None:
"""
Save environment variables to .env file.
Persists cluster configuration discovered by autodiscover.sh.
Values are properly quoted if they contain spaces or commas.
EXTENSIBILITY:
- To add new persistent settings: Just add them to the env dict before calling
- To add timestamps/metadata: Add comment lines to the output
- To support append mode: Read existing, merge, then write
Args:
env: Dictionary of key=value pairs to save
"""
lines = ["# Auto-generated by run-recipe.py --discover", ""]
for key, value in sorted(env.items()):
# Quote values with spaces
if " " in value or "," in value:
lines.append(f'{key}="{value}"')
else:
lines.append(f"{key}={value}")
lines.append("")
with open(ENV_FILE, "w") as f:
f.write("\n".join(lines))
print(f"Saved to {ENV_FILE}")
def run_autodiscover() -> dict[str, str] | None: def run_autodiscover() -> dict[str, str] | None:
""" """
Run autodiscover.sh and return discovered configuration. Run autodiscover.sh interactively and return discovered configuration.
Executes the autodiscover.sh script to detect cluster topology, Executes the autodiscover.sh script to detect cluster topology,
then presents an interactive node selection menu. including interactive per-node confirmation and .env saving.
After autodiscover.sh completes, reads configuration from .env file.
EXTENSIBILITY:
- To add new discovery methods: Extend autodiscover.sh or add Python detection here
- To add GPU detection: Add nvidia-smi parsing to discovered env
- To skip interactive selection: Add a --non-interactive flag
- To add node health checks: Ping/SSH test each discovered node
DISCOVERED VARIABLES:
CLUSTER_NODES: Comma-separated list of node IPs (user-selected)
LOCAL_IP: This machine's IP address
ETH_IF: Ethernet interface name (e.g., 'eth0')
IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available
Returns: Returns:
Dictionary with discovered configuration, or None if discovery failed Dictionary with discovered configuration from .env, or None if discovery failed
""" """
if not AUTODISCOVER_SCRIPT.exists(): if not AUTODISCOVER_SCRIPT.exists():
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
@@ -618,85 +593,32 @@ def run_autodiscover() -> dict[str, str] | None:
print("Running autodiscover...") print("Running autodiscover...")
print() print()
# Run autodiscover in a subshell and capture the variables # Pass CONFIG_FILE so autodiscover.sh knows where to save the config.
# We source the script and print the variables we care about # Do NOT set CONFIG_FILE_SET=true — that would cause an error if the file
# doesn't exist yet (it's the file we're about to create).
env_vars = os.environ.copy()
env_vars["CONFIG_FILE"] = str(ENV_FILE)
env_vars["FORCE_DISCOVER"] = "true"
env_vars.pop("CONFIG_FILE_SET", None)
# Run autodiscover interactively so its prompts are shown to the user
script = f""" script = f"""
source '{AUTODISCOVER_SCRIPT}' source '{AUTODISCOVER_SCRIPT}'
detect_interfaces run_autodiscover
detect_local_ip
detect_nodes
echo "CLUSTER_NODES=$NODES_ARG"
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
""" """
result = subprocess.run( result = subprocess.run(["bash", "-c", script], env=env_vars)
["bash", "-c", script],
capture_output=True,
text=True
)
if result.returncode != 0: if result.returncode != 0:
print("Autodiscover output:")
print(result.stdout)
if result.stderr:
print(result.stderr)
print("Error: Autodiscover failed") print("Error: Autodiscover failed")
return None return None
# Print the autodiscover output (excluding the final variable lines) # Read configuration from the .env file that autodiscover.sh wrote
output_lines = result.stdout.strip().split("\n") env = load_env_file()
env = {} if not env.get("CLUSTER_NODES"):
for line in output_lines: print("Autodiscover completed but no CLUSTER_NODES found in .env")
if "=" in line and any(line.startswith(k) for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]):
key, _, value = line.partition("=")
env[key] = value
else:
print(line)
print()
# Interactive node selection
if env.get("CLUSTER_NODES"):
all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()]
local_ip = env.get("LOCAL_IP", "")
if len(all_nodes) > 1:
print("Select which nodes to include in the cluster:")
print()
selected_nodes = []
for node in all_nodes:
is_local = node == local_ip
label = f"{node} (this machine)" if is_local else node
# Default to yes for all nodes
while True:
response = input(f" Include {label}? [Y/n]: ").strip().lower()
if response in ("", "y", "yes"):
selected_nodes.append(node)
break
elif response in ("n", "no"):
break
else:
print(" Please enter 'y' or 'n'")
print()
if not selected_nodes:
print("No nodes selected. Aborting.")
return None return None
if len(selected_nodes) == 1:
print(f"Only one node selected: {selected_nodes[0]}")
print("This will run in solo mode (single node).")
else:
print(f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}")
env["CLUSTER_NODES"] = ",".join(selected_nodes)
print()
return env return env
@@ -757,18 +679,16 @@ Examples:
# Show current .env configuration # Show current .env configuration
%(prog)s --show-env %(prog)s --show-env
""" """,
) )
parser.add_argument( parser.add_argument(
"recipe", "recipe",
nargs="?", nargs="?",
help="Path to recipe YAML file (or just the name without .yaml)" help="Path to recipe YAML file (or just the name without .yaml)",
) )
parser.add_argument( parser.add_argument(
"--list", "-l", "--list", "-l", action="store_true", help="List available recipes"
action="store_true",
help="List available recipes"
) )
# Setup options # Setup options
@@ -776,87 +696,194 @@ Examples:
setup_group.add_argument( setup_group.add_argument(
"--setup", "--setup",
action="store_true", action="store_true",
help="Full setup: build container (if missing) + download model (if missing) + run" help="Full setup: build container (if missing) + download model (if missing) + run",
) )
setup_group.add_argument( setup_group.add_argument(
"--build-only", "--build-only",
action="store_true", action="store_true",
help="Only build/copy the container image, don't run" help="Only build/copy the container image, don't run",
) )
setup_group.add_argument( setup_group.add_argument(
"--download-only", "--download-only",
action="store_true", action="store_true",
help="Only download/copy the model, don't run" help="Only download/copy the model, don't run",
) )
setup_group.add_argument( setup_group.add_argument(
"--force-build", "--force-build", action="store_true", help="Force rebuild even if image exists"
action="store_true",
help="Force rebuild even if image exists"
) )
setup_group.add_argument( setup_group.add_argument(
"--force-download", "--force-download",
action="store_true", action="store_true",
help="Force re-download even if model exists" help="Force re-download even if model exists",
) )
parser.add_argument( parser.add_argument(
"--dry-run", "--dry-run",
action="store_true", action="store_true",
help="Show what would be executed without running" help="Show what would be executed without running",
) )
# Override options # Override options
override_group = parser.add_argument_group("Recipe overrides") override_group = parser.add_argument_group("Recipe overrides")
override_group.add_argument("--port", type=int, help="Override port") override_group.add_argument("--port", type=int, help="Override port")
override_group.add_argument("--host", help="Override host") override_group.add_argument("--host", help="Override host")
override_group.add_argument("--tensor-parallel", "--tp", type=int, dest="tensor_parallel", help="Override tensor parallelism") override_group.add_argument(
override_group.add_argument("--gpu-memory-utilization", "--gpu-mem", type=float, dest="gpu_memory_utilization", help="Override GPU memory utilization") "--tensor-parallel",
override_group.add_argument("--max-model-len", type=int, dest="max_model_len", help="Override max model length") "--tp",
type=int,
dest="tensor_parallel",
help="Override tensor parallelism",
)
override_group.add_argument(
"--gpu-memory-utilization",
"--gpu-mem",
type=float,
dest="gpu_memory_utilization",
help="Override GPU memory utilization",
)
override_group.add_argument(
"--max-model-len",
type=int,
dest="max_model_len",
help="Override max model length",
)
# Launch options (passed to launch-cluster.sh) # Launch options (passed to launch-cluster.sh)
launch_group = parser.add_argument_group("Launch options (passed to launch-cluster.sh)") launch_group = parser.add_argument_group(
launch_group.add_argument("--solo", action="store_true", help="Run in solo mode (single node, no Ray)") "Launch options (passed to launch-cluster.sh)"
launch_group.add_argument("-n", "--nodes", help="Comma-separated list of node IPs (first is head node)") )
launch_group.add_argument("-d", "--daemon", action="store_true", help="Run in daemon mode") launch_group.add_argument(
launch_group.add_argument("-t", "--container", dest="container_override", help="Override container image from recipe") "--solo", action="store_true", help="Run in solo mode (single node, no Ray)"
launch_group.add_argument("--nccl-debug", choices=["VERSION", "WARN", "INFO", "TRACE"], help="NCCL debug level") )
launch_group.add_argument("-e", "--env", action="append", dest="env_vars", default=[], metavar="VAR=VALUE", help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.") launch_group.add_argument(
"-n", "--nodes", help="Comma-separated list of node IPs (first is head node)"
)
launch_group.add_argument(
"-d", "--daemon", action="store_true", help="Run in daemon mode"
)
launch_group.add_argument(
"-t",
"--container",
dest="container_override",
help="Override container image from recipe",
)
launch_group.add_argument(
"--nccl-debug",
choices=["VERSION", "WARN", "INFO", "TRACE"],
help="NCCL debug level",
)
launch_group.add_argument(
"-e",
"--env",
action="append",
dest="env_vars",
default=[],
metavar="VAR=VALUE",
help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.",
)
launch_group.add_argument( launch_group.add_argument(
"--no-ray", "--no-ray",
action="store_true", action="store_true",
dest="no_ray", dest="no_ray",
help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)" help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)",
)
launch_group.add_argument(
"--master-port",
"--head-port",
type=int,
dest="master_port",
help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)",
)
launch_group.add_argument(
"--name",
dest="container_name",
help="Override container name (default: vllm_node)",
)
launch_group.add_argument(
"--eth-if",
dest="eth_if",
help="Ethernet interface (overrides .env and auto-detection)",
)
launch_group.add_argument(
"--ib-if",
dest="ib_if",
help="InfiniBand interface (overrides .env and auto-detection)",
)
launch_group.add_argument(
"-j",
dest="build_jobs",
type=int,
metavar="N",
help="Number of parallel build jobs inside container",
)
launch_group.add_argument(
"--no-cache-dirs",
action="store_true",
dest="no_cache_dirs",
help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton",
)
launch_group.add_argument(
"--non-privileged",
action="store_true",
dest="non_privileged",
help="Run in non-privileged mode (removes --privileged and --ipc=host)",
)
launch_group.add_argument(
"--mem-limit-gb",
type=int,
dest="mem_limit_gb",
help="Memory limit in GB (only with --non-privileged)",
)
launch_group.add_argument(
"--mem-swap-limit-gb",
type=int,
dest="mem_swap_limit_gb",
help="Memory+swap limit in GB (only with --non-privileged)",
)
launch_group.add_argument(
"--pids-limit",
type=int,
dest="pids_limit",
help="Process limit (only with --non-privileged, default: 4096)",
)
launch_group.add_argument(
"--shm-size-gb",
type=int,
dest="shm_size_gb",
help="Shared memory size in GB (only with --non-privileged, default: 64)",
)
# Config file option
parser.add_argument(
"--config",
dest="config_file",
metavar="FILE",
help="Path to .env configuration file (default: .env in script directory)",
) )
launch_group.add_argument("--master-port", "--head-port", type=int, dest="master_port", help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)")
launch_group.add_argument("--name", dest="container_name", help="Override container name (default: vllm_node)")
launch_group.add_argument("--eth-if", dest="eth_if", help="Ethernet interface (overrides .env and auto-detection)")
launch_group.add_argument("--ib-if", dest="ib_if", help="InfiniBand interface (overrides .env and auto-detection)")
launch_group.add_argument("-j", dest="build_jobs", type=int, metavar="N", help="Number of parallel build jobs inside container")
launch_group.add_argument("--no-cache-dirs", action="store_true", dest="no_cache_dirs", help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton")
launch_group.add_argument("--non-privileged", action="store_true", dest="non_privileged", help="Run in non-privileged mode (removes --privileged and --ipc=host)")
launch_group.add_argument("--mem-limit-gb", type=int, dest="mem_limit_gb", help="Memory limit in GB (only with --non-privileged)")
launch_group.add_argument("--mem-swap-limit-gb", type=int, dest="mem_swap_limit_gb", help="Memory+swap limit in GB (only with --non-privileged)")
launch_group.add_argument("--pids-limit", type=int, dest="pids_limit", help="Process limit (only with --non-privileged, default: 4096)")
launch_group.add_argument("--shm-size-gb", type=int, dest="shm_size_gb", help="Shared memory size in GB (only with --non-privileged, default: 64)")
# Cluster discovery options # Cluster discovery options
discover_group = parser.add_argument_group("Cluster discovery") discover_group = parser.add_argument_group("Cluster discovery")
discover_group.add_argument( discover_group.add_argument(
"--discover", "--discover",
action="store_true", action="store_true",
help="Auto-detect cluster nodes and save to .env file" help="Auto-detect cluster nodes and save to .env file",
) )
discover_group.add_argument( discover_group.add_argument(
"--show-env", "--show-env", action="store_true", help="Show current .env configuration"
action="store_true",
help="Show current .env configuration"
) )
# Use parse_known_args to allow extra vLLM arguments after -- # Use parse_known_args to allow extra vLLM arguments after --
args, extra_args = parser.parse_known_args() args, extra_args = parser.parse_known_args()
# Set .env file path (use default if not specified)
global ENV_FILE
if args.config_file:
ENV_FILE = Path(args.config_file).resolve()
else:
ENV_FILE = SCRIPT_DIR / ".env"
# Filter out the -- separator if present # Filter out the -- separator if present
if extra_args and extra_args[0] == '--': if extra_args and extra_args[0] == "--":
extra_args = extra_args[1:] extra_args = extra_args[1:]
# Handle --discover (can be run with or without a recipe) # Handle --discover (can be run with or without a recipe)
@@ -870,8 +897,6 @@ Examples:
print(f" {key}={value}") print(f" {key}={value}")
print() print()
save_env_file(env)
if not args.recipe: if not args.recipe:
return 0 return 0
@@ -935,17 +960,10 @@ Examples:
discovered_env = run_autodiscover() discovered_env = run_autodiscover()
if discovered_env and discovered_env.get("CLUSTER_NODES"): if discovered_env and discovered_env.get("CLUSTER_NODES"):
env = discovered_env # use freshly loaded env from autodiscover
nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
nodes_from_env = True nodes_from_env = True
if nodes:
# Ask if user wants to save to .env
print()
response = input("Save this configuration to .env for future use? [Y/n]: ").strip().lower()
if response in ("", "y", "yes"):
save_env_file(discovered_env)
print()
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh # Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
eth_if = args.eth_if or None eth_if = args.eth_if or None
ib_if = args.ib_if or None ib_if = args.ib_if or None
@@ -963,8 +981,10 @@ Examples:
solo_only = recipe.get("solo_only", False) solo_only = recipe.get("solo_only", False)
is_solo = args.solo or not is_cluster is_solo = args.solo or not is_cluster
if getattr(args, 'no_ray', False) and is_solo: if getattr(args, "no_ray", False) and is_solo:
print("Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray.") print(
"Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray."
)
return 1 return 1
if cluster_only and is_solo: if cluster_only and is_solo:
@@ -972,7 +992,9 @@ Examples:
print(f"This model is too large to run on a single node.") print(f"This model is too large to run on a single node.")
print() print()
print("Options:") print("Options:")
print(f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2") print(
f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2"
)
print(f" 2. Auto-discover and save: {sys.argv[0]} --discover") print(f" 2. Auto-discover and save: {sys.argv[0]} --discover")
print(f" Then run: {sys.argv[0]} {args.recipe}") print(f" Then run: {sys.argv[0]} {args.recipe}")
return 1 return 1
@@ -985,8 +1007,17 @@ Examples:
print(f" 2. Remove nodes from .env: {sys.argv[0]} --show-env") print(f" 2. Remove nodes from .env: {sys.argv[0]} --show-env")
return 1 return 1
# Determine copy targets for cluster deployments # Determine copy targets for build/model distribution.
copy_targets = worker_nodes if is_cluster else None # Prefer COPY_HOSTS from .env (may differ from CLUSTER_NODES in mesh mode),
# fall back to worker_nodes derived from CLUSTER_NODES.
if is_cluster:
copy_hosts_str = env.get("COPY_HOSTS")
if copy_hosts_str:
copy_targets = [h.strip() for h in copy_hosts_str.split(",") if h.strip()]
else:
copy_targets = worker_nodes
else:
copy_targets = None
if args.dry_run: if args.dry_run:
print("=== Dry Run ===") print("=== Dry Run ===")
@@ -1007,9 +1038,13 @@ Examples:
print(f" Workers: {', '.join(worker_nodes)}") print(f" Workers: {', '.join(worker_nodes)}")
print(f"Solo mode: {is_solo}") print(f"Solo mode: {is_solo}")
if eth_if: if eth_if:
print(f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}") print(
f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}"
)
if ib_if: if ib_if:
print(f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}") print(
f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}"
)
if args.container_name: if args.container_name:
print(f"Container name: {args.container_name}") print(f"Container name: {args.container_name}")
if args.non_privileged: if args.non_privileged:
@@ -1099,7 +1134,7 @@ Examples:
print(f" 2. Build manually: ./build-and-copy.sh -t {container}") print(f" 2. Build manually: ./build-and-copy.sh -t {container}")
print() print()
response = input("Build now? [y/N] ").strip().lower() response = input("Build now? [y/N] ").strip().lower()
if response == 'y': if response == "y":
if not build_image(container, copy_targets, build_args): if not build_image(container, copy_targets, build_args):
print("Error: Failed to build image") print("Error: Failed to build image")
return 1 return 1
@@ -1109,7 +1144,13 @@ Examples:
# Build overrides from CLI args # Build overrides from CLI args
overrides = {} overrides = {}
for key in ["port", "host", "tensor_parallel", "gpu_memory_utilization", "max_model_len"]: for key in [
"port",
"host",
"tensor_parallel",
"gpu_memory_utilization",
"max_model_len",
]:
value = getattr(args, key, None) value = getattr(args, key, None)
if value is not None: if value is not None:
overrides[key] = value overrides[key] = value
@@ -1122,24 +1163,34 @@ Examples:
if extra_args: if extra_args:
# Map vLLM flags to our override keys # Map vLLM flags to our override keys
flag_to_override = { flag_to_override = {
'--port': 'port', "--port": "port",
'--host': 'host', "--host": "host",
'--tensor-parallel-size': 'tensor_parallel', "--tensor-parallel-size": "tensor_parallel",
'-tp': 'tensor_parallel', "-tp": "tensor_parallel",
'--gpu-memory-utilization': 'gpu_memory_utilization', "--gpu-memory-utilization": "gpu_memory_utilization",
'--max-model-len': 'max_model_len', "--max-model-len": "max_model_len",
} }
for i, arg in enumerate(extra_args): for i, arg in enumerate(extra_args):
# Check both exact flag and =value syntax # Check both exact flag and =value syntax
flag = arg.split('=')[0] if '=' in arg else arg flag = arg.split("=")[0] if "=" in arg else arg
if flag in flag_to_override: if flag in flag_to_override:
override_key = flag_to_override[flag] override_key = flag_to_override[flag]
if override_key in overrides: if override_key in overrides:
print(f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override") print(
print(f" vLLM uses last value; extra args appear after template substitution") f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override"
)
print(
f" vLLM uses last value; extra args appear after template substitution"
)
# Generate launch script # Generate launch script
script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args, no_ray=getattr(args, 'no_ray', False)) script_content = generate_launch_script(
recipe,
overrides,
is_solo=is_solo,
extra_args=extra_args,
no_ray=getattr(args, "no_ray", False),
)
if args.dry_run: if args.dry_run:
print("=== Generated Launch Script ===") print("=== Generated Launch Script ===")
@@ -1158,7 +1209,7 @@ Examples:
cmd_parts.append("--solo") cmd_parts.append("--solo")
if args.daemon: if args.daemon:
cmd_parts.append("-d") cmd_parts.append("-d")
if getattr(args, 'no_ray', False): if getattr(args, "no_ray", False):
cmd_parts.append("--no-ray") cmd_parts.append("--no-ray")
if nodes: if nodes:
cmd_parts.extend(["-n", ",".join(nodes)]) cmd_parts.extend(["-n", ",".join(nodes)])
@@ -1188,6 +1239,8 @@ Examples:
cmd_parts.extend(["--pids-limit", str(args.pids_limit)]) cmd_parts.extend(["--pids-limit", str(args.pids_limit)])
if args.shm_size_gb: if args.shm_size_gb:
cmd_parts.extend(["--shm-size-gb", str(args.shm_size_gb)]) cmd_parts.extend(["--shm-size-gb", str(args.shm_size_gb)])
if args.config_file:
cmd_parts.extend(["--config", args.config_file])
cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"]) cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"])
print(" ".join(cmd_parts)) print(" ".join(cmd_parts))
print() print()
@@ -1195,7 +1248,7 @@ Examples:
return 0 return 0
# Write temporary launch script # Write temporary launch script
with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f: with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f:
f.write(script_content) f.write(script_content)
temp_script = f.name temp_script = f.name
@@ -1222,7 +1275,7 @@ Examples:
if args.daemon: if args.daemon:
cmd.append("-d") cmd.append("-d")
if getattr(args, 'no_ray', False): if getattr(args, "no_ray", False):
cmd.append("--no-ray") cmd.append("--no-ray")
# Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover) # Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover)
@@ -1258,6 +1311,9 @@ Examples:
if args.shm_size_gb: if args.shm_size_gb:
cmd.extend(["--shm-size-gb", str(args.shm_size_gb)]) cmd.extend(["--shm-size-gb", str(args.shm_size_gb)])
if args.config_file:
cmd.extend(["--config", args.config_file])
# Add launch script # Add launch script
cmd.extend(["--launch-script", temp_script]) cmd.extend(["--launch-script", temp_script])