177 Commits

Author SHA1 Message Date
f24d177802 Update README.md
Some checks failed
Build and Push spark-vllm / docker (push) Failing after 4s
2026-05-11 18:20:33 +00:00
bb0d120177 gitea workflow 2026-05-11 13:16:59 -05:00
eugr
ba9dde963f Fixed 3-node Qwen 397B recipe to prevent OOM and use instanttensor 2026-05-10 22:20:49 -07:00
eugr
ae8ac815ac Adjusted Qwen3.5-397B recipe to fix OOM issue and lower memory requirements 2026-05-09 13:45:15 -07:00
eugr
83a680c87b Fixed OOM for Qwen3.5-397B 2026-05-09 13:25:31 -07:00
Eugene Rakhmatulin
69ea62294f remove unnecessary mod from qwen3-coder-next template 2026-05-08 16:32:54 -07:00
Eugene Rakhmatulin
8e548ce664 Fixed typo 2026-05-08 14:59:13 -07:00
Eugene Rakhmatulin
bca64f9a53 Performance regression fix 2026-05-08 13:40:55 -07:00
Eugene Rakhmatulin
29d5904b80 Fix performance regression 2026-05-08 12:56:28 -07:00
Eugene Rakhmatulin
b87854fd4c Fixed qwen3.6 recipes 2026-05-06 10:56:09 -07:00
Eugene Rakhmatulin
c67c5b5c1e Add chat template and recipe for Qwen3.6-35B-A3B-FP8 model 2026-05-06 10:32:46 -07:00
Eugene Rakhmatulin
9fbed882bc Added EXPERIMENTAL mod for b12x - initial support 2026-04-29 14:38:37 -07:00
Eugene Rakhmatulin
97e51d5d23 fixed gemma4 recipe 2026-04-29 12:56:07 -07:00
Eugene Rakhmatulin
87cb9f6e1e Reverted gemma4 to safetensors. Fixes #214 and #217. 2026-04-29 10:56:40 -07:00
eugr
e3243bf555 Merge pull request #197 from mmonad/minimax-m2.7-awq-recipe
Add recipe for MiniMax-M2.7-AWQ
2026-04-25 19:26:43 -07:00
Eugene Rakhmatulin
43a00ed90f Fixed #205 2026-04-25 18:39:46 -07:00
eugr
ef9b0e50f4 Merge pull request #210 from Kaweees/main
Update gpu-mem-util-gb: patch with new vLLM default value
2026-04-25 10:00:52 -07:00
Miguel Villa Floran
c1e952de2e Update gpu-mem-util-gb: patch with new vLLM default value 2026-04-24 11:40:41 -07:00
Eugene Rakhmatulin
b13a3600d3 Remove a dependency 2026-04-23 07:47:23 -07:00
Eugene Rakhmatulin
7dea11bbf0 More robust handling of PRs 2026-04-22 13:18:12 -07:00
Eugene Rakhmatulin
c187912e23 Removed merged PRs 2026-04-21 09:47:26 -07:00
L.B.R.
caa28c8e12 Add recipe for MiniMax-M2.7-AWQ
Add a vLLM serving recipe for the MiniMax M2.7 model using
the cyankiwi/MiniMax-M2.7-AWQ-4bit quantization. Uses the
same minimax_m2 tool-call and reasoning parsers as the
existing M2 recipe, with Ray distributed backend on 2 GPUs.
2026-04-18 22:44:26 +01:00
Eugene Rakhmatulin
5415c1fe9e Include a PR to fix broken torch bindings (vllm pr 40191) 2026-04-18 09:19:50 -07:00
Eugene Rakhmatulin
d49fac1b8b Re-enable flashinfer_cutlass 2026-04-16 16:40:56 -07:00
Eugene Rakhmatulin
6b7f8dace6 Fixes #187 2026-04-15 22:32:14 -07:00
Eugene Rakhmatulin
76fbf0d0be Fix for broken MiniMax M2 parser 2026-04-15 16:31:50 -07:00
Eugene Rakhmatulin
b7830469be Updated README 2026-04-14 17:23:42 -07:00
Eugene Rakhmatulin
b50fa426c8 Merge pull request #190 2026-04-14 17:18:56 -07:00
Tim Messerschmidt
2c13e1ce25 Add InstantTensor to runtime dependencies 2026-04-14 19:38:36 +02:00
Eugene Rakhmatulin
c026c92bd0 Updated README 2026-04-13 11:27:57 -07:00
Eugene Rakhmatulin
cf4cb35356 added new flashinfer build dependency 2026-04-13 08:47:34 -07:00
Eugene Rakhmatulin
1ad85442ac Added a helper mod for Qwen3.5-397B recipe 2026-04-12 19:14:23 -07:00
Eugene Rakhmatulin
30919581ee Included .gitgnore in wheels 2026-04-11 17:02:39 -07:00
Eugene Rakhmatulin
b7c8616743 Pinned pytorch version 2026-04-11 11:54:46 -07:00
Eugene Rakhmatulin
8e8e850ef1 fix for new requirements structure 2026-04-10 20:14:47 -07:00
Eugene Rakhmatulin
fc08740fba Increased uv timeout 2026-04-10 19:38:38 -07:00
Eugene Rakhmatulin
288da8e911 Mod to fix Gemma4 tool parser 2026-04-04 16:48:07 -07:00
Eugene Rakhmatulin
7bc4e4ce5e Fixes #158 by adding build args to gemma4 recipe 2026-04-04 10:46:06 -07:00
Eugene Rakhmatulin
49d6d9fefd Removed PR2927 as it's been merged 2026-04-03 16:56:00 -07:00
Eugene Rakhmatulin
4afca860a5 Fix broken compilation (PR 38919) 2026-04-03 10:22:10 -07:00
Eugene Rakhmatulin
ed32612cdd A recipe for Gemma4-26B 2026-04-02 23:53:55 -07:00
Eugene Rakhmatulin
44808f7018 Apply vLLM PR 35568 2026-04-02 17:13:54 -07:00
Eugene Rakhmatulin
12caec228e switching gpt-oss-120b to solo only for now 2026-04-01 10:27:50 -07:00
Eugene Rakhmatulin
27eb35f08d Fixed 4x qwen recipe 2026-04-01 10:09:01 -07:00
eugr
3335540972 Merge branch 'pr-152' 2026-04-01 08:59:01 -07:00
eugr
ae25d64ac0 Changed CUTLASS ref for mxfp4 build 2026-04-01 08:58:31 -07:00
Eugene Rakhmatulin
a770865834 Updated PRs to apply 2026-04-01 08:31:34 -07:00
Artyom
7b47235463 Pin nvidia-nvshmem-cu13 to <3.6 in Dockerfile.mxfp4
nvidia-nvshmem-cu13 3.6.5 (released Mar 24) introduced a breaking
change — nvshmemi_device_state_d was removed from NVSHMEM headers,
which breaks FlashInfer AOT compilation of nvshmem_binding.cu.
2026-04-01 07:38:53 +02:00
Eugene Rakhmatulin
3a3ab98b3e Temporarily added PR2897 to Dockerfile 2026-03-31 22:06:08 -07:00
Eugene Rakhmatulin
23fb7dcc20 Merge branch '3-node-autodiscover' 2026-03-31 18:22:23 -07:00
Eugene Rakhmatulin
c4860b86a2 Updated README with 3-node support 2026-03-31 18:19:22 -07:00
Eugene Rakhmatulin
044557943c Bugfixes 2026-03-31 17:49:17 -07:00
Eugene Rakhmatulin
ead749239d Bugfix 2026-03-31 16:57:56 -07:00
Eugene Rakhmatulin
a889fed254 Updated README 2026-03-31 16:54:19 -07:00
Eugene Rakhmatulin
e89104d91b Always rerun discovery when --discover is specified 2026-03-31 16:25:05 -07:00
Eugene Rakhmatulin
15a04ada32 Bug fixes 2026-03-31 16:20:23 -07:00
Eugene Rakhmatulin
a467a7a0bd Updated README for 3-node 2026-03-31 13:47:04 -07:00
Eugene Rakhmatulin
48318380f9 Bugfix 2026-03-31 13:41:35 -07:00
Eugene Rakhmatulin
287d3c72e5 Fix for forced autodiscovery 2026-03-31 13:34:59 -07:00
Eugene Rakhmatulin
9370b2bb34 Don't start the cluster if only --setup/--discover is specified 2026-03-31 13:29:56 -07:00
Eugene Rakhmatulin
bb177383ff Bugfix in autodiscovery dedup 2026-03-31 12:46:15 -07:00
Eugene Rakhmatulin
7f0be29fcc Handle edge case when two sparks have both cables plugged and assigned IPs 2026-03-31 11:59:03 -07:00
Eugene Rakhmatulin
41c0ce2c9a Fixed FI PR 2026-03-30 14:25:42 -07:00
Eugene Rakhmatulin
45494688d1 Updated README, added NVFP4 fix 2026-03-30 11:45:40 -07:00
Eugene Rakhmatulin
a3201f8873 --flashinfer-ref / --apply-flashinfer-pr 2026-03-29 22:40:35 -07:00
Eugene Rakhmatulin
e471ca2436 Don't copy if -c is not specified 2026-03-28 18:12:32 -07:00
Eugene Rakhmatulin
32674c2619 removed temporary patch as it causes more issues. 2026-03-28 17:49:17 -07:00
Eugene Rakhmatulin
47f5f931b5 Allow to specify config file when doing setup 2026-03-28 14:55:31 -07:00
Eugene Rakhmatulin
d37217bad0 moved PR patch before the requirements patching 2026-03-28 09:22:19 -07:00
Eugene Rakhmatulin
e70c87b4f6 Added PR38423 (temp) 2026-03-28 08:50:54 -07:00
Eugene Rakhmatulin
c1a6cec074 Updated documentation; default image tags in build script 2026-03-27 16:41:09 -07:00
Eugene Rakhmatulin
51d69c5c17 commenting out non-applicable PRs 2026-03-27 16:15:54 -07:00
Eugene Rakhmatulin
e7f2ee692f Added temporary patch to apply PR38126 that fixes broken NVFP4 quants 2026-03-27 09:30:26 -07:00
Eugene Rakhmatulin
101ae6fd56 Merge branch 'main' into 3-node-autodiscover 2026-03-27 09:02:10 -07:00
Eugene Rakhmatulin
f4ca15ce18 Made autoround mod optional to support latest version of vLLM. Fixes #144. 2026-03-27 09:00:50 -07:00
Eugene Rakhmatulin
3d918e0b82 Merge branch '3-node' into 3-node-autodiscover 2026-03-27 07:51:08 -07:00
eugr
47a896d722 Removed expert-parallel from 3x-node Qwen 2026-03-26 22:44:48 -07:00
Eugene Rakhmatulin
0fa585f909 Fix typo in pipeline_parallel setting in Qwen3.5-397B-INT4-Autoround recipe 2026-03-26 18:43:17 -07:00
Eugene Rakhmatulin
cecec74828 Add recipe for Qwen3.5-397B-INT4-Autoround in pipeline-parallel mode 2026-03-26 18:41:57 -07:00
Eugene Rakhmatulin
c8ee2a2511 Perform node count check in any mode 2026-03-26 18:15:09 -07:00
Eugene Rakhmatulin
ce293b5f05 Additional checks for parallelism and cluster size 2026-03-26 17:52:47 -07:00
Eugene Rakhmatulin
f872cc17a8 Fix for --setup behavior 2026-03-26 16:49:09 -07:00
Eugene Rakhmatulin
00c16746e5 Handle new copy hosts setup in run-recipe.py 2026-03-26 16:45:35 -07:00
Eugene Rakhmatulin
f163ca69de Autodiscover tweaks 2026-03-26 16:30:05 -07:00
Eugene Rakhmatulin
a78e221de3 Autodiscovery refactoring with mesh support 2026-03-26 15:47:41 -07:00
Eugene Rakhmatulin
e6ee108cdf Temporary patch for NVFP4 2026-03-26 11:43:44 -07:00
Eugene Rakhmatulin
174de6f0a8 temporary patch for PR38126 2026-03-26 08:58:04 -07:00
Eugene Rakhmatulin
83a74bccec Removed extra solo mode check 2026-03-26 07:45:23 -07:00
Eugene Rakhmatulin
ff18a9ad5b Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node 2026-03-25 23:38:44 -07:00
Eugene Rakhmatulin
c08b34a218 add --config passthrough to run-recipe 2026-03-25 23:35:52 -07:00
Eugene Rakhmatulin
23cca2a11a Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node 2026-03-25 23:17:25 -07:00
Eugene Rakhmatulin
c2fe579ccc Enhance .env file handling and validation in scripts 2026-03-25 23:16:56 -07:00
Eugene Rakhmatulin
8b7c02aa25 add .env support to build-and-copy.sh 2026-03-25 22:47:02 -07:00
Eugene Rakhmatulin
73fec1bdf8 bugfix 2026-03-25 15:40:09 -07:00
Eugene Rakhmatulin
2f5ff0211e Cleanup in build script 2026-03-25 15:39:23 -07:00
Eugene Rakhmatulin
63ee72e729 Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node 2026-03-25 15:36:31 -07:00
Eugene Rakhmatulin
4a0feea6c3 Added --cleanup option to build script 2026-03-25 15:35:32 -07:00
Eugene Rakhmatulin
429042b7dc Revert "Added --cleanup option"
This reverts commit b8930b05a1.
2026-03-25 15:35:15 -07:00
Eugene Rakhmatulin
ef95336937 Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node 2026-03-25 15:25:19 -07:00
Eugene Rakhmatulin
b8930b05a1 Added --cleanup option 2026-03-25 15:24:59 -07:00
Eugene Rakhmatulin
49d505ad14 Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node 2026-03-25 15:16:47 -07:00
Eugene Rakhmatulin
1755dfd114 Added LOCAL_IP support 2026-03-25 15:16:06 -07:00
Eugene Rakhmatulin
3d4dc4c82e Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node 2026-03-25 14:42:37 -07:00
Eugene Rakhmatulin
07fac71dac Fixed bug with CONTAINER_NAME variable 2026-03-25 14:42:01 -07:00
Eugene Rakhmatulin
1702f47df6 Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node 2026-03-25 14:18:32 -07:00
Eugene Rakhmatulin
ad2cd3373f .env configuration support for launch-cluster.sh 2026-03-25 14:18:00 -07:00
Eugene Rakhmatulin
1fd8c7afc3 Merge branch 'main' into 3-node 2026-03-25 12:45:40 -07:00
Eugene Rakhmatulin
3dcd2a90c1 Updated Nemotron-3-Super recipe 2026-03-25 12:44:44 -07:00
Eugene Rakhmatulin
efacbd69f2 Updated Nemotron3-Super recipe 2026-03-25 12:43:12 -07:00
Eugene Rakhmatulin
c4b078b868 Merge branch 'main' into 3-node 2026-03-24 22:21:25 -07:00
Eugene Rakhmatulin
3be2fb24a8 Merge pull request #122 2026-03-24 22:18:52 -07:00
Eugene Rakhmatulin
7fa69187df metadata changes 2026-03-24 22:18:07 -07:00
Drew Botwinick
8298c3d7f8 Merge remote-tracking branch 'upstream/main'
# Conflicts:
#	Dockerfile
2026-03-24 15:41:09 -05:00
Eugene Rakhmatulin
f8c2653fd3 Quick fix for NCCL dependency 2026-03-23 23:20:59 -07:00
Eugene Rakhmatulin
990a7b3837 Use mesh-optimized NCCL 2026-03-23 15:43:18 -07:00
Eugene Rakhmatulin
9e089acf2b Updated Nemotron recipes to use VLLM CUTLASS 2026-03-22 23:03:24 -07:00
Eugene Rakhmatulin
2d749742e4 Changed base image back to base CUDA development one 2026-03-21 18:11:20 -07:00
Eugene Rakhmatulin
7a54657abf Revert "cuda 13.2 torch"
This reverts commit 926dd57a87.
2026-03-21 15:36:17 -07:00
Eugene Rakhmatulin
926dd57a87 cuda 13.2 torch 2026-03-21 15:15:01 -07:00
Eugene Rakhmatulin
6e8d85c914 cleanup 2026-03-21 15:12:12 -07:00
Drew Botwinick
d6e76f8e2f add build metadata generation and include in Dockerfiles 2026-03-21 16:10:04 -05:00
Eugene Rakhmatulin
8385506c5e Fixes 2026-03-20 23:51:21 -07:00
Eugene Rakhmatulin
8caebe3155 Reverting back to CUDA image + pytorch from wheels 2026-03-20 17:03:18 -07:00
Eugene Rakhmatulin
919a881cb1 Merge branch 'main' of gitlab.home.eugr.net:ai/spark-vllm 2026-03-18 22:03:25 -07:00
Eugene Rakhmatulin
8ddc259619 Fixed #111 2026-03-18 22:03:04 -07:00
eugr
22f3fa6c21 Merge pull request #103 from apairmont/network_arg
Add docker --network arg to common build flags
2026-03-18 21:48:48 -07:00
Eugene Rakhmatulin
15d295887c Updated README to reflect --master-port parameter 2026-03-18 21:23:28 -07:00
Eugene Rakhmatulin
7e4150feed Added master-port argument 2026-03-18 16:57:55 -07:00
eugr
7b752c31c5 Merge pull request #110 from voloszad/patch-1
Remove run-cluster-node.sh script copy and permission commands from Dockerfile.mxfp4
2026-03-18 14:54:11 -07:00
Andrej V.
bdd2b10f54 Remove script copy and permission commands from Dockerfile
Removed script copying and permission setting for run-cluster-node.sh.
2026-03-18 21:57:56 +01:00
Eugene Rakhmatulin
2755b62d12 Fixes #108 2026-03-18 13:26:39 -07:00
Eugene Rakhmatulin
f327b92abe Fixes #106 and #108 2026-03-18 13:06:44 -07:00
Eugene Rakhmatulin
57b458570e Added experimental Qwen3.5-397B support for dual Spark configuration 2026-03-17 19:05:36 -07:00
Eugene Rakhmatulin
57ed099465 Updated README file to reflect new launch-cluster options. 2026-03-17 16:16:04 -07:00
Eugene Rakhmatulin
fb0687cd1b Updated README to describe no-ray mode 2026-03-17 15:27:22 -07:00
Eugene Rakhmatulin
ccea2ba861 Bugfixes 2026-03-17 13:54:42 -07:00
Eugene Rakhmatulin
957605498c Added extra passthrough variables to run-recipe 2026-03-17 13:41:40 -07:00
Eugene Rakhmatulin
b1eeefc0eb Changed Nemotron-3-Nano-NVFP4 to Marlin backend 2026-03-17 13:10:48 -07:00
Alan Pairmont
b879b7748f add network arg to common build flags 2026-03-16 12:09:59 -04:00
Eugene Rakhmatulin
fa645f3e4b bugfixes 2026-03-13 13:39:30 -07:00
Eugene Rakhmatulin
dedbd0a01d bugfixes 2026-03-13 12:41:48 -07:00
Eugene Rakhmatulin
caa83d9e5b Bugfixes 2026-03-13 12:32:43 -07:00
Eugene Rakhmatulin
4bcbbaa25a Bugfixes 2026-03-13 12:23:41 -07:00
Eugene Rakhmatulin
d08266a123 Bugfixes 2026-03-13 12:18:22 -07:00
Eugene Rakhmatulin
03b055d7f0 Major cluster orchestration refactoring to support running without Ray 2026-03-13 11:55:18 -07:00
Eugene Rakhmatulin
d609fecef3 Merge branch 'main' of github.com:eugr/spark-vllm-docker 2026-03-12 15:04:41 -07:00
eugr
7c198b1ceb Merge pull request #90 from sonusflow/pr/qwen35-397b-tp4
Add Qwen3.5-397B INT4-AutoRound TP=4 recipe (37 tok/s)
2026-03-12 15:04:23 -07:00
Eugene Rakhmatulin
8ae51192e5 Experimental mod to support gpu-memory-utilization-gb 2026-03-12 13:37:44 -07:00
Eugene Rakhmatulin
8fec9bed06 Updated Nemotron to support dual sparks 2026-03-12 13:30:15 -07:00
Eugene Rakhmatulin
6a323cc6f5 Merge pull request #93 2026-03-12 13:00:13 -07:00
Eugene Rakhmatulin
6f9a2f981c Adjusted model parameters 2026-03-12 12:59:05 -07:00
remi
122edc8229 super nemotron mod & recipe for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 2026-03-11 20:53:44 +01:00
Eugene Rakhmatulin
7ceea85647 Fixed qwen3-coder-next-int4-autoround to exclude Ray 2026-03-11 11:20:56 -07:00
Eugene Rakhmatulin
45066e2b16 Updated README 2026-03-11 09:57:34 -07:00
Eugene Rakhmatulin
f2cf11b047 Added a recipe for qwen3-coder-next-int4-autoround 2026-03-11 09:23:23 -07:00
sonusflow
3baca14eb1 Move recipe to 4x-spark-cluster/ and add UMA memory optimizations
- Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/
  per maintainer request (multi-node recipes in separate directory)
- Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env
- Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory):
  - Disable Ray dashboard (saves ~1.2 GiB per node)
  - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB)
  - Disable pre-started idle workers (saves ~8 GiB on head node)
  - Set --num-cpus 2 and --disable-usage-stats on all nodes
- Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache
2026-03-11 07:29:45 +00:00
Eugene Rakhmatulin
66b5c85907 Merge branch 'main' of github.com:eugr/spark-vllm-docker 2026-03-10 10:29:10 -07:00
eugr
0019bdf5ed Merge pull request #85 from saladinomario/feat/recipe-env-passthrough
Add -e/--env passthrough to run-recipe.py
2026-03-10 10:28:29 -07:00
sonusflow
006734910c Add Qwen3.5-397B INT4-AutoRound TP=4 recipe and Marlin fix
Production-tested recipe for running Qwen3.5-397B-A17B with INT4 AutoRound
quantization across 4 DGX Spark nodes using tensor parallelism.

Performance (4× DGX Spark, driver 580.126.09):
- Single user: 37 tok/s
- 4 concurrent: ~26 tok/s per user, ~103 tok/s aggregate

The Marlin TP fix resolves the MIN_THREAD_N=64 constraint that breaks
in_proj_ba layers at TP=4 (output_size=128/4=32 < 64). Solution:
ReplicatedLinear for B/A projections, applied via diff patches.

Key config:
- VLLM_MARLIN_USE_ATOMIC_ADD=1 (required for Marlin correctness)
- KV cache FP8, prefix caching enabled
- gpu_memory_utilization 0.78 (UMA safe margin)
- CUDAGraphs enabled (default, requires driver 580.x)

Note: Driver 590.x has CUDAGraph capture deadlock on GB10 unified memory.
Stay on driver 580.126.09.
2026-03-09 21:30:28 +00:00
Eugene Rakhmatulin
e225c709fb Revert "fix: add temporary patch for CUDA graphs estimation" as it has been merged to main
This reverts commit 63b2a8dbed.
2026-03-09 09:46:50 -07:00
Eugene Rakhmatulin
63b2a8dbed fix: add temporary patch for CUDA graphs estimation 2026-03-08 22:43:41 -07:00
eugr
9724619dbd Merge pull request #87 from SeraphimSerapis/fix_wheels_download
fix: skip empty lines in wheel download read loop
2026-03-07 09:34:31 -08:00
Eugene Rakhmatulin
d42c4199fa Unsloth chat template for qwen3.5 2026-03-06 23:35:18 -08:00
Tim Messerschmidt
b9fc32ec34 fix: skip empty lines in wheel download read loop
Add a guard to skip empty lines (e.g. trailing newlines) in the
while-read loop to prevent try_download_wheels from breaking on
unexpected blank input.
2026-03-07 05:06:12 +01:00
Eugene Rakhmatulin
9dc09bd04b Renamed recipe for qwen3.5-35b-a3b-fp8 to match others 2026-03-06 13:56:06 -08:00
eugr
e88426646b Merge pull request #76 from mmonad/fix-exec-arg-quoting
Fix shell quoting for exec command arguments
2026-03-06 13:45:53 -08:00
mariosaladino
f95beba566 Add -e/--env passthrough to run-recipe.py
Fixes #81. Allows passing environment variables (e.g. HF_TOKEN)
through to the container when launching via recipes, mirroring
the existing -e flag in launch-cluster.sh.

Usage: ./run-recipe.sh glm-4.7-flash-awq --solo -e HF_TOKEN=$HF_TOKEN
2026-03-06 21:50:29 +01:00
Olivier Paroz
eb8abcca7f Prevent 169.254.x.x fallback when setting fix IP address (#84)
* Prevent 169.254.x.x fallback when setting fix IP address

To force the use of the IP we've chosen to be assigned to the interface, it's safer to disable the fallback to avoid problems down the line

* Prevent 169.254.x.x fallback when setting fix IP address

To force the use of the static IP address we've chosen to be assigned to the interface, it's safer to disable the fallback to avoid problems down the line
2026-03-06 11:47:47 -08:00
eugr
d148d95a19 Merge pull request #80 from oliverjohnwilson/recipe-add_minimax-m2.5_qwen3.5-397b-a17B-fp8
added minimax-m2.5 and qwen3.5-397b-a17B-fp8 recipes to a recipes/4x-spark-cluster/ subdirectory
2026-03-06 11:46:37 -08:00
Eugene Rakhmatulin
5346372f14 More robust wheels check before download 2026-03-05 17:06:57 -08:00
Eugene Rakhmatulin
5f8f988d91 Merge branch 'main' of github.com:eugr/spark-vllm-docker 2026-03-05 16:29:00 -08:00
eugr
3fabd3fb1c Merge pull request #72 from erikvullings/main
Add Qwen35-35B-A3B recipe in FP8 format
2026-03-05 16:27:50 -08:00
Eugene Rakhmatulin
2d03bc138d saving flashinfer and vllm commits in wheels directories 2026-03-05 14:41:25 -08:00
oliverjohnwilson
4303f8b6d0 added minimax-m2.5 and qwen3.5-397b-a17B-fp8 recipes to a recipes/4x-spark-cluster/ subdirectory 2026-03-04 16:01:37 -06:00
L.B.R.
50b3ca60f3 Fix shell quoting for exec command arguments
Arguments with special characters (e.g. JSON strings) were passed
unquoted, causing breakage for commands like:
  --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":2}'

Use printf %q in launch-cluster.sh and shlex.quote() in run-recipe.py
to properly escape arguments.
2026-03-04 15:22:42 +00:00
Erik Vullings
163f23d85b Update qwen35-35b-a3b-fp8.yaml
--max_num_batched_tokens is a default variable now, which can be overriden via the CLI
2026-03-03 12:46:12 +01:00
Erik Vullings
e8f94d6b8b Add Qwen35-35B-A3B recipe in FP8 format 2026-02-27 17:46:06 +01:00
51 changed files with 4111 additions and 939 deletions

38
.env.example Normal file
View File

@@ -0,0 +1,38 @@
# Example .env configuration file for spark-vllm-docker
# Copy this file to .env and customize for your environment
# Cluster configuration
# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node)
CLUSTER_NODES="192.168.177.11,192.168.177.12"
# ETH_IF: Ethernet interface name (optional, auto-detected if not specified)
ETH_IF="enp1s0f1np1"
# IB_IF: InfiniBand interface name (optional, auto-detected if not specified)
IB_IF="rocep1s0f1,roceP2p1s0f1"
# LOCAL_IP: Local IP address (optional, auto-detected if not specified)
# Useful for solo mode or overriding auto-detection
LOCAL_IP="192.168.177.11"
# MASTER_PORT: Port for cluster coordination (default: 29501)
MASTER_PORT="29501"
# CONTAINER_NAME: Container name (default: vllm_node)
# Note: This is a configuration variable, NOT passed as env var to container
CONTAINER_NAME="vllm_node"
# Container environment variables
# Any variable starting with CONTAINER_ (except CONTAINER_NAME) will be converted to -e flags
# Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO
CONTAINER_NCCL_DEBUG="INFO"
CONTAINER_HF_TOKEN="your_huggingface_token_here"
CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional)
# Used by build-and-copy.sh to distribute images across cluster
COPY_HOSTS="192.168.177.12"
# Additional container environment variables
# CONTAINER_MAX_JOBS="16"
# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"

View File

@@ -0,0 +1,53 @@
name: Build and Push spark-vllm
on:
push:
branches:
- main
workflow_dispatch:
env:
IMAGE_NAME: spark-vllm
IMAGE_TAG: latest
jobs:
docker:
runs-on: nix
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Login to Registry
run: |
echo "${{ secrets.REGISTRY_PASSWORD }}" | docker login \
${{ secrets.REGISTRY_HOST }} \
-u "${{ secrets.REGISTRY_USERNAME }}" \
--password-stdin
- name: Make build script executable
run: chmod +x build-and-copy.sh
- name: Build image using upstream script
run: |
./build-and-copy.sh -t ${IMAGE_NAME}:${IMAGE_TAG}
- name: Tag image
run: |
docker tag \
${IMAGE_NAME}:${IMAGE_TAG} \
${{ secrets.REGISTRY_HOST }}/${IMAGE_NAME}:${IMAGE_TAG}
docker tag \
${IMAGE_NAME}:${IMAGE_TAG} \
${{ secrets.REGISTRY_HOST }}/${IMAGE_NAME}:${GITEA_SHA::7}
- name: Push latest
run: |
docker push \
${{ secrets.REGISTRY_HOST }}/${IMAGE_NAME}:${IMAGE_TAG}
- name: Push commit SHA
run: |
docker push \
${{ secrets.REGISTRY_HOST }}/${IMAGE_NAME}:${GITEA_SHA::7}

3
.gitignore vendored
View File

@@ -1 +1,2 @@
.env
.env
build-metadata.yaml

View File

@@ -4,9 +4,9 @@
ARG BUILD_JOBS=16
# =========================================================
# STAGE 1: Base Image (Installs Dependencies)
# STAGE 1: Base Build Image
# =========================================================
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base
FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS base
# Build parallemism
ARG BUILD_JOBS
@@ -14,6 +14,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
ENV NINJAFLAGS="-j${BUILD_JOBS}"
ENV MAKEFLAGS="-j${BUILD_JOBS}"
ENV DG_JIT_USE_NVRTC=1
ENV USE_CUDNN=1
# Set non-interactive frontend to prevent apt prompts
ENV DEBIAN_FRONTEND=noninteractive
@@ -27,6 +29,9 @@ ENV UV_CACHE_DIR=/root/.cache/uv
ENV UV_SYSTEM_PYTHON=1
ENV UV_BREAK_SYSTEM_PACKAGES=1
ENV UV_LINK_MODE=copy
# Set timeouts
ENV UV_HTTP_TIMEOUT=600
ENV UV_HTTP_RETRIES=10
# Set the base directory environment variable
ENV VLLM_BASE_DIR=/workspace/vllm
@@ -35,10 +40,18 @@ ENV VLLM_BASE_DIR=/workspace/vllm
# Added ccache to enable incremental compilation caching
RUN apt update && \
apt install -y --no-install-recommends \
curl vim ninja-build git \
ccache \
curl vim cmake build-essential ninja-build \
libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
python3-dev python3-pip git wget \
libibverbs1 libibverbs-dev rdma-core \
ccache devscripts debhelper fakeroot \
&& rm -rf /var/lib/apt/lists/* \
&& pip install uv && pip uninstall -y flash-attn
&& pip install uv
# Additional deps
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install torch==2.11.0 torchvision torchaudio triton --index-url https://download.pytorch.org/whl/cu130 && \
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm
# Configure Ccache for CUDA/C++
ENV PATH=/usr/lib/ccache:$PATH
@@ -51,14 +64,19 @@ ENV CCACHE_COMPRESS=1
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache
# Setup Workspace
WORKDIR $VLLM_BASE_DIR
# 2. Set Environment Variables
ARG TORCH_CUDA_ARCH_LIST="12.1a"
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
# Setup Workspace
WORKDIR $VLLM_BASE_DIR
# Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb
RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \
cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \
make pkg.debian.build && apt install -y --no-install-recommends --allow-downgrades ./build/pkg/deb/*.deb
# =========================================================
# STAGE 2: FlashInfer Builder
# =========================================================
@@ -73,8 +91,9 @@ ARG FLASHINFER_REF=main
# Change this argument to force a re-download of FlashInfer
ARG CACHEBUST_FLASHINFER=1
# Additional deps
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
uv pip install packaging
# Smart Git Clone (Fetch changes instead of full re-clone)
RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
@@ -100,6 +119,31 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
WORKDIR /workspace/flashinfer
ARG FLASHINFER_PRS=""
RUN if [ -n "$FLASHINFER_PRS" ]; then \
# Git requires a user identity to create merge commits
git config --global user.email "builder@example.com"; \
git config --global user.name "Docker Builder"; \
\
echo "Applying PRs: $FLASHINFER_PRS"; \
for pr in $FLASHINFER_PRS; do \
echo "Fetching and merging PR #$pr..."; \
git fetch origin pull/${pr}/head:pr-${pr}; \
git merge pr-${pr} --no-edit; \
done; \
fi
# TEMPORARY patch for flashinfer autotune and other improvements (PR 2927) - MERGED 4/3
# RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/2927.diff -o pr2927.diff \
# && if git apply --reverse --check pr2927.diff 2>/dev/null; then \
# echo "PR #2927 already applied, skipping."; \
# else \
# echo "Applying FI PR #2927..."; \
# git apply -v pr2927.diff; \
# fi \
# && rm pr2927.diff
# Apply patch to avoid re-downloading existing cubins
COPY flashinfer_cache.patch .
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
@@ -113,7 +157,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
# flashinfer-jit-cache
cd ../flashinfer-jit-cache && \
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
# dump git ref in the wheels dir
cd .. && git rev-parse HEAD > /workspace/wheels/.flashinfer-commit
# =========================================================
# STAGE 3: FlashInfer Wheel Export
@@ -130,9 +176,6 @@ ARG TORCH_CUDA_ARCH_LIST="12.1a"
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
WORKDIR $VLLM_BASE_DIR
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
# --- VLLM SOURCE CACHE BUSTER ---
ARG CACHEBUST_VLLM=1
@@ -166,20 +209,56 @@ WORKDIR $VLLM_BASE_DIR/vllm
ARG VLLM_PRS=""
RUN if [ -n "$VLLM_PRS" ]; then \
# Git requires a user identity to create merge commits
git config --global user.email "builder@example.com"; \
git config --global user.name "Docker Builder"; \
\
echo "Applying PRs: $VLLM_PRS"; \
for pr in $VLLM_PRS; do \
echo "Fetching and applying PR #$pr..."; \
curl -fL "https://github.com/vllm-project/vllm/pull/${pr}.diff" | git apply -v; \
echo "Fetching and merging PR #$pr..."; \
git fetch origin pull/${pr}/head:pr-${pr}; \
git merge pr-${pr} --no-edit; \
done; \
fi
# TEMPORARY PATCH for broken FP8 kernels - https://github.com/vllm-project/vllm/pull/35568
RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/35568.diff -o pr35568.diff \
&& if git apply --reverse --check pr35568.diff 2>/dev/null; then \
echo "PR 35568 already applied, skipping."; \
else \
echo "Applying PR 35568..."; \
git apply -v --exclude="tests/*" pr35568.diff; \
fi \
&& rm pr35568.diff
# TEMPORARY PATCH: revert vLLM PR #41524 / commit c51df430,
# which disables FlashInfer autotune and regresses DGX Spark throughput.
RUN set -eux; \
patch_commit="c51df43005726a09c6eb7348e8c1b00501c70a8e"; \
target="vllm/config/vllm.py"; \
marker="https://github.com/flashinfer-ai/flashinfer/issues/3197"; \
if grep -q "$marker" "$target"; then \
echo "PR #41524 regression found; reverting ${patch_commit}"; \
if ! git revert --no-commit "$patch_commit"; then \
git revert --abort 2>/dev/null || true; \
echo "ERROR: PR #41524 appears present but could not be reverted"; \
exit 1; \
fi; \
if grep -q "$marker" "$target"; then \
echo "ERROR: revert completed but PR #41524 marker is still present"; \
exit 1; \
fi; \
else \
echo "PR #41524 regression marker not present; skipping revert"; \
fi
# Prepare build requirements
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
python3 use_existing_torch.py && \
sed -i "/flashinfer/d" requirements/cuda.txt && \
sed -i '/^triton\b/d' requirements/test.txt && \
sed -i '/^fastsafetensors\b/d' requirements/test.txt && \
uv pip install -r requirements/build.txt
sed -i '/^triton\b/d' requirements/test/cuda.txt && \
sed -i '/^fastsafetensors\b/d' requirements/test/cuda.txt && \
uv pip install -r requirements/build/cuda.txt
# Apply Patches
# TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
@@ -190,13 +269,15 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
# patch -p1 < fastsafetensors.patch; \
# fi
# TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302
RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping"
RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping"
# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping"
# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping"
# Final Compilation
RUN --mount=type=cache,id=ccache,target=/root/.ccache \
--mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
# dump git ref in the wheels dir
git rev-parse HEAD > /workspace/wheels/.vllm-commit
# =========================================================
# STAGE 5: vLLM Wheel Export
@@ -207,7 +288,7 @@ COPY --from=vllm-builder /workspace/wheels /
# =========================================================
# STAGE 6: Runner (Installs wheels from host ./wheels/)
# =========================================================
FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS runner
# Transferring build settings from build image because of ptxas/jit compilation during vLLM startup
# Build parallemism
@@ -216,6 +297,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
ENV NINJAFLAGS="-j${BUILD_JOBS}"
ENV MAKEFLAGS="-j${BUILD_JOBS}"
ENV DG_JIT_USE_NVRTC=1
ENV USE_CUDNN=1
ENV DEBIAN_FRONTEND=noninteractive
ENV PIP_BREAK_SYSTEM_PACKAGES=1
@@ -228,13 +311,18 @@ ENV UV_SYSTEM_PYTHON=1
ENV UV_BREAK_SYSTEM_PACKAGES=1
ENV UV_LINK_MODE=copy
# Mount additional packages from base builder image
# Install runtime dependencies
RUN apt update && \
RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target=/workspace/nccl-pkg \
apt update && \
apt install -y --no-install-recommends \
curl vim git \
python3 python3-pip python3-dev vim curl git wget \
libcudnn9-cuda-13 \
libibverbs1 libibverbs-dev rdma-core \
libxcb1 \
&& cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \
&& rm -rf /var/lib/apt/lists/* \
&& pip install uv && pip uninstall -y flash-attn # triton-kernels pytorch-triton
&& pip install uv
# Set final working directory
WORKDIR $VLLM_BASE_DIR
@@ -246,6 +334,11 @@ RUN mkdir -p tiktoken_encodings && \
ARG PRE_TRANSFORMERS=0
# Install deps
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install torch==2.11.0 torchvision torchaudio triton --index-url https://download.pytorch.org/whl/cu130 && \
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
# Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
# With --tf5: override vLLM's transformers<5 constraint to get transformers>=5
RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
@@ -266,27 +359,14 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
ENV PATH=$VLLM_BASE_DIR:$PATH
# Copy scripts
COPY run-cluster-node.sh $VLLM_BASE_DIR/
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
# Final extra deps
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
uv pip install ray[default] fastsafetensors instanttensor
# Cleanup
# Keeping it here for reference - this won't work as is without squashing layers
# RUN uv pip uninstall absl-py apex argon2-cffi \
# argon2-cffi-bindings arrow asttokens astunparse async-lru audioread babel beautifulsoup4 \
# black bleach comm contourpy cycler datasets debugpy decorator defusedxml dllist dm-tree \
# execnet executing expecttest fastjsonschema fonttools fqdn gast hypothesis \
# ipykernel ipython ipython_pygments_lexers isoduration isort jedi joblib jupyter-events \
# jupyter-lsp jupyter_client jupyter_core jupyter_server jupyter_server_terminals jupyterlab \
# jupyterlab_code_formatter jupyterlab_code_formatter jupyterlab_pygments jupyterlab_server \
# jupyterlab_tensorboard_pro jupytext kiwisolver matplotlib matplotlib-inline matplotlib-inline \
# mistune ml_dtypes mock nbclient nbconvert nbformat nest-asyncio notebook notebook_shim \
# opt_einsum optree outlines_core overrides pandas pandocfilters parso pexpect polygraphy pooch \
# pyarrow pycocotools pytest-flakefinder pytest-rerunfailures pytest-shard pytest-xdist \
# scikit-learn scipy Send2Trash soundfile soupsieve soxr spin stack-data \
# wcwidth webcolors xdoctest Werkzeug
# Fix NCCL
RUN rm /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2 && \
ln -s /usr/lib/aarch64-linux-gnu/libnccl.so.2 /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2
# Build metadata (generated by build-and-copy.sh)
COPY build-metadata.yaml /workspace/build-metadata.yaml

View File

@@ -98,10 +98,10 @@ ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git
ARG FLASHINFER_SHA=f349e52496a72a00d8c4ac02c7a1e38523ff7194
ARG CUTLASS_SHA=c7516ad20f3d022fdbc93e9468643bf3b577e02c
ARG CUTLASS_SHA=fede53000a962b46e05bafe0c86311778caeb380
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
uv pip install "nvidia-nvshmem-cu13<3.6" "apache-tvm-ffi<0.2"
# Clone FlashInfer (cached for faster rebuilds)
RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
@@ -270,13 +270,12 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
ENV PATH=$VLLM_BASE_DIR:$PATH
# Copy scripts
COPY run-cluster-node.sh $VLLM_BASE_DIR/
RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
# Final extra deps
RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
uv pip install ray[default] fastsafetensors "nvidia-nvshmem-cu13<3.6"
# Build metadata (generated by build-and-copy.sh)
COPY build-metadata.yaml /workspace/build-metadata.yaml
# If not compiling Triton
# remove triton-kernels as they are not compatible with this vLLM version yet

450
README.md
View File

@@ -1,7 +1,7 @@
# vLLM Docker Optimized for DGX Spark (single or multi-node)
This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups.
Cluster setup supports direct connect between dual Sparks, connecting via QSFP/RoCE switch and 3-node mesh configuration.
While it was primarily developed to support multi-node inference, it works just as well on a single node setups.
@@ -31,6 +31,8 @@ We will expand the selection of models we test in the pipeline, but since vLLM i
If you want to build the latest from main branch, you can specify `--rebuild-vllm` flag. Or you can target a specific vLLM release by setting `--vllm-ref` parameter.
Similarly, `--rebuild-flashinfer`, `--flashinfer-ref`, and `--apply-flashinfer-pr` control the FlashInfer build in the same way.
## QUICK START
### Build
@@ -52,8 +54,8 @@ Build the container.
**On DGX Spark cluster:**
Make sure you connect your Sparks together and enable passwordless SSH as described in NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks).
You can also check out our new [Networking Guide](docs/NETWORKING.md).
Make sure you connect your Sparks together and enable passwordless SSH as described in our [Networking Guide](docs/NETWORKING.md). You can also check out NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks), but using our guide is the best way to get started.
**NEW**: the guide now includes instructions on setting up 3-node Spark mesh!
Then run the following command that will build and distribute image across the cluster.
@@ -67,7 +69,7 @@ An initial build speed depends on your Internet connection speed and whether the
**On a single node**:
**NEW** - `launch-cluster.sh` now supports solo mode, which is now a recommended way to run the container on a single Spark:
`launch-cluster.sh` supports solo mode, which is now a recommended way to run the container on a single Spark:
```bash
./launch-cluster.sh --solo exec \
@@ -78,23 +80,6 @@ An initial build speed depends on your Internet connection speed and whether the
--load-format fastsafetensors
```
**To launch using regular `docker run`**
```bash
docker run \
--privileged \
--gpus all \
-it --rm \
--network host --ipc=host \
-v ~/.cache/huggingface:/root/.cache/huggingface \
vllm-node \
bash -c -i "vllm serve \
QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ \
--port 8000 --host 0.0.0.0 \
--gpu-memory-utilization 0.7 \
--load-format fastsafetensors"
```
**On a cluster**
It's recommended to download the model on one node and distribute across the cluster using ConnectX interconnect prior to launching. This is to avoid re-downloading the model from the Internet on every node in the cluster.
@@ -127,11 +112,9 @@ This will run the model on all available cluster nodes.
**Also:** You can use any vLLM container that has "bash" as its default entrypoint with the launch script. It was tested with NGC vLLM, but can work with others too. To use such container in the cluster, you need to specify `--apply-mod use-ngc-vllm` argument to `./launch-cluster.sh`. However, it's recommended to build the container using this repository for best compatibility and most up-to-date features.
## CHANGELOG
**IMPORTANT**
You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning.
You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning.
You can check the build cache size by running:
@@ -149,6 +132,302 @@ Don't do it every time you rebuild, because it will slow down compilation times.
For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
## CHANGELOG
### 2026-04-14
Added `--load-format instanttensor` support to vLLM - thanks @SeraphimSerapis.
An experimental option for now, but allows for faster loading than the current fastsafetensors default. You need to rebuild the container to start using the option, but you don't have to trigger the source build.
### 2026-04-12
#### Drop-caches mod for Qwen3.5-397B
Updated Qwen3.5-397B recipe (for dual node configuration) to use the new mod `mods/drop-caches` which clears filesystem caches every minute while the container is running, resolving fastsafetensors getting stuck during loading and a few other bugs when operating close to max memory limit.
### 2026-04-11
#### Pinned PyTorch Version
Pinned PyTorch to version 2.11.0 (previously using nightly builds) to fix incompatibility with transformers 5.x and avoid torch version mismatch in builds.
### 2026-04-02
A new recipe for Gemma4-26B-A4B in "on-the-fly" FP8 quantization:
Single Spark:
```bash
./run-recipe.sh gemma4-26b-a4b --solo
```
Dual Sparks:
```bash
./run-recipe.sh gemma4-26b-a4b --no-ray
```
### 2026-03-31
#### Flags to specify Flashinfer ref and apply PRs
`build-and-copy.sh` gains two new flags that mirror the existing vLLM equivalents:
- `--flashinfer-ref <ref>` — build FlashInfer from a specific commit SHA, branch, or tag instead of `main`. Forces a local FlashInfer build (skips prebuilt wheel download).
- `--apply-flashinfer-pr <pr-num>` — fetch and apply a FlashInfer GitHub PR patch before building. Can be specified multiple times. Forces a local FlashInfer build.
Both flags are incompatible with `--exp-mxfp4`.
#### Default image tag in `build-and-copy.sh`
`build-and-copy.sh` now automatically sets a sensible default image tag when `-t` is not specified:
- `--tf5` / `--pre-tf` - tag defaults to `vllm-node-tf5`
- `--exp-mxfp4` - tag defaults to `vllm-node-mxfp4`
- in all other cases - tag defaults to `vllm-node` (no change)
An explicit `-t <tag>` always takes precedence.
#### Support for 3-node mesh setups
Added initial support for setups where 3 Sparks are connected in a ring-like mesh without an additional switch.
See [Networking Guide](docs/NETWORKING.md) for instructions on how to connect and set up networking in such cluster.
Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can detect mesh setups and configure parameters accordingly.
You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe:
```bash
./run-recipe.sh --discover # force mesh discovery
./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls
```
Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are:
- `--pipeline-parallel 3` will let you run a model that can't fit on dual Sparks, but without additional speed improvements (total throughtput may improve though).
- `--data-parallel 3` (possibly with `--enable-expert-parallel`) will let you run a model that can fit on a single Spark, but allow for better concurrency.
You can also run models with `--tensor-parallel 2` in a 3-node configuration - in this case only first two nodes (from autodiscovery/.env or from the CLI parameters) will be utilized.
#### GB10 Verification During Node Discovery
Node discovery now confirms each SSH-reachable peer is a GB10 system before adding it to the cluster:
Only hosts reporting `NVIDIA GB10` are included. This prevents accidentally adding non-Spark machines that happen to be on the same subnet.
#### Separate COPY_HOSTS Discovery
Autodiscover now determines the host list used for image and model distribution separately from `CLUSTER_NODES`:
- **Non-mesh**: `COPY_HOSTS` mirrors `CLUSTER_NODES` (no change in behaviour).
- **Mesh**: scans the direct IB-attached `enp1s0f0np0` and `enp1s0f1np1` interfaces (not the OOB ETH interface), so large file transfers use the faster direct InfiniBand path.
`COPY_HOSTS` is saved to `.env` and respected by `build-and-copy.sh`, `hf-download.sh`, and `run-recipe.py`.
#### Interactive Configuration Save in `autodiscover.sh`
`autodiscover.sh` now handles `.env` creation with a guided interactive flow, replacing the previous logic in `run-recipe.py`:
- Runs automatically when `.env` is absent.
- Asks per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
- Skips if `.env` already exists (use `--setup` to force).
`run-recipe.py` no longer contains its own `.env`-save prompt — it delegates entirely to `autodiscover.sh`.
#### `--setup` Flag in `launch-cluster.sh` and `build-and-copy.sh`
Both scripts now accept `--setup` to force a full autodiscovery run and overwrite the existing `.env`:
```bash
./launch-cluster.sh --setup exec vllm serve ...
./build-and-copy.sh --setup -c
```
This is equivalent to the existing `--setup` in `run-recipe.sh`.
#### `--config` Flag
`hf-download.sh`, `build-and-copy.sh` and `launch-cluster.sh` now accept `--config <file>` to load a custom `.env` configuration file. `COPY_HOSTS` from the config is used for model distribution:
```bash
./hf-download.sh QuantTrio/MiniMax-M2-AWQ --config /path/to/cluster.env -c --copy-parallel
```
#### Parallelism-Aware Node Trimming
`launch-cluster.sh` now parses `-tp` / `--tensor-parallel-size`, `-pp` / `--pipeline-parallel-size`, and `-dp` / `--data-parallel-size` from the exec command or launch script and adjusts the active node count accordingly — for both Ray and no-Ray modes.
- If **fewer nodes are needed** than configured, only the required nodes get containers started (excess nodes are left idle).
- If **more nodes are needed** than available, an error is raised before anything starts.
```
Note: Command requires 2 node(s) (tp=2 * pp=1 * dp=1); using 2 of 3 configured node(s).
Error: Command requires 4 nodes (tp=4 * pp=1 * dp=1) but only 3 node(s) are configured.
```
No flags required — the check is automatic whenever parallelism arguments are present in the command.
### 2026-03-18
#### `--master-port` / `--head-port` Parameter
Added `--master-port` (synonym: `--head-port`) to both `launch-cluster.sh` and `run-recipe.sh` to configure the port used for cluster coordination:
- In **Ray mode**: sets the Ray head node port (previously hardcoded to 6379)
- In **No-Ray mode**: sets the PyTorch distributed `--master-port` passed to vLLM
Default is `29501`.
```bash
./launch-cluster.sh --master-port 29501 --no-ray exec vllm serve ...
./run-recipe.sh qwen3.5-122b-fp8 --no-ray --master-port 29501
```
#### `--network` Parameter in Build Arguments
Added `--network <name>` to `build-and-copy.sh` to allow using host networking during builds.
Thanks @apairmont for the PR.
### 2026-03-17
#### EXPERIMENTAL Intel/Qwen3.5-397B-A17B-int4-AutoRound Recipe
You can run full 397B Qwen3.5 model on just two Sparks with vision and full context, however you need to make sure your Sparks don't run anything extra that can take a lot of RAM. That means that you don't want to log into the graphical interface or use remote desktop. Connect to the head node via ssh.
Alternatively, you can run in non-graphical mode (runlevel 3) by using `sudo systemctl isolate multi-user.target` to switch (you can use `sudo systemctl set-default graphical.target` to switch back to graphical mode), however this is known to reduce performance a bit.
You can run the model with the following command on the head node:
```bash
./run-recipe.sh qwen3.5-397b-int4-autoround.yaml --no-ray
```
Please, note `--no-ray` is necessary to fit full context. It also improves inference speed by ~1 t/s.
By default it will try to allocate 112 GB for vLLM on each node. You can change this by changing `--gpu-memory-utilization` (e.g. `--gpu-memory-utilization 113`), but please be aware that it uses GB instead of percentage **for this recipe**.
**KNOWN ISSUES**:
1. The current firmware may cause sudden shutdown event on one or both Sparks during heavy inference. If you have this issue, you will need to lower GPU clock frequency on the affected unit(s), e.g. `sudo nvidia-smi -lgc 200,2150`. This command will reduce max GPU frequency to 2150 MHz. You can play with higher values to see what works for you (default is 2411 MHz, but can boost to 3000 MHz). Please note that this setting only survives until the next reboot, but can be applied at any time.
2. You will need to use the new `--no-ray` argument to fit full context.
3. If the model gets stuck loading weights, clearing the cache on both nodes can "unstuck" it. Use `sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'` to clear the cache.
#### Major Cluster Orchestration Refactoring
Significantly refactored the internal cluster startup logic in `launch-cluster.sh`:
- Removed the standalone `run-cluster-node.sh` script; its logic is now fully integrated into `launch-cluster.sh`.
- Ray head/worker startup, environment variable injection, and launch script distribution are now handled by `launch-cluster.sh` directly.
- Worker containers are started with proper per-node environment variables (`VLLM_HOST_IP`, `NCCL_SOCKET_IFNAME`, etc.) injected via `docker run`/`docker exec` instead of relying on `.bashrc`.
- You will now be able to run other vLLM containers without applying `use-ngc-vllm` mod (current version is just an empty stub).
#### No-Ray Multi-Node Mode
Added `--no-ray` flag to `launch-cluster.sh` to run multi-node vLLM clusters without Ray, using PyTorch's native distributed backend instead. It slightly improves inference performance for most models and reduces memory requirements.
```bash
./launch-cluster.sh --no-ray exec vllm serve ...
```
`--no-ray` is incompatible with `--solo` (which already runs without Ray).
#### `run-recipe.sh` No-Ray Mode and Extended Flag Passthrough
`run-recipe.sh` now supports `--no-ray` flag for running multi-node inference without Ray (uses PyTorch distributed backend instead):
```bash
./run-recipe.sh qwen3.5-122b-fp8 --no-ray
```
The following `launch-cluster.sh` flags are now also passed through from `run-recipe.sh`:
`--master-port`, `--name`, `--eth-if`, `--ib-if`, `-j`, `--no-cache-dirs`, `--non-privileged`, `--mem-limit-gb`, `--mem-swap-limit-gb`, `--pids-limit`, `--shm-size-gb`.
#### Nemotron-3-Nano-NVFP4 Switched to Marlin Backend
The `nemotron-3-nano-nvfp4` recipe has been updated to use the Marlin backend for better performance and reliability (until Flashinfer fully supports NVFP4 on sm121).
### 2026-03-12
#### Experimental `--gpu-memory-utilization-gb` Mod
Added a new mod `mods/gpu-mem-util-gb` that adds a `--gpu-memory-utilization-gb` flag to vLLM, allowing you to specify GPU memory reservation in GiB instead of as a fraction. This is particularly useful on DGX Spark's unified memory architecture where available memory changes dynamically.
```bash
./launch-cluster.sh --apply-mod mods/gpu-mem-util-gb exec vllm serve ... \
--gpu-memory-utilization-gb 110
```
Cannot be used simultaneously with `--kv-cache-memory-bytes`.
#### Qwen3.5-397B INT4-AutoRound TP=4 Recipe (4× Spark Cluster)
Added `recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml` for running Intel/Qwen3.5-397B-A17B-int4-AutoRound across 4 DGX Spark nodes with tensor parallelism (TP=4).
Benchmarked at ~37 tok/s single-user, ~103 tok/s aggregate (4 concurrent users).
Includes a new mod `mods/fix-qwen35-tp4-marlin` that resolves a Marlin kernel constraint (`MIN_THREAD_N=64`) that breaks certain projection layers at TP=4.
**Note:** Requires NVIDIA driver 580.x. Driver 590.x has a CUDAGraph capture deadlock on GB10 unified memory.
```bash
./run-recipe.sh 4x-spark-cluster/qwen3.5-397b-int4-autoround
```
Thanks @sonusflow for the contribution.
#### Nemotron-3-Super-120B NVFP4 Recipe
Added a new recipe `nemotron-3-super-nvfp4` for running `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4` with Marlin kernels. Supports both solo and cluster modes. Includes a custom reasoning parser (`super_v3_reasoning_parser.py`) fetched from the model repository. Supports both dual and single Spark configurations.
```bash
./run-recipe.sh nemotron-3-super-nvfp4
```
### 2026-03-11
#### Qwen3-Coder-Next INT4-AutoRound Recipe
Added a new recipe `qwen3-coder-next-int4-autoround` for running Intel/Qwen3-Coder-Next-int4-AutoRound. Supports single Spark only (use with `--solo` switch), since split weights are too small for Marlin kernel.
```bash
./run-recipe.sh qwen3-coder-next-int4-autoround --solo
```
### 2026-03-06
#### `-e/--env` Passthrough in `run-recipe.py`
`run-recipe.sh` now accepts one or more `-e VAR=VALUE` flags to pass environment variables directly to the container, mirroring the existing behaviour of `launch-cluster.sh`.
```bash
./run-recipe.sh qwen3.5-122b-int4-autoround --solo -e HF_TOKEN=$HF_TOKEN
```
#### Unsloth Chat Template for Qwen3.5
Added a new mod `mods/fix-qwen3.5-chat-template` that applies the Unsloth chat template to Qwen3.5 models for better compatibility with modern clients. The template is now included in the `qwen3.5-122b-fp8`, `qwen3.5-122b-int4-autoround`, and `qwen3.5-35b-a3b-fp8` recipes.
#### Fix Shell Quoting for Exec Command Arguments
Fixed shell quoting for exec command arguments in `launch-cluster.sh` and `run-recipe.py` to correctly handle arguments containing spaces or special characters.
### 2026-03-05
#### Qwen3.5-35B-A3B-FP8 Recipe
Added a new recipe `qwen3.5-35b-a3b-fp8` for running Qwen3.5-35B-A3B in FP8 format.
```bash
./run-recipe.sh qwen3.5-35b-a3b-fp8
```
#### 4× Spark Cluster Recipes
Added a `recipes/4x-spark-cluster/` subdirectory with recipes optimised for a 4-node Spark cluster:
- `minimax-m2.5` — MiniMax M2.5 on 4× Spark
- `qwen3.5-397b-a17B-fp8` — Qwen3.5-397B-A17B in FP8 on 4× Spark
#### More Robust Wheels Check Before Download
Improved the wheels availability check in `build-and-copy.sh` to be more reliable when deciding whether to download remote wheels.
### 2026-03-04
#### Prebuilt vLLM Wheels via GitHub Releases
@@ -162,7 +441,15 @@ The download logic mirrors the FlashInfer behaviour:
No new flags are required — the download happens transparently.
All prebuilt wheels are now tested with multiple models in both solo and cluster configuration as a part of automated deployment pipeline which will now run nightly. The wheels are released only if they pass all the tests and no significant performance regressions are detected.
All prebuilt wheels are now tested with multiple models in both solo and cluster configuration as a part of automated deployment pipeline which will now run nightly. The wheels are released only if they pass all the tests and no significant performance regressions are detected.
#### Qwen3.5-122B-FP8 Recipe
Added a new recipe `qwen3.5-122b-fp8` for running Qwen3.5-122B in FP8 format.
```bash
./run-recipe.sh qwen3.5-122b-fp8
```
### 2026-03-02
@@ -421,7 +708,8 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm-
To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:
```bash
./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c
# Image tag defaults to vllm-node-tf5 when --tf5/--pre-tf is used
./build-and-copy.sh --pre-tf -c
```
Then, to run on a single node:
@@ -471,7 +759,8 @@ It is currently the fastest way to run GPT-OSS on DGX Spark, achieving 60 t/s on
To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss:
```bash
./build-and-copy.sh -t vllm-node-mxfp4 --exp-mxfp4 -c
# Image tag defaults to vllm-node-mxfp4 when --exp-mxfp4 is used
./build-and-copy.sh --exp-mxfp4 -c
```
Then, to run on a single Spark:
@@ -715,12 +1004,14 @@ Using a different username:
| Flag | Description |
| :--- | :--- |
| `-t, --tag <tag>` | Image tag (default: `vllm-node`) |
| `-t, --tag <tag>` | Image tag (default: `vllm-node`; auto-set to `vllm-node-tf5` with `--tf5`, `vllm-node-mxfp4` with `--exp-mxfp4`) |
| `--gpu-arch <arch>` | Target GPU architecture (default: `12.1a`) |
| `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build |
| `--rebuild-vllm` | Force rebuild vLLM from source |
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: `main`) |
| `--flashinfer-ref <ref>` | FlashInfer commit SHA, branch or tag (default: `main`) |
| `--apply-vllm-pr <pr-num>` | Apply a vLLM PR patch during build. Can be specified multiple times. |
| `--apply-flashinfer-pr <pr-num>` | Apply a FlashInfer PR patch during build. Can be specified multiple times. |
| `--tf5` | Install transformers v5 (5.0.0 or higher). Aliases: `--pre-tf, --pre-transformers`. |
| `--exp-mxfp4` | Build with experimental native MXFP4 support. Alias: `--experimental-mxfp4`. |
| `-c, --copy-to <hosts>` | Host(s) to copy the image to after building (space- or comma-separated). |
@@ -730,9 +1021,13 @@ Using a different username:
| `-u, --user <user>` | Username for SSH connection (default: current user) |
| `--full-log` | Enable full Docker build output (`--progress=plain`) |
| `--no-build` | Skip building, only copy existing image (requires `--copy-to`) |
| `--network <name>` | Docker network to use during build (e.g. `host`). |
| `--cleanup` | Remove all cached `.whl` and `*-commit` files from the `wheels/` directory. |
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory) |
| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists) |
| `-h, --help` | Show help message |
**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! If you omit the IP address and use `-c` without addresses, it will use autodiscovery to detect a proper IP address.
**IMPORTANT**: When copying to another node manually, use the IP assigned to a ConnectX 7 interface (`enp1s0f*`), not the 10G/wireless interfaces. When using `-c` without addresses, autodiscovery selects the correct interface automatically — in mesh mode it uses the direct IB-attached interfaces (`enp1s0f0np0`, `enp1s0f1np1`) for maximum transfer speed.
### Copying the container to another Spark node (Manual Method)
@@ -801,9 +1096,13 @@ Assumptions and limitations:
### Auto-Detection
The script attempts to automatically detect:
* **Ethernet Interface:** The interface associated with the active InfiniBand device that has an IP address.
* **InfiniBand Interface:** The active InfiniBand devices. By default both active RoCE interfaces that correspond to active IB port(s) will be utilized.
* **Node Role:** Based on the detected IP address and the list of nodes (defaults to `192.168.177.11` as head and `192.168.177.12` as worker).
* **Ethernet Interface (`ETH_IF`):** Determined by the number of active CX7 interfaces:
- **2 active** (standard): the `enp*` interface (no capital P) that has an IP address.
- **4 active** (mesh topology): `enP7s7` (preferred) or `wlP9s9` (wireless, shown with a warning) — the cluster coordination interface is separate from the CX7 ports in this configuration.
* **InfiniBand Interface (`IB_IF`):** All active RoCE devices. In mesh mode this is always `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`.
* **Cluster peers:** Discovered by scanning the `ETH_IF` subnet for hosts with SSH access **and** a GB10 GPU (`nvidia-smi --query-gpu=name` must return `NVIDIA GB10`).
* **Copy hosts (`COPY_HOSTS`):** In standard mode, same as cluster peers. In mesh mode, scanned separately on `enp1s0f0np0` and `enp1s0f1np1` subnets so that image/model transfers use the direct InfiniBand path.
### Manual Overrides
@@ -826,6 +1125,8 @@ You can override the auto-detected values if needed:
| `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. |
| `--check-config` | Check configuration and auto-detection without launching. |
| `--solo` | Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster |
| `--no-ray` | No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend). |
| `--master-port` / `--head-port` | Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501). |
| `--no-cache-dirs` | Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton). |
| `--launch-script` | Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted. |
| `-d` | Run in daemon mode (detached). |
@@ -834,6 +1135,10 @@ You can override the auto-detected values if needed:
| `--mem-swap-limit-gb` | Memory+swap limit in GB (default: mem-limit + 10, only with `--non-privileged`). |
| `--pids-limit` | Process limit (default: 4096, only with `--non-privileged`). |
| `--shm-size-gb` | Shared memory size in GB (default: 64, only with `--non-privileged`). |
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists). |
| `start \| stop \| status \| exec` | Action to perform (default: `start`). Not compatible with `--launch-script`. |
| `command` | Command to execute inside the container (only for `exec` action). |
### Non-Privileged Mode
@@ -977,6 +1282,61 @@ You need to make sure you allocate IP addresses to them (no need to allocate IP
## 5\. Configuration Details
### Cluster Configuration (`.env` file)
The scripts share a `.env` file (default: `.env` in the repo directory) for persistent cluster configuration. It is created automatically by autodiscovery — run `--discover` (via `run-recipe.sh`) or `--setup` (via `launch-cluster.sh` / `build-and-copy.sh`) on first use.
**Supported variables:**
| Variable | Description |
| :--- | :--- |
| `CLUSTER_NODES` | Comma-separated node IPs used for Ray/vLLM cluster (head node first). |
| `COPY_HOSTS` | Comma-separated node IPs used for image and model distribution. In mesh mode these are the IPs on the direct IB-attached interfaces, which may differ from `CLUSTER_NODES`. |
| `LOCAL_IP` | IP address of the local node. |
| `ETH_IF` | Ethernet interface for cluster coordination (e.g. `enp1s0f1np1` or `enP7s7`). |
| `IB_IF` | Comma-separated RoCE/IB device names (e.g. `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`). |
| `CONTAINER_*` | Any variable prefixed with `CONTAINER_` (except `CONTAINER_NAME`) is passed as `-e VAR=VALUE` to the container. Example: `CONTAINER_NCCL_DEBUG=INFO``-e NCCL_DEBUG=INFO`. |
**Mesh-mode NCCL variables** (written automatically when mesh topology is detected):
```
CONTAINER_NCCL_NET_PLUGIN=none
CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
CONTAINER_NCCL_IB_MERGE_NICS=0
```
**Example `.env` for a standard 2-node cluster:**
```
CLUSTER_NODES=192.168.177.11,192.168.177.12
COPY_HOSTS=192.168.177.12
LOCAL_IP=192.168.177.11
ETH_IF=enp1s0f1np1
IB_IF=rocep1s0f1,roceP2p1s0f1
```
To use a custom config file path, pass `--config /path/to/file.env` to any script.
### Autodiscovery Workflow
On first run, if no `.env` is present, the scripts will automatically trigger autodiscovery. You can also run it explicitly:
```bash
# Via run-recipe.sh
./run-recipe.sh --discover
# Via launch-cluster.sh or build-and-copy.sh (force re-run even if .env exists)
./launch-cluster.sh --setup exec vllm serve ...
./build-and-copy.sh --setup -c
```
Autodiscovery:
1. Detects active CX7 interfaces and determines mesh vs. standard topology.
2. Scans the network for SSH-reachable GB10 peers.
3. In mesh mode, separately discovers `COPY_HOSTS` on direct IB-attached interfaces.
4. Prompts for per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
5. Saves the result to `.env`.
### Environment Persistence
The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run:
@@ -1150,6 +1510,32 @@ The `hf-download.sh` script provides a convenient way to download models from Hu
./hf-download.sh -c --copy-parallel QuantTrio/MiniMax-M2-AWQ
```
**Use nodes from `.env` (respects `COPY_HOSTS`):**
```bash
./hf-download.sh -c QuantTrio/MiniMax-M2-AWQ
```
When `-c` is given without explicit hosts, the script checks `COPY_HOSTS` in `.env` first, then falls back to autodiscovery. In mesh mode this means transfers go over the direct IB-attached interfaces automatically.
**Use a custom config file:**
```bash
./hf-download.sh --config /path/to/cluster.env -c QuantTrio/MiniMax-M2-AWQ
```
**Available options:**
| Flag | Description |
| :--- | :--- |
| `<model-name>` | HuggingFace model ID (e.g. `QuantTrio/MiniMax-M2-AWQ`). Required. |
| `-c, --copy-to <hosts>` | Host(s) to copy the model to after download (space- or comma-separated). Omit hosts to use `COPY_HOSTS` from `.env` or autodiscovery. |
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
| `--copy-parallel` | Copy to all hosts concurrently instead of serially. |
| `-u, --user <user>` | SSH username for remote copies (default: current user). |
| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
| `-h, --help` | Show help message. |
### Hardware Architecture
**Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`.

View File

@@ -1,5 +1,57 @@
#!/bin/bash
SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
# Load .env file if exists (for shared configuration)
# This is called early so that DOTENV_* variables are available to all functions
load_env_if_exists() {
local env_file="${CONFIG_FILE:-}"
local config_explicit="${CONFIG_FILE_SET:-false}"
# If CONFIG_FILE is not set, check default location
if [[ -z "$env_file" ]]; then
env_file="$SCRIPT_DIR/.env"
config_explicit="false"
fi
# Validate config file exists if explicitly specified
# Exception: if --setup is also specified, the file will be created by the setup procedure
if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]] && [[ "${FORCE_DISCOVER:-false}" != "true" ]]; then
echo "Error: Config file not found: $env_file"
exit 1
fi
if [[ -f "$env_file" ]]; then
# Load .env variables with DOTENV_ prefix
while IFS='=' read -r key value || [[ -n "$key" ]]; do
# Skip comments and empty lines
[[ "$key" =~ ^[[:space:]]*# ]] && continue
[[ -z "$key" ]] && continue
# Remove leading/trailing whitespace from key
key=$(echo "$key" | xargs)
# Skip if key is empty after trimming
[[ -z "$key" ]] && continue
# Remove quotes from value
value="${value%\"}"
value="${value#\"}"
value="${value%\'}"
value="${value#\'}"
# Export with DOTENV_ prefix
export "DOTENV_$key=$value"
done < "$env_file"
fi
}
# Load .env file
load_env_if_exists
# Mesh mode flag (set by detect_interfaces)
MESH_MODE="false"
# Function to detect IB and Ethernet interfaces
detect_interfaces() {
# If both interfaces are already set, nothing to do
@@ -14,60 +66,132 @@ detect_interfaces() {
fi
echo "Auto-detecting interfaces..."
# Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)"
# We capture: IB_DEV, NET_DEV
mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}')
if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then
echo "Error: No active IB interfaces found."
return 1
fi
DETECTED_IB_IFS=()
CANDIDATE_ETH_IFS=()
ALL_NET_IFS=()
for pair in "${IB_NET_PAIRS[@]}"; do
ib_dev=$(echo "$pair" | awk '{print $1}')
net_dev=$(echo "$pair" | awk '{print $2}')
DETECTED_IB_IFS+=("$ib_dev")
# Check if interface has an IP address
if ip addr show "$net_dev" | grep -q "inet "; then
CANDIDATE_ETH_IFS+=("$net_dev")
ALL_NET_IFS+=("$net_dev")
done
local num_up="${#IB_NET_PAIRS[@]}"
# --- Sanity checks ---
# 1. enp* (no capital P) interfaces MUST have an IP
for net_dev in "${ALL_NET_IFS[@]}"; do
if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then
if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned."
return 1
fi
fi
done
# Set IB_IF if not provided
if [[ -z "$IB_IF" ]]; then
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
echo " Detected IB_IF: $IB_IF"
fi
# Set ETH_IF if not provided
if [[ -z "$ETH_IF" ]]; then
if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then
echo "Error: No active IB-associated interfaces have IP addresses."
# 2. No two interfaces with IPs should share the same subnet
declare -A SEEN_SUBNETS
for net_dev in "${ALL_NET_IFS[@]}"; do
local cidr
cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
# Compute network address using python3
local net_addr
net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null)
if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then
echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration."
return 1
fi
# Selection logic: Prefer interface without capital 'P'
SELECTED_ETH=""
for iface in "${CANDIDATE_ETH_IFS[@]}"; do
if [[ "$iface" != *"P"* ]]; then
SELECTED_ETH="$iface"
break
fi
done
# Fallback: Use the first one if all have 'P' or none found yet
if [[ -z "$SELECTED_ETH" ]]; then
SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}"
SEEN_SUBNETS["$net_addr"]="$net_dev"
done
# --- Mode selection ---
if [[ "$num_up" -eq 2 ]]; then
# Non-mesh configuration
MESH_MODE="false"
echo " Non-mesh mode: 2 CX7 interfaces active."
# Set IB_IF if not provided
if [[ -z "$IB_IF" ]]; then
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
echo " Detected IB_IF: $IB_IF"
fi
ETH_IF="$SELECTED_ETH"
echo " Detected ETH_IF: $ETH_IF"
# Set ETH_IF if not provided: prefer interface without capital 'P'
if [[ -z "$ETH_IF" ]]; then
local selected_eth=""
for net_dev in "${ALL_NET_IFS[@]}"; do
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
if [[ "$net_dev" != *P* ]]; then
selected_eth="$net_dev"
break
fi
fi
done
# Fallback: first interface with an IP
if [[ -z "$selected_eth" ]]; then
for net_dev in "${ALL_NET_IFS[@]}"; do
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
selected_eth="$net_dev"
break
fi
done
fi
if [[ -z "$selected_eth" ]]; then
echo "Error: No active IB-associated interfaces have IP addresses."
return 1
fi
ETH_IF="$selected_eth"
echo " Detected ETH_IF: $ETH_IF"
fi
elif [[ "$num_up" -eq 4 ]]; then
# Mesh configuration
MESH_MODE="true"
echo " Mesh mode: all 4 CX7 interfaces active."
# Set IB_IF to all four RoCE interfaces (hardcoded for mesh)
if [[ -z "$IB_IF" ]]; then
IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1"
echo " Detected IB_IF: $IB_IF"
fi
# Set ETH_IF: check enP7s7 first, then wlP9s9
if [[ -z "$ETH_IF" ]]; then
if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then
ETH_IF="enP7s7"
echo " Detected ETH_IF: $ETH_IF"
elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then
ETH_IF="wlP9s9"
echo " Detected ETH_IF: $ETH_IF"
echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited."
else
echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination."
return 1
fi
fi
# Export mesh NCCL settings directly so launch-cluster.sh picks them up
# even if the user declines to save config to .env
export DOTENV_CONTAINER_NCCL_NET_PLUGIN=none
export DOTENV_CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
export DOTENV_CONTAINER_NCCL_IB_MERGE_NICS=0
else
echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)."
return 1
fi
}
@@ -84,16 +208,51 @@ detect_local_ip() {
# Get CIDR of the selected ETH_IF
CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1)
if [[ -z "$CIDR" ]]; then
echo "Error: Could not determine IP/CIDR for interface $ETH_IF"
return 1
fi
LOCAL_IP=${CIDR%/*}
echo " Detected Local IP: $LOCAL_IP ($CIDR)"
}
# Scan a subnet for GB10-capable peers via SSH
# Usage: _scan_subnet_for_gb10 <cidr> <local_ip_to_exclude> <output_file>
_scan_subnet_for_gb10() {
local cidr="$1"
local exclude_ip="$2"
local out_file="$3"
if ! command -v python3 &> /dev/null; then
echo "Error: python3 not found."
return 1
fi
if ! command -v nc &> /dev/null; then
echo "Error: nc (netcat) not found."
return 1
fi
local all_ips
all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr")
for ip in $all_ips; do
[[ "$ip" == "$exclude_ip" ]] && continue
(
if nc -z -w 1 "$ip" 22 &>/dev/null; then
# Check if remote is a GB10 system
if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
"nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \
2>/dev/null | grep -q "NVIDIA GB10"; then
echo "$ip" >> "$out_file"
fi
fi
) &
done
wait
}
# Function to detect cluster nodes
detect_nodes() {
detect_local_ip || return 1
@@ -111,58 +270,182 @@ detect_nodes() {
return 0
fi
echo "Auto-detecting nodes..."
if ! command -v nc &> /dev/null; then
echo "Error: nc (netcat) not found. Please install netcat."
return 1
fi
if ! command -v python3 &> /dev/null; then
echo "Error: python3 not found. Please install python3."
return 1
# Try to use CLUSTER_NODES from .env
if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then
echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES"
PEER_NODES=()
IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES"
for node in "${ALL_NODES[@]}"; do
node=$(echo "$node" | xargs)
[[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node")
done
NODES_ARG="$DOTENV_CLUSTER_NODES"
return 0
fi
DETECTED_IPS=("$LOCAL_IP")
echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..."
local temp_file
temp_file=$(mktemp)
_scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file"
PEER_NODES=()
echo " Scanning for SSH peers on $CIDR..."
# Generate list of IPs using python
ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR")
TEMP_IPS_FILE=$(mktemp)
# Scan in parallel
for ip in $ALL_IPS; do
# Skip own IP
if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi
(
# Check port 22 with 1 second timeout
if nc -z -w 1 "$ip" 22 &>/dev/null; then
echo "$ip" >> "$TEMP_IPS_FILE"
fi
) &
done
# Wait for all background scans to complete
wait
# Read found IPs
if [[ -f "$TEMP_IPS_FILE" ]]; then
local detected_ips=("$LOCAL_IP")
if [[ -f "$temp_file" ]]; then
while read -r ip; do
DETECTED_IPS+=("$ip")
PEER_NODES+=("$ip")
echo " Found peer: $ip"
done < "$TEMP_IPS_FILE"
rm -f "$TEMP_IPS_FILE"
PEER_NODES+=("$ip")
detected_ips+=("$ip")
echo " Found GB10 peer: $ip"
done < <(sort "$temp_file")
rm -f "$temp_file"
fi
# Sort IPs
IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}"))
# Sort and set NODES_ARG
IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}"))
unset IFS
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
echo " Cluster Nodes: $NODES_ARG"
}
# Function to detect COPY_HOSTS for build/model distribution
# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network)
# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers
detect_copy_hosts() {
if [[ "$MESH_MODE" == "false" ]]; then
COPY_PEER_NODES=("${PEER_NODES[@]}")
return 0
fi
# Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets
echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..."
local temp_file
temp_file=$(mktemp)
for iface in enp1s0f0np0 enp1s0f1np1; do
local cidr
cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
local local_iface_ip="${cidr%/*}"
echo " Scanning $iface ($cidr)..."
_scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file"
done
# Deduplicate and collect results.
# On two-cable setups two IB IPs may belong to the same host; deduplicate by
# querying each host's ETH_IF IP as a canonical identity.
COPY_PEER_NODES=()
declare -A _SEEN_COPY # keyed by IB IP
declare -A _SEEN_HOST # keyed by ETH_IF IP → first IB IP seen for that host
if [[ -f "$temp_file" ]]; then
while read -r ip; do
[[ -n "${_SEEN_COPY[$ip]}" ]] && continue
_SEEN_COPY["$ip"]=1
# Resolve canonical host identity via ETH_IF IP
local host_ip
host_ip=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
"ip -o -f inet addr show $ETH_IF 2>/dev/null | awk '{print \$4}' | head -n1 | cut -d/ -f1" \
</dev/null 2>/dev/null)
if [[ -n "$host_ip" && -n "${_SEEN_HOST[$host_ip]}" ]]; then
echo " Skipping $ip (same host as ${_SEEN_HOST[$host_ip]}, ETH_IF: $host_ip)"
continue
fi
[[ -n "$host_ip" ]] && _SEEN_HOST["$host_ip"]="$ip"
COPY_PEER_NODES+=("$ip")
echo " Found GB10 copy host: $ip"
done < <(sort "$temp_file")
rm -f "$temp_file"
fi
}
# Save discovered configuration to .env
# Skips if .env already exists unless FORCE_DISCOVER=true
save_config() {
local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}"
# Skip if .env exists and not forced
if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then
return 0
fi
echo ""
local save_prompt="Save discovered configuration to $env_file?"
if [[ -f "$env_file" ]]; then
save_prompt="Overwrite existing configuration in $env_file?"
fi
read -r -p "$save_prompt [Y/n]: " response
response="${response,,}"
if [[ "$response" =~ ^(n|no)$ ]]; then
return 0
fi
# Build list of all cluster nodes (local + peers)
local all_cluster_nodes=()
if [[ -n "$LOCAL_IP" ]]; then
all_cluster_nodes+=("$LOCAL_IP")
fi
for node in "${PEER_NODES[@]}"; do
all_cluster_nodes+=("$node")
done
# Per-node confirmation for CLUSTER_NODES
echo ""
echo "Select nodes for CLUSTER_NODES:"
local selected_cluster=()
for node in "${all_cluster_nodes[@]}"; do
local label="$node"
[[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)"
read -r -p " Include $label? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_cluster+=("$node")
fi
done
if [[ "${#selected_cluster[@]}" -eq 0 ]]; then
echo "No nodes selected. Aborting save."
return 1
fi
# Per-node confirmation for COPY_HOSTS
echo ""
echo "Select nodes for COPY_HOSTS (build/model distribution):"
local selected_copy=()
for node in "${COPY_PEER_NODES[@]}"; do
read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_copy+=("$node")
fi
done
# Write .env
{
echo "# Auto-generated by autodiscover.sh"
echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")"
if [[ "${#selected_copy[@]}" -gt 0 ]]; then
echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")"
fi
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
if [[ "$MESH_MODE" == "true" ]]; then
echo "# Mesh mode NCCL settings"
echo "CONTAINER_NCCL_NET_PLUGIN=none"
echo "CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1"
echo "CONTAINER_NCCL_IB_MERGE_NICS=0"
fi
} > "$env_file"
echo ""
echo "Saved to $env_file"
}
# Convenience function: run full autodiscovery pipeline
run_autodiscover() {
detect_interfaces || return 1
detect_local_ip || return 1
detect_nodes || return 1
detect_copy_hosts || return 1
save_config
}

View File

@@ -6,36 +6,76 @@ START_TIME=$(date +%s)
# Default values
IMAGE_TAG="vllm-node"
IMAGE_TAG_SET=false
REBUILD_FLASHINFER=false
REBUILD_VLLM=false
COPY_HOSTS=()
COPY_TO_FLAG=false
SSH_USER="$USER"
NO_BUILD=false
VLLM_REF="main"
VLLM_REF_SET=false
FLASHINFER_REF="main"
FLASHINFER_REF_SET=false
TMP_IMAGE=""
PARALLEL_COPY=false
EXP_MXFP4=false
VLLM_REF_SET=false
VLLM_PRS=""
FLASHINFER_PRS=""
PRE_TRANSFORMERS=false
FULL_LOG=false
BUILD_JOBS="16"
GPU_ARCH_LIST="12.1a"
NETWORK_ARG=""
WHEELS_REPO="eugr/spark-vllm-docker"
FLASHINFER_RELEASE_TAG="prebuilt-flashinfer-current"
VLLM_RELEASE_TAG="prebuilt-vllm-current"
# Space-separated list of GPU architectures for which prebuilt wheels are available
PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a"
CLEANUP_MODE="false"
CONFIG_FILE=""
cleanup() {
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
echo "Cleaning up temporary image $TMP_IMAGE"
rm -f "$TMP_IMAGE"
fi
rm -f ./build-metadata.yaml
}
trap cleanup EXIT
generate_build_metadata() {
local dockerfile="$1"
local vllm_version="$2"
local vllm_commit="$3"
local flashinfer_commit="$4"
local vllm_ref="$5"
local pre_transformers="$6"
local exp_mxfp4="$7"
local vllm_prs="$8"
local base_image
base_image=$(grep -m1 '^FROM .* AS runner' "$dockerfile" | awk '{print $2}')
cat > ./build-metadata.yaml <<EOF
build_date: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
build_script_commit: $(git rev-parse HEAD 2>/dev/null || echo "unknown")
vllm_version: ${vllm_version:-unknown}
vllm_commit: ${vllm_commit:-unknown}
flashinfer_commit: ${flashinfer_commit:-unknown}
gpu_arch: ${GPU_ARCH_LIST}
base_image: ${base_image:-unknown}
build_args:
vllm_ref: ${vllm_ref}
transformers_5: ${pre_transformers}
exp_mxfp4: ${exp_mxfp4}
vllm_prs: "${vllm_prs}"
build_jobs: ${BUILD_JOBS}
EOF
echo "Generated build-metadata.yaml"
}
add_copy_hosts() {
local token part
for token in "$@"; do
@@ -66,7 +106,12 @@ copy_to_host() {
# try_download_wheels TAG PREFIX
# Downloads wheels matching PREFIX*.whl from a GitHub release.
# Skips files that are already present and up to date (by remote updated_at vs local mtime).
# Skip conditions (either is sufficient):
# 1. Commit hash in release name matches .wheels/.{PREFIX}_commit (primary check).
# 2. All local wheels are newer than the latest GitHub asset (freshly built).
# Only downloads a file when the remote asset is newer than the local copy AND
# the above skip conditions are not met.
# On success, persists the release commit hash to .wheels/.{PREFIX}_commit.
# Returns 0 if all matching wheels are now available, 1 on any error.
try_download_wheels() {
local TAG="$1"
@@ -92,7 +137,7 @@ try_download_wheels() {
local DOWNLOAD_LIST
DOWNLOAD_LIST=$(echo "$RELEASE_JSON" | python3 -c '
import json, sys, os
import json, sys, os, re
from datetime import datetime, timezone
wheels_dir, prefix = sys.argv[1], sys.argv[2]
@@ -104,6 +149,31 @@ if not assets:
print("No assets found matching prefix: " + prefix, file=sys.stderr)
sys.exit(1)
# Extract commit hash from the release name:
# FlashInfer: "Prebuilt FlashInfer Wheels (0.6.5-124a2d32-d20260305) - DGX Spark Only"
# vLLM: "Prebuilt vLLM Wheels (0.16.1rc1.dev296+ga73af584f.d20260305.cu131) - DGX Spark only"
release_name = data.get("name", "")
commit_hash = None
if prefix.startswith("flashinfer"):
m = re.search(r"\([\d.]+\w*-([0-9a-f]{6,})-d\d{8}\)", release_name, re.IGNORECASE)
if m:
commit_hash = m.group(1)
else:
m = re.search(r"\+g([0-9a-f]{6,})\.", release_name, re.IGNORECASE)
if m:
commit_hash = m.group(1)
# Compare against the locally stored commit hash
commit_file = os.path.join(wheels_dir, "." + prefix + "-commit")
local_commit = None
if os.path.exists(commit_file):
with open(commit_file) as f:
local_commit = f.read().strip()
if commit_hash and local_commit and local_commit[:len(commit_hash)] == commit_hash:
print("Commit hash matches (" + commit_hash + ") — wheels are up to date.", file=sys.stderr)
sys.exit(0)
newest_remote_ts = max(
datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ")
.replace(tzinfo=timezone.utc).timestamp()
@@ -119,12 +189,19 @@ local_wheels = [
if local_wheels and all(os.path.getmtime(p) >= newest_remote_ts for p in local_wheels):
sys.exit(0)
downloads = []
for a in assets:
local_path = os.path.join(wheels_dir, a["name"])
remote_ts = datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ") \
.replace(tzinfo=timezone.utc).timestamp()
if not os.path.exists(local_path) or remote_ts > os.path.getmtime(local_path):
print(a["browser_download_url"] + " " + a["name"])
downloads.append(a["browser_download_url"] + " " + a["name"])
if downloads:
if commit_hash:
print("#commit:" + commit_hash)
for d in downloads:
print(d)
' "$WHEELS_DIR" "$PREFIX") || return 1
if [ -z "$DOWNLOAD_LIST" ]; then
@@ -132,16 +209,36 @@ for a in assets:
return 0
fi
# Parse the optional '#commit:HASH' sentinel emitted by the Python script
local REMOTE_COMMIT=""
local DOWNLOAD_ENTRIES=""
while IFS= read -r LINE; do
if [[ "$LINE" == "#commit:"* ]]; then
REMOTE_COMMIT="${LINE#"#commit:"}"
elif [[ -n "$LINE" ]]; then
DOWNLOAD_ENTRIES+="$LINE"$'\n'
fi
done <<< "$DOWNLOAD_LIST"
if [ -z "$DOWNLOAD_ENTRIES" ]; then
echo "All $PREFIX wheels are up to date — skipping download."
return 0
fi
# Back up existing wheels so we never leave a mix of old and new on failure
local DL_BACKUP="$WHEELS_DIR/.backup-download-${PREFIX}"
rm -rf "$DL_BACKUP" && mkdir -p "$DL_BACKUP"
for f in "$WHEELS_DIR/${PREFIX}"*.whl; do
[ -f "$f" ] && mv "$f" "$DL_BACKUP/"
done
for f in "$WHEELS_DIR/.${PREFIX}"*; do
[ -f "$f" ] && mv "$f" "$DL_BACKUP/"
done
local URL NAME TMP_WHL
local DOWNLOADED=()
while IFS=' ' read -r URL NAME; do
[ -z "$URL" ] && continue
echo "Downloading $NAME..."
TMP_WHL=$(mktemp "$WHEELS_DIR/${NAME}.XXXXXX")
if curl -L --progress-bar --connect-timeout 30 "$URL" -o "$TMP_WHL"; then
@@ -154,24 +251,30 @@ for a in assets:
if compgen -G "$DL_BACKUP/${PREFIX}*.whl" > /dev/null 2>&1; then
echo "Restoring previous $PREFIX wheels..."
mv "$DL_BACKUP/${PREFIX}"*.whl "$WHEELS_DIR/"
mv "$DL_BACKUP/.${PREFIX}"* "$WHEELS_DIR/"
fi
rm -rf "$DL_BACKUP"
return 1
fi
done <<< "$DOWNLOAD_LIST"
done <<< "$DOWNLOAD_ENTRIES"
rm -rf "$DL_BACKUP"
if [ -n "$REMOTE_COMMIT" ]; then
echo "$REMOTE_COMMIT" > "$WHEELS_DIR/.${PREFIX}-commit"
echo "Recorded $PREFIX commit hash: $REMOTE_COMMIT"
fi
return 0
}
# Help function
usage() {
echo "Usage: $0 [OPTIONS]"
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
echo " -t, --tag <tag> : Image tag (default: 'vllm-node', 'vllm-node-tf5' with --tf5, 'vllm-node-mxfp4' with --exp-mxfp4)"
echo " --gpu-arch <arch> : GPU architecture (default: '12.1a')"
echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)"
echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)"
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
echo " --flashinfer-ref <ref> : FlashInfer commit SHA, branch or tag (default: 'main')"
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists."
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
@@ -180,47 +283,34 @@ usage() {
echo " --tf5 : Install transformers>=5 (aliases: --pre-tf, --pre-transformers)"
echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
echo " --apply-vllm-pr <pr-num> : Apply a specific PR patch to vLLM source. Can be specified multiple times."
echo " --apply-flashinfer-pr <pr-num>: Apply a specific PR patch to FlashInfer source. Can be specified multiple times."
echo " --full-log : Enable full build logging (--progress=plain)"
echo " --no-build : Skip building, only copy image (requires --copy-to)"
echo " --network <network> : Docker network to use during build"
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
echo " --config : Path to .env configuration file (default: .env in script directory)"
echo " --setup : Force autodiscovery and save configuration (even if .env exists)"
echo " -h, --help : Show this help message"
exit 1
}
# Argument parsing
# Parse all arguments
CONFIG_FILE_SET=false
while [[ "$#" -gt 0 ]]; do
case $1 in
-t|--tag) IMAGE_TAG="$2"; shift ;;
-t|--tag) IMAGE_TAG="$2"; IMAGE_TAG_SET=true; shift ;;
--gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
--rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
--rebuild-vllm) REBUILD_VLLM=true ;;
--vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
--flashinfer-ref) FLASHINFER_REF="$2"; FLASHINFER_REF_SET=true; shift ;;
-c|--copy-to|--copy-to-host|--copy-to-hosts)
COPY_TO_FLAG=true
shift
while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1"
shift
done
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "No hosts specified. Using autodiscovery..."
source "$(dirname "$0")/autodiscover.sh"
detect_nodes
if [ $? -ne 0 ]; then
echo "Error: Autodiscovery failed."
exit 1
fi
if [ ${#PEER_NODES[@]} -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
continue
;;
-j|--build-jobs) BUILD_JOBS="$2"; shift ;;
@@ -241,21 +331,100 @@ while [[ "$#" -gt 0 ]]; do
exit 1
fi
;;
--apply-flashinfer-pr)
if [ -n "$2" ] && [[ "$2" != -* ]]; then
if [ -n "$FLASHINFER_PRS" ]; then
FLASHINFER_PRS="$FLASHINFER_PRS $2"
else
FLASHINFER_PRS="$2"
fi
shift
else
echo "Error: --apply-flashinfer-pr requires a PR number."
exit 1
fi
;;
--full-log) FULL_LOG=true ;;
--no-build) NO_BUILD=true ;;
--cleanup) CLEANUP_MODE=true ;;
--network)
if [ -n "$2" ] && [[ "$2" != -* ]]; then
NETWORK_ARG="$2"
shift
else
echo "Error: --network requires a network name."
exit 1
fi
;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
-h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;;
esac
shift
done
# Apply default IMAGE_TAG based on flags if -t was not specified
if [ "$IMAGE_TAG_SET" = false ]; then
if [ "$PRE_TRANSFORMERS" = true ]; then
IMAGE_TAG="vllm-node-tf5"
elif [ "$EXP_MXFP4" = true ]; then
IMAGE_TAG="vllm-node-mxfp4"
fi
fi
# Source autodiscover.sh to load .env file
source "$(dirname "$0")/autodiscover.sh"
# If --setup: force full autodiscovery and save configuration
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
echo "Running full autodiscovery (--setup)..."
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
fi
# Handle COPY_HOSTS from .env or autodiscovery only if -c was explicitly specified
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else
echo "No hosts specified. Using autodiscovery..."
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
fi
# Validate flag combinations
if [ -n "$VLLM_PRS" ]; then
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
fi
if [ -n "$FLASHINFER_PRS" ]; then
if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-flashinfer-pr is incompatible with --exp-mxfp4"; exit 1; fi
fi
if [ "$EXP_MXFP4" = true ]; then
if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
if [ "$FLASHINFER_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --flashinfer-ref"; exit 1; fi
if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --tf5"; exit 1; fi
if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
@@ -267,6 +436,30 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
exit 1
fi
# Handle cleanup mode
if [[ "$CLEANUP_MODE" == "true" ]]; then
WHEELS_DIR="./wheels"
echo "Cleaning up wheels directory..."
# Remove all .whl files
if compgen -G "$WHEELS_DIR/*.whl" > /dev/null 2>&1; then
rm -f "$WHEELS_DIR"/*.whl
echo "Removed *.whl files from $WHEELS_DIR"
else
echo "No *.whl files found in $WHEELS_DIR"
fi
# Remove all .-commit files
if compgen -G "$WHEELS_DIR/.*-commit" > /dev/null 2>&1; then
rm -f "$WHEELS_DIR"/.*-commit
echo "Removed .*-commit files from $WHEELS_DIR"
else
echo "No .*-commit files found in $WHEELS_DIR"
fi
echo "Cleanup complete."
fi
# Ensure wheels directory exists
mkdir -p ./wheels
@@ -278,6 +471,9 @@ fi
COMMON_BUILD_FLAGS+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
COMMON_BUILD_FLAGS+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
COMMON_BUILD_FLAGS+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
if [ -n "$NETWORK_ARG" ]; then
COMMON_BUILD_FLAGS+=("--network" "$NETWORK_ARG")
fi
# =====================================================
# Build image (unless --no-build or --exp-mxfp4)
@@ -289,6 +485,13 @@ RUNNER_BUILD_TIME=0
if [ "$NO_BUILD" = false ]; then
if [ "$EXP_MXFP4" = true ]; then
echo "Building with experimental MXFP4 support..."
# Generate build metadata YAML for mxfp4 build
MXFP4_VLLM_SHA=$(grep -m1 '^ARG VLLM_SHA=' Dockerfile.mxfp4 | cut -d= -f2)
MXFP4_FLASHINFER_SHA=$(grep -m1 '^ARG FLASHINFER_SHA=' Dockerfile.mxfp4 | cut -d= -f2)
generate_build_metadata Dockerfile.mxfp4 "unknown" "$MXFP4_VLLM_SHA" "$MXFP4_FLASHINFER_SHA" \
"mxfp4-pinned" "false" "true" ""
CMD=("docker" "build" "-t" "$IMAGE_TAG" "${COMMON_BUILD_FLAGS[@]}" "-f" "Dockerfile.mxfp4" ".")
echo "Building image with command: ${CMD[*]}"
BUILD_START=$(date +%s)
@@ -299,9 +502,21 @@ if [ "$NO_BUILD" = false ]; then
# ----------------------------------------------------------
# Phase 1: FlashInfer wheels
# ----------------------------------------------------------
if [ "$FLASHINFER_REF_SET" = true ] || [ -n "$FLASHINFER_PRS" ]; then
REBUILD_FLASHINFER=true
fi
BUILD_FLASHINFER=false
if [ "$REBUILD_FLASHINFER" = true ]; then
echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
if [ "$FLASHINFER_REF_SET" = true ] && [ -n "$FLASHINFER_PRS" ]; then
echo "Rebuilding FlashInfer wheels (--flashinfer-ref and --apply-flashinfer-pr specified)..."
elif [ "$FLASHINFER_REF_SET" = true ]; then
echo "Rebuilding FlashInfer wheels (--flashinfer-ref specified)..."
elif [ -n "$FLASHINFER_PRS" ]; then
echo "Rebuilding FlashInfer wheels (--apply-flashinfer-pr specified)..."
else
echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
fi
BUILD_FLASHINFER=true
elif try_download_wheels "$FLASHINFER_RELEASE_TAG" "flashinfer"; then
echo "FlashInfer wheels ready."
@@ -323,12 +538,18 @@ if [ "$NO_BUILD" = false ]; then
FI_CMD=("docker" "build"
"--target" "flashinfer-export"
"--output" "type=local,dest=./wheels"
"${COMMON_BUILD_FLAGS[@]}")
"${COMMON_BUILD_FLAGS[@]}"
"--build-arg" "FLASHINFER_REF=$FLASHINFER_REF")
if [ "$REBUILD_FLASHINFER" = true ]; then
FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
fi
if [ -n "$FLASHINFER_PRS" ]; then
echo "Applying FlashInfer PRs: $FLASHINFER_PRS"
FI_CMD+=("--build-arg" "FLASHINFER_PRS=$FLASHINFER_PRS")
fi
FI_CMD+=(".")
echo "FlashInfer build command: ${FI_CMD[*]}"
@@ -420,6 +641,15 @@ if [ "$NO_BUILD" = false ]; then
exit 1
fi
# Generate build metadata YAML
VLLM_VERSION=$(ls ./wheels/vllm-*.whl 2>/dev/null | head -1 | sed 's|.*/vllm-||;s|-.*||')
VLLM_COMMIT=""
[ -f "./wheels/.vllm-commit" ] && VLLM_COMMIT=$(cat ./wheels/.vllm-commit)
FLASHINFER_COMMIT=""
[ -f "./wheels/.flashinfer-commit" ] && FLASHINFER_COMMIT=$(cat ./wheels/.flashinfer-commit)
generate_build_metadata Dockerfile "$VLLM_VERSION" "$VLLM_COMMIT" "$FLASHINFER_COMMIT" \
"$VLLM_REF" "$PRE_TRANSFORMERS" "false" "$VLLM_PRS"
RUNNER_CMD=("docker" "build"
"-t" "$IMAGE_TAG"
"${COMMON_BUILD_FLAGS[@]}")

View File

@@ -42,13 +42,54 @@ However, in order to get full bandwidth in NCCL RDMA mode, we need to utilize **
Also, note that connecting two Sparks using **both** ports won't give you any noticeable advantage in bandwidth, so single connection is sufficient.
If you connect 3 Sparks by daisy-chaining them, you will only be able to sustain 100G between each pair of Sparks.
## Connecting more than 2 Sparks in the cluster
## Connecting 3 Sparks in a mesh cluster without a switch
Three Sparks can be connected together in a cluster without using a separate RoCE switch.
However, all three Sparks need to be on the same wired network using it's 10G Ethernet port (RG-45, not QSFP). Being on a same wireless network should work too, but it's not recommended and was not tested.
You need to make sure they are connected the following way: port 0 on one Spark should connect to port 1 on another Spark (unlike non-mesh configuration).
Example diagram:
```mermaid
block-beta
columns 1
block:Spark3
columns 2
Title3["Spark 3"]:2
s3p0["Port 0<br>192.168.187.13<br>192.168.188.13"] s3p1["Port 1<br>192.168.197.13<br>192.168.198.13"]
end
space
block:Spark2
columns 2
Title2["Spark 2"]:2
s2p0["Port 0<br>192.168.197.12<br>192.168.198.12"] s2p1["Port 1<br>192.168.177.12<br>192.168.178.13"]
end
space
block:Spark1
columns 2
Title1["Spark 1"]:2
s1p0["Port 0<br>192.168.177.11<br>192.168.178.11"] s1p1["Port 1<br>192.168.187.11<br>192.168.188.11"]
end
s1p0 <--> s2p1
s2p0 <--> s3p1
s3p0 <--> s1p1
```
## Connecting more than 2 Sparks in the cluster using a switch
To connect more than 2 Sparks, you will need a proper switch, for example [Microtik CRS812-DDQ](https://mikrotik.com/product/crs812_ddq).
Please refer to [this post](https://forums.developer.nvidia.com/t/6x-spark-setup/354399/56) for an example of setting up a 6-8 node Spark cluster.
## Network setup
### For dual Sparks or multiple Sparks using a QSFP switch
Assuming both are connected using rightmost QFSP port (when looking from the back).
Create `/etc/netplan/40-cx7.yaml` on `spark`:
@@ -58,15 +99,16 @@ network:
ethernets:
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [ ipv4 ] # Restrict link-local addresses to IPv4 only
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.177.11/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: [ ipv4 ]
link-local: []
mtu: 9000
addresses: [192.168.178.11/24]
```
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
@@ -76,23 +118,19 @@ network:
ethernets:
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [ ipv4 ] # Restrict link-local addresses to IPv4 only
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.177.12/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: [ ipv4 ]
link-local: []
mtu: 9000
addresses: [192.168.178.12/24]
```
Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both.
You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet.
For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one.
**DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
Then run on each node:
@@ -115,6 +153,122 @@ MTU setting (testing):
sudo ip link set dev enp1s0f1np1 mtu 9000
```
### For 3-node mesh
3-node mesh is configured differently than dual clusters or clusters using a QSFP switch.
Assuming, your Sparks are connected according to the diagram above:
Create `/etc/netplan/40-cx7.yaml` on `spark1`:
```yaml
network:
version: 2
ethernets:
enp1s0f0np0:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.177.11/24]
enP2p1s0f0np0:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.178.11/24]
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.187.11/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.188.11/24]
```
Create `/etc/netplan/40-cx7.yaml` on `spark2`:
```yaml
network:
version: 2
ethernets:
enp1s0f0np0:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.197.12/24]
enP2p1s0f0np0:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.198.12/24]
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.177.12/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.178.12/24]
```
Create `/etc/netplan/40-cx7.yaml` on `spark3`:
```yaml
network:
version: 2
ethernets:
enp1s0f0np0:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.187.13/24]
enP2p1s0f0np0:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.188.13/24]
enp1s0f1np1:
dhcp4: no
dhcp6: no # Explicitly disable DHCPv6
link-local: [] # Restrict link-local addresses to static IPv4 only
mtu: 9000
addresses: [192.168.197.13/24]
enP2p1s0f1np1:
dhcp4: no
dhcp6: no
link-local: []
mtu: 9000
addresses: [192.168.198.13/24]
```
Then run (on each Spark):
```bash
sudo chmod 600 /etc/netplan/40-cx7.yaml
sudo netplan apply
```
### Passwordless SSH and benchmarks
Set up passwordless ssh. On the first spark:
```bash
wget https://raw.githubusercontent.com/NVIDIA/dgx-spark-playbooks/refs/heads/main/nvidia/connect-two-sparks/assets/discover-sparks
chmod +x discover-sparks
./discover-sparks
```
**Benchmark connection (use perftest package):**
Run the receiver on `spark2` node:
@@ -196,7 +350,9 @@ ib_write_lat 192.168.177.12 -d rocep1s0f1 --report_gbits -R --force-link IB
---------------------------------------------------------------------------------------
```
## NCCL Setup
## NCCL Tests
### Dual Sparks or Sparks via QSFP switch
From https://build.nvidia.com/spark/nccl/stacked-sparks
@@ -239,4 +395,52 @@ mpirun -np 2 -H 192.168.177.11:1,192.168.177.12:1 \
-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
$HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2
```
### 3-node mesh
```bash
# Install dependencies and build NCCL
sudo apt-get update && sudo apt-get install -y libopenmpi-dev
git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git ~/nccl/
cd ~/nccl/
make -j src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121"
# Set environment variables
export CUDA_HOME="/usr/local/cuda"
export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
export NCCL_HOME="$HOME/nccl/build/"
export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
```
Build NCCL Test Suite:
```bash
# Clone and build NCCL tests
git clone https://github.com/NVIDIA/nccl-tests.git ~/nccl-tests/
cd ~/nccl-tests/
make MPI=1
```
Test on both nodes (replace spark1, spark2, spark3 with the actual hostnames or IP address on non-QSFP interface):
```bash
# Set environment variables
export CUDA_HOME="/usr/local/cuda"
export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
export NCCL_HOME="$HOME/nccl_spark_cluster/build/"
export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
# For 3-node mesh we have to use 10G interface for OOB communication!
export UCX_NET_DEVICES=enP7s7
export NCCL_SOCKET_IFNAME=enP7s7
export OMPI_MCA_btl_tcp_if_include=enP7s7
export NCCL_IB_HCA=rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1
export NCCL_IB_DISABLE=0
# Run the all_gather performance test across both nodes
mpirun -np 3 -H spark1:1,spark2:1,spark3:1 \
--mca plm_rsh_agent "ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" \
-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH -x NCCL_IB_MERGE_NICS=0 -x NCCL_NET_PLUGIN=none -x NCCL_IB_SUBNET_AWARE_ROUTING=1 \
$HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 3
```

View File

@@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub"
COPY_HOSTS=()
SSH_USER="$USER"
PARALLEL_COPY=false
CONFIG_FILE=""
CONFIG_FILE_SET=false
# Help function
usage() {
@@ -16,6 +18,7 @@ usage() {
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " -u, --user <user> : Username for ssh commands (default: \$USER)"
echo " --config <file> : Path to .env configuration file (default: .env in script directory)"
echo " -h, --help : Show this help message"
exit 1
}
@@ -37,11 +40,11 @@ copy_model_to_host() {
local host="$1"
local model_name="$2"
local model_dir="$3"
echo "Copying model '$model_name' to ${SSH_USER}@${host}..."
local host_copy_start host_copy_end host_copy_time
host_copy_start=$(date +%s)
if rsync -av --mkpath --progress "$model_dir" "${SSH_USER}@${host}:$HUB_PATH/"; then
host_copy_end=$(date +%s)
host_copy_time=$((host_copy_end - host_copy_start))
@@ -53,44 +56,24 @@ copy_model_to_host() {
}
# Argument parsing
COPY_TO_FLAG=false
while [[ "$#" -gt 0 ]]; do
case $1 in
-c|--copy-to|--copy-to-host|--copy-to-hosts)
COPY_TO_FLAG=true
shift
# Consume arguments until the next flag or end of args
while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1"
shift
done
# If no hosts specified, use autodiscovery
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "No hosts specified. Using autodiscovery..."
source "$(dirname "$0")/autodiscover.sh"
detect_nodes
if [ $? -ne 0 ]; then
echo "Error: Autodiscovery failed."
exit 1
fi
# Use PEER_NODES directly
if [ ${#PEER_NODES[@]} -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
continue
;;
--copy-parallel) PARALLEL_COPY=true ;;
-u|--user) SSH_USER="$2"; shift ;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
-h|--help) usage ;;
*)
*)
# If positional argument is provided
if [ -z "${MODEL_NAME:-}" ]; then
MODEL_NAME="$1"
@@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do
shift
done
# Export config so autodiscover.sh picks it up
export CONFIG_FILE CONFIG_FILE_SET
# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available
source "$(dirname "$0")/autodiscover.sh"
# Validate model name is provided
if [ -z "${MODEL_NAME:-}" ]; then
echo "Error: Model name is required."
usage
fi
# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
# --copy-to was specified but no hosts given: use .env or autodiscover
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else
echo "No hosts specified. Using autodiscovery..."
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}"
fi
elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then
# No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly
: # intentional no-op; user didn't ask for copy
fi
# Check if uvx is installed
if ! command -v uvx &> /dev/null; then
echo "Error: 'uvx' command not found."
@@ -231,4 +249,4 @@ if [ "$COPY_TIME" -gt 0 ]; then
fi
echo "Total: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
echo "========================================="
echo "Done downloading $MODEL_NAME."
echo "Done downloading $MODEL_NAME."

View File

@@ -16,6 +16,7 @@ fi
ETH_IF=""
IB_IF=""
NCCL_DEBUG_VAL=""
MASTER_PORT="29501"
# Initialize variables
NODES_ARG=""
@@ -23,15 +24,18 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
COMMAND_TO_RUN=""
DAEMON_MODE="false"
CHECK_CONFIG="false"
ACTION="start"
ACTION=""
CLUSTER_WAS_RUNNING="false"
MOD_PATHS=()
MOD_TYPES=()
LAUNCH_SCRIPT_PATH=""
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
CONFIG_FILE="" # Will be set to default after argument parsing
ACTIONS_ARG=""
SOLO_MODE="false"
NO_RAY_MODE="false"
LAUNCH_SCRIPT_MODE="false"
MOUNT_CACHE_DIRS="true"
BUILD_JOBS=""
NON_PRIVILEGED_MODE="false"
@@ -55,6 +59,8 @@ usage() {
echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted."
echo " --check-config Check configuration and auto-detection without launching"
echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
echo " --master-port Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501)"
echo " --no-ray No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
echo " --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)"
echo " -d Daemon mode (only for 'start' action)"
echo " --non-privileged Run in non-privileged mode (removes --privileged and --ipc=host)"
@@ -62,9 +68,31 @@ usage() {
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
echo " --config Path to .env configuration file (default: .env in script directory)
--setup/--discover Force autodiscovery and save configuration (even if .env exists)"
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
echo ""
echo "Supported .env file variables:"
echo " CLUSTER_NODES Comma-separated list of node IPs"
echo " ETH_IF Ethernet interface name"
echo " IB_IF InfiniBand interface name"
echo " MASTER_PORT Port for cluster coordination (default: 29501)"
echo " CONTAINER_NAME Container name (default: vllm_node)"
echo " LOCAL_IP Local IP address (for solo mode or override auto-detection)"
echo " CONTAINER_* Any variable starting with CONTAINER_ (except CONTAINER_NAME)"
echo " becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO"
echo ""
echo "Example .env file:"
echo " CLUSTER_NODES=192.168.1.1,192.168.1.2"
echo " ETH_IF=eth0"
echo " IB_IF=ib0"
echo " MASTER_PORT=29501"
echo " CONTAINER_NAME=vllm_node"
echo " LOCAL_IP=192.168.1.1"
echo " CONTAINER_NCCL_DEBUG=INFO"
echo " CONTAINER_HF_TOKEN=abc123"
echo ""
echo "Launch Script Usage:"
echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed"
echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script"
@@ -91,8 +119,10 @@ while [[ "$#" -gt 0 ]]; do
NCCL_DEBUG_VAL="INFO"
fi
;;
--master-port|--head-port) MASTER_PORT="$2"; shift ;;
--check-config) CHECK_CONFIG="true" ;;
--solo) SOLO_MODE="true" ;;
--no-ray) NO_RAY_MODE="true" ;;
--no-cache-dirs) MOUNT_CACHE_DIRS="false" ;;
--non-privileged) NON_PRIVILEGED_MODE="true" ;;
--mem-limit-gb) MEM_LIMIT_GB="$2"; shift ;;
@@ -101,6 +131,8 @@ while [[ "$#" -gt 0 ]]; do
--shm-size-gb) SHM_SIZE_GB="$2"; shift ;;
-d) DAEMON_MODE="true" ;;
-h|--help) usage ;;
--config) CONFIG_FILE="$2"; shift ;;
--setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
start|stop|status)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
@@ -115,7 +147,7 @@ while [[ "$#" -gt 0 ]]; do
fi
ACTION="exec"
shift
COMMAND_TO_RUN="$@"
COMMAND_TO_RUN=$(printf "%q " "$@")
break
;;
*)
@@ -126,6 +158,115 @@ while [[ "$#" -gt 0 ]]; do
shift
done
# Set .env file path (use default if not specified)
if [[ -z "$CONFIG_FILE" ]]; then
CONFIG_FILE="$SCRIPT_DIR/.env"
CONFIG_FILE_SET=false
else
CONFIG_FILE_SET=true
fi
# Load .env file
if [[ -f "$CONFIG_FILE" ]]; then
echo "Loading configuration from .env file..."
# Validate .env file syntax
if ! python3 -c "
import sys
import re
env_file = '$CONFIG_FILE'
seen_keys = set()
with open(env_file, 'r') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
continue
# Check for key=value format
if '=' not in line:
print(f'Error: Invalid syntax at line {line_num}: missing \"=\"')
sys.exit(1)
key = line.split('=', 1)[0].strip()
# Validate key format (alphanumeric + underscore)
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', key):
print(f'Error: Invalid key format at line {line_num}: {key}')
sys.exit(1)
# Check for duplicates
if key in seen_keys:
print(f'Error: Duplicate key at line {line_num}: {key}')
sys.exit(1)
seen_keys.add(key)
sys.exit(0)
" 2>/dev/null; then
echo "Error: Invalid .env file syntax. Aborting."
exit 1
fi
# Load .env variables with DOTENV_ prefix
while IFS='=' read -r key value || [[ -n "$key" ]]; do
# Skip comments and empty lines
[[ "$key" =~ ^[[:space:]]*# ]] && continue
[[ -z "$key" ]] && continue
# Remove leading/trailing whitespace from key
key=$(echo "$key" | xargs)
# Skip if key is empty after trimming
[[ -z "$key" ]] && continue
# Remove quotes and whitespace from value using Python for proper shlex handling
value=$(python3 -c "
import shlex
import sys
value = '''$value'''
# Strip whitespace
value = value.strip()
# Remove surrounding quotes if present
if (value.startswith('\"') and value.endswith('\"')) or (value.startswith(\"'\" ) and value.endswith(\"'\")):
value = value[1:-1]
print(value)
")
# Export with DOTENV_ prefix
export "DOTENV_$key=$value"
done < "$CONFIG_FILE"
echo "Loaded .env variables: $(compgen -v DOTENV_ | tr '\n' ' ')"
fi
# Apply .env configuration (CLI args take precedence)
if [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]]; then
NODES_ARG="$DOTENV_CLUSTER_NODES"
fi
if [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]]; then
ETH_IF="$DOTENV_ETH_IF"
fi
if [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]]; then
IB_IF="$DOTENV_IB_IF"
fi
if [[ -z "$MASTER_PORT" || "$MASTER_PORT" == "29501" ]] && [[ -n "$DOTENV_MASTER_PORT" ]]; then
MASTER_PORT="$DOTENV_MASTER_PORT"
fi
if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOTENV_CONTAINER_NAME" ]]; then
CONTAINER_NAME="$DOTENV_CONTAINER_NAME"
fi
if [[ -n "$DOTENV_LOCAL_IP" ]]; then
export LOCAL_IP="$DOTENV_LOCAL_IP"
fi
# Validate non-privileged mode flags
if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
# Set default swap limit if not specified
@@ -156,6 +297,26 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
esac
fi
# Add container environment variables from .env (CONTAINER_* pattern)
# Excludes CONTAINER_NAME which is a configuration variable, not an env var
for env_var in $(compgen -v DOTENV_CONTAINER_); do
# Skip CONTAINER_NAME as it's a configuration variable
[[ "$env_var" == "DOTENV_CONTAINER_NAME" ]] && continue
# Get the value
value="${!env_var}"
# Extract the actual env var name (remove DOTENV_CONTAINER_ prefix)
actual_var="${env_var#DOTENV_CONTAINER_}"
# Properly escape the value for shell using Python
escaped_value=$(python3 -c "import shlex; print(shlex.quote('$value'))")
# Add to docker args
DOCKER_ARGS="$DOCKER_ARGS -e $actual_var=$escaped_value"
echo "Adding container env: $actual_var"
done
# Add build job parallelization environment variables if BUILD_JOBS is set
if [[ -n "$BUILD_JOBS" ]]; then
DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS"
@@ -204,9 +365,10 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
# Set command to run the copied script (use absolute path since docker exec may not be in /workspace)
COMMAND_TO_RUN="/workspace/exec-script.sh"
LAUNCH_SCRIPT_MODE="true"
# If launch script is specified, default action to exec unless explicitly set to stop/status
if [[ "$ACTION" == "start" ]]; then
if [[ -z "$ACTION" || "$ACTION" == "start" ]]; then
ACTION="exec"
fi
fi
@@ -251,13 +413,33 @@ done
# Source autodiscover module
source "$(dirname "$0")/autodiscover.sh"
if [[ "$SOLO_MODE" == "true" ]]; then
if [[ -n "$NODES_ARG" ]]; then
echo "Error: --solo is incompatible with -n/--nodes."
exit 1
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
# --setup: force full autodiscovery and save configuration
echo "Running full autodiscovery (--setup)..."
# Clear pre-loaded values so detect functions run fresh instead of short-circuiting
ETH_IF="" IB_IF="" NODES_ARG="" LOCAL_IP=""
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
# If no action was specified, setup was the only intent — exit cleanly
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
exit 0
fi
fi
if [[ "$SOLO_MODE" == "true" ]]; then
# Solo mode: skip node detection, just get local IP
LOCAL_IP="127.0.0.1"
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1
if [[ -z "$LOCAL_IP" ]]; then
LOCAL_IP="127.0.0.1"
fi
NODES_ARG="$LOCAL_IP"
PEER_NODES=()
echo "Solo mode enabled. Skipping node detection."
@@ -303,6 +485,11 @@ if [[ "$SOLO_MODE" == "false" && ${#PEER_NODES[@]} -eq 0 ]]; then
SOLO_MODE="true"
fi
if [[ "$NO_RAY_MODE" == "true" && "$SOLO_MODE" == "true" ]]; then
echo "Warning: Only one node detected; --no-ray has no effect in solo mode. Proceeding normally."
NO_RAY_MODE="false"
fi
echo "Head Node: $HEAD_IP"
echo "Worker Nodes: ${PEER_NODES[*]}"
echo "Container Name: $CONTAINER_NAME"
@@ -324,6 +511,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]
fi
fi
if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" && "$CHECK_CONFIG" != "true" ]]; then
echo "Error: No action specified. Use: start | stop | status | exec"
usage
exit 1
fi
if [[ "$CHECK_CONFIG" == "true" ]]; then
echo "Configuration Check Complete."
echo " Image Name: $IMAGE_NAME"
@@ -377,9 +570,11 @@ if [[ "$ACTION" == "status" ]]; then
# Check Head
if docker ps | grep -q "$CONTAINER_NAME"; then
echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is RUNNING."
echo "--- Ray Status ---"
docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
echo "------------------"
if [[ "$NO_RAY_MODE" == "false" ]]; then
echo "--- Ray Status ---"
docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
echo "------------------"
fi
else
echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is NOT running."
fi
@@ -537,23 +732,109 @@ apply_mod_to_container() {
fi
}
# Copy Launch Script to Container Function
copy_launch_script_to_container() {
local container="$1"
local script_path="$2"
# Parse -tp/-pp/-dp (and long forms) from a text string (command or script content).
# Sets TP_SIZE, PP_SIZE, DP_SIZE, PARALLELISM_FOUND globals.
# Only acts when at least one parallelism flag is present.
parse_parallelism_from_text() {
local text="$1"
TP_SIZE=1; PP_SIZE=1; DP_SIZE=1
PARALLELISM_FOUND=false
echo "Copying launch script to head node..."
# Normalize --flag=value to --flag value for uniform word-by-word parsing
local normalized
normalized=$(echo "$text" | sed 's/\(--[a-z-]*\)=/\1 /g')
local target_script_path="$script_path"
local prev=""
for word in $normalized; do
case "$prev" in
-tp|--tensor-parallel-size)
[[ "$word" =~ ^[0-9]+$ ]] && TP_SIZE="$word" && PARALLELISM_FOUND=true ;;
-pp|--pipeline-parallel-size)
[[ "$word" =~ ^[0-9]+$ ]] && PP_SIZE="$word" && PARALLELISM_FOUND=true ;;
-dp|--data-parallel-size)
[[ "$word" =~ ^[0-9]+$ ]] && DP_SIZE="$word" && PARALLELISM_FOUND=true ;;
esac
prev="$word"
done
}
# Copy script into container as /workspace/exec-script.sh
echo " Copying script into container..."
docker cp "$target_script_path" "$container:/workspace/exec-script.sh"
# Build a patched copy of the launch script on the host for a specific node.
# Strips --distributed-executor-backend and appends multi-node args.
# Prints the path of the temp file (caller must delete it).
make_node_script() {
local script_path="$1"; local nnodes="$2"; local node_rank="$3"; local master_addr="$4"
local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr --master-port $MASTER_PORT"
[[ "$node_rank" -gt 0 ]] && extra="$extra --headless"
# Make executable
local tmp; tmp=$(mktemp /tmp/vllm_node_script_XXXXXX.sh)
# Remove just the flag and its value (not the whole line), then filter empty/backslash-only lines
sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//' "$script_path" | \
grep -Ev '^[[:space:]\\]*$' > "$tmp"
# Strip trailing backslash from last line before appending multi-node args
sed -i "$ s/[[:space:]]*\\\\[[:space:]]*$//" "$tmp"
sed -i "$ s/$/ $extra/" "$tmp"
chmod +x "$tmp"
echo "$tmp"
}
# Copy a script file into a local container as /workspace/exec-script.sh
copy_script_to_container() {
local container="$1"; local script_path="$2"; local label="${3:-node}"
echo "Copying launch script to $label..."
docker cp "$script_path" "$container:/workspace/exec-script.sh" || { echo "Error: docker cp to $label failed"; exit 1; }
docker exec "$container" chmod +x /workspace/exec-script.sh
}
echo " Launch script copied to head node"
# Copy a script file to a remote container via scp + docker cp
copy_script_to_worker() {
local worker_ip="$1"; local container="$2"; local script_path="$3"
echo "Copying launch script to worker $worker_ip..."
local remote_tmp="/tmp/vllm_script_$(date +%s)_$RANDOM.sh"
scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$worker_ip:$remote_tmp" || { echo "Error: scp to $worker_ip failed"; exit 1; }
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
"docker cp $remote_tmp $container:/workspace/exec-script.sh && \
docker exec $container chmod +x /workspace/exec-script.sh && \
rm -f $remote_tmp" || { echo "Error: docker cp to worker $worker_ip failed"; exit 1; }
}
# Build -e KEY=VALUE flags for a given node IP (used in docker run and docker exec)
get_env_flags() {
local node_ip="$1"
printf -- '-e %s ' \
"VLLM_HOST_IP=$node_ip" \
"RAY_NODE_IP_ADDRESS=$node_ip" \
"RAY_OVERRIDE_NODE_IP_ADDRESS=$node_ip" \
"MN_IF_NAME=$ETH_IF" \
"UCX_NET_DEVICES=$ETH_IF" \
"NCCL_SOCKET_IFNAME=$ETH_IF" \
"NCCL_IB_HCA=$IB_IF" \
"NCCL_IB_DISABLE=0" \
"OMPI_MCA_btl_tcp_if_include=$ETH_IF" \
"GLOO_SOCKET_IFNAME=$ETH_IF" \
"TP_SOCKET_IFNAME=$ETH_IF" \
"RAY_memory_monitor_refresh_ms=0" \
"RAY_num_prestart_python_workers=0" \
"RAY_object_store_memory=1073741824"
}
# Start Ray head node inside the container
start_ray_head() {
local container="$1"
echo "Starting Ray HEAD node on $HEAD_IP..."
docker exec -d "$container" bash -c \
"ray start --block --head --port $MASTER_PORT --object-store-memory 1073741824 --num-cpus 2 \
--node-ip-address $HEAD_IP --include-dashboard=false --disable-usage-stats \
>> /proc/1/fd/1 2>&1"
}
# Start Ray worker node inside the container on a remote host
start_ray_worker() {
local worker_ip="$1"; local container="$2"
echo "Starting Ray WORKER node on $worker_ip..."
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
"docker exec -d $container bash -c \
'ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
--address=$HEAD_IP:$MASTER_PORT --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'"
}
# Start Cluster Function
@@ -564,31 +845,6 @@ start_cluster() {
return
fi
# Start Head Node
echo "Starting Head Node on $HEAD_IP..."
# Ensure cache dirs exist on head
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
mkdir -p "$dir"
done
fi
local head_cmd_args=()
if [[ "$SOLO_MODE" == "true" ]]; then
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting container...; exec sleep infinity")
else
head_cmd_args=(sleep infinity)
fi
else
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role head --host-ip $HEAD_IP --eth-if $ETH_IF --ib-if $IB_IF")
else
head_cmd_args=(./run-cluster-node.sh --role head --host-ip "$HEAD_IP" --eth-if "$ETH_IF" --ib-if "$IB_IF")
fi
fi
# Build docker run arguments based on mode
local docker_args_common="--gpus all -d --rm --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME"
local docker_caps_args=""
@@ -603,62 +859,68 @@ start_cluster() {
docker_resource_args="--ipc=host"
fi
# Start Head Node
echo "Starting Head Node on $HEAD_IP..."
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
mkdir -p "$dir"
done
fi
docker run $docker_caps_args $docker_resource_args \
$docker_args_common \
"${head_cmd_args[@]}"
$(get_env_flags "$HEAD_IP") $docker_args_common sleep infinity
# Start Worker Nodes
for worker in "${PEER_NODES[@]}"; do
echo "Starting Worker Node on $worker..."
# Ensure cache dirs exist on worker
if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
# Create string of dirs to create
dirs_str="${CACHE_DIRS_TO_CREATE[*]}"
ssh "$worker" "mkdir -p $dirs_str"
fi
local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $docker_args_common"
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
local inner_script="echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
ssh "$worker" "$docker_run_cmd bash -c \"$inner_script\""
else
ssh "$worker" "$docker_run_cmd ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
ssh "$worker" "mkdir -p ${CACHE_DIRS_TO_CREATE[*]}"
fi
local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $(get_env_flags "$worker") $docker_args_common"
ssh "$worker" "$docker_run_cmd sleep infinity"
done
# Apply mods if requested
# Apply mods (containers are idle — no mod_done sync needed)
if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
echo "Applying modifications to cluster nodes..."
# Apply to Head
for i in "${!MOD_PATHS[@]}"; do
apply_mod_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
done
# Signal completion on Head
docker exec "$CONTAINER_NAME" touch /tmp/mod_done
# Apply to Workers
for worker in "${PEER_NODES[@]}"; do
for i in "${!MOD_PATHS[@]}"; do
apply_mod_to_container "$worker" "$CONTAINER_NAME" "false" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
done
# Signal completion on Worker
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "docker exec $CONTAINER_NAME touch /tmp/mod_done"
done
fi
# Copy launch script to head node only (workers don't need it - they just run Ray)
# Copy (and patch for no-ray) launch script
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
if [[ "$NO_RAY_MODE" == "true" ]]; then
# Build per-node patched scripts on the host, then copy
local head_script; head_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "0" "$HEAD_IP")
copy_script_to_container "$CONTAINER_NAME" "$head_script" "head node ($HEAD_IP)"
rm -f "$head_script"
local rank=1
for worker in "${PEER_NODES[@]}"; do
local worker_script; worker_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "$rank" "$HEAD_IP")
copy_script_to_worker "$worker" "$CONTAINER_NAME" "$worker_script"
rm -f "$worker_script"
(( rank++ ))
done
else
copy_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" "head node"
fi
fi
if [[ "$SOLO_MODE" == "false" ]]; then
# Start Ray cluster (unless solo or no-ray)
if [[ "$SOLO_MODE" == "false" && "$NO_RAY_MODE" == "false" ]]; then
start_ray_head "$CONTAINER_NAME"
for worker in "${PEER_NODES[@]}"; do
start_ray_worker "$worker" "$CONTAINER_NAME"
done
wait_for_cluster
else
echo "Solo mode active: Skipping Ray cluster readiness check."
# Give container a moment to start up
sleep 2
fi
}
@@ -686,25 +948,97 @@ wait_for_cluster() {
exit 1
}
if [[ "$ACTION" == "exec" ]]; then
start_cluster
echo "Executing command on head node: $COMMAND_TO_RUN"
# Execute command on head node (daemon or interactive)
_exec_on_head() {
local cmd="$1"
if [[ "$DAEMON_MODE" == "true" ]]; then
# Daemon mode: run command detached inside the container and exit immediately
# Extract env vars starting from VLLM_HOST_IP to avoid interactive check in .bashrc
# Redirect output to PID 1 stdout/stderr so it shows up in docker logs
docker exec -d "$CONTAINER_NAME" bash -c "eval \"\$(sed -n '/export VLLM_HOST_IP/,\$p' /root/.bashrc)\" && { $COMMAND_TO_RUN; } >> /proc/1/fd/1 2>> /proc/1/fd/2"
docker exec -d "$CONTAINER_NAME" bash -c "$cmd >> /proc/1/fd/1 2>&1"
echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
else
# Check if running in a TTY to avoid "input device is not a TTY" error
if [ -t 0 ]; then
DOCKER_EXEC_FLAGS="-it"
else
DOCKER_EXEC_FLAGS="-i"
fi
if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -c "$cmd"
fi
}
docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
# Execute a no-ray multi-node command: workers (background) then head
exec_no_ray_cluster() {
local base_cmd="$1"
local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
# Launch workers first (always background)
local rank=1
for worker in "${PEER_NODES[@]}"; do
local worker_cmd
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
worker_cmd="$base_cmd" # script already patched per-node in start_cluster()
else
local clean
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
worker_cmd="$clean --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --master-port $MASTER_PORT --headless"
fi
echo "Launching worker (rank $rank) on $worker..."
local remote_payload remote_cmd
remote_payload="$worker_cmd >> /proc/1/fd/1 2>&1"
printf -v remote_cmd 'docker exec -d %q bash -c %q' "$CONTAINER_NAME" "$remote_payload"
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "$remote_cmd"
(( rank++ ))
done
# Launch head (rank 0) last
local head_cmd
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
head_cmd="$base_cmd"
else
local clean
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
head_cmd="$clean --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP --master-port $MASTER_PORT"
fi
echo "Executing command on head node (rank 0): $head_cmd"
if [[ "$DAEMON_MODE" == "true" ]]; then
docker exec -d "$CONTAINER_NAME" bash -c "$head_cmd >> /proc/1/fd/1 2>&1"
echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
else
if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -c "$head_cmd"
fi
}
if [[ "$ACTION" == "exec" ]]; then
# Trim (or error on) PEER_NODES based on declared parallelism, for any multi-node exec
if [[ "$SOLO_MODE" != "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
cmd_text=$(cat "$LAUNCH_SCRIPT_PATH" 2>/dev/null || true)
else
cmd_text="$COMMAND_TO_RUN"
fi
parse_parallelism_from_text "$cmd_text"
if [[ "$PARALLELISM_FOUND" == "true" ]]; then
required_nodes=$(( TP_SIZE * PP_SIZE * DP_SIZE ))
total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
if [[ "$required_nodes" -gt "$total_nodes" ]]; then
echo "Error: Command requires $required_nodes nodes (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE) but only $total_nodes node(s) are configured."
exit 1
elif [[ "$required_nodes" -lt "$total_nodes" ]]; then
echo "Note: Command requires $required_nodes node(s) (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE); using $required_nodes of $total_nodes configured node(s)."
PEER_NODES=("${PEER_NODES[@]:0:$(( required_nodes - 1 ))}")
fi
fi
fi
start_cluster
echo "Executing command: $COMMAND_TO_RUN"
if [[ "$NO_RAY_MODE" == "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]] || echo "$COMMAND_TO_RUN" | grep -q "vllm serve"; then
exec_no_ray_cluster "$COMMAND_TO_RUN"
else
_exec_on_head "$COMMAND_TO_RUN"
fi
else
_exec_on_head "$COMMAND_TO_RUN"
fi
elif [[ "$ACTION" == "start" ]]; then
start_cluster

17
mods/drop-caches/run.sh Normal file
View File

@@ -0,0 +1,17 @@
#!/bin/bash
# This mod will drop the FS caches every minute - useful to unstuck Qwen3.5-397B or other similar models during loading
CMD='sync; echo 3 > /proc/sys/vm/drop_caches'
LOG="/tmp/drop_caches.log"
PIDFILE="/tmp/drop_caches.pid"
nohup bash -c '
while true; do
'"$CMD"' >> "'"$LOG"'" 2>&1
sleep 60
done
' >/dev/null 2>&1 &
echo $! > "$PIDFILE"
echo "Started drop_caches loop with PID $(cat "$PIDFILE"); log is available in $LOG"

116
mods/exp-b12x/run.sh Normal file
View File

@@ -0,0 +1,116 @@
#!/bin/bash
set -e
SITE_PACKAGES="/usr/local/lib/python3.12/dist-packages"
echo "=== EXPERIMENTAL b12x-patches mod ==="
# 0a. Check if b12x support is present in vLLM
if [ ! -f "$SITE_PACKAGES/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py" ]; then
echo "[b12x ERROR] No b12x support detected; please rebuild with --apply-vllm-pr 40082, e.g.:"
echo "./build-and-copy.sh -t vllm-node-40082 --apply-vllm-pr 40082"
exit 1
fi
# 0b. Check if environment variables are set
if [[ "$VLLM_NVFP4_GEMM_BACKEND" != "flashinfer-b12x" ]]; then
echo "[b12x ERROR] Please set required environment variables to use b12x backend"
echo "*** Add the following arguments to launch-cluster.sh:"
echo " -e FLASHINFER_DISABLE_VERSION_CHECK=1 -e VLLM_USE_FLASHINFER_MOE_FP16=1 -e VLLM_NVFP4_GEMM_BACKEND=flashinfer-b12x -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 -e VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm -e VLLM_USE_FLASHINFER_MOE_FP4=1"
echo "*** also set the following vLLM parameters:"
echo " --moe-backend flashinfer_b12x --attention-backend flashinfer"
exit 1
fi
# ---------------------------------------------------------------
# 1. Pin nvidia-cutlass-dsl + companion libs to 4.4.2
# (4.5.x generates bad PTX on SM121 — `_mma` rejected by ptxas).
# All THREE packages must match: the python frontend, the base libs,
# and the CUDA 13 libs (which contain the MLIR compiler).
# ---------------------------------------------------------------
DSL_VER=$(pip show nvidia-cutlass-dsl 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
LIBS_BASE_VER=$(pip show nvidia-cutlass-dsl-libs-base 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
# LIBS_CU13_VER=$(pip show nvidia-cutlass-dsl-libs-cu13 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
if [ "$DSL_VER" != "4.4.2" ] || [ "$LIBS_BASE_VER" != "4.4.2" ] || [ "$LIBS_CU13_VER" != "4.4.2" ]; then
echo "[b12x] Pinning nvidia-cutlass-dsl{,-libs-base,-libs-cu13} to 4.4.2"
echo "[b12x] current: dsl=${DSL_VER:-none} libs-base=${LIBS_BASE_VER:-none} libs-cu13=${LIBS_CU13_VER:-none}"
uv pip install \
nvidia-cutlass-dsl==4.4.2 \
nvidia-cutlass-dsl-libs-base==4.4.2 \
nvidia-cutlass-dsl-libs-cu13==4.4.2 \
-q 2>/dev/null || echo "[b12x] WARNING: cutlass-dsl pin install returned non-zero"
else
echo "[b12x] nvidia-cutlass-dsl + libs already at 4.4.2"
fi
# ---------------------------------------------------------------
# 2. Apply cutlass-dsl SM121 patches
# FlashInfer/vLLM install wipes vendored cutlass, so re-apply every time
# ---------------------------------------------------------------
echo "[b12x] Applying cutlass-dsl SM121 patches..."
# 2a. warp/mma.py: allow sm_121a alongside sm_120a in both the runtime
# arch check and the `admissible_archs` string list (used in error msgs)
for f in $(find "$SITE_PACKAGES" -name "mma.py" -path "*/warp/*" 2>/dev/null); do
if grep -q "if not arch == Arch.sm_120a:" "$f" 2>/dev/null; then
sed -i "s/if not arch == Arch.sm_120a:/if arch not in (Arch.sm_120a, Arch.sm_121a):/" "$f"
echo " patched $f (warp sm_121a runtime check)"
fi
# Add sm_121a to the admissible_archs list if missing
if grep -q '"sm_120a",' "$f" 2>/dev/null && ! grep -q '"sm_121a"' "$f" 2>/dev/null; then
sed -i 's/^\(\s*\)"sm_120a",$/\1"sm_120a",\n\1"sm_121a",/' "$f"
echo " patched $f (warp sm_121a admissible_archs)"
fi
done
# 2b. tcgen05/mma.py: add sm_120a and sm_121a to supported arch list
for f in $(find "$SITE_PACKAGES" -name "mma.py" -path "*/tcgen05/*" 2>/dev/null); do
if ! grep -q "Arch.sm_121a" "$f" 2>/dev/null; then
sed -i "/Arch.sm_103a,/a\\ Arch.sm_120a,\n Arch.sm_121a," "$f"
echo " patched $f (tcgen05 mma sm_121a)"
fi
done
# 2c. tcgen05/copy.py: allow sm_120f family
for f in $(find "$SITE_PACKAGES" -name "copy.py" -path "*/tcgen05/*" 2>/dev/null); do
if ! grep -q "sm_120f" "$f" 2>/dev/null; then
sed -i "s/arch.is_family_of(Arch.sm_110f)/arch.is_family_of(Arch.sm_110f) or arch.is_family_of(Arch.sm_120f)/" "$f"
echo " patched $f (tcgen05 copy sm_120f)"
fi
done
# Clear pycache so patched code takes effect
find "$SITE_PACKAGES" -name "__pycache__" -path "*/cutlass*" -exec rm -rf {} + 2>/dev/null || true
find "$SITE_PACKAGES" -name "__pycache__" -path "*/flashinfer*" -exec rm -rf {} + 2>/dev/null || true
# ---------------------------------------------------------------
# 3 Patch FlashInfer's blackwell_sm12x __init__.py to drop the
# broken `sm120_moe_dispatch_context` import (FlashInfer main
# has a stale __init__ that references a function that no
# longer exists in moe_dispatch.py — but the symbol isn't
# actually used by anything, so we just remove it from the
# import + __all__ list).
# ---------------------------------------------------------------
SM12X_INIT="$SITE_PACKAGES/flashinfer/fused_moe/cute_dsl/blackwell_sm12x/__init__.py"
if [ -f "$SM12X_INIT" ]; then
if grep -q "sm120_moe_dispatch_context" "$SM12X_INIT"; then
# Drop the line that imports/exports the missing symbol
sed -i '/sm120_moe_dispatch_context/d' "$SM12X_INIT"
find "$SITE_PACKAGES/flashinfer" -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
echo "[b12x] patched $SM12X_INIT (dropped stale sm120_moe_dispatch_context references)"
else
echo "[b12x] $SM12X_INIT already cleaned"
fi
else
echo "[b12x] $SM12X_INIT not found (older FlashInfer?), skipping"
fi
if grep -q "if current_platform.has_device_capability(120) and has_flashinfer_b12x_gemm():" $SITE_PACKAGES/vllm/model_executor/kernels/linear/nvfp4/flashinfer.py; then
echo "[b12x] Patching vLLM PR 40080 to enable sm121 cap"
sed -i "s/if current_platform.has_device_capability(120) and has_flashinfer_b12x_gemm():/if True:/" $SITE_PACKAGES/vllm/model_executor/kernels/linear/nvfp4/flashinfer.py
fi

View File

@@ -0,0 +1,10 @@
#!/bin/bash
set -e
cd /usr/local/lib/python3.12/dist-packages
echo "Applying PR #38909"
if curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38909.diff | git apply --exclude="tests/*"; then
echo "- PR #38909 applied successfully"
else
echo "- PR #38909 can't be applied, skipping"
fi

View File

@@ -1,3 +1,3 @@
#!/bin/bash
set -e
patch -p1 -d /usr/local/lib/python3.12/dist-packages < transformers.patch
patch -p1 -d /usr/local/lib/python3.12/dist-packages < transformers.patch || echo "Patch is not applicable, skipping..."

View File

@@ -0,0 +1,155 @@
{%- set image_count = namespace(value=0) %}
{%- set video_count = namespace(value=0) %}
{%- macro render_content(content, do_vision_count, is_system_content=false) %}
{%- if content is string %}
{{- content }}
{%- elif content is iterable and content is not mapping %}
{%- for item in content %}
{%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
{%- if is_system_content %}
{{- raise_exception('System message cannot contain images.') }}
{%- endif %}
{%- if do_vision_count %}
{%- set image_count.value = image_count.value + 1 %}
{%- endif %}
{%- if add_vision_id %}
{{- 'Picture ' ~ image_count.value ~ ': ' }}
{%- endif %}
{{- '<|vision_start|><|image_pad|><|vision_end|>' }}
{%- elif 'video' in item or item.type == 'video' %}
{%- if is_system_content %}
{{- raise_exception('System message cannot contain videos.') }}
{%- endif %}
{%- if do_vision_count %}
{%- set video_count.value = video_count.value + 1 %}
{%- endif %}
{%- if add_vision_id %}
{{- 'Video ' ~ video_count.value ~ ': ' }}
{%- endif %}
{{- '<|vision_start|><|video_pad|><|vision_end|>' }}
{%- elif 'text' in item %}
{{- item.text }}
{%- else %}
{{- raise_exception('Unexpected item type in content.') }}
{%- endif %}
{%- endfor %}
{%- elif content is none or content is undefined %}
{{- '' }}
{%- else %}
{{- raise_exception('Unexpected content type.') }}
{%- endif %}
{%- endmacro %}
{%- if not messages %}
{{- raise_exception('No messages provided.') }}
{%- endif %}
{%- if tools and tools is iterable and tools is not mapping %}
{{- '<|im_start|>system\n' }}
{{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>" }}
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
{%- if messages[0].role == 'system' %}
{%- set content = render_content(messages[0].content, false, true)|trim %}
{%- if content %}
{{- '\n\n' + content }}
{%- endif %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- else %}
{%- if messages[0].role == 'system' %}
{%- set content = render_content(messages[0].content, false, true)|trim %}
{{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" %}
{%- set content = render_content(message.content, false)|trim %}
{%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if ns.multi_step_tool %}
{{- raise_exception('No user query found in messages.') }}
{%- endif %}
{%- for message in messages %}
{%- set content = render_content(message.content, true)|trim %}
{%- if message.role == "system" %}
{%- if not loop.first %}
{{- raise_exception('System message must be at the beginning.') }}
{%- endif %}
{%- elif message.role == "user" %}
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is string %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '</think>' in content %}
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- set reasoning_content = reasoning_content|trim %}
{%- if loop.index0 > ns.last_query_index %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{%- if loop.first %}
{%- if content|trim %}
{{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- else %}
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- endif %}
{%- else %}
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- endif %}
{%- if tool_call.arguments is mapping %}
{%- for args_name in tool_call.arguments %}
{%- set args_value = tool_call.arguments[args_name] %}
{{- '<parameter=' + args_name + '>\n' }}
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
{{- args_value }}
{{- '\n</parameter>\n' }}
{%- endfor %}
{%- endif %}
{{- '</function>\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.previtem and loop.previtem.role != "tool" %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- content }}
{{- '\n</tool_response>' }}
{%- if not loop.last and loop.nextitem.role != "tool" %}
{{- '<|im_end|>\n' }}
{%- elif loop.last %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- else %}
{{- raise_exception('Unexpected message role.') }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- else %}
{{- '<think>\n' }}
{%- endif %}
{%- endif %}

View File

@@ -0,0 +1,4 @@
#!/bin/bash
set -e
cp chat_template.jinja $WORKSPACE_DIR/unsloth.jinja
echo "=======> to apply chat template, use --chat-template unsloth.jinja"

View File

@@ -0,0 +1,223 @@
{%- set image_count = namespace(value=0) %}
{%- set video_count = namespace(value=0) %}
{%- macro render_content(content, do_vision_count, is_system_content=false) %}
{%- if content is string %}
{{- content }}
{%- elif content is iterable and content is not mapping %}
{%- for item in content %}
{%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
{%- if is_system_content %}
{{- raise_exception('System message cannot contain images.') }}
{%- endif %}
{%- if do_vision_count %}
{%- set image_count.value = image_count.value + 1 %}
{%- endif %}
{%- if add_vision_id is defined and add_vision_id %}
{{- 'Picture ' ~ image_count.value ~ ': ' }}
{%- endif %}
{{- '<|vision_start|><|image_pad|><|vision_end|>' }}
{%- elif 'video' in item or item.type == 'video' %}
{%- if is_system_content %}
{{- raise_exception('System message cannot contain videos.') }}
{%- endif %}
{%- if do_vision_count %}
{%- set video_count.value = video_count.value + 1 %}
{%- endif %}
{%- if add_vision_id is defined and add_vision_id %}
{{- 'Video ' ~ video_count.value ~ ': ' }}
{%- endif %}
{{- '<|vision_start|><|video_pad|><|vision_end|>' }}
{%- elif 'text' in item %}
{{- item.text }}
{%- else %}
{{- raise_exception('Unexpected item type in content.') }}
{%- endif %}
{%- endfor %}
{%- elif content is none or content is undefined %}
{{- '' }}
{%- else %}
{{- raise_exception('Unexpected content type.') }}
{%- endif %}
{%- endmacro %}
{%- set ns_flags = namespace(enable_thinking=true) %}
{%- if enable_thinking is defined %}
{%- set ns_flags.enable_thinking = enable_thinking %}
{%- endif %}
{%- if not messages %}
{{- raise_exception('No messages provided.') }}
{%- endif %}
{%- if tools and tools is iterable and tools is not mapping %}
{{- '<|im_start|>system\n' }}
{{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n</tools>" }}
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
{%- if messages[0].role == 'system' or messages[0].role == 'developer' %}
{%- set content = render_content(messages[0].content, false, true)|trim %}
{%- if '<|think_off|>' in content %}
{%- set ns_flags.enable_thinking = false %}
{%- set content = content.replace('<|think_off|>', '') %}
{%- endif %}
{%- if '<|think_on|>' in content %}
{%- set ns_flags.enable_thinking = true %}
{%- set content = content.replace('<|think_on|>', '') %}
{%- endif %}
{%- set content = content.strip() %}
{%- if content %}
{{- '\n\n' + content }}
{%- endif %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- else %}
{%- if messages[0].role == 'system' or messages[0].role == 'developer' %}
{%- set content = render_content(messages[0].content, false, true)|trim %}
{%- if '<|think_off|>' in content %}
{%- set ns_flags.enable_thinking = false %}
{%- set content = content.replace('<|think_off|>', '') %}
{%- endif %}
{%- if '<|think_on|>' in content %}
{%- set ns_flags.enable_thinking = true %}
{%- set content = content.replace('<|think_on|>', '') %}
{%- endif %}
{%- set content = content.strip() %}
{{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" %}
{%- set content = render_content(message.content, false)|trim %}
{%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if ns.multi_step_tool %}
{%- set ns.last_query_index = messages|length - 1 %}
{%- endif %}
{%- for message in messages %}
{%- set content = render_content(message.content, true)|trim %}
{%- if '<|think_off|>' in content %}
{%- set ns_flags.enable_thinking = false %}
{%- set content = content.replace('<|think_off|>', '') %}
{%- endif %}
{%- if '<|think_on|>' in content %}
{%- set ns_flags.enable_thinking = true %}
{%- set content = content.replace('<|think_on|>', '') %}
{%- endif %}
{%- set content = content.strip() %}
{%- if message.role == "system" or message.role == "developer" %}
{%- if not loop.first %}
{{- raise_exception('System message must be at the beginning.') }}
{%- endif %}
{%- elif message.role == "user" %}
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set reasoning_content = '' %}
{#- Auto-close unclosed think before tool_call -#}
{%- if '<think>' in content and '<tool_call>' in content %}
{%- set last_think = content.rfind('<think>') %}
{%- set last_close = content.rfind('</think>') %}
{%- set tool_pos = content.find('<tool_call>') %}
{%- if last_close < last_think or last_close == -1 %}
{%- if tool_pos > last_think %}
{%- set content = content[:tool_pos] + '</think>' + content[tool_pos:] %}
{%- else %}
{%- set content = content + '</think>' %}
{%- endif %}
{%- endif %}
{%- endif %}
{%- if message.reasoning_content is string %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- set has_think_tag = false %}
{%- set think_end_token = '</think>' %}
{%- if '</think>' in content %}
{%- set has_think_tag = true %}
{%- elif '</thinking>' in content %}
{%- set has_think_tag = true %}
{%- set think_end_token = '</thinking>' %}
{%- elif '<think>' in content %}
{%- set reasoning_content = content.split('<think>')[-1].lstrip('\n') %}
{%- set content = '' %}
{%- endif %}
{%- if has_think_tag %}
{%- set reasoning_content = content.split(think_end_token)[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
{%- set content = content.split(think_end_token)[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- set reasoning_content = reasoning_content|trim %}
{%- set show_think = false %}
{%- if loop.index0 > ns.last_query_index %}
{%- set show_think = true %}
{%- elif ns_flags.enable_thinking and (preserve_thinking is undefined or preserve_thinking is true) and reasoning_content|length > 0 %}
{%- set show_think = true %}
{%- endif %}
{%- if show_think %}
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{%- if loop.first %}
{%- if content|trim %}
{{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- else %}
{{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- endif %}
{%- else %}
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- endif %}
{%- if tool_call.arguments is defined and tool_call.arguments is mapping %}
{%- if tool_call.arguments|length > 0 %}
{%- for args_name in tool_call.arguments %}
{%- set args_value = tool_call.arguments[args_name] %}
{{- '<parameter=' + args_name + '>\n' }}
{%- set args_value = args_value | string if args_value is string else args_value | tojson %}
{{- args_value }}
{{- '\n</parameter>\n' }}
{%- endfor %}
{%- endif %}
{%- elif tool_call.arguments is defined and tool_call.arguments is string %}
{%- if tool_call.arguments|trim|length > 0 %}
{{- tool_call.arguments }}
{{- '\n' }}
{%- endif %}
{%- endif %}
{{- '</function>\n</tool_call>' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.previtem and loop.previtem.role != "tool" %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- content }}
{{- '\n</tool_response>' }}
{%- if not loop.last and loop.nextitem.role != "tool" %}
{{- '<|im_end|>\n' }}
{%- elif loop.last %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- else %}
{{- raise_exception('Unexpected message role.') }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if ns_flags.enable_thinking is false %}
{{- '<think>\n\n</think>\n\n' }}
{%- else %}
{{- '<think>\n' }}
{%- endif %}
{%- endif %}

View File

@@ -0,0 +1,4 @@
#!/bin/bash
set -e
cp chat_template.jinja $WORKSPACE_DIR/fixed_chat_template.jinja
echo "=======> to apply chat template, use --chat-template fixed_chat_template.jinja"

View File

@@ -0,0 +1,23 @@
# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
import re
path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
with open(path) as f:
content = f.read()
old = """kwargs["ignore_keys_at_rope_validation"] = [
"mrope_section",
"mrope_interleaved",
]"""
new = """kwargs["ignore_keys_at_rope_validation"] = {
"mrope_section",
"mrope_interleaved",
}"""
content = content.replace(old, new)
with open(path, "w") as f:
f.write(content)
print("Fixed ignore_keys_at_rope_validation: list -> set")

View File

@@ -0,0 +1,46 @@
--- qwen3_5.py.orig 2026-03-03 00:00:00.000000000 +0000
+++ qwen3_5.py 2026-03-03 00:00:00.000000000 +0000
@@ -166,11 +166,13 @@
z_size = self.value_dim // self.tp_size
mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
z = z.reshape(z.size(0), -1, self.head_v_dim)
- ba, _ = self.in_proj_ba(hidden_states)
- b, a = ba.chunk(2, dim=-1)
-
- b = b.contiguous()
- a = a.contiguous()
+ # Replicated B/A projections — full output, sliced to local TP partition
+ b_full, _ = self.in_proj_b(hidden_states)
+ a_full, _ = self.in_proj_a(hidden_states)
+ _ba_chunk = self.num_v_heads // self.tp_size
+ _ba_start = self.tp_rank * _ba_chunk
+ b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
+ a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
# ============================================================
# Part 2: Core Attention (Custom Op)
@@ -374,8 +376,6 @@
# GDN
("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
("in_proj_qkvz", "in_proj_z", 3),
- ("in_proj_ba", "in_proj_b", 0),
- ("in_proj_ba", "in_proj_a", 1),
]
params_dict = dict(self.named_parameters())
@@ -530,7 +530,6 @@
"gate_up_proj": ["gate_proj", "up_proj"],
# GDN fused projections.
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -630,7 +629,6 @@
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):

View File

@@ -0,0 +1,56 @@
--- qwen3_next.py.orig 2026-03-03 00:00:00.000000000 +0000
+++ qwen3_next.py 2026-03-03 00:00:00.000000000 +0000
@@ -411,15 +411,22 @@
quant_config=quant_config,
prefix=f"{prefix}.in_proj_qkvz",
)
- # ba_proj doesn't support blockwise fp8 quantization.
- # # in_proj_ba is defined as MergedColumnParallelLinear for
- # compatibility with Qwen3_5.
- self.in_proj_ba = MergedColumnParallelLinear(
+ # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
+ # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
+ # Each rank loads full weights and slices in forward.
+ self.in_proj_b = ReplicatedLinear(
input_size=self.hidden_size,
- output_sizes=[self.num_v_heads] * 2,
+ output_size=self.num_v_heads,
bias=False,
quant_config=quant_config,
- prefix=f"{prefix}.in_proj_ba",
+ prefix=f"{prefix}.in_proj_b",
+ )
+ self.in_proj_a = ReplicatedLinear(
+ input_size=self.hidden_size,
+ output_size=self.num_v_heads,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.in_proj_a",
)
query_key_settings = (self.key_dim, 0, False)
@@ -584,7 +591,15 @@
# Part 1: Input Projection
# ============================================================
projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
- projected_states_ba, _ = self.in_proj_ba(hidden_states)
+ # Replicated B/A projections — full output, sliced to local TP partition
+ b_full, _ = self.in_proj_b(hidden_states)
+ a_full, _ = self.in_proj_a(hidden_states)
+ _ba_chunk = self.num_v_heads // self.tp_size
+ _ba_start = self.tp_rank * _ba_chunk
+ projected_states_ba = torch.cat([
+ b_full[:, _ba_start:_ba_start+_ba_chunk],
+ a_full[:, _ba_start:_ba_start+_ba_chunk],
+ ], dim=-1)
query, key, value, z, b, a = self.fix_query_key_value_ordering(
projected_states_qkvz, projected_states_ba
)
@@ -1326,7 +1341,6 @@
],
"gate_up_proj": ["gate_proj", "up_proj"],
"in_proj_qkvz": ["in_proj_qkvz"],
- "in_proj_ba": ["in_proj_ba"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

View File

@@ -0,0 +1,23 @@
#!/bin/bash
# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
# Delivery: unified diff patches (portable across vLLM versions)
set -e
MOD_DIR="$(dirname "$0")"
MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
echo "[fix-qwen35-tp4-marlin] Applying patches..."
# Apply patches with --forward (skip if already applied)
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
}
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
}
# Fix rope validation (idempotent)
python3 "$MOD_DIR/fix_rope.py"
echo "[fix-qwen35-tp4-marlin] Done."

View File

@@ -0,0 +1,255 @@
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 3796265ff..b6dcfb54c 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -45,6 +45,11 @@ class CacheConfig:
not matter if you have another vLLM instance running on the same GPU. For
example, if you have two vLLM instances running on the same GPU, you can
set the GPU memory utilization to 0.5 for each instance."""
+ gpu_memory_utilization_gb: float | None = Field(default=None, gt=0)
+ """Amount of GPU memory to be used in GiB. This provides fine-grained control
+ over GPU memory usage and is particularly useful on unified memory systems
+ where available memory changes dynamically. If specified, it overrides
+ gpu_memory_utilization. Cannot be used simultaneously with kv_cache_memory_bytes."""
cache_dtype: CacheDType = "auto"
"""Data type for kv cache storage. If "auto", will use model data type.
CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
@@ -204,6 +209,18 @@ class CacheConfig:
object.__setattr__(self, "user_specified_block_size", True)
return self
+ @model_validator(mode="after")
+ def _validate_memory_params(self) -> "CacheConfig":
+ if (
+ self.gpu_memory_utilization_gb is not None
+ and self.kv_cache_memory_bytes is not None
+ ):
+ raise ValueError(
+ "Cannot specify both gpu_memory_utilization_gb and "
+ "kv_cache_memory_bytes. Please use only one of them."
+ )
+ return self
+
@field_validator("cache_dtype", mode="after")
@classmethod
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 56bbb7bf5..db5012608 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -454,6 +454,7 @@ class EngineArgs:
offload_prefetch_step: int = PrefetchOffloadConfig.offload_prefetch_step
offload_params: set[str] = get_field(PrefetchOffloadConfig, "offload_params")
gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
+ gpu_memory_utilization_gb: float | None = CacheConfig.gpu_memory_utilization_gb
kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
max_num_batched_tokens: int | None = None
max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
@@ -954,6 +955,9 @@ class EngineArgs:
cache_group.add_argument(
"--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]
)
+ cache_group.add_argument(
+ "--gpu-memory-utilization-gb", **cache_kwargs["gpu_memory_utilization_gb"]
+ )
cache_group.add_argument(
"--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
)
@@ -1512,6 +1516,7 @@ class EngineArgs:
cache_config = CacheConfig(
block_size=self.block_size, # type: ignore[arg-type]
gpu_memory_utilization=self.gpu_memory_utilization,
+ gpu_memory_utilization_gb=self.gpu_memory_utilization_gb,
kv_cache_memory_bytes=self.kv_cache_memory_bytes,
cache_dtype=resolved_cache_dtype, # type: ignore[arg-type]
is_attention_free=model_config.is_attention_free,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 5909b3043..c2607df6a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -156,6 +156,11 @@ class LLM:
values will increase the KV cache size and thus improve the model's
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
+ gpu_memory_utilization_gb: Amount of GPU memory to reserve in GiB.
+ This provides fine-grained control over GPU memory usage and is
+ particularly useful on unified memory systems where available memory
+ changes dynamically. If specified, it overrides gpu_memory_utilization.
+ Cannot be used simultaneously with kv_cache_memory_bytes.
kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default,
this is set to None and vllm can automatically infer the kv cache
size based on gpu_memory_utilization. However, users may want to
@@ -234,6 +239,7 @@ class LLM:
chat_template: Path | str | None = None,
seed: int = 0,
gpu_memory_utilization: float = 0.92,
+ gpu_memory_utilization_gb: float | None = None,
cpu_offload_gb: float = 0,
offload_group_size: int = 0,
offload_num_in_group: int = 1,
@@ -356,6 +362,7 @@ class LLM:
tokenizer_revision=tokenizer_revision,
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
+ gpu_memory_utilization_gb=gpu_memory_utilization_gb,
kv_cache_memory_bytes=kv_cache_memory_bytes,
cpu_offload_gb=cpu_offload_gb,
offload_group_size=offload_group_size,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 2ed7ef7e0..806830b17 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -622,7 +622,8 @@ def _check_enough_kv_cache_memory(
if available_memory <= 0:
raise ValueError(
"No available memory for the cache blocks. "
- "Try increasing `gpu_memory_utilization` when initializing the engine. "
+ "Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb` "
+ "when initializing the engine. "
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
"for more details."
)
@@ -643,8 +644,8 @@ def _check_enough_kv_cache_memory(
f"({max_model_len}), ({format_gib(needed_memory)} GiB KV "
f"cache is needed, which is larger than the available KV cache "
f"memory ({format_gib(available_memory)} GiB). {estimated_msg}"
- f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
- f"when initializing the engine. "
+ f"Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb`, "
+ f"or decreasing `max_model_len` when initializing the engine. "
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
f"for more details."
)
@@ -1438,7 +1439,8 @@ def _auto_fit_max_model_len(
if auto_fit_max <= 0:
raise ValueError(
"Cannot auto-fit max_model_len: not enough GPU memory available "
- "to serve even a single token. Try increasing `gpu_memory_utilization`."
+ "to serve even a single token. Try increasing `gpu_memory_utilization` "
+ "or `gpu_memory_utilization_gb`."
)
if auto_fit_max >= original_max:
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 3d065927e..e8cef2ceb 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -358,6 +358,7 @@ def report_usage_stats(
"dtype": str(vllm_config.model_config.dtype),
"block_size": vllm_config.cache_config.block_size,
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
+ "gpu_memory_utilization_gb": vllm_config.cache_config.gpu_memory_utilization_gb,
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
# Quantization
"quantization": vllm_config.model_config.quantization,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b53bd71a1..d28821328 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5355,8 +5355,8 @@ class GPUModelRunner(
raise RuntimeError(
"CUDA out of memory occurred when warming up sampler with "
f"{num_reqs} dummy requests. Please try lowering "
- "`max_num_seqs` or `gpu_memory_utilization` when "
- "initializing the engine."
+ "`max_num_seqs`, `gpu_memory_utilization`, or "
+ "`gpu_memory_utilization_gb` when initializing the engine."
) from e
else:
raise e
@@ -5434,8 +5434,8 @@ class GPUModelRunner(
raise RuntimeError(
"CUDA out of memory occurred when warming up pooler "
f"({task=}) with {num_reqs} dummy requests. Please try "
- "lowering `max_num_seqs` or `gpu_memory_utilization` when "
- "initializing the engine."
+ "lowering `max_num_seqs`, `gpu_memory_utilization`, or "
+ "`gpu_memory_utilization_gb` when initializing the engine."
) from e
else:
raise e
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 842e76549..bf3bb359b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -357,7 +357,8 @@ class Worker(WorkerBase):
Tip:
You may limit the usage of GPU memory
- by adjusting the `gpu_memory_utilization` parameter.
+ by adjusting the `gpu_memory_utilization` or
+ `gpu_memory_utilization_gb` parameter.
"""
if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
# still need a profile run which compiles the model for
@@ -369,7 +370,8 @@ class Worker(WorkerBase):
f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
"KV Cache as specified by kv_cache_memory_bytes config and "
"skipped memory profiling. This does not respect the "
- "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
+ "gpu_memory_utilization or gpu_memory_utilization_gb config. "
+ "Only use kv_cache_memory_bytes "
"config when you want manual control of KV cache memory "
"size. If OOM'ed, check the difference of initial free "
"memory between the current run and the previous run "
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index d06c40ed6..89c94e641 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -405,21 +405,43 @@ def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) ->
Calculate the amount of memory required by vLLM, then validate
that the current amount of free memory is sufficient for that.
"""
- requested_memory = math.ceil(
- init_snapshot.total_memory * cache_config.gpu_memory_utilization
- )
-
- if init_snapshot.free_memory < requested_memory:
- raise ValueError(
- f"Free memory on device {init_snapshot.device_} "
- f"({format_gib(init_snapshot.free_memory)}/"
- f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
- f"is less than desired GPU memory utilization "
- f"({cache_config.gpu_memory_utilization}, "
- f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
- f"utilization or reduce GPU memory used by other processes."
+ if cache_config.gpu_memory_utilization_gb is not None:
+ requested_memory = math.ceil(cache_config.gpu_memory_utilization_gb * 1024**3)
+ if requested_memory <= 0:
+ raise ValueError(
+ f"gpu_memory_utilization_gb must be positive, got "
+ f"{cache_config.gpu_memory_utilization_gb} GiB."
+ )
+ if requested_memory > init_snapshot.total_memory:
+ raise ValueError(
+ f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
+ f"total GPU memory ({format_gib(init_snapshot.total_memory)} GiB). "
+ f"Reduce gpu_memory_utilization_gb or use a smaller value."
+ )
+ safety_margin = 0.5 * 1024**3
+ if requested_memory > init_snapshot.free_memory + safety_margin:
+ raise ValueError(
+ f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
+ f"available memory ({format_gib(init_snapshot.free_memory)} GiB) "
+ f"with safety margin ({format_gib(safety_margin)} GiB). "
+ f"Reduce gpu_memory_utilization_gb or free up GPU memory."
+ )
+ else:
+ requested_memory = math.ceil(
+ init_snapshot.total_memory * cache_config.gpu_memory_utilization
)
+ if init_snapshot.free_memory < requested_memory:
+ raise ValueError(
+ f"Free memory on device {init_snapshot.device_} "
+ f"({format_gib(init_snapshot.free_memory)}/"
+ f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
+ f"is less than desired GPU memory utilization "
+ f"({cache_config.gpu_memory_utilization}, "
+ f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
+ f"utilization or reduce GPU memory used by other processes."
+ )
+
return requested_memory

View File

@@ -0,0 +1,6 @@
#!/bin/bash
set -e
patch -p1 -d /usr/local/lib/python3.12/dist-packages < gpu_mem.patch \
&& echo "=====> You can now use --gpu-memory-utilization-gb parameter to specify reserved memory in GiB"

View File

@@ -0,0 +1,4 @@
#!/bin/bash
set -e
cd $WORKSPACE_DIR
wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/resolve/main/super_v3_reasoning_parser.py

View File

@@ -1,117 +0,0 @@
#!/bin/bash
set -e
# Define a function to export immediately AND save to .bashrc for future sessions
export_persist() {
local var_name="$1"
local var_value="$2"
# 1. Export for the current running process
export "$var_name"="$var_value"
# 2. Append to .bashrc (idempotent check to avoid duplicate lines)
if ! grep -q "export $var_name=" ~/.bashrc; then
echo "export $var_name=\"$var_value\"" >> ~/.bashrc
else
# Optional: Update the existing line if it exists
sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
fi
}
# --- Help Function ---
usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Required Arguments:"
echo " -r, --role <head|node> : Set the node type"
echo " -h, --host-ip <ip> : IP address of this interface (Host IP)"
echo " -e, --eth-if <name> : Ethernet interface name (e.g., eth0)"
echo " -i, --ib-if <name> : InfiniBand/RDMA interface name"
echo ""
echo "Conditional Arguments:"
echo " -m, --head-ip <ip> : IP of the head node (REQUIRED if role is 'node')"
echo ""
echo "Example:"
echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
exit 1
}
# --- Argument Parsing ---
# Initialize variables to empty
NODE_TYPE=""
HOST_IP=""
ETH_IF_NAME=""
IB_IF_NAME=""
HEAD_IP=""
while [[ "$#" -gt 0 ]]; do
case $1 in
-r|--role) NODE_TYPE="$2"; shift ;;
-h|--host-ip) HOST_IP="$2"; shift ;;
-e|--eth-if) ETH_IF_NAME="$2"; shift ;;
-i|--ib-if) IB_IF_NAME="$2"; shift ;;
-m|--head-ip) HEAD_IP="$2"; shift ;;
*) echo "Unknown parameter passed: $1"; usage ;;
esac
shift
done
# --- Validation ---
# 1. Check if all common required arguments are present
if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
echo "Error: Missing required arguments."
usage
fi
# 2. Validate Role
if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
echo "Error: --role must be 'head' or 'node'."
exit 1
fi
# 3. Conditional Check for Head IP
if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
echo "Error: When --role is 'node', you must provide --head-ip."
exit 1
fi
# --- Environment Configuration ---
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
export_persist VLLM_HOST_IP "$HOST_IP"
export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
# Network Interface
export_persist MN_IF_NAME "$ETH_IF_NAME"
export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
# InfiniBand
export_persist NCCL_IB_HCA "$IB_IF_NAME"
export_persist NCCL_IB_DISABLE "0"
# Sockets/Transport
export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist RAY_memory_monitor_refresh_ms "0"
# --- Execution ---
if [ "${NODE_TYPE}" == "head" ]; then
echo "Starting Ray HEAD node..."
exec ray start --block --head --port 6379 \
--node-ip-address "$VLLM_HOST_IP" \
--disable-usage-stats
else
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
exec ray start --block \
--address="$HEAD_IP:6379" \
--node-ip-address "$VLLM_HOST_IP"
fi

View File

@@ -1,6 +1,5 @@
#!/bin/bash
set -e
echo "Setting up cluster initialization script..."
cp run-cluster-node.sh $WORKSPACE_DIR/run-cluster-node.sh
chmod +x $WORKSPACE_DIR/run-cluster-node.sh
# NGC vLLM mod: container initialization is now handled by launch-cluster.sh
echo "NGC vLLM mod applied."

View File

@@ -0,0 +1,58 @@
# Recipe: Qwen3.5-122B-A10B-INT4-Autoround
# Qwen3.5-122B model in Intel INT4-Autoround quantization
# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks.
# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150`
recipe_version: "1"
name: Qwen3.5-397B-INT4-Autoround (PP=3)
description: Recipe for Qwen3.5-397B-INT4-Autoround to run on 3-node mesh in pipeline-parallel mode
# HuggingFace model to download (optional, for --download-model)
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
cluster_only: true
# Container image to use
container: vllm-node-tf5
build_args:
- --tf5
mods:
- mods/fix-qwen3.5-chat-template
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
pipeline_parallel: 3
gpu_memory_utilization: 0.7
max_model_len: 262144
max_num_batched_tokens: 16384
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template
command: |
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
--max-model-len {max_model_len} \
--max-num-seqs 10 \
--kv-cache-dtype fp8 \
--gpu-memory-utilization {gpu_memory_utilization} \
--port {port} \
--host {host} \
--enable-prefix-caching \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--max-num-batched-tokens {max_num_batched_tokens} \
--trust-remote-code \
--chat-template unsloth.jinja \
-tp 1 \
-pp {pipeline_parallel} \
--load-format instanttensor \
--distributed-executor-backend ray

View File

@@ -0,0 +1,45 @@
# Recipe: MiniMax-M2.5
# MiniMaxAI/MiniMax-M2.5
recipe_version: "1"
name: MiniMax-M2.5
description: vLLM serving MiniMax-M2.5 with Ray distributed backend
# HuggingFace model to download (optional, for --download-model)
model: MiniMaxAI/MiniMax-M2.5
# Container image to use
container: vllm-node
# Can only be run in a cluster
cluster_only: true
# No mods required
mods: []
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 4
gpu_memory_utilization: 0.90
max_model_len: 128000
# Environment variables
env:
VLLM_DISTRIBUTED_EXECUTOR_CONFIG: '{"placement_group_options":{"strategy":"SPREAD"}}'
# The vLLM serve command template
command: |
vllm serve MiniMaxAI/MiniMax-M2.5 \
--trust-remote-code \
--port {port} \
--host {host} \
--gpu-memory-utilization {gpu_memory_utilization} \
-tp {tensor_parallel} \
--distributed-executor-backend ray \
--max-model-len {max_model_len} \
--load-format fastsafetensors \
--enable-auto-tool-choice \
--tool-call-parser minimax_m2 \
--reasoning-parser minimax_m2_append_think

View File

@@ -0,0 +1,61 @@
# Recipe: Qwen3.5-397B-A17B-FP8
# Qwen3.5-397B-A17B model in FP8 precision
# Multi-modal input
recipe_version: "1"
name: Qwen3.5-397B-A17B-FP8
description: vLLM serving Qwen3.5-397B-A17B-FP8
# HuggingFace model to download (optional, for --download-model)
model: Qwen/Qwen3.5-397B-A17B-FP8
#solo_only: true
# Container image to use
container: vllm-node-tf5
build_args:
- --tf5
# Mod required to fix ROPE syntax error
mods:
- mods/fix-qwen3.5-autoround
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 4
gpu_memory_utilization: 0.85
max_model_len: 262144
max_num_batched_tokens: 8192
# Environment variables
env:
VLLM_USE_DEEP_GEMM: 0
VLLM_USE_FLASHINFER_MOE_FP16: 1
VLLM_USE_FLASHINFER_SAMPLER: 0
OMP_NUM_THREADS: 4
# The vLLM serve command template
command: |
vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
--max-model-len {max_model_len} \
--gpu-memory-utilization {gpu_memory_utilization} \
--port {port} \
--host {host} \
--load-format fastsafetensors \
--enable-prefix-caching \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--max-num-batched-tokens {max_num_batched_tokens} \
--trust-remote-code \
-tp {tensor_parallel} \
--distributed-executor-backend ray \
--mm-encoder-tp-mode data \
--kv-cache-dtype fp8 \
--compilation-config.cudagraph_mode none \
--max-num-seqs 32 \
--attention-backend flashinfer

View File

@@ -0,0 +1,53 @@
# Recipe: Qwen3.5-397B-A17B-INT4-Autoround
# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
recipe_version: "1"
name: Qwen3.5-397B-INT4-Autoround
description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
# HuggingFace model to download (optional, for --download-model)
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
# Container image to use
container: vllm-node-tf5
build_args:
- --tf5
# Mods required: coder-next tool/reasoning parser + Marlin TP fix
mods:
- mods/fix-qwen3-coder-next
- mods/fix-qwen35-tp4-marlin
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 4
gpu_memory_utilization: 0.78
max_model_len: 32768
max_num_batched_tokens: 8192
# The vLLM serve command template
command: |
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tensor-parallel-size {tensor_parallel} \
--distributed-executor-backend ray \
--kv-cache-dtype fp8 \
--gpu-memory-utilization {gpu_memory_utilization} \
--max-model-len {max_model_len} \
--max-num-batched-tokens {max_num_batched_tokens} \
--enable-prefix-caching \
--trust-remote-code \
--host {host} \
--port {port}

View File

@@ -44,12 +44,16 @@ The recipe runner can automatically discover cluster nodes:
```
When you run `--discover`, it:
1. Scans the network for nodes with SSH access
2. Prompts you to select which nodes to include
3. Saves the configuration to `.env`
1. Detects active CX7 interfaces and determines mesh vs. standard topology.
2. Scans the network for peers that are both SSH-reachable **and** have an NVIDIA GB10 GPU.
3. In mesh mode, separately discovers `COPY_HOSTS` on the direct IB-attached interfaces.
4. Prompts for per-node confirmation for `CLUSTER_NODES` and `COPY_HOSTS`.
5. Saves the full configuration (including mesh NCCL settings if applicable) to `.env`.
Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`.
When distributing the container image or model files, the runner uses `COPY_HOSTS` from `.env` (which may differ from `CLUSTER_NODES` in mesh mode) to ensure transfers go over the fastest available path.
## Workflow Modes
### Solo Mode (Single Node)
@@ -169,6 +173,7 @@ Usage: ./run-recipe.sh [OPTIONS] [RECIPE]
Cluster discovery:
--discover Auto-detect cluster nodes and save to .env
--show-env Show current .env configuration
--config FILE Path to .env configuration file (default: .env in repo directory)
Recipe overrides:
--port PORT Override port
@@ -186,10 +191,25 @@ Setup options:
Launch options:
--solo Run in solo mode (single node, no Ray)
--no-ray Multi-node without Ray (PyTorch distributed backend)
-n, --nodes IPS Comma-separated node IPs (first = head)
-d, --daemon Run in daemon mode
-t, --container IMAGE Override container from recipe
--name NAME Override container name
--nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE)
--master-port PORT Cluster coordination port: Ray head port or PyTorch
distributed master port (default: 29501).
Alias: --head-port
--eth-if IFACE Override Ethernet interface
--ib-if IFACE Override InfiniBand interface
-e VAR=VALUE Pass environment variable to container (repeatable)
-j N Number of parallel build jobs
--no-cache-dirs Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton
--non-privileged Run container without --privileged
--mem-limit-gb N Memory limit in GB (only with --non-privileged)
--mem-swap-limit-gb N Memory+swap limit in GB (only with --non-privileged)
--pids-limit N Process limit (only with --non-privileged)
--shm-size-gb N Shared memory size in GB (only with --non-privileged)
Extra vLLM arguments:
-- ARGS... Pass additional arguments directly to vLLM
@@ -261,10 +281,18 @@ command: |
```
┌─────────────────────────────────────────────────────────┐
│ autodiscover.sh │
│ - Interface detection (standard / mesh topology) │
│ - GB10 peer verification via SSH │
│ - CLUSTER_NODES and COPY_HOSTS discovery │
│ - Interactive .env save with per-node confirmation │
└──────────────────────────┬──────────────────────────────┘
│ sourced by
┌─────────────────────────────────────────────────────────┐
│ run-recipe.sh / run-recipe.py │
│ - Parses YAML recipe │
│ - Auto-discovers cluster nodes (--discover)
│ - Loads nodes from .env │
│ - Loads / triggers cluster discovery (--discover) │
│ - Handles --setup (build + download + run) │
│ - Generates launch script from template │
│ - Applies CLI overrides │
@@ -274,15 +302,15 @@ command: |
┌──────────────────────┐ ┌───────────────────────────────┐
│ build-and-copy.sh │ │ hf-download.sh │
│ - Docker build │ │ - HuggingFace model download │
│ - Copy to workers │ │ - Rsync to workers
│ - Copy to COPY_HOSTS│ │ - Rsync to COPY_HOSTS
└──────────────────────┘ └───────────────────────────────┘
│ then calls (for run)
┌─────────────────────────────────────────────────────────┐
│ launch-cluster.sh │
│ - Cluster orchestration │
│ - Container lifecycle
│ - Container lifecycle (trimmed to required node count)
│ - Mod application │
│ - Launch script execution │
└─────────────────────────────────────────────────────────┘

View File

@@ -0,0 +1,53 @@
# Recipe: Gemma4-26B-A4B
# Gemma4-26B-A4B model in online FP8 quantization
recipe_version: "1"
name: Gemma4-26B-A4B
description: vLLM serving Gemma4-26B-A4B
# HuggingFace model to download (optional, for --download-model)
model: google/gemma-4-26B-A4B-it
# Only cluster is supported
cluster_only: false
solo_only: false
# Container image to use
container: vllm-node-tf5
build_args:
- --tf5
# Mods
# mods:
# - mods/fix-gemma4-tool-parser
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
gpu_memory_utilization: 0.7
max_model_len: 262144
max_num_batched_tokens: 8192
# Environment variables
env: {}
# The vLLM serve command template
command: |
vllm serve google/gemma-4-26B-A4B-it \
--max-model-len {max_model_len} \
--gpu-memory-utilization {gpu_memory_utilization} \
--port {port} \
--host {host} \
--load-format safetensors \
--enable-prefix-caching \
--enable-auto-tool-choice \
--tool-call-parser gemma4 \
--reasoning-parser gemma4 \
--quantization fp8 \
--kv-cache-dtype fp8 \
--max-num-batched-tokens {max_num_batched_tokens} \
-tp {tensor_parallel} --distributed-executor-backend ray

View File

@@ -30,8 +30,8 @@ build_args:
# Mods to apply before running (paths relative to repo root)
# This mod prevents severe inference speed degradation
mods:
- mods/fix-glm-4.7-flash-AWQ
# mods:
# - mods/fix-glm-4.7-flash-AWQ
# Default settings (can be overridden via CLI)
defaults:

View File

@@ -0,0 +1,44 @@
# Recipe: MiniMax-M2.7-AWQ
# MiniMax M2.7 model with AWQ quantization
recipe_version: "1"
name: MiniMax-M2.7-AWQ
description: vLLM serving MiniMax-M2.7-AWQ with Ray distributed backend
# HuggingFace model to download (optional, for --download-model)
model: cyankiwi/MiniMax-M2.7-AWQ-4bit
# Container image to use
container: vllm-node
# Can only be run in a cluster
cluster_only: true
# No mods required
mods: []
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
gpu_memory_utilization: 0.8
max_model_len: 196608
# Environment variables
env: {}
# The vLLM serve command template
command: |
vllm serve cyankiwi/MiniMax-M2.7-AWQ-4bit \
--trust-remote-code \
--port {port} \
--host {host} \
--gpu-memory-utilization {gpu_memory_utilization} \
-tp {tensor_parallel} \
--distributed-executor-backend ray \
--max-model-len {max_model_len} \
--load-format fastsafetensors \
--enable-auto-tool-choice \
--tool-call-parser minimax_m2 \
--reasoning-parser minimax_m2

View File

@@ -25,16 +25,12 @@ defaults:
host: 0.0.0.0
tensor_parallel: 1
gpu_memory_utilization: 0.7
max_model_len: 131072
# Environment variables
env:
VLLM_USE_FLASHINFER_MOE_FP4: 1
VLLM_FLASHINFER_MOE_BACKEND: "throughput"
max_model_len: 262144
# The vLLM serve command template
command: |
vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4 \
--moe-backend cutlass \
--max-model-len {max_model_len} \
--port {port} --host {host} \
--trust-remote-code \
@@ -44,6 +40,5 @@ command: |
--reasoning-parser nano_v3 \
--kv-cache-dtype fp8 \
--enable-prefix-caching \
--attention-backend flashinfer \
--load-format fastsafetensors \
--gpu-memory-utilization {gpu_memory_utilization}

View File

@@ -0,0 +1,46 @@
# Recipe: Nemotron-3-Super-NVFP4
# Uses VLLM_CUTLASS for NVFP4
recipe_version: "1"
name: Nemotron-3-Super-NVFP4-CUTLASS-Optimized
description: vLLM serving Nemotron-3-Super-120B using CUTLASS kernels
model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
container: vllm-node
cluster_only: false
solo_only: false
# mods:
# - mods/nemotron-super
env:
VLLM_FLASHINFER_ALLREDUCE_BACKEND: trtllm
VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1
container: vllm-node
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
gpu_memory_utilization: 0.7
max_model_len: 262144
max_num_seqs: 10
command: |
vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
--kv-cache-dtype fp8 \
--moe-backend cutlass \
--trust-remote-code \
--gpu-memory-utilization {gpu_memory_utilization} \
--max-model-len {max_model_len} \
--max-num-seqs {max_num_seqs} \
--enable-prefix-caching \
--host {host} \
--port {port} \
--enable-auto-tool-choice \
--load-format fastsafetensors \
--tool-call-parser qwen3_coder \
--reasoning-parser nemotron_v3 \
--mamba_ssm_cache_dtype float32 \
--tensor-parallel-size {tensor_parallel} \
--attention-backend TRITON_ATTN \
--distributed-executor-backend ray

View File

@@ -11,6 +11,9 @@ model: openai/gpt-oss-120b
# Container image to use
container: vllm-node-mxfp4
# Only solo now
solo_only: true
# Build arguments for build-and-copy.sh
build_args:
- --exp-mxfp4
@@ -22,7 +25,7 @@ mods: []
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
tensor_parallel: 1
gpu_memory_utilization: 0.70
max_num_batched_tokens: 8192
@@ -37,8 +40,6 @@ command: |
--tool-call-parser openai \
--reasoning-parser openai_gptoss \
--enable-auto-tool-choice \
--tensor-parallel-size {tensor_parallel} \
--distributed-executor-backend ray \
--gpu-memory-utilization {gpu_memory_utilization} \
--enable-prefix-caching \
--load-format fastsafetensors \

View File

@@ -15,8 +15,8 @@ model: Qwen/Qwen3-Coder-Next-FP8
container: vllm-node
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
mods:
- mods/fix-qwen3-coder-next
# mods:
# - mods/fix-qwen3-coder-next
# Default settings (can be overridden via CLI)
defaults:

View File

@@ -0,0 +1,43 @@
# Recipe: Qwen3-Coder-Next-int4-Autoround
# Qwen3-Coder-Next model in Intel int4-Autoround format
recipe_version: "1"
name: Qwen3-Coder-Next-int4-Autoround
description: Qwen3-Coder-Next-int4-Autoround
# HuggingFace model to download (optional, for --download-model)
model: Intel/Qwen3-Coder-Next-int4-AutoRound
solo_only: true
# Container image to use
container: vllm-node
# Mod required to fix autoround weight loading issues
mods:
- mods/fix-qwen3-next-autoround
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
gpu_memory_utilization: 0.7
max_model_len: 262144
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template
command: |
vllm serve Intel/Qwen3-Coder-Next-int4-AutoRound \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--gpu-memory-utilization {gpu_memory_utilization} \
--host {host} \
--port {port} \
--load-format fastsafetensors \
--enable-prefix-caching \
--max-model-len {max_model_len}

View File

@@ -15,7 +15,8 @@ cluster_only: true
container: vllm-node
# No mods required
mods: []
mods:
- mods/fix-qwen3.5-chat-template
# Default settings (can be overridden via CLI)
defaults:
@@ -41,5 +42,6 @@ command: |
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--chat-template unsloth.jinja \
-tp {tensor_parallel} --distributed-executor-backend ray \
--max-num-batched-tokens {max_num_batched_tokens}

View File

@@ -18,7 +18,8 @@ build_args:
# Mod required to fix ROPE syntax error
mods:
- mods/fix-qwen3.5-autoround
# - mods/fix-qwen3.5-autoround
- mods/fix-qwen3.5-chat-template
# Default settings (can be overridden via CLI)
defaults:
@@ -43,10 +44,11 @@ command: |
--load-format fastsafetensors \
--enable-prefix-caching \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--tool-call-parser qwen3_xml \
--reasoning-parser qwen3 \
--max-num-batched-tokens {max_num_batched_tokens} \
--trust-remote-code \
--chat-template unsloth.jinja \
-tp {tensor_parallel} \
--distributed-executor-backend ray

View File

@@ -0,0 +1,51 @@
# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
# Qwen/Qwen3.5-35B-A3B model in native FP8 format
recipe_version: "1"
name: Qwen35-35B-A3B
description: vLLM serving Qwen3.5-35B-A3B-FP8
# HuggingFace model to download (optional, for --download-model)
model: Qwen/Qwen3.5-35B-A3B-FP8
#solo_only: true
# Container image to use
container: vllm-node
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
mods:
- mods/fix-qwen3-coder-next
- mods/fix-qwen3.5-chat-template
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
gpu_memory_utilization: 0.7
max_model_len: 262144
max_num_batched_tokens: 16384
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template
command: |
vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
--host {host} \
--port {port} \
--max-model-len {max_model_len} \
--max-num-batched-tokens {max_num_batched_tokens} \
--gpu-memory-utilization {gpu_memory_utilization} \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--kv-cache-dtype fp8 \
--load-format fastsafetensors \
--attention-backend flashinfer \
--enable-prefix-caching \
--chat-template unsloth.jinja \
-tp {tensor_parallel} \
--distributed-executor-backend ray

View File

@@ -0,0 +1,59 @@
# Recipe: Qwen3.5-122B-A10B-INT4-Autoround
# Qwen3.5-122B model in Intel INT4-Autoround quantization
# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks.
# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150`
recipe_version: "1"
name: Qwen3.5-397B-INT4-Autoround
description: EXPERIMENTAL recipe for Qwen3.5-397B-INT4-Autoround (please refer to README for details! Use with `--no-ray` parameter!)
# HuggingFace model to download (optional, for --download-model)
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
cluster_only: true
# Container image to use
container: vllm-node-tf5
build_args:
- --tf5
mods:
- mods/fix-qwen3.5-chat-template
#- mods/gpu-mem-util-gb
# - mods/drop-caches
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
gpu_memory_utilization: 0.9
max_model_len: 262144
max_num_batched_tokens: 4176
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template
command: |
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
--max-model-len {max_model_len} \
--max-num-seqs 2 \
--kv-cache-dtype fp8 \
--gpu-memory-utilization {gpu_memory_utilization} \
--port {port} \
--host {host} \
--enable-prefix-caching \
--enable-auto-tool-choice \
--tool-call-parser qwen3_xml \
--reasoning-parser qwen3 \
--max-num-batched-tokens {max_num_batched_tokens} \
--trust-remote-code \
--chat-template unsloth.jinja \
--load-format instanttensor \
-tp {tensor_parallel} \
--distributed-executor-backend ray

View File

@@ -0,0 +1,51 @@
# Recipe: Qwen/Qwen3.6-35B-A3B-FP8
# Qwen/Qwen3.6-35B-A3B model in native FP8 format
recipe_version: "1"
name: Qwen36-35B-A3B
description: vLLM serving Qwen3.6-35B-A3B-FP8
# HuggingFace model to download (optional, for --download-model)
model: Qwen/Qwen3.6-35B-A3B-FP8
#solo_only: true
# Container image to use
container: vllm-node
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
mods:
- mods/fix-qwen3.6-chat-template
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
gpu_memory_utilization: 0.7
max_model_len: 262144
max_num_batched_tokens: 16384
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template
command: |
vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \
--host {host} \
--port {port} \
--max-model-len {max_model_len} \
--max-num-batched-tokens {max_num_batched_tokens} \
--gpu-memory-utilization {gpu_memory_utilization} \
--enable-auto-tool-choice \
--tool-call-parser qwen3_xml \
--reasoning-parser qwen3 \
--load-format fastsafetensors \
--attention-backend flash_attn \
--enable-prefix-caching \
--chat-template fixed_chat_template.jinja \
--speculative-config '{{"method": "dflash", "model": "z-lab/Qwen3.6-35B-A3B-DFlash", "num_speculative_tokens": 15}}' \
-tp {tensor_parallel} \
--distributed-executor-backend ray

View File

@@ -0,0 +1,49 @@
# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
# Qwen/Qwen3.5-35B-A3B model in native FP8 format
recipe_version: "1"
name: Qwen36-35B-A3B
description: vLLM serving Qwen3.6-35B-A3B-FP8
# HuggingFace model to download (optional, for --download-model)
model: Qwen/Qwen3.6-35B-A3B-FP8
#solo_only: true
# Container image to use
container: vllm-node
mods:
- mods/fix-qwen3.6-chat-template
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 2
gpu_memory_utilization: 0.7
max_model_len: 262144
max_num_batched_tokens: 16384
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template
command: |
vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \
--host {host} \
--port {port} \
--max-model-len {max_model_len} \
--max-num-batched-tokens {max_num_batched_tokens} \
--gpu-memory-utilization {gpu_memory_utilization} \
--enable-auto-tool-choice \
--tool-call-parser qwen3_xml \
--kv-cache-dtype fp8 \
--load-format fastsafetensors \
--attention-backend flashinfer \
--enable-prefix-caching \
--chat-template fixed_chat_template.jinja \
-tp {tensor_parallel} \
--distributed-executor-backend ray

View File

@@ -1,120 +0,0 @@
#!/bin/bash
set -e
# Define a function to export immediately AND save to .bashrc for future sessions
export_persist() {
local var_name="$1"
local var_value="$2"
# 1. Export for the current running process
export "$var_name"="$var_value"
# 2. Append to .bashrc (idempotent check to avoid duplicate lines)
if ! grep -q "export $var_name=" ~/.bashrc; then
echo "export $var_name=\"$var_value\"" >> ~/.bashrc
else
# Optional: Update the existing line if it exists
sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
fi
}
# --- Help Function ---
usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Required Arguments:"
echo " -r, --role <head|node> : Set the node type"
echo " -h, --host-ip <ip> : IP address of this interface (Host IP)"
echo " -e, --eth-if <name> : Ethernet interface name (e.g., eth0)"
echo " -i, --ib-if <name> : InfiniBand/RDMA interface name"
echo ""
echo "Conditional Arguments:"
echo " -m, --head-ip <ip> : IP of the head node (REQUIRED if role is 'node')"
echo ""
echo "Example:"
echo " $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
echo " $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
exit 1
}
# --- Argument Parsing ---
# Initialize variables to empty
NODE_TYPE=""
HOST_IP=""
ETH_IF_NAME=""
IB_IF_NAME=""
HEAD_IP=""
while [[ "$#" -gt 0 ]]; do
case $1 in
-r|--role) NODE_TYPE="$2"; shift ;;
-h|--host-ip) HOST_IP="$2"; shift ;;
-e|--eth-if) ETH_IF_NAME="$2"; shift ;;
-i|--ib-if) IB_IF_NAME="$2"; shift ;;
-m|--head-ip) HEAD_IP="$2"; shift ;;
*) echo "Unknown parameter passed: $1"; usage ;;
esac
shift
done
# --- Validation ---
# 1. Check if all common required arguments are present
if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
echo "Error: Missing required arguments."
usage
fi
# 2. Validate Role
if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
echo "Error: --role must be 'head' or 'node'."
exit 1
fi
# 3. Conditional Check for Head IP
if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
echo "Error: When --role is 'node', you must provide --head-ip."
exit 1
fi
# --- Environment Configuration ---
echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
export_persist VLLM_HOST_IP "$HOST_IP"
export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
# Network Interface
export_persist MN_IF_NAME "$ETH_IF_NAME"
export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
# InfiniBand
export_persist NCCL_IB_HCA "$IB_IF_NAME"
export_persist NCCL_IB_DISABLE "0"
# Sockets/Transport
export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist RAY_memory_monitor_refresh_ms "0"
# --- Execution ---
if [ "${NODE_TYPE}" == "head" ]; then
echo "Starting Ray HEAD node..."
exec ray start --block --head --port 6379 \
--node-ip-address "$VLLM_HOST_IP" \
--include-dashboard=True \
--dashboard-host "0.0.0.0" \
--dashboard-port 8265 \
--disable-usage-stats
else
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
exec ray start --block \
--address="$HEAD_IP:6379" \
--node-ip-address "$VLLM_HOST_IP"
fi

File diff suppressed because it is too large Load Diff

View File

@@ -728,6 +728,48 @@ test_launch_cmd_no_solo_in_cluster() {
fi
}
# Test: -e / --env passthrough to launch-cluster.sh
test_launch_cmd_env_passthrough() {
log_test "Launch command includes -e env vars"
recipe_name=$(find_solo_recipe)
if [[ -z "$recipe_name" ]]; then
log_skip "No solo-capable recipes found"
return
fi
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -e HF_TOKEN=test123 -e MY_VAR=hello 2>&1)
launch_cmd=$(extract_launch_cmd "$output")
if echo "$launch_cmd" | grep -q "\-e HF_TOKEN=test123" && echo "$launch_cmd" | grep -q "\-e MY_VAR=hello"; then
log_pass "Launch command includes -e env vars"
else
log_fail "-e env vars not found in launch command"
log_verbose "Launch cmd: $launch_cmd"
fi
}
# Test: no -e flags when none specified
test_launch_cmd_no_env_by_default() {
log_test "Launch command omits -e when no env vars specified"
recipe_name=$(find_solo_recipe)
if [[ -z "$recipe_name" ]]; then
log_skip "No solo-capable recipes found"
return
fi
output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
launch_cmd=$(extract_launch_cmd "$output")
if echo "$launch_cmd" | grep -q " -e "; then
log_fail "Unexpected -e flag in launch command"
log_verbose "Launch cmd: $launch_cmd"
else
log_pass "Launch command correctly omits -e when none specified"
fi
}
# ==============================================================================
# README Documentation Verification Tests
# ==============================================================================
@@ -1203,6 +1245,8 @@ main() {
test_launch_cmd_launch_script
test_launch_cmd_container_override
test_launch_cmd_no_solo_in_cluster
test_launch_cmd_env_passthrough
test_launch_cmd_no_env_by_default
echo ""
# README documentation verification tests

2
wheels/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*
!.gitignore