Fixed 3-node Qwen 397B recipe to prevent OOM and use instanttensor

Adjusted Qwen3.5-397B recipe to fix OOM issue and lower memory requirements
Fixed OOM for Qwen3.5-397B
2026-05-10 22:20:49 -07:00 · 2026-05-09 13:45:15 -07:00 · 2026-05-09 13:25:31 -07:00 · 2026-05-08 16:32:54 -07:00 · 2026-05-08 14:59:13 -07:00 · 2026-05-08 13:40:55 -07:00
50 changed files with 4141 additions and 960 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,38 @@
+# Example .env configuration file for spark-vllm-docker
+# Copy this file to .env and customize for your environment
+
+# Cluster configuration
+# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node)
+CLUSTER_NODES="192.168.177.11,192.168.177.12"
+
+# ETH_IF: Ethernet interface name (optional, auto-detected if not specified)
+ETH_IF="enp1s0f1np1"
+
+# IB_IF: InfiniBand interface name (optional, auto-detected if not specified)
+IB_IF="rocep1s0f1,roceP2p1s0f1"
+
+# LOCAL_IP: Local IP address (optional, auto-detected if not specified)
+# Useful for solo mode or overriding auto-detection
+LOCAL_IP="192.168.177.11"
+
+# MASTER_PORT: Port for cluster coordination (default: 29501)
+MASTER_PORT="29501"
+
+# CONTAINER_NAME: Container name (default: vllm_node)
+# Note: This is a configuration variable, NOT passed as env var to container
+CONTAINER_NAME="vllm_node"
+
+# Container environment variables
+# Any variable starting with CONTAINER_ (except CONTAINER_NAME) will be converted to -e flags
+# Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO
+CONTAINER_NCCL_DEBUG="INFO"
+CONTAINER_HF_TOKEN="your_huggingface_token_here"
+CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1"
+
+# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional)
+# Used by build-and-copy.sh to distribute images across cluster
+COPY_HOSTS="192.168.177.12"
+
+# Additional container environment variables
+# CONTAINER_MAX_JOBS="16"
+# CONTAINER_CUDA_VISIBLE_DEVICES="0,1"
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-.env
+.env
+build-metadata.yaml
--- a/170
+++ b/170
@@ -4,9 +4,9 @@
 ARG BUILD_JOBS=16

 # =========================================================
-# STAGE 1: Base Image (Installs Dependencies)
+# STAGE 1: Base Build Image
 # =========================================================
-FROM nvcr.io/nvidia/pytorch:26.01-py3 AS base
+FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS base

 # Build parallemism
 ARG BUILD_JOBS
@@ -14,6 +14,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
 ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
 ENV NINJAFLAGS="-j${BUILD_JOBS}"
 ENV MAKEFLAGS="-j${BUILD_JOBS}"
+ENV DG_JIT_USE_NVRTC=1
+ENV USE_CUDNN=1

 # Set non-interactive frontend to prevent apt prompts
 ENV DEBIAN_FRONTEND=noninteractive
@@ -27,6 +29,9 @@ ENV UV_CACHE_DIR=/root/.cache/uv
 ENV UV_SYSTEM_PYTHON=1
 ENV UV_BREAK_SYSTEM_PACKAGES=1
 ENV UV_LINK_MODE=copy
+# Set timeouts
+ENV UV_HTTP_TIMEOUT=600
+ENV UV_HTTP_RETRIES=10

 # Set the base directory environment variable
 ENV VLLM_BASE_DIR=/workspace/vllm
@@ -35,10 +40,18 @@ ENV VLLM_BASE_DIR=/workspace/vllm
 # Added ccache to enable incremental compilation caching
 RUN apt update && \
    apt install -y --no-install-recommends \
-    curl vim ninja-build git \
-    ccache \
+    curl vim cmake build-essential ninja-build \
+    libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \
+    python3-dev python3-pip git wget \
+    libibverbs1 libibverbs-dev rdma-core \
+    ccache devscripts debhelper fakeroot \
    && rm -rf /var/lib/apt/lists/* \
-    && pip install uv && pip uninstall -y flash-attn
+    && pip install uv
+
+# Additional deps
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+     uv pip install torch==2.11.0 torchvision torchaudio triton --index-url https://download.pytorch.org/whl/cu130 && \
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2" filelock pynvml requests tqdm

 # Configure Ccache for CUDA/C++
 ENV PATH=/usr/lib/ccache:$PATH
@@ -51,14 +64,19 @@ ENV CCACHE_COMPRESS=1
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache

-# Setup Workspace
-WORKDIR $VLLM_BASE_DIR
-
 # 2. Set Environment Variables
 ARG TORCH_CUDA_ARCH_LIST="12.1a"
 ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas

+# Setup Workspace
+WORKDIR $VLLM_BASE_DIR
+
+# Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb
+RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \
+    cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \
+    make pkg.debian.build && apt install -y --no-install-recommends --allow-downgrades ./build/pkg/deb/*.deb
+
 # =========================================================
 # STAGE 2: FlashInfer Builder
 # =========================================================
@@ -73,8 +91,9 @@ ARG FLASHINFER_REF=main
 # Change this argument to force a re-download of FlashInfer
 ARG CACHEBUST_FLASHINFER=1

+# Additional deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
+     uv pip install packaging

 # Smart Git Clone (Fetch changes instead of full re-clone)
 RUN --mount=type=cache,id=repo-cache,target=/repo-cache \
@@ -100,6 +119,31 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \

 WORKDIR /workspace/flashinfer

+ARG FLASHINFER_PRS=""
+
+RUN if [ -n "$FLASHINFER_PRS" ]; then \
+        # Git requires a user identity to create merge commits
+        git config --global user.email "builder@example.com"; \
+        git config --global user.name "Docker Builder"; \
+        \
+        echo "Applying PRs: $FLASHINFER_PRS"; \
+        for pr in $FLASHINFER_PRS; do \
+            echo "Fetching and merging PR #$pr..."; \
+            git fetch origin pull/${pr}/head:pr-${pr}; \
+            git merge pr-${pr} --no-edit; \
+        done; \
+    fi
+
+# TEMPORARY patch for flashinfer autotune and other improvements (PR 2927) - MERGED 4/3
+# RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/2927.diff -o pr2927.diff \
+#     && if git apply --reverse --check pr2927.diff 2>/dev/null; then \
+#          echo "PR #2927 already applied, skipping."; \
+#        else \
+#          echo "Applying FI PR #2927..."; \
+#          git apply -v pr2927.diff; \
+#        fi \
+#     && rm pr2927.diff
+
 # Apply patch to avoid re-downloading existing cubins
 COPY flashinfer_cache.patch .
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
@@ -113,7 +157,9 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    cd flashinfer-cubin && uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
    # flashinfer-jit-cache
    cd ../flashinfer-jit-cache && \
-    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # dump git ref in the wheels dir
+    cd .. && git rev-parse HEAD > /workspace/wheels/.flashinfer-commit

 # =========================================================
 # STAGE 3: FlashInfer Wheel Export
@@ -130,9 +176,6 @@ ARG TORCH_CUDA_ARCH_LIST="12.1a"
 ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 WORKDIR $VLLM_BASE_DIR

-RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
-
 # --- VLLM SOURCE CACHE BUSTER ---
 ARG CACHEBUST_VLLM=1

@@ -166,20 +209,56 @@ WORKDIR $VLLM_BASE_DIR/vllm
 ARG VLLM_PRS=""

 RUN if [ -n "$VLLM_PRS" ]; then \
+        # Git requires a user identity to create merge commits
+        git config --global user.email "builder@example.com"; \
+        git config --global user.name "Docker Builder"; \
+        \
        echo "Applying PRs: $VLLM_PRS"; \
        for pr in $VLLM_PRS; do \
-            echo "Fetching and applying PR #$pr..."; \
-            curl -fL "https://github.com/vllm-project/vllm/pull/${pr}.diff" | git apply -v; \
+            echo "Fetching and merging PR #$pr..."; \
+            git fetch origin pull/${pr}/head:pr-${pr}; \
+            git merge pr-${pr} --no-edit; \
        done; \
    fi

+# TEMPORARY PATCH for broken FP8 kernels - https://github.com/vllm-project/vllm/pull/35568
+RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/35568.diff -o pr35568.diff \
+    && if git apply --reverse --check pr35568.diff 2>/dev/null; then \
+         echo "PR 35568 already applied, skipping."; \
+       else \
+         echo "Applying PR 35568..."; \
+         git apply -v --exclude="tests/*" pr35568.diff; \
+       fi \
+    && rm pr35568.diff
+
+# TEMPORARY PATCH: revert vLLM PR #41524 / commit c51df430,
+# which disables FlashInfer autotune and regresses DGX Spark throughput.
+RUN set -eux; \
+    patch_commit="c51df43005726a09c6eb7348e8c1b00501c70a8e"; \
+    target="vllm/config/vllm.py"; \
+    marker="https://github.com/flashinfer-ai/flashinfer/issues/3197"; \
+    if grep -q "$marker" "$target"; then \
+        echo "PR #41524 regression found; reverting ${patch_commit}"; \
+        if ! git revert --no-commit "$patch_commit"; then \
+            git revert --abort 2>/dev/null || true; \
+            echo "ERROR: PR #41524 appears present but could not be reverted"; \
+            exit 1; \
+        fi; \
+        if grep -q "$marker" "$target"; then \
+            echo "ERROR: revert completed but PR #41524 marker is still present"; \
+            exit 1; \
+        fi; \
+    else \
+        echo "PR #41524 regression marker not present; skipping revert"; \
+    fi
+
 # Prepare build requirements
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
    python3 use_existing_torch.py && \
    sed -i "/flashinfer/d" requirements/cuda.txt && \
-    sed -i '/^triton\b/d' requirements/test.txt && \
-    sed -i '/^fastsafetensors\b/d' requirements/test.txt && \
-    uv pip install -r requirements/build.txt
+    sed -i '/^triton\b/d' requirements/test/cuda.txt && \
+    sed -i '/^fastsafetensors\b/d' requirements/test/cuda.txt && \
+    uv pip install -r requirements/build/cuda.txt

 # Apply Patches
 # TEMPORARY PATCH for fastsafetensors loading in cluster setup - tracking https://github.com/vllm-project/vllm/issues/34180
@@ -190,13 +269,15 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
 #         patch -p1 < fastsafetensors.patch; \
 #     fi
 # TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302
-RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping"
-RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping"
+# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping"
+# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping"

 # Final Compilation
 RUN --mount=type=cache,id=ccache,target=/root/.ccache \
    --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v
+    uv build --no-build-isolation --wheel . --out-dir=/workspace/wheels -v && \
+    # dump git ref in the wheels dir
+    git rev-parse HEAD > /workspace/wheels/.vllm-commit

 # =========================================================
 # STAGE 5: vLLM Wheel Export
@@ -207,7 +288,7 @@ COPY --from=vllm-builder /workspace/wheels /
 # =========================================================
 # STAGE 6: Runner (Installs wheels from host ./wheels/)
 # =========================================================
-FROM nvcr.io/nvidia/pytorch:26.01-py3 AS runner
+FROM nvidia/cuda:13.2.0-devel-ubuntu24.04 AS runner

 # Transferring build settings from build image because of ptxas/jit compilation during vLLM startup
 # Build parallemism
@@ -216,6 +297,8 @@ ENV MAX_JOBS=${BUILD_JOBS}
 ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
 ENV NINJAFLAGS="-j${BUILD_JOBS}"
 ENV MAKEFLAGS="-j${BUILD_JOBS}"
+ENV DG_JIT_USE_NVRTC=1
+ENV USE_CUDNN=1

 ENV DEBIAN_FRONTEND=noninteractive
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
@@ -228,13 +311,18 @@ ENV UV_SYSTEM_PYTHON=1
 ENV UV_BREAK_SYSTEM_PACKAGES=1
 ENV UV_LINK_MODE=copy

+# Mount additional packages from base builder image
 # Install runtime dependencies
-RUN apt update && \
+RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target=/workspace/nccl-pkg \
+    apt update && \
    apt install -y --no-install-recommends \
-    curl vim git \
+    python3 python3-pip python3-dev vim curl git wget \
+    libcudnn9-cuda-13 \
+    libibverbs1 libibverbs-dev rdma-core \
    libxcb1 \
+    && cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \
    && rm -rf /var/lib/apt/lists/* \
-    && pip install uv && pip uninstall -y flash-attn # triton-kernels pytorch-triton
+    && pip install uv

 # Set final working directory
 WORKDIR $VLLM_BASE_DIR
@@ -246,6 +334,11 @@ RUN mkdir -p tiktoken_encodings && \

 ARG PRE_TRANSFORMERS=0

+# Install deps
+RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
+     uv pip install torch==2.11.0 torchvision torchaudio triton --index-url https://download.pytorch.org/whl/cu130 && \
+     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
+
 # Install wheels from host ./wheels/ (bind-mounted from build context — no layer bloat)
 # With --tf5: override vLLM's transformers<5 constraint to get transformers>=5
 RUN --mount=type=bind,source=wheels,target=/workspace/wheels \
@@ -266,27 +359,14 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
 ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
 ENV PATH=$VLLM_BASE_DIR:$PATH

-# Copy scripts
-COPY run-cluster-node.sh $VLLM_BASE_DIR/
-RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh

 # Final extra deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
+    uv pip install ray[default] fastsafetensors instanttensor

-# Cleanup
-
-# Keeping it here for reference - this won't work as is without squashing layers
-# RUN uv pip uninstall absl-py apex argon2-cffi \
-#     argon2-cffi-bindings arrow asttokens astunparse async-lru audioread babel beautifulsoup4 \
-#     black bleach comm contourpy cycler datasets debugpy decorator defusedxml dllist dm-tree \
-#     execnet executing expecttest fastjsonschema fonttools fqdn gast hypothesis \
-#     ipykernel ipython ipython_pygments_lexers isoduration isort jedi joblib jupyter-events \
-#     jupyter-lsp jupyter_client jupyter_core jupyter_server jupyter_server_terminals jupyterlab \
-#     jupyterlab_code_formatter jupyterlab_code_formatter jupyterlab_pygments jupyterlab_server \
-#     jupyterlab_tensorboard_pro jupytext kiwisolver matplotlib matplotlib-inline matplotlib-inline \
-#     mistune ml_dtypes mock nbclient nbconvert nbformat nest-asyncio notebook notebook_shim \
-#     opt_einsum optree outlines_core overrides pandas pandocfilters parso pexpect polygraphy pooch \
-#     pyarrow pycocotools pytest-flakefinder pytest-rerunfailures pytest-shard pytest-xdist \
-#     scikit-learn scipy Send2Trash soundfile soupsieve soxr spin stack-data \
-#     wcwidth webcolors xdoctest Werkzeug
+# Fix NCCL
+RUN rm /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2 && \
+    ln -s /usr/lib/aarch64-linux-gnu/libnccl.so.2 /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2
+    
+# Build metadata (generated by build-and-copy.sh)
+COPY build-metadata.yaml /workspace/build-metadata.yaml
--- a/Dockerfile.mxfp4
+++ b/Dockerfile.mxfp4
@@ -98,10 +98,10 @@ ARG FLASHINFER_REPO=https://github.com/christopherowen/flashinfer.git
 ARG CUTLASS_REPO=https://github.com/christopherowen/cutlass.git

 ARG FLASHINFER_SHA=f349e52496a72a00d8c4ac02c7a1e38523ff7194
-ARG CUTLASS_SHA=c7516ad20f3d022fdbc93e9468643bf3b577e02c
+ARG CUTLASS_SHA=fede53000a962b46e05bafe0c86311778caeb380

 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-     uv pip install nvidia-nvshmem-cu13 "apache-tvm-ffi<0.2"
+     uv pip install "nvidia-nvshmem-cu13<3.6" "apache-tvm-ffi<0.2"

 # Clone FlashInfer (cached for faster rebuilds)
 RUN --mount=type=cache,id=git-flashinfer,target=/git-cache/flashinfer \
@@ -270,13 +270,12 @@ ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
 ENV TIKTOKEN_ENCODINGS_BASE=$VLLM_BASE_DIR/tiktoken_encodings
 ENV PATH=$VLLM_BASE_DIR:$PATH

-# Copy scripts
-COPY run-cluster-node.sh $VLLM_BASE_DIR/
-RUN chmod +x $VLLM_BASE_DIR/run-cluster-node.sh
-
 # Final extra deps
 RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \
-    uv pip install ray[default] fastsafetensors nvidia-nvshmem-cu13
+    uv pip install ray[default] fastsafetensors "nvidia-nvshmem-cu13<3.6"
+
+# Build metadata (generated by build-and-copy.sh)
+COPY build-metadata.yaml /workspace/build-metadata.yaml

 # If not compiling Triton
 # remove triton-kernels as they are not compatible with this vLLM version yet
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 # vLLM Docker Optimized for DGX Spark (single or multi-node)

 This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups.
+Cluster setup supports direct connect between dual Sparks, connecting via QSFP/RoCE switch and 3-node mesh configuration.

 While it was primarily developed to support multi-node inference, it works just as well on a single node setups.

@@ -26,7 +27,12 @@ While it was primarily developed to support multi-node inference, it works just

 This repository is not affiliated with NVIDIA or their subsidiaries. This is a community effort aimed to help DGX Spark users to set up and run the most recent versions of vLLM on Spark cluster or single nodes. 

-The Dockerfile builds from the main branch of VLLM, so depending on when you run the build process, it may not be in fully functioning state. You can target a specific vLLM release by setting `--vllm-ref` parameter.
+Unless `--rebuild-vllm` or `--vllm-ref` or `--apply-vllm-pr` is specified, the builder will fetch the latest precompiled vLLM wheels from the repository. They are built nightly and tested on multiple models in both cluster and solo configuration before publishing.
+We will expand the selection of models we test in the pipeline, but since vLLM is a rapidly developing platform, some things may break.
+
+If you want to build the latest from main branch, you can specify `--rebuild-vllm` flag. Or you can target a specific vLLM release by setting `--vllm-ref` parameter.
+
+Similarly, `--rebuild-flashinfer`, `--flashinfer-ref`, and `--apply-flashinfer-pr` control the FlashInfer build in the same way.

 ## QUICK START

@@ -49,8 +55,8 @@ Build the container.

 **On DGX Spark cluster:**

-Make sure you connect your Sparks together and enable passwordless SSH as described in NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks). 
-You can also check out our new [Networking Guide](docs/NETWORKING.md).
+Make sure you connect your Sparks together and enable passwordless SSH as described in our [Networking Guide](docs/NETWORKING.md). You can also check out NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks), but using our guide is the best way to get started.
+**NEW**: the guide now includes instructions on setting up 3-node Spark mesh!

 Then run the following command that will build and distribute image across the cluster.

@@ -58,13 +64,13 @@ Then run the following command that will build and distribute image across the c
 ./build-and-copy.sh -c
 ```

-An initial build will take around 20-30 minutes, but subsequent builds will be faster. Precompiled vLLM wheels for DGX Spark will also be available soon.
+An initial build speed depends on your Internet connection speed and whether the base image is already present on your machine. After base image pull, the build should take only 2-3 minutes. If `--rebuild-vllm` and/or `--rebuild-flashinfer` is used to trigger a build from the sourcew, it will take between 20-40 minutes, but subsequent builds will be faster. Prebuilt FlashInfer and vLLM wheels are downloaded automatically from GitHub releases, so compilation from source is usually not required.

 ### Run

 **On a single node**:

-**NEW** - `launch-cluster.sh` now supports solo mode, which is now a recommended way to run the container on a single Spark:
+`launch-cluster.sh` supports solo mode, which is now a recommended way to run the container on a single Spark:

 ```bash
 ./launch-cluster.sh --solo exec \
@@ -75,23 +81,6 @@ An initial build will take around 20-30 minutes, but subsequent builds will be f
    --load-format fastsafetensors
 ```

-**To launch using regular `docker run`**
-
-```bash
- docker run \
-  --privileged \
-  --gpus all \
-  -it --rm \
-  --network host --ipc=host \
-  -v  ~/.cache/huggingface:/root/.cache/huggingface \
-  vllm-node \
-  bash -c -i "vllm serve \
-  QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ \
-  --port 8000 --host 0.0.0.0 \
-  --gpu-memory-utilization 0.7 \
-  --load-format fastsafetensors"
-```
-
 **On a cluster**

 It's recommended to download the model on one node and distribute across the cluster using ConnectX interconnect prior to launching. This is to avoid re-downloading the model from the Internet on every node in the cluster.
@@ -120,15 +109,13 @@ To launch the model:

 This will run the model on all available cluster nodes.

-**NOTE:** do not use `--load-format fastsafetensors` if you are loading models that would take >0.8 of available RAM (without KV cache) as it may result in out of memory situation.
+**NOTE:** do not use `--load-format fastsafetensors` if you are loading models that would take >0.85 of available RAM (without KV cache) as it may result in out of memory situation.

 **Also:** You can use any vLLM container that has "bash" as its default entrypoint with the launch script. It was tested with NGC vLLM, but can work with others too. To use such container in the cluster, you need to specify `--apply-mod use-ngc-vllm` argument to `./launch-cluster.sh`. However, it's recommended to build the container using this repository for best compatibility and most up-to-date features. 

-## CHANGELOG
-
 **IMPORTANT**

-You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning. 
+You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning.

 You can check the build cache size by running:

@@ -146,6 +133,325 @@ Don't do it every time you rebuild, because it will slow down compilation times.

 For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`

+## CHANGELOG
+
+### 2026-04-14
+
+Added `--load-format instanttensor` support to vLLM - thanks @SeraphimSerapis. 
+An experimental option for now, but allows for faster loading than the current fastsafetensors default. You need to rebuild the container to start using the option, but you don't have to trigger the source build.
+
+### 2026-04-12
+
+#### Drop-caches mod for Qwen3.5-397B
+
+Updated Qwen3.5-397B recipe (for dual node configuration) to use the new mod `mods/drop-caches` which clears filesystem caches every minute while the container is running, resolving fastsafetensors getting stuck during loading and a few other bugs when operating close to max memory limit.
+
+### 2026-04-11
+
+#### Pinned PyTorch Version
+
+Pinned PyTorch to version 2.11.0 (previously using nightly builds) to fix incompatibility with transformers 5.x and avoid torch version mismatch in builds.
+
+### 2026-04-02
+
+A new recipe for Gemma4-26B-A4B in "on-the-fly" FP8 quantization:
+
+Single Spark:
+
+```bash
+./run-recipe.sh gemma4-26b-a4b --solo
+```
+
+Dual Sparks: 
+
+```bash
+./run-recipe.sh gemma4-26b-a4b --no-ray
+```
+
+### 2026-03-31
+
+#### Flags to specify Flashinfer ref and apply PRs
+
+`build-and-copy.sh` gains two new flags that mirror the existing vLLM equivalents:
+
+- `--flashinfer-ref <ref>` — build FlashInfer from a specific commit SHA, branch, or tag instead of `main`. Forces a local FlashInfer build (skips prebuilt wheel download).
+- `--apply-flashinfer-pr <pr-num>` — fetch and apply a FlashInfer GitHub PR patch before building. Can be specified multiple times. Forces a local FlashInfer build.
+
+Both flags are incompatible with `--exp-mxfp4`.
+
+#### Default image tag in `build-and-copy.sh`
+
+`build-and-copy.sh` now automatically sets a sensible default image tag when `-t` is not specified:
+
+- `--tf5` / `--pre-tf` - tag defaults to `vllm-node-tf5`
+- `--exp-mxfp4` - tag defaults to `vllm-node-mxfp4`
+- in all other cases - tag defaults to `vllm-node` (no change)
+
+An explicit `-t <tag>` always takes precedence.
+
+#### Support for 3-node mesh setups
+
+Added initial support for setups where 3 Sparks are connected in a ring-like mesh without an additional switch.
+See [Networking Guide](docs/NETWORKING.md) for instructions on how to connect and set up networking in such cluster.
+
+Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can detect mesh setups and configure parameters accordingly.
+
+You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe:
+
+```bash
+./run-recipe.sh --discover # force mesh discovery
+./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls
+```
+
+Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are:
+
+- `--pipeline-parallel 3` will let you run a model that can't fit on dual Sparks, but without additional speed improvements (total throughtput may improve though).
+- `--data-parallel 3` (possibly with `--enable-expert-parallel`) will let you run a model that can fit on a single Spark, but allow for better concurrency.
+
+You can also run models with `--tensor-parallel 2` in a 3-node configuration - in this case only first two nodes (from autodiscovery/.env or from the CLI parameters) will be utilized.
+
+#### GB10 Verification During Node Discovery
+
+Node discovery now confirms each SSH-reachable peer is a GB10 system before adding it to the cluster:
+Only hosts reporting `NVIDIA GB10` are included. This prevents accidentally adding non-Spark machines that happen to be on the same subnet.
+
+#### Separate COPY_HOSTS Discovery
+
+Autodiscover now determines the host list used for image and model distribution separately from `CLUSTER_NODES`:
+
+- **Non-mesh**: `COPY_HOSTS` mirrors `CLUSTER_NODES` (no change in behaviour).
+- **Mesh**: scans the direct IB-attached `enp1s0f0np0` and `enp1s0f1np1` interfaces (not the OOB ETH interface), so large file transfers use the faster direct InfiniBand path.
+
+`COPY_HOSTS` is saved to `.env` and respected by `build-and-copy.sh`, `hf-download.sh`, and `run-recipe.py`.
+
+#### Interactive Configuration Save in `autodiscover.sh`
+
+`autodiscover.sh` now handles `.env` creation with a guided interactive flow, replacing the previous logic in `run-recipe.py`:
+
+- Runs automatically when `.env` is absent.
+- Asks per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
+- Skips if `.env` already exists (use `--setup` to force).
+
+`run-recipe.py` no longer contains its own `.env`-save prompt — it delegates entirely to `autodiscover.sh`.
+
+#### `--setup` Flag in `launch-cluster.sh` and `build-and-copy.sh`
+
+Both scripts now accept `--setup` to force a full autodiscovery run and overwrite the existing `.env`:
+
+```bash
+./launch-cluster.sh --setup exec vllm serve ...
+./build-and-copy.sh --setup -c
+```
+
+This is equivalent to the existing `--setup` in `run-recipe.sh`.
+
+#### `--config` Flag
+
+`hf-download.sh`, `build-and-copy.sh` and `launch-cluster.sh` now accept `--config <file>` to load a custom `.env` configuration file. `COPY_HOSTS` from the config is used for model distribution:
+
+```bash
+./hf-download.sh QuantTrio/MiniMax-M2-AWQ --config /path/to/cluster.env -c --copy-parallel
+```
+
+#### Parallelism-Aware Node Trimming
+
+`launch-cluster.sh` now parses `-tp` / `--tensor-parallel-size`, `-pp` / `--pipeline-parallel-size`, and `-dp` / `--data-parallel-size` from the exec command or launch script and adjusts the active node count accordingly — for both Ray and no-Ray modes.
+
+- If **fewer nodes are needed** than configured, only the required nodes get containers started (excess nodes are left idle).
+- If **more nodes are needed** than available, an error is raised before anything starts.
+
+```
+Note: Command requires 2 node(s) (tp=2 * pp=1 * dp=1); using 2 of 3 configured node(s).
+Error: Command requires 4 nodes (tp=4 * pp=1 * dp=1) but only 3 node(s) are configured.
+```
+
+No flags required — the check is automatic whenever parallelism arguments are present in the command.
+
+### 2026-03-18
+
+#### `--master-port` / `--head-port` Parameter
+
+Added `--master-port` (synonym: `--head-port`) to both `launch-cluster.sh` and `run-recipe.sh` to configure the port used for cluster coordination:
+
+- In **Ray mode**: sets the Ray head node port (previously hardcoded to 6379)
+- In **No-Ray mode**: sets the PyTorch distributed `--master-port` passed to vLLM
+
+Default is `29501`.
+
+```bash
+./launch-cluster.sh --master-port 29501 --no-ray exec vllm serve ...
+./run-recipe.sh qwen3.5-122b-fp8 --no-ray --master-port 29501
+```
+
+#### `--network` Parameter in Build Arguments
+
+Added `--network <name>` to `build-and-copy.sh` to allow using host networking during builds. 
+Thanks @apairmont for the PR.
+
+### 2026-03-17
+
+#### EXPERIMENTAL Intel/Qwen3.5-397B-A17B-int4-AutoRound Recipe
+
+You can run full 397B Qwen3.5 model on just two Sparks with vision and full context, however you need to make sure your Sparks don't run anything extra that can take a lot of RAM. That means that you don't want to log into the graphical interface or use remote desktop. Connect to the head node via ssh.
+
+Alternatively, you can run in non-graphical mode (runlevel 3) by using `sudo systemctl isolate multi-user.target` to switch (you can use `sudo systemctl set-default graphical.target` to switch back to graphical mode), however this is known to reduce performance a bit.
+
+You can run the model with the following command on the head node:
+
+```bash
+./run-recipe.sh qwen3.5-397b-int4-autoround.yaml --no-ray
+```
+
+Please, note `--no-ray` is necessary to fit full context. It also improves inference speed by ~1 t/s.
+By default it will try to allocate 112 GB for vLLM on each node. You can change this by changing `--gpu-memory-utilization` (e.g. `--gpu-memory-utilization 113`), but please be aware that it uses GB instead of percentage **for this recipe**. 
+
+**KNOWN ISSUES**:
+
+1. The current firmware may cause sudden shutdown event on one or both Sparks during heavy inference. If you have this issue, you will need to lower GPU clock frequency on the affected unit(s), e.g. `sudo nvidia-smi -lgc 200,2150`. This command will reduce max GPU frequency to 2150 MHz. You can play with higher values to see what works for you (default is 2411 MHz, but can boost to 3000 MHz). Please note that this setting only survives until the next reboot, but can be applied at any time.
+2. You will need to use the new `--no-ray` argument to fit full context.
+3. If the model gets stuck loading weights, clearing the cache on both nodes can "unstuck" it. Use `sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'` to clear the cache. 
+
+
+#### Major Cluster Orchestration Refactoring
+
+Significantly refactored the internal cluster startup logic in `launch-cluster.sh`:
+- Removed the standalone `run-cluster-node.sh` script; its logic is now fully integrated into `launch-cluster.sh`.
+- Ray head/worker startup, environment variable injection, and launch script distribution are now handled by `launch-cluster.sh` directly.
+- Worker containers are started with proper per-node environment variables (`VLLM_HOST_IP`, `NCCL_SOCKET_IFNAME`, etc.) injected via `docker run`/`docker exec` instead of relying on `.bashrc`.
+- You will now be able to run other vLLM containers without applying `use-ngc-vllm` mod (current version is just an empty stub).
+
+#### No-Ray Multi-Node Mode
+
+Added `--no-ray` flag to `launch-cluster.sh` to run multi-node vLLM clusters without Ray, using PyTorch's native distributed backend instead. It slightly improves inference performance for most models and reduces memory requirements.
+
+```bash
+./launch-cluster.sh --no-ray exec vllm serve ...
+```
+
+`--no-ray` is incompatible with `--solo` (which already runs without Ray).
+
+#### `run-recipe.sh` No-Ray Mode and Extended Flag Passthrough
+
+`run-recipe.sh` now supports `--no-ray` flag for running multi-node inference without Ray (uses PyTorch distributed backend instead):
+
+```bash
+./run-recipe.sh qwen3.5-122b-fp8 --no-ray
+```
+
+The following `launch-cluster.sh` flags are now also passed through from `run-recipe.sh`:
+`--master-port`, `--name`, `--eth-if`, `--ib-if`, `-j`, `--no-cache-dirs`, `--non-privileged`, `--mem-limit-gb`, `--mem-swap-limit-gb`, `--pids-limit`, `--shm-size-gb`.
+
+#### Nemotron-3-Nano-NVFP4 Switched to Marlin Backend
+
+The `nemotron-3-nano-nvfp4` recipe has been updated to use the Marlin backend for better performance and reliability (until Flashinfer fully supports NVFP4 on sm121).
+
+### 2026-03-12
+
+#### Experimental `--gpu-memory-utilization-gb` Mod
+
+Added a new mod `mods/gpu-mem-util-gb` that adds a `--gpu-memory-utilization-gb` flag to vLLM, allowing you to specify GPU memory reservation in GiB instead of as a fraction. This is particularly useful on DGX Spark's unified memory architecture where available memory changes dynamically.
+
+```bash
+./launch-cluster.sh --apply-mod mods/gpu-mem-util-gb exec vllm serve ... \
+  --gpu-memory-utilization-gb 110
+```
+
+Cannot be used simultaneously with `--kv-cache-memory-bytes`.
+
+#### Qwen3.5-397B INT4-AutoRound TP=4 Recipe (4× Spark Cluster)
+
+Added `recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml` for running Intel/Qwen3.5-397B-A17B-int4-AutoRound across 4 DGX Spark nodes with tensor parallelism (TP=4).
+
+Benchmarked at ~37 tok/s single-user, ~103 tok/s aggregate (4 concurrent users).
+
+Includes a new mod `mods/fix-qwen35-tp4-marlin` that resolves a Marlin kernel constraint (`MIN_THREAD_N=64`) that breaks certain projection layers at TP=4.
+
+**Note:** Requires NVIDIA driver 580.x. Driver 590.x has a CUDAGraph capture deadlock on GB10 unified memory.
+
+```bash
+./run-recipe.sh 4x-spark-cluster/qwen3.5-397b-int4-autoround
+```
+Thanks @sonusflow for the contribution.
+
+#### Nemotron-3-Super-120B NVFP4 Recipe
+
+Added a new recipe `nemotron-3-super-nvfp4` for running `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4` with Marlin kernels. Supports both solo and cluster modes. Includes a custom reasoning parser (`super_v3_reasoning_parser.py`) fetched from the model repository. Supports both dual and single Spark configurations.
+
+```bash
+./run-recipe.sh nemotron-3-super-nvfp4
+```
+
+### 2026-03-11
+
+#### Qwen3-Coder-Next INT4-AutoRound Recipe
+
+Added a new recipe `qwen3-coder-next-int4-autoround` for running Intel/Qwen3-Coder-Next-int4-AutoRound. Supports single Spark only (use with `--solo` switch), since split weights are too small for Marlin kernel.
+
+```bash
+./run-recipe.sh qwen3-coder-next-int4-autoround --solo
+```
+
+### 2026-03-06
+
+#### `-e/--env` Passthrough in `run-recipe.py`
+
+`run-recipe.sh` now accepts one or more `-e VAR=VALUE` flags to pass environment variables directly to the container, mirroring the existing behaviour of `launch-cluster.sh`.
+
+```bash
+./run-recipe.sh qwen3.5-122b-int4-autoround --solo -e HF_TOKEN=$HF_TOKEN
+```
+
+#### Unsloth Chat Template for Qwen3.5
+
+Added a new mod `mods/fix-qwen3.5-chat-template` that applies the Unsloth chat template to Qwen3.5 models for better compatibility with modern clients. The template is now included in the `qwen3.5-122b-fp8`, `qwen3.5-122b-int4-autoround`, and `qwen3.5-35b-a3b-fp8` recipes.
+
+#### Fix Shell Quoting for Exec Command Arguments
+
+Fixed shell quoting for exec command arguments in `launch-cluster.sh` and `run-recipe.py` to correctly handle arguments containing spaces or special characters.
+
+### 2026-03-05
+
+#### Qwen3.5-35B-A3B-FP8 Recipe
+
+Added a new recipe `qwen3.5-35b-a3b-fp8` for running Qwen3.5-35B-A3B in FP8 format.
+
+```bash
+./run-recipe.sh qwen3.5-35b-a3b-fp8
+```
+
+#### 4× Spark Cluster Recipes
+
+Added a `recipes/4x-spark-cluster/` subdirectory with recipes optimised for a 4-node Spark cluster:
+- `minimax-m2.5` — MiniMax M2.5 on 4× Spark
+- `qwen3.5-397b-a17B-fp8` — Qwen3.5-397B-A17B in FP8 on 4× Spark
+
+#### More Robust Wheels Check Before Download
+
+Improved the wheels availability check in `build-and-copy.sh` to be more reliable when deciding whether to download remote wheels.
+
+### 2026-03-04
+
+#### Prebuilt vLLM Wheels via GitHub Releases
+
+`build-and-copy.sh` now automatically downloads prebuilt vLLM wheels from the [GitHub releases](https://github.com/eugr/spark-vllm-docker/releases/tag/prebuilt-vllm-current) before falling back to a local build — identical to the existing FlashInfer download mechanism. This eliminates the need to compile vLLM from source on first use.
+
+The download logic mirrors the FlashInfer behaviour:
+- If prebuilt wheels are available and newer than any locally cached version, they are downloaded automatically.
+- If the download fails (e.g. no network, release not found, GPU arch not supported), the script falls back to building locally, or reuses existing local wheels if present.
+- `--rebuild-vllm`, `--vllm-ref`, or `--apply-vllm-pr` skip the download entirely and force a local build.
+
+No new flags are required — the download happens transparently.
+
+All prebuilt wheels are now tested with multiple models in both solo and cluster configuration as a part of automated deployment pipeline which will now run nightly. The wheels are released only if they pass all the tests and no significant performance regressions are detected.
+
+#### Qwen3.5-122B-FP8 Recipe
+
+Added a new recipe `qwen3.5-122b-fp8` for running Qwen3.5-122B in FP8 format.
+
+```bash
+./run-recipe.sh qwen3.5-122b-fp8
+```
+
 ### 2026-03-02

 #### Qwen3.5-122B-INT4-Autoround Support
@@ -178,7 +484,6 @@ Added a new mod for Intel/Qwen3-Coder-Next-INT4-Autoround model support: `mods/f

 Changed reasoning parser in Minimax for better compatibility with modern clients (like coding tools).

-
 ### 2026-02-18

 #### Completely Redesigned Build Process
@@ -404,7 +709,8 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm-
 To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.:

 ```bash
-./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c
+# Image tag defaults to vllm-node-tf5 when --tf5/--pre-tf is used
+./build-and-copy.sh --pre-tf -c
 ```

 Then, to run on a single node:
@@ -454,7 +760,8 @@ It is currently the fastest way to run GPT-OSS on DGX Spark, achieving 60 t/s on
 To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss:

 ```bash
-./build-and-copy.sh -t vllm-node-mxfp4 --exp-mxfp4 -c
+# Image tag defaults to vllm-node-mxfp4 when --exp-mxfp4 is used
+./build-and-copy.sh --exp-mxfp4 -c
 ```

 Then, to run on a single Spark:
@@ -698,12 +1005,14 @@ Using a different username:

 | Flag | Description |
 | :--- | :--- |
-| `-t, --tag <tag>` | Image tag (default: `vllm-node`) |
+| `-t, --tag <tag>` | Image tag (default: `vllm-node`; auto-set to `vllm-node-tf5` with `--tf5`, `vllm-node-mxfp4` with `--exp-mxfp4`) |
 | `--gpu-arch <arch>` | Target GPU architecture (default: `12.1a`) |
 | `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build |
 | `--rebuild-vllm` | Force rebuild vLLM from source |
 | `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: `main`) |
+| `--flashinfer-ref <ref>` | FlashInfer commit SHA, branch or tag (default: `main`) |
 | `--apply-vllm-pr <pr-num>` | Apply a vLLM PR patch during build. Can be specified multiple times. |
+| `--apply-flashinfer-pr <pr-num>` | Apply a FlashInfer PR patch during build. Can be specified multiple times. |
 | `--tf5` | Install transformers v5 (5.0.0 or higher). Aliases: `--pre-tf, --pre-transformers`. |
 | `--exp-mxfp4` | Build with experimental native MXFP4 support. Alias: `--experimental-mxfp4`. |
 | `-c, --copy-to <hosts>` | Host(s) to copy the image to after building (space- or comma-separated). |
@@ -713,9 +1022,13 @@ Using a different username:
 | `-u, --user <user>` | Username for SSH connection (default: current user) |
 | `--full-log` | Enable full Docker build output (`--progress=plain`) |
 | `--no-build` | Skip building, only copy existing image (requires `--copy-to`) |
+| `--network <name>` | Docker network to use during build (e.g. `host`). |
+| `--cleanup` | Remove all cached `.whl` and `*-commit` files from the `wheels/` directory. |
+| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory) |
+| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists) |
 | `-h, --help` | Show help message |

-**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! If you omit the IP address and use `-c` without addresses, it will use autodiscovery to detect a proper IP address.
+**IMPORTANT**: When copying to another node manually, use the IP assigned to a ConnectX 7 interface (`enp1s0f*`), not the 10G/wireless interfaces. When using `-c` without addresses, autodiscovery selects the correct interface automatically — in mesh mode it uses the direct IB-attached interfaces (`enp1s0f0np0`, `enp1s0f1np1`) for maximum transfer speed.

 ### Copying the container to another Spark node (Manual Method)

@@ -784,9 +1097,12 @@ Assumptions and limitations:
 ### Auto-Detection

 The script attempts to automatically detect:
-*   **Ethernet Interface:** The interface associated with the active InfiniBand device that has an IP address.
-*   **InfiniBand Interface:** The active InfiniBand devices. By default both active RoCE interfaces that correspond to active IB port(s) will be utilized.
-*   **Node Role:** Based on the detected IP address and the list of nodes (defaults to `192.168.177.11` as head and `192.168.177.12` as worker).
+*   **Ethernet Interface (`ETH_IF`):** Determined by the number of active CX7 interfaces:
+    - **2 active** (standard): the `enp*` interface (no capital P) that has an IP address.
+    - **4 active** (mesh topology): `enP7s7` (preferred) or `wlP9s9` (wireless, shown with a warning) — the cluster coordination interface is separate from the CX7 ports in this configuration.
+*   **InfiniBand Interface (`IB_IF`):** All active RoCE devices. In mesh mode this is always `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`.
+*   **Cluster peers:** Discovered by scanning the `ETH_IF` subnet for hosts with SSH access **and** a GB10 GPU (`nvidia-smi --query-gpu=name` must return `NVIDIA GB10`).
+*   **Copy hosts (`COPY_HOSTS`):** In standard mode, same as cluster peers. In mesh mode, scanned separately on `enp1s0f0np0` and `enp1s0f1np1` subnets so that image/model transfers use the direct InfiniBand path.

 ### Manual Overrides

@@ -809,6 +1125,8 @@ You can override the auto-detected values if needed:
 | `--nccl-debug` | NCCL debug level (e.g., INFO, WARN). Defaults to INFO if flag is present but value is omitted. |
 | `--check-config` | Check configuration and auto-detection without launching. |
 | `--solo` | Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster |
+| `--no-ray` | No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend). |
+| `--master-port` / `--head-port` | Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501). |
 | `--no-cache-dirs` | Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton). |
 | `--launch-script` | Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted. |
 | `-d` | Run in daemon mode (detached). |
@@ -817,6 +1135,10 @@ You can override the auto-detected values if needed:
 | `--mem-swap-limit-gb` | Memory+swap limit in GB (default: mem-limit + 10, only with `--non-privileged`). |
 | `--pids-limit` | Process limit (default: 4096, only with `--non-privileged`). |
 | `--shm-size-gb` | Shared memory size in GB (default: 64, only with `--non-privileged`). |
+| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
+| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists). |
+| `start \| stop \| status \| exec` | Action to perform (default: `start`). Not compatible with `--launch-script`. |
+| `command` | Command to execute inside the container (only for `exec` action). |

 ### Non-Privileged Mode

@@ -960,6 +1282,61 @@ You need to make sure you allocate IP addresses to them (no need to allocate IP

 ## 5\. Configuration Details

+### Cluster Configuration (`.env` file)
+
+The scripts share a `.env` file (default: `.env` in the repo directory) for persistent cluster configuration. It is created automatically by autodiscovery — run `--discover` (via `run-recipe.sh`) or `--setup` (via `launch-cluster.sh` / `build-and-copy.sh`) on first use.
+
+**Supported variables:**
+
+| Variable | Description |
+| :--- | :--- |
+| `CLUSTER_NODES` | Comma-separated node IPs used for Ray/vLLM cluster (head node first). |
+| `COPY_HOSTS` | Comma-separated node IPs used for image and model distribution. In mesh mode these are the IPs on the direct IB-attached interfaces, which may differ from `CLUSTER_NODES`. |
+| `LOCAL_IP` | IP address of the local node. |
+| `ETH_IF` | Ethernet interface for cluster coordination (e.g. `enp1s0f1np1` or `enP7s7`). |
+| `IB_IF` | Comma-separated RoCE/IB device names (e.g. `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`). |
+| `CONTAINER_*` | Any variable prefixed with `CONTAINER_` (except `CONTAINER_NAME`) is passed as `-e VAR=VALUE` to the container. Example: `CONTAINER_NCCL_DEBUG=INFO` → `-e NCCL_DEBUG=INFO`. |
+
+**Mesh-mode NCCL variables** (written automatically when mesh topology is detected):
+
+```
+CONTAINER_NCCL_NET_PLUGIN=none
+CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
+CONTAINER_NCCL_IB_MERGE_NICS=0
+```
+
+**Example `.env` for a standard 2-node cluster:**
+
+```
+CLUSTER_NODES=192.168.177.11,192.168.177.12
+COPY_HOSTS=192.168.177.12
+LOCAL_IP=192.168.177.11
+ETH_IF=enp1s0f1np1
+IB_IF=rocep1s0f1,roceP2p1s0f1
+```
+
+To use a custom config file path, pass `--config /path/to/file.env` to any script.
+
+### Autodiscovery Workflow
+
+On first run, if no `.env` is present, the scripts will automatically trigger autodiscovery. You can also run it explicitly:
+
+```bash
+# Via run-recipe.sh
+./run-recipe.sh --discover
+
+# Via launch-cluster.sh or build-and-copy.sh (force re-run even if .env exists)
+./launch-cluster.sh --setup exec vllm serve ...
+./build-and-copy.sh --setup -c
+```
+
+Autodiscovery:
+1. Detects active CX7 interfaces and determines mesh vs. standard topology.
+2. Scans the network for SSH-reachable GB10 peers.
+3. In mesh mode, separately discovers `COPY_HOSTS` on direct IB-attached interfaces.
+4. Prompts for per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`.
+5. Saves the result to `.env`.
+
 ### Environment Persistence

 The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run:
@@ -1133,6 +1510,32 @@ The `hf-download.sh` script provides a convenient way to download models from Hu
 ./hf-download.sh -c --copy-parallel QuantTrio/MiniMax-M2-AWQ
 ```

+**Use nodes from `.env` (respects `COPY_HOSTS`):**
+
+```bash
+./hf-download.sh -c QuantTrio/MiniMax-M2-AWQ
+```
+
+When `-c` is given without explicit hosts, the script checks `COPY_HOSTS` in `.env` first, then falls back to autodiscovery. In mesh mode this means transfers go over the direct IB-attached interfaces automatically.
+
+**Use a custom config file:**
+
+```bash
+./hf-download.sh --config /path/to/cluster.env -c QuantTrio/MiniMax-M2-AWQ
+```
+
+**Available options:**
+
+| Flag | Description |
+| :--- | :--- |
+| `<model-name>` | HuggingFace model ID (e.g. `QuantTrio/MiniMax-M2-AWQ`). Required. |
+| `-c, --copy-to <hosts>` | Host(s) to copy the model to after download (space- or comma-separated). Omit hosts to use `COPY_HOSTS` from `.env` or autodiscovery. |
+| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
+| `--copy-parallel` | Copy to all hosts concurrently instead of serially. |
+| `-u, --user <user>` | SSH username for remote copies (default: current user). |
+| `--config <file>` | Path to `.env` configuration file (default: `.env` in script directory). |
+| `-h, --help` | Show help message. |
+
 ### Hardware Architecture

 **Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`.
--- a/autodiscover.sh
+++ b/autodiscover.sh
@@ -1,5 +1,57 @@
 #!/bin/bash

+SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
+
+# Load .env file if exists (for shared configuration)
+# This is called early so that DOTENV_* variables are available to all functions
+load_env_if_exists() {
+    local env_file="${CONFIG_FILE:-}"
+    local config_explicit="${CONFIG_FILE_SET:-false}"
+
+    # If CONFIG_FILE is not set, check default location
+    if [[ -z "$env_file" ]]; then
+        env_file="$SCRIPT_DIR/.env"
+        config_explicit="false"
+    fi
+
+    # Validate config file exists if explicitly specified
+    # Exception: if --setup is also specified, the file will be created by the setup procedure
+    if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]] && [[ "${FORCE_DISCOVER:-false}" != "true" ]]; then
+        echo "Error: Config file not found: $env_file"
+        exit 1
+    fi
+
+    if [[ -f "$env_file" ]]; then
+        # Load .env variables with DOTENV_ prefix
+        while IFS='=' read -r key value || [[ -n "$key" ]]; do
+            # Skip comments and empty lines
+            [[ "$key" =~ ^[[:space:]]*# ]] && continue
+            [[ -z "$key" ]] && continue
+
+            # Remove leading/trailing whitespace from key
+            key=$(echo "$key" | xargs)
+
+            # Skip if key is empty after trimming
+            [[ -z "$key" ]] && continue
+
+            # Remove quotes from value
+            value="${value%\"}"
+            value="${value#\"}"
+            value="${value%\'}"
+            value="${value#\'}"
+
+            # Export with DOTENV_ prefix
+            export "DOTENV_$key=$value"
+        done < "$env_file"
+    fi
+}
+
+# Load .env file
+load_env_if_exists
+
+# Mesh mode flag (set by detect_interfaces)
+MESH_MODE="false"
+
 # Function to detect IB and Ethernet interfaces
 detect_interfaces() {
    # If both interfaces are already set, nothing to do
@@ -14,60 +66,132 @@ detect_interfaces() {
    fi

    echo "Auto-detecting interfaces..."
-    
+
    # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)"
    # We capture: IB_DEV, NET_DEV
    mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}')
-    
+
    if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then
        echo "Error: No active IB interfaces found."
        return 1
    fi

    DETECTED_IB_IFS=()
-    CANDIDATE_ETH_IFS=()
+    ALL_NET_IFS=()

    for pair in "${IB_NET_PAIRS[@]}"; do
        ib_dev=$(echo "$pair" | awk '{print $1}')
        net_dev=$(echo "$pair" | awk '{print $2}')
-        
        DETECTED_IB_IFS+=("$ib_dev")
-        
-        # Check if interface has an IP address
-        if ip addr show "$net_dev" | grep -q "inet "; then
-            CANDIDATE_ETH_IFS+=("$net_dev")
+        ALL_NET_IFS+=("$net_dev")
+    done
+
+    local num_up="${#IB_NET_PAIRS[@]}"
+
+    # --- Sanity checks ---
+
+    # 1. enp* (no capital P) interfaces MUST have an IP
+    for net_dev in "${ALL_NET_IFS[@]}"; do
+        if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then
+            if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
+                echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned."
+                return 1
+            fi
        fi
    done

-    # Set IB_IF if not provided
-    if [[ -z "$IB_IF" ]]; then
-        IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
-        echo "  Detected IB_IF: $IB_IF"
-    fi
-
-    # Set ETH_IF if not provided
-    if [[ -z "$ETH_IF" ]]; then
-        if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then
-            echo "Error: No active IB-associated interfaces have IP addresses."
+    # 2. No two interfaces with IPs should share the same subnet
+    declare -A SEEN_SUBNETS
+    for net_dev in "${ALL_NET_IFS[@]}"; do
+        local cidr
+        cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1)
+        [[ -z "$cidr" ]] && continue
+        # Compute network address using python3
+        local net_addr
+        net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null)
+        if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then
+            echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration."
            return 1
        fi
-        
-        # Selection logic: Prefer interface without capital 'P'
-        SELECTED_ETH=""
-        for iface in "${CANDIDATE_ETH_IFS[@]}"; do
-            if [[ "$iface" != *"P"* ]]; then
-                SELECTED_ETH="$iface"
-                break
-            fi
-        done
-        
-        # Fallback: Use the first one if all have 'P' or none found yet
-        if [[ -z "$SELECTED_ETH" ]]; then
-            SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}"
+        SEEN_SUBNETS["$net_addr"]="$net_dev"
+    done
+
+    # --- Mode selection ---
+
+    if [[ "$num_up" -eq 2 ]]; then
+        # Non-mesh configuration
+        MESH_MODE="false"
+        echo "  Non-mesh mode: 2 CX7 interfaces active."
+
+        # Set IB_IF if not provided
+        if [[ -z "$IB_IF" ]]; then
+            IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
+            echo "  Detected IB_IF: $IB_IF"
        fi
-        
-        ETH_IF="$SELECTED_ETH"
-        echo "  Detected ETH_IF: $ETH_IF"
+
+        # Set ETH_IF if not provided: prefer interface without capital 'P'
+        if [[ -z "$ETH_IF" ]]; then
+            local selected_eth=""
+            for net_dev in "${ALL_NET_IFS[@]}"; do
+                if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
+                    if [[ "$net_dev" != *P* ]]; then
+                        selected_eth="$net_dev"
+                        break
+                    fi
+                fi
+            done
+            # Fallback: first interface with an IP
+            if [[ -z "$selected_eth" ]]; then
+                for net_dev in "${ALL_NET_IFS[@]}"; do
+                    if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
+                        selected_eth="$net_dev"
+                        break
+                    fi
+                done
+            fi
+            if [[ -z "$selected_eth" ]]; then
+                echo "Error: No active IB-associated interfaces have IP addresses."
+                return 1
+            fi
+            ETH_IF="$selected_eth"
+            echo "  Detected ETH_IF: $ETH_IF"
+        fi
+
+    elif [[ "$num_up" -eq 4 ]]; then
+        # Mesh configuration
+        MESH_MODE="true"
+        echo "  Mesh mode: all 4 CX7 interfaces active."
+
+        # Set IB_IF to all four RoCE interfaces (hardcoded for mesh)
+        if [[ -z "$IB_IF" ]]; then
+            IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1"
+            echo "  Detected IB_IF: $IB_IF"
+        fi
+
+        # Set ETH_IF: check enP7s7 first, then wlP9s9
+        if [[ -z "$ETH_IF" ]]; then
+            if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then
+                ETH_IF="enP7s7"
+                echo "  Detected ETH_IF: $ETH_IF"
+            elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then
+                ETH_IF="wlP9s9"
+                echo "  Detected ETH_IF: $ETH_IF"
+                echo "  Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited."
+            else
+                echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination."
+                return 1
+            fi
+        fi
+
+        # Export mesh NCCL settings directly so launch-cluster.sh picks them up
+        # even if the user declines to save config to .env
+        export DOTENV_CONTAINER_NCCL_NET_PLUGIN=none
+        export DOTENV_CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1
+        export DOTENV_CONTAINER_NCCL_IB_MERGE_NICS=0
+
+    else
+        echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)."
+        return 1
    fi
 }

@@ -84,16 +208,51 @@ detect_local_ip() {

    # Get CIDR of the selected ETH_IF
    CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1)
-    
+
    if [[ -z "$CIDR" ]]; then
        echo "Error: Could not determine IP/CIDR for interface $ETH_IF"
        return 1
    fi
-    
+
    LOCAL_IP=${CIDR%/*}
    echo "  Detected Local IP: $LOCAL_IP ($CIDR)"
 }

+# Scan a subnet for GB10-capable peers via SSH
+# Usage: _scan_subnet_for_gb10 <cidr> <local_ip_to_exclude> <output_file>
+_scan_subnet_for_gb10() {
+    local cidr="$1"
+    local exclude_ip="$2"
+    local out_file="$3"
+
+    if ! command -v python3 &> /dev/null; then
+        echo "Error: python3 not found."
+        return 1
+    fi
+    if ! command -v nc &> /dev/null; then
+        echo "Error: nc (netcat) not found."
+        return 1
+    fi
+
+    local all_ips
+    all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr")
+
+    for ip in $all_ips; do
+        [[ "$ip" == "$exclude_ip" ]] && continue
+        (
+            if nc -z -w 1 "$ip" 22 &>/dev/null; then
+                # Check if remote is a GB10 system
+                if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
+                    "nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \
+                    2>/dev/null | grep -q "NVIDIA GB10"; then
+                    echo "$ip" >> "$out_file"
+                fi
+            fi
+        ) &
+    done
+    wait
+}
+
 # Function to detect cluster nodes
 detect_nodes() {
    detect_local_ip || return 1
@@ -111,58 +270,182 @@ detect_nodes() {
        return 0
    fi

-    echo "Auto-detecting nodes..."
-    
-    if ! command -v nc &> /dev/null; then
-        echo "Error: nc (netcat) not found. Please install netcat."
-        return 1
-    fi
-    
-    if ! command -v python3 &> /dev/null; then
-        echo "Error: python3 not found. Please install python3."
-        return 1
+    # Try to use CLUSTER_NODES from .env
+    if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then
+        echo "  Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES"
+        PEER_NODES=()
+        IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES"
+        for node in "${ALL_NODES[@]}"; do
+            node=$(echo "$node" | xargs)
+            [[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node")
+        done
+        NODES_ARG="$DOTENV_CLUSTER_NODES"
+        return 0
    fi

-    DETECTED_IPS=("$LOCAL_IP")
+    echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..."
+
+    local temp_file
+    temp_file=$(mktemp)
+
+    _scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file"
+
    PEER_NODES=()
-    
-    echo "  Scanning for SSH peers on $CIDR..."
-    
-    # Generate list of IPs using python
-    ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR")
-    
-    TEMP_IPS_FILE=$(mktemp)
-    
-    # Scan in parallel
-    for ip in $ALL_IPS; do
-        # Skip own IP
-        if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi
-        
-        (
-            # Check port 22 with 1 second timeout
-            if nc -z -w 1 "$ip" 22 &>/dev/null; then
-                echo "$ip" >> "$TEMP_IPS_FILE"
-            fi
-        ) &
-    done
-    
-    # Wait for all background scans to complete
-    wait
-    
-    # Read found IPs
-    if [[ -f "$TEMP_IPS_FILE" ]]; then
+    local detected_ips=("$LOCAL_IP")
+    if [[ -f "$temp_file" ]]; then
        while read -r ip; do
-             DETECTED_IPS+=("$ip")
-             PEER_NODES+=("$ip")
-             echo "  Found peer: $ip"
-        done < "$TEMP_IPS_FILE"
-        rm -f "$TEMP_IPS_FILE"
+            PEER_NODES+=("$ip")
+            detected_ips+=("$ip")
+            echo "  Found GB10 peer: $ip"
+        done < <(sort "$temp_file")
+        rm -f "$temp_file"
    fi
-    
-    # Sort IPs
-    IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}"))
+
+    # Sort and set NODES_ARG
+    IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}"))
    unset IFS
-    
    NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
    echo "  Cluster Nodes: $NODES_ARG"
 }
+
+# Function to detect COPY_HOSTS for build/model distribution
+# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network)
+# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers
+detect_copy_hosts() {
+    if [[ "$MESH_MODE" == "false" ]]; then
+        COPY_PEER_NODES=("${PEER_NODES[@]}")
+        return 0
+    fi
+
+    # Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets
+    echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..."
+
+    local temp_file
+    temp_file=$(mktemp)
+
+    for iface in enp1s0f0np0 enp1s0f1np1; do
+        local cidr
+        cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1)
+        [[ -z "$cidr" ]] && continue
+        local local_iface_ip="${cidr%/*}"
+        echo "  Scanning $iface ($cidr)..."
+        _scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file"
+    done
+
+    # Deduplicate and collect results.
+    # On two-cable setups two IB IPs may belong to the same host; deduplicate by
+    # querying each host's ETH_IF IP as a canonical identity.
+    COPY_PEER_NODES=()
+    declare -A _SEEN_COPY   # keyed by IB IP
+    declare -A _SEEN_HOST   # keyed by ETH_IF IP → first IB IP seen for that host
+    if [[ -f "$temp_file" ]]; then
+        while read -r ip; do
+            [[ -n "${_SEEN_COPY[$ip]}" ]] && continue
+            _SEEN_COPY["$ip"]=1
+            # Resolve canonical host identity via ETH_IF IP
+            local host_ip
+            host_ip=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
+                "ip -o -f inet addr show $ETH_IF 2>/dev/null | awk '{print \$4}' | head -n1 | cut -d/ -f1" \
+                </dev/null 2>/dev/null)
+            if [[ -n "$host_ip" && -n "${_SEEN_HOST[$host_ip]}" ]]; then
+                echo "  Skipping $ip (same host as ${_SEEN_HOST[$host_ip]}, ETH_IF: $host_ip)"
+                continue
+            fi
+            [[ -n "$host_ip" ]] && _SEEN_HOST["$host_ip"]="$ip"
+            COPY_PEER_NODES+=("$ip")
+            echo "  Found GB10 copy host: $ip"
+        done < <(sort "$temp_file")
+        rm -f "$temp_file"
+    fi
+}
+
+# Save discovered configuration to .env
+# Skips if .env already exists unless FORCE_DISCOVER=true
+save_config() {
+    local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}"
+
+    # Skip if .env exists and not forced
+    if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then
+        return 0
+    fi
+
+    echo ""
+    local save_prompt="Save discovered configuration to $env_file?"
+    if [[ -f "$env_file" ]]; then
+        save_prompt="Overwrite existing configuration in $env_file?"
+    fi
+    read -r -p "$save_prompt [Y/n]: " response
+    response="${response,,}"
+    if [[ "$response" =~ ^(n|no)$ ]]; then
+        return 0
+    fi
+
+    # Build list of all cluster nodes (local + peers)
+    local all_cluster_nodes=()
+    if [[ -n "$LOCAL_IP" ]]; then
+        all_cluster_nodes+=("$LOCAL_IP")
+    fi
+    for node in "${PEER_NODES[@]}"; do
+        all_cluster_nodes+=("$node")
+    done
+
+    # Per-node confirmation for CLUSTER_NODES
+    echo ""
+    echo "Select nodes for CLUSTER_NODES:"
+    local selected_cluster=()
+    for node in "${all_cluster_nodes[@]}"; do
+        local label="$node"
+        [[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)"
+        read -r -p "  Include $label? [Y/n]: " r
+        r="${r,,}"
+        if [[ ! "$r" =~ ^(n|no)$ ]]; then
+            selected_cluster+=("$node")
+        fi
+    done
+
+    if [[ "${#selected_cluster[@]}" -eq 0 ]]; then
+        echo "No nodes selected. Aborting save."
+        return 1
+    fi
+
+    # Per-node confirmation for COPY_HOSTS
+    echo ""
+    echo "Select nodes for COPY_HOSTS (build/model distribution):"
+    local selected_copy=()
+    for node in "${COPY_PEER_NODES[@]}"; do
+        read -r -p "  Include $node in COPY_HOSTS? [Y/n]: " r
+        r="${r,,}"
+        if [[ ! "$r" =~ ^(n|no)$ ]]; then
+            selected_copy+=("$node")
+        fi
+    done
+
+    # Write .env
+    {
+        echo "# Auto-generated by autodiscover.sh"
+        echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")"
+        if [[ "${#selected_copy[@]}" -gt 0 ]]; then
+            echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")"
+        fi
+        echo "LOCAL_IP=$LOCAL_IP"
+        echo "ETH_IF=$ETH_IF"
+        echo "IB_IF=$IB_IF"
+        if [[ "$MESH_MODE" == "true" ]]; then
+            echo "# Mesh mode NCCL settings"
+            echo "CONTAINER_NCCL_NET_PLUGIN=none"
+            echo "CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1"
+            echo "CONTAINER_NCCL_IB_MERGE_NICS=0"
+        fi
+    } > "$env_file"
+    echo ""
+    echo "Saved to $env_file"
+}
+
+# Convenience function: run full autodiscovery pipeline
+run_autodiscover() {
+    detect_interfaces || return 1
+    detect_local_ip || return 1
+    detect_nodes || return 1
+    detect_copy_hosts || return 1
+    save_config
+}
--- a/build-and-copy.sh
+++ b/build-and-copy.sh
@@ -6,35 +6,76 @@ START_TIME=$(date +%s)

 # Default values
 IMAGE_TAG="vllm-node"
+IMAGE_TAG_SET=false
 REBUILD_FLASHINFER=false
 REBUILD_VLLM=false
 COPY_HOSTS=()
+COPY_TO_FLAG=false
 SSH_USER="$USER"
 NO_BUILD=false
 VLLM_REF="main"
+VLLM_REF_SET=false
+FLASHINFER_REF="main"
+FLASHINFER_REF_SET=false
 TMP_IMAGE=""
 PARALLEL_COPY=false
 EXP_MXFP4=false
-VLLM_REF_SET=false
 VLLM_PRS=""
+FLASHINFER_PRS=""
 PRE_TRANSFORMERS=false
 FULL_LOG=false
 BUILD_JOBS="16"
 GPU_ARCH_LIST="12.1a"
+NETWORK_ARG=""
 WHEELS_REPO="eugr/spark-vllm-docker"
 FLASHINFER_RELEASE_TAG="prebuilt-flashinfer-current"
+VLLM_RELEASE_TAG="prebuilt-vllm-current"
 # Space-separated list of GPU architectures for which prebuilt wheels are available
 PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a"
+CLEANUP_MODE="false"
+CONFIG_FILE=""

 cleanup() {
    if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
        echo "Cleaning up temporary image $TMP_IMAGE"
        rm -f "$TMP_IMAGE"
    fi
+    rm -f ./build-metadata.yaml
 }

 trap cleanup EXIT

+generate_build_metadata() {
+    local dockerfile="$1"
+    local vllm_version="$2"
+    local vllm_commit="$3"
+    local flashinfer_commit="$4"
+    local vllm_ref="$5"
+    local pre_transformers="$6"
+    local exp_mxfp4="$7"
+    local vllm_prs="$8"
+
+    local base_image
+    base_image=$(grep -m1 '^FROM .* AS runner' "$dockerfile" | awk '{print $2}')
+
+    cat > ./build-metadata.yaml <<EOF
+build_date: $(date -u +"%Y-%m-%dT%H:%M:%SZ")
+build_script_commit: $(git rev-parse HEAD 2>/dev/null || echo "unknown")
+vllm_version: ${vllm_version:-unknown}
+vllm_commit: ${vllm_commit:-unknown}
+flashinfer_commit: ${flashinfer_commit:-unknown}
+gpu_arch: ${GPU_ARCH_LIST}
+base_image: ${base_image:-unknown}
+build_args:
+  vllm_ref: ${vllm_ref}
+  transformers_5: ${pre_transformers}
+  exp_mxfp4: ${exp_mxfp4}
+  vllm_prs: "${vllm_prs}"
+  build_jobs: ${BUILD_JOBS}
+EOF
+    echo "Generated build-metadata.yaml"
+}
+
 add_copy_hosts() {
    local token part
    for token in "$@"; do
@@ -65,7 +106,12 @@ copy_to_host() {

 # try_download_wheels TAG PREFIX
 # Downloads wheels matching PREFIX*.whl from a GitHub release.
-# Skips files that are already present and up to date (by remote updated_at vs local mtime).
+# Skip conditions (either is sufficient):
+#   1. Commit hash in release name matches .wheels/.{PREFIX}_commit (primary check).
+#   2. All local wheels are newer than the latest GitHub asset (freshly built).
+# Only downloads a file when the remote asset is newer than the local copy AND
+# the above skip conditions are not met.
+# On success, persists the release commit hash to .wheels/.{PREFIX}_commit.
 # Returns 0 if all matching wheels are now available, 1 on any error.
 try_download_wheels() {
    local TAG="$1"
@@ -91,7 +137,7 @@ try_download_wheels() {

    local DOWNLOAD_LIST
    DOWNLOAD_LIST=$(echo "$RELEASE_JSON" | python3 -c '
-import json, sys, os
+import json, sys, os, re
 from datetime import datetime, timezone

 wheels_dir, prefix = sys.argv[1], sys.argv[2]
@@ -103,6 +149,31 @@ if not assets:
    print("No assets found matching prefix: " + prefix, file=sys.stderr)
    sys.exit(1)

+# Extract commit hash from the release name:
+#   FlashInfer: "Prebuilt FlashInfer Wheels (0.6.5-124a2d32-d20260305) - DGX Spark Only"
+#   vLLM:       "Prebuilt vLLM Wheels (0.16.1rc1.dev296+ga73af584f.d20260305.cu131) - DGX Spark only"
+release_name = data.get("name", "")
+commit_hash = None
+if prefix.startswith("flashinfer"):
+    m = re.search(r"\([\d.]+\w*-([0-9a-f]{6,})-d\d{8}\)", release_name, re.IGNORECASE)
+    if m:
+        commit_hash = m.group(1)
+else:
+    m = re.search(r"\+g([0-9a-f]{6,})\.", release_name, re.IGNORECASE)
+    if m:
+        commit_hash = m.group(1)
+
+# Compare against the locally stored commit hash
+commit_file = os.path.join(wheels_dir, "." + prefix + "-commit")
+local_commit = None
+if os.path.exists(commit_file):
+    with open(commit_file) as f:
+        local_commit = f.read().strip()
+
+if commit_hash and local_commit and local_commit[:len(commit_hash)] == commit_hash:
+    print("Commit hash matches (" + commit_hash + ") — wheels are up to date.", file=sys.stderr)
+    sys.exit(0)
+
 newest_remote_ts = max(
    datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ")
            .replace(tzinfo=timezone.utc).timestamp()
@@ -118,12 +189,19 @@ local_wheels = [
 if local_wheels and all(os.path.getmtime(p) >= newest_remote_ts for p in local_wheels):
    sys.exit(0)

+downloads = []
 for a in assets:
    local_path = os.path.join(wheels_dir, a["name"])
    remote_ts = datetime.strptime(a["updated_at"], "%Y-%m-%dT%H:%M:%SZ") \
                    .replace(tzinfo=timezone.utc).timestamp()
    if not os.path.exists(local_path) or remote_ts > os.path.getmtime(local_path):
-        print(a["browser_download_url"] + " " + a["name"])
+        downloads.append(a["browser_download_url"] + " " + a["name"])
+
+if downloads:
+    if commit_hash:
+        print("#commit:" + commit_hash)
+    for d in downloads:
+        print(d)
 ' "$WHEELS_DIR" "$PREFIX") || return 1

    if [ -z "$DOWNLOAD_LIST" ]; then
@@ -131,16 +209,36 @@ for a in assets:
        return 0
    fi

+    # Parse the optional '#commit:HASH' sentinel emitted by the Python script
+    local REMOTE_COMMIT=""
+    local DOWNLOAD_ENTRIES=""
+    while IFS= read -r LINE; do
+        if [[ "$LINE" == "#commit:"* ]]; then
+            REMOTE_COMMIT="${LINE#"#commit:"}"
+        elif [[ -n "$LINE" ]]; then
+            DOWNLOAD_ENTRIES+="$LINE"$'\n'
+        fi
+    done <<< "$DOWNLOAD_LIST"
+
+    if [ -z "$DOWNLOAD_ENTRIES" ]; then
+        echo "All $PREFIX wheels are up to date — skipping download."
+        return 0
+    fi
+
    # Back up existing wheels so we never leave a mix of old and new on failure
    local DL_BACKUP="$WHEELS_DIR/.backup-download-${PREFIX}"
    rm -rf "$DL_BACKUP" && mkdir -p "$DL_BACKUP"
    for f in "$WHEELS_DIR/${PREFIX}"*.whl; do
        [ -f "$f" ] && mv "$f" "$DL_BACKUP/"
    done
+    for f in "$WHEELS_DIR/.${PREFIX}"*; do
+        [ -f "$f" ] && mv "$f" "$DL_BACKUP/"
+    done

    local URL NAME TMP_WHL
    local DOWNLOADED=()
    while IFS=' ' read -r URL NAME; do
+        [ -z "$URL" ] && continue
        echo "Downloading $NAME..."
        TMP_WHL=$(mktemp "$WHEELS_DIR/${NAME}.XXXXXX")
        if curl -L --progress-bar --connect-timeout 30 "$URL" -o "$TMP_WHL"; then
@@ -153,24 +251,30 @@ for a in assets:
            if compgen -G "$DL_BACKUP/${PREFIX}*.whl" > /dev/null 2>&1; then
                echo "Restoring previous $PREFIX wheels..."
                mv "$DL_BACKUP/${PREFIX}"*.whl "$WHEELS_DIR/"
+                mv "$DL_BACKUP/.${PREFIX}"* "$WHEELS_DIR/"
            fi
            rm -rf "$DL_BACKUP"
            return 1
        fi
-    done <<< "$DOWNLOAD_LIST"
+    done <<< "$DOWNLOAD_ENTRIES"

    rm -rf "$DL_BACKUP"
+    if [ -n "$REMOTE_COMMIT" ]; then
+        echo "$REMOTE_COMMIT" > "$WHEELS_DIR/.${PREFIX}-commit"
+        echo "Recorded $PREFIX commit hash: $REMOTE_COMMIT"
+    fi
    return 0
 }

 # Help function
 usage() {
    echo "Usage: $0 [OPTIONS]"
-    echo "  -t, --tag <tag>               : Image tag (default: 'vllm-node')"
+    echo "  -t, --tag <tag>               : Image tag (default: 'vllm-node', 'vllm-node-tf5' with --tf5, 'vllm-node-mxfp4' with --exp-mxfp4)"
    echo "  --gpu-arch <arch>             : GPU architecture (default: '12.1a')"
    echo "  --rebuild-flashinfer          : Force rebuild of FlashInfer wheels (ignore cached wheels)"
    echo "  --rebuild-vllm                : Force rebuild of vLLM wheels (ignore cached wheels)"
    echo "  --vllm-ref <ref>              : vLLM commit SHA, branch or tag (default: 'main')"
+    echo "  --flashinfer-ref <ref>        : FlashInfer commit SHA, branch or tag (default: 'main')"
    echo "  -c, --copy-to <hosts>         : Host(s) to copy the image to. Accepts comma or space-delimited lists."
    echo "      --copy-to-host            : Alias for --copy-to (backwards compatibility)."
    echo "      --copy-parallel           : Copy to all hosts in parallel instead of serially."
@@ -179,47 +283,34 @@ usage() {
    echo "  --tf5                         : Install transformers>=5 (aliases: --pre-tf, --pre-transformers)"
    echo "  --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support"
    echo "  --apply-vllm-pr <pr-num>      : Apply a specific PR patch to vLLM source. Can be specified multiple times."
+    echo "  --apply-flashinfer-pr <pr-num>: Apply a specific PR patch to FlashInfer source. Can be specified multiple times."
    echo "  --full-log                    : Enable full build logging (--progress=plain)"
    echo "  --no-build                    : Skip building, only copy image (requires --copy-to)"
+    echo "  --network <network>           : Docker network to use during build"
+    echo "  --cleanup                     : Remove all *.whl and *.-commit files in wheels directory"
+    echo "  --config                      : Path to .env configuration file (default: .env in script directory)"
+    echo "  --setup                       : Force autodiscovery and save configuration (even if .env exists)"
    echo "  -h, --help                    : Show this help message"
    exit 1
 }

-# Argument parsing
+# Parse all arguments
+CONFIG_FILE_SET=false
 while [[ "$#" -gt 0 ]]; do
    case $1 in
-        -t|--tag) IMAGE_TAG="$2"; shift ;;
+        -t|--tag) IMAGE_TAG="$2"; IMAGE_TAG_SET=true; shift ;;
        --gpu-arch) GPU_ARCH_LIST="$2"; shift ;;
        --rebuild-flashinfer) REBUILD_FLASHINFER=true ;;
        --rebuild-vllm) REBUILD_VLLM=true ;;
        --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;;
+        --flashinfer-ref) FLASHINFER_REF="$2"; FLASHINFER_REF_SET=true; shift ;;
        -c|--copy-to|--copy-to-host|--copy-to-hosts)
+            COPY_TO_FLAG=true
            shift
            while [[ "$#" -gt 0 && "$1" != -* ]]; do
                add_copy_hosts "$1"
                shift
            done
-
-            if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
-                echo "No hosts specified. Using autodiscovery..."
-                source "$(dirname "$0")/autodiscover.sh"
-
-                detect_nodes
-                if [ $? -ne 0 ]; then
-                    echo "Error: Autodiscovery failed."
-                    exit 1
-                fi
-
-                if [ ${#PEER_NODES[@]} -gt 0 ]; then
-                    COPY_HOSTS=("${PEER_NODES[@]}")
-                fi
-
-                if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
-                     echo "Error: Autodiscovery found no other nodes."
-                     exit 1
-                fi
-                echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
-            fi
            continue
            ;;
        -j|--build-jobs) BUILD_JOBS="$2"; shift ;;
@@ -240,21 +331,100 @@ while [[ "$#" -gt 0 ]]; do
               exit 1
            fi
            ;;
+        --apply-flashinfer-pr)
+            if [ -n "$2" ] && [[ "$2" != -* ]]; then
+               if [ -n "$FLASHINFER_PRS" ]; then
+                   FLASHINFER_PRS="$FLASHINFER_PRS $2"
+               else
+                   FLASHINFER_PRS="$2"
+               fi
+               shift
+            else
+               echo "Error: --apply-flashinfer-pr requires a PR number."
+               exit 1
+            fi
+            ;;
        --full-log) FULL_LOG=true ;;
        --no-build) NO_BUILD=true ;;
+        --cleanup) CLEANUP_MODE=true ;;
+        --network)
+            if [ -n "$2" ] && [[ "$2" != -* ]]; then
+                NETWORK_ARG="$2"
+                shift
+            else
+                echo "Error: --network requires a network name."
+                exit 1
+            fi
+            ;;
+        --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
+        --setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
        -h|--help) usage ;;
        *) echo "Unknown parameter passed: $1"; usage ;;
    esac
    shift
 done

+# Apply default IMAGE_TAG based on flags if -t was not specified
+if [ "$IMAGE_TAG_SET" = false ]; then
+    if [ "$PRE_TRANSFORMERS" = true ]; then
+        IMAGE_TAG="vllm-node-tf5"
+    elif [ "$EXP_MXFP4" = true ]; then
+        IMAGE_TAG="vllm-node-mxfp4"
+    fi
+fi
+
+# Source autodiscover.sh to load .env file
+source "$(dirname "$0")/autodiscover.sh"
+
+# If --setup: force full autodiscovery and save configuration
+if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
+    echo "Running full autodiscovery (--setup)..."
+    detect_interfaces || exit 1
+    detect_local_ip || exit 1
+    detect_nodes || exit 1
+    detect_copy_hosts || exit 1
+    save_config || exit 1
+    # Reload .env so DOTENV_* variables reflect saved config
+    load_env_if_exists
+fi
+
+# Handle COPY_HOSTS from .env or autodiscovery only if -c was explicitly specified
+if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
+    if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
+        echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
+        IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
+        COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
+    else
+        echo "No hosts specified. Using autodiscovery..."
+        detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
+        detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
+        detect_nodes || { echo "Error: Node detection failed."; exit 1; }
+        detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
+
+        if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
+            COPY_HOSTS=("${COPY_PEER_NODES[@]}")
+        fi
+
+        if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
+            echo "Error: Autodiscovery found no other nodes."
+            exit 1
+        fi
+        echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
+    fi
+fi
+
 # Validate flag combinations
 if [ -n "$VLLM_PRS" ]; then
    if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi
 fi

+if [ -n "$FLASHINFER_PRS" ]; then
+    if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-flashinfer-pr is incompatible with --exp-mxfp4"; exit 1; fi
+fi
+
 if [ "$EXP_MXFP4" = true ]; then
    if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi
+    if [ "$FLASHINFER_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --flashinfer-ref"; exit 1; fi
    if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --tf5"; exit 1; fi
    if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi
    if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi
@@ -266,6 +436,30 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
    exit 1
 fi

+# Handle cleanup mode
+if [[ "$CLEANUP_MODE" == "true" ]]; then
+    WHEELS_DIR="./wheels"
+    echo "Cleaning up wheels directory..."
+    
+    # Remove all .whl files
+    if compgen -G "$WHEELS_DIR/*.whl" > /dev/null 2>&1; then
+        rm -f "$WHEELS_DIR"/*.whl
+        echo "Removed *.whl files from $WHEELS_DIR"
+    else
+        echo "No *.whl files found in $WHEELS_DIR"
+    fi
+    
+    # Remove all .-commit files
+    if compgen -G "$WHEELS_DIR/.*-commit" > /dev/null 2>&1; then
+        rm -f "$WHEELS_DIR"/.*-commit
+        echo "Removed .*-commit files from $WHEELS_DIR"
+    else
+        echo "No .*-commit files found in $WHEELS_DIR"
+    fi
+    
+    echo "Cleanup complete."
+fi
+
 # Ensure wheels directory exists
 mkdir -p ./wheels

@@ -277,6 +471,9 @@ fi
 COMMON_BUILD_FLAGS+=("--build-arg" "BUILD_JOBS=$BUILD_JOBS")
 COMMON_BUILD_FLAGS+=("--build-arg" "TORCH_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
 COMMON_BUILD_FLAGS+=("--build-arg" "FLASHINFER_CUDA_ARCH_LIST=$GPU_ARCH_LIST")
+if [ -n "$NETWORK_ARG" ]; then
+    COMMON_BUILD_FLAGS+=("--network" "$NETWORK_ARG")
+fi

 # =====================================================
 # Build image (unless --no-build or --exp-mxfp4)
@@ -288,6 +485,13 @@ RUNNER_BUILD_TIME=0
 if [ "$NO_BUILD" = false ]; then
    if [ "$EXP_MXFP4" = true ]; then
        echo "Building with experimental MXFP4 support..."
+
+        # Generate build metadata YAML for mxfp4 build
+        MXFP4_VLLM_SHA=$(grep -m1 '^ARG VLLM_SHA=' Dockerfile.mxfp4 | cut -d= -f2)
+        MXFP4_FLASHINFER_SHA=$(grep -m1 '^ARG FLASHINFER_SHA=' Dockerfile.mxfp4 | cut -d= -f2)
+        generate_build_metadata Dockerfile.mxfp4 "unknown" "$MXFP4_VLLM_SHA" "$MXFP4_FLASHINFER_SHA" \
+            "mxfp4-pinned" "false" "true" ""
+
        CMD=("docker" "build" "-t" "$IMAGE_TAG" "${COMMON_BUILD_FLAGS[@]}" "-f" "Dockerfile.mxfp4" ".")
        echo "Building image with command: ${CMD[*]}"
        BUILD_START=$(date +%s)
@@ -298,9 +502,21 @@ if [ "$NO_BUILD" = false ]; then
        # ----------------------------------------------------------
        # Phase 1: FlashInfer wheels
        # ----------------------------------------------------------
+        if [ "$FLASHINFER_REF_SET" = true ] || [ -n "$FLASHINFER_PRS" ]; then
+            REBUILD_FLASHINFER=true
+        fi
+
        BUILD_FLASHINFER=false
        if [ "$REBUILD_FLASHINFER" = true ]; then
-            echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
+            if [ "$FLASHINFER_REF_SET" = true ] && [ -n "$FLASHINFER_PRS" ]; then
+                echo "Rebuilding FlashInfer wheels (--flashinfer-ref and --apply-flashinfer-pr specified)..."
+            elif [ "$FLASHINFER_REF_SET" = true ]; then
+                echo "Rebuilding FlashInfer wheels (--flashinfer-ref specified)..."
+            elif [ -n "$FLASHINFER_PRS" ]; then
+                echo "Rebuilding FlashInfer wheels (--apply-flashinfer-pr specified)..."
+            else
+                echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..."
+            fi
            BUILD_FLASHINFER=true
        elif try_download_wheels "$FLASHINFER_RELEASE_TAG" "flashinfer"; then
            echo "FlashInfer wheels ready."
@@ -322,12 +538,18 @@ if [ "$NO_BUILD" = false ]; then
            FI_CMD=("docker" "build"
                "--target" "flashinfer-export"
                "--output" "type=local,dest=./wheels"
-                "${COMMON_BUILD_FLAGS[@]}")
+                "${COMMON_BUILD_FLAGS[@]}"
+                "--build-arg" "FLASHINFER_REF=$FLASHINFER_REF")

            if [ "$REBUILD_FLASHINFER" = true ]; then
                FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)")
            fi

+            if [ -n "$FLASHINFER_PRS" ]; then
+                echo "Applying FlashInfer PRs: $FLASHINFER_PRS"
+                FI_CMD+=("--build-arg" "FLASHINFER_PRS=$FLASHINFER_PRS")
+            fi
+
            FI_CMD+=(".")

            echo "FlashInfer build command: ${FI_CMD[*]}"
@@ -347,30 +569,32 @@ if [ "$NO_BUILD" = false ]; then
        # ----------------------------------------------------------
        # Phase 2: vLLM wheels
        # ----------------------------------------------------------
-        VLLM_WHEELS_EXIST=false
-        if compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
-            VLLM_WHEELS_EXIST=true
-        fi
-
        if [ "$VLLM_REF_SET" = true ] || [ -n "$VLLM_PRS" ]; then
            REBUILD_VLLM=true
        fi

-        if [ "$REBUILD_VLLM" = true ] || [ "$VLLM_WHEELS_EXIST" = false ]; then
-            if [ "$REBUILD_VLLM" = true ]; then
-                if [ "$VLLM_REF_SET" = true ] && [ -n "$VLLM_PRS" ]; then
-                    echo "Rebuilding vLLM wheels (--vllm-ref and --apply-vllm-pr specified)..."
-                elif [ "$VLLM_REF_SET" = true ]; then
-                    echo "Rebuilding vLLM wheels (--vllm-ref specified)..."
-                elif [ -n "$VLLM_PRS" ]; then
-                    echo "Rebuilding vLLM wheels (--apply-vllm-pr specified)..."
-                else
-                    echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
-                fi
+        BUILD_VLLM=false
+        if [ "$REBUILD_VLLM" = true ]; then
+            if [ "$VLLM_REF_SET" = true ] && [ -n "$VLLM_PRS" ]; then
+                echo "Rebuilding vLLM wheels (--vllm-ref and --apply-vllm-pr specified)..."
+            elif [ "$VLLM_REF_SET" = true ]; then
+                echo "Rebuilding vLLM wheels (--vllm-ref specified)..."
+            elif [ -n "$VLLM_PRS" ]; then
+                echo "Rebuilding vLLM wheels (--apply-vllm-pr specified)..."
            else
-                echo "No vLLM wheels found in ./wheels/ — building..."
+                echo "Rebuilding vLLM wheels (--rebuild-vllm specified)..."
            fi
+            BUILD_VLLM=true
+        elif try_download_wheels "$VLLM_RELEASE_TAG" "vllm"; then
+            echo "vLLM wheels ready."
+        elif compgen -G "./wheels/vllm*.whl" > /dev/null 2>&1; then
+            echo "Download failed — using existing local vLLM wheels."
+        else
+            echo "No vLLM wheels available (download failed) — building..."
+            BUILD_VLLM=true
+        fi

+        if [ "$BUILD_VLLM" = true ]; then
            # Back up existing vllm wheels; restore them if the build fails
            VLLM_BACKUP="./wheels/.backup-vllm"
            rm -rf "$VLLM_BACKUP" && mkdir -p "$VLLM_BACKUP"
@@ -393,7 +617,6 @@ if [ "$NO_BUILD" = false ]; then
                VLLM_CMD+=("--build-arg" "VLLM_PRS=$VLLM_PRS")
            fi

-
            VLLM_CMD+=(".")

            echo "vLLM build command: ${VLLM_CMD[*]}"
@@ -408,8 +631,6 @@ if [ "$NO_BUILD" = false ]; then
                rm -rf "$VLLM_BACKUP"
                exit 1
            fi
-        else
-            echo "vLLM wheels already present in ./wheels/ — skipping build."
        fi

        # ----------------------------------------------------------
@@ -420,6 +641,15 @@ if [ "$NO_BUILD" = false ]; then
            exit 1
        fi

+        # Generate build metadata YAML
+        VLLM_VERSION=$(ls ./wheels/vllm-*.whl 2>/dev/null | head -1 | sed 's|.*/vllm-||;s|-.*||')
+        VLLM_COMMIT=""
+        [ -f "./wheels/.vllm-commit" ] && VLLM_COMMIT=$(cat ./wheels/.vllm-commit)
+        FLASHINFER_COMMIT=""
+        [ -f "./wheels/.flashinfer-commit" ] && FLASHINFER_COMMIT=$(cat ./wheels/.flashinfer-commit)
+        generate_build_metadata Dockerfile "$VLLM_VERSION" "$VLLM_COMMIT" "$FLASHINFER_COMMIT" \
+            "$VLLM_REF" "$PRE_TRANSFORMERS" "false" "$VLLM_PRS"
+
        RUNNER_CMD=("docker" "build"
            "-t" "$IMAGE_TAG"
            "${COMMON_BUILD_FLAGS[@]}")
--- a/docs/NETWORKING.md
+++ b/docs/NETWORKING.md
@@ -42,13 +42,54 @@ However, in order to get full bandwidth in NCCL RDMA mode, we need to utilize **
 Also, note that connecting two Sparks using **both** ports won't give you any noticeable advantage in bandwidth, so single connection is sufficient.
 If you connect 3 Sparks by daisy-chaining them, you will only be able to sustain 100G between each pair of Sparks.

-## Connecting more than 2 Sparks in the cluster
+## Connecting 3 Sparks in a mesh cluster without a switch
+
+Three Sparks can be connected together in a cluster without using a separate RoCE switch.
+However, all three Sparks need to be on the same wired network using it's 10G Ethernet port (RG-45, not QSFP). Being on a same wireless network should work too, but it's not recommended and was not tested.
+
+You need to make sure they are connected the following way: port 0 on one Spark should connect to port 1 on another Spark (unlike non-mesh configuration).
+Example diagram:
+
+```mermaid
+block-beta
+    columns 1
+    
+    block:Spark3
+        columns 2
+        Title3["Spark 3"]:2
+        s3p0["Port 0<br>192.168.187.13<br>192.168.188.13"] s3p1["Port 1<br>192.168.197.13<br>192.168.198.13"]
+    end
+    
+    space
+    
+    block:Spark2
+        columns 2
+        Title2["Spark 2"]:2
+        s2p0["Port 0<br>192.168.197.12<br>192.168.198.12"] s2p1["Port 1<br>192.168.177.12<br>192.168.178.13"]
+    end
+    
+    space
+    
+    block:Spark1
+        columns 2
+        Title1["Spark 1"]:2
+        s1p0["Port 0<br>192.168.177.11<br>192.168.178.11"] s1p1["Port 1<br>192.168.187.11<br>192.168.188.11"]
+    end
+
+    s1p0 <--> s2p1
+    s2p0 <--> s3p1
+    s3p0 <--> s1p1
+```
+
+## Connecting more than 2 Sparks in the cluster using a switch

 To connect more than 2 Sparks, you will need a proper switch, for example [Microtik CRS812-DDQ](https://mikrotik.com/product/crs812_ddq).
 Please refer to [this post](https://forums.developer.nvidia.com/t/6x-spark-setup/354399/56) for an example of setting up a 6-8 node Spark cluster.

 ## Network setup

+### For dual Sparks or multiple Sparks using a QSFP switch
+
 Assuming both are connected using rightmost QFSP port (when looking from the back).

 Create `/etc/netplan/40-cx7.yaml` on `spark`:
@@ -58,15 +99,16 @@ network:
  ethernets:
    enp1s0f1np1:
      dhcp4: no
-      dhcp6: no              # Explicitly disable DHCPv6
-      link-local: [ ipv4 ]   # Restrict link-local addresses to IPv4 only
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
      mtu: 9000
      addresses: [192.168.177.11/24]
    enP2p1s0f1np1:
      dhcp4: no
      dhcp6: no
-      link-local: [ ipv4 ]
+      link-local: []
      mtu: 9000
+      addresses: [192.168.178.11/24]
 ```

 Create `/etc/netplan/40-cx7.yaml` on `spark2`:
@@ -76,23 +118,19 @@ network:
  ethernets:
    enp1s0f1np1:
      dhcp4: no
-      dhcp6: no              # Explicitly disable DHCPv6
-      link-local: [ ipv4 ]   # Restrict link-local addresses to IPv4 only
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
      mtu: 9000
      addresses: [192.168.177.12/24]
    enP2p1s0f1np1:
      dhcp4: no
      dhcp6: no
-      link-local: [ ipv4 ]
+      link-local: []
      mtu: 9000
+      addresses: [192.168.178.12/24]
 ```

-Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both.
-You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet.
-
-For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.
-
-This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one.
+**DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing.

 Then run on each node:

@@ -115,6 +153,122 @@ MTU setting (testing):
 sudo ip link set dev enp1s0f1np1 mtu 9000
 ```

+### For 3-node mesh
+
+3-node mesh is configured differently than dual clusters or clusters using a QSFP switch.
+
+Assuming, your Sparks are connected according to the diagram above:
+
+Create `/etc/netplan/40-cx7.yaml` on `spark1`:
+```yaml
+network:
+  version: 2
+  ethernets:
+    enp1s0f0np0:
+      dhcp4: no
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
+      mtu: 9000
+      addresses: [192.168.177.11/24]
+    enP2p1s0f0np0:
+      dhcp4: no
+      dhcp6: no
+      link-local: []
+      mtu: 9000
+      addresses: [192.168.178.11/24]
+    enp1s0f1np1:
+      dhcp4: no
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
+      mtu: 9000
+      addresses: [192.168.187.11/24]
+    enP2p1s0f1np1:
+      dhcp4: no
+      dhcp6: no
+      link-local: []
+      mtu: 9000
+      addresses: [192.168.188.11/24]
+```
+
+Create `/etc/netplan/40-cx7.yaml` on `spark2`:
+```yaml
+network:
+  version: 2
+  ethernets:
+    enp1s0f0np0:
+      dhcp4: no
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
+      mtu: 9000
+      addresses: [192.168.197.12/24]
+    enP2p1s0f0np0:
+      dhcp4: no
+      dhcp6: no
+      link-local: []
+      mtu: 9000
+      addresses: [192.168.198.12/24]
+    enp1s0f1np1:
+      dhcp4: no
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
+      mtu: 9000
+      addresses: [192.168.177.12/24]
+    enP2p1s0f1np1:
+      dhcp4: no
+      dhcp6: no
+      link-local: []
+      mtu: 9000
+      addresses: [192.168.178.12/24]
+```
+
+Create `/etc/netplan/40-cx7.yaml` on `spark3`:
+```yaml
+network:
+  version: 2
+  ethernets:
+    enp1s0f0np0:
+      dhcp4: no
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
+      mtu: 9000
+      addresses: [192.168.187.13/24]
+    enP2p1s0f0np0:
+      dhcp4: no
+      dhcp6: no
+      link-local: []
+      mtu: 9000
+      addresses: [192.168.188.13/24]
+    enp1s0f1np1:
+      dhcp4: no
+      dhcp6: no        # Explicitly disable DHCPv6
+      link-local: []   # Restrict link-local addresses to static IPv4 only
+      mtu: 9000
+      addresses: [192.168.197.13/24]
+    enP2p1s0f1np1:
+      dhcp4: no
+      dhcp6: no
+      link-local: []
+      mtu: 9000
+      addresses: [192.168.198.13/24]
+```
+
+Then run (on each Spark):
+
+```bash
+sudo chmod 600 /etc/netplan/40-cx7.yaml
+sudo netplan apply
+```
+
+### Passwordless SSH and benchmarks
+
+Set up passwordless ssh. On the first spark:
+
+```bash
+wget https://raw.githubusercontent.com/NVIDIA/dgx-spark-playbooks/refs/heads/main/nvidia/connect-two-sparks/assets/discover-sparks
+chmod +x discover-sparks
+./discover-sparks
+```
+
 **Benchmark connection (use perftest package):**

 Run the receiver on `spark2` node:
@@ -196,7 +350,9 @@ ib_write_lat 192.168.177.12 -d rocep1s0f1 --report_gbits -R --force-link IB
 ---------------------------------------------------------------------------------------
 ```

-## NCCL Setup
+## NCCL Tests
+
+### Dual Sparks or Sparks via QSFP switch

 From https://build.nvidia.com/spark/nccl/stacked-sparks

@@ -239,4 +395,52 @@ mpirun -np 2 -H 192.168.177.11:1,192.168.177.12:1 \
  -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH \
  $HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2

+```
+
+### 3-node mesh
+
+```bash
+# Install dependencies and build NCCL
+sudo apt-get update && sudo apt-get install -y libopenmpi-dev
+git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git ~/nccl/
+cd ~/nccl/
+make -j src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121"
+
+# Set environment variables
+export CUDA_HOME="/usr/local/cuda"
+export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
+export NCCL_HOME="$HOME/nccl/build/"
+export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
+```
+
+Build NCCL Test Suite:
+
+```bash
+# Clone and build NCCL tests
+git clone https://github.com/NVIDIA/nccl-tests.git ~/nccl-tests/
+cd ~/nccl-tests/
+make MPI=1
+```
+
+Test on both nodes (replace spark1, spark2, spark3 with the actual hostnames or IP address on non-QSFP interface):
+
+```bash
+# Set environment variables
+export CUDA_HOME="/usr/local/cuda"
+export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi"
+export NCCL_HOME="$HOME/nccl_spark_cluster/build/"
+export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH"
+
+# For 3-node mesh we have to use 10G interface for OOB communication!
+export UCX_NET_DEVICES=enP7s7
+export NCCL_SOCKET_IFNAME=enP7s7
+export OMPI_MCA_btl_tcp_if_include=enP7s7
+export NCCL_IB_HCA=rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1
+export NCCL_IB_DISABLE=0
+
+# Run the all_gather performance test across both nodes
+mpirun -np 3 -H spark1:1,spark2:1,spark3:1 \
+  --mca plm_rsh_agent "ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" \
+  -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH -x NCCL_IB_MERGE_NICS=0 -x NCCL_NET_PLUGIN=none -x NCCL_IB_SUBNET_AWARE_ROUTING=1 \
+  $HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 3
 ```
--- a/hf-download.sh
+++ b/hf-download.sh
@@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub"
 COPY_HOSTS=()
 SSH_USER="$USER"
 PARALLEL_COPY=false
+CONFIG_FILE=""
+CONFIG_FILE_SET=false

 # Help function
 usage() {
@@ -16,6 +18,7 @@ usage() {
    echo "      --copy-to-host          : Alias for --copy-to (backwards compatibility)."
    echo "      --copy-parallel         : Copy to all hosts in parallel instead of serially."
    echo "  -u, --user <user>           : Username for ssh commands (default: \$USER)"
+    echo "  --config <file>             : Path to .env configuration file (default: .env in script directory)"
    echo "  -h, --help                  : Show this help message"
    exit 1
 }
@@ -37,11 +40,11 @@ copy_model_to_host() {
    local host="$1"
    local model_name="$2"
    local model_dir="$3"
-    
+
    echo "Copying model '$model_name' to ${SSH_USER}@${host}..."
    local host_copy_start host_copy_end host_copy_time
    host_copy_start=$(date +%s)
-    
+
    if rsync -av --mkpath --progress "$model_dir" "${SSH_USER}@${host}:$HUB_PATH/"; then
        host_copy_end=$(date +%s)
        host_copy_time=$((host_copy_end - host_copy_start))
@@ -53,44 +56,24 @@ copy_model_to_host() {
 }

 # Argument parsing
+COPY_TO_FLAG=false
 while [[ "$#" -gt 0 ]]; do
    case $1 in
        -c|--copy-to|--copy-to-host|--copy-to-hosts)
+            COPY_TO_FLAG=true
            shift
            # Consume arguments until the next flag or end of args
            while [[ "$#" -gt 0 && "$1" != -* ]]; do
                add_copy_hosts "$1"
                shift
            done
-
-            # If no hosts specified, use autodiscovery
-            if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
-                echo "No hosts specified. Using autodiscovery..."
-                source "$(dirname "$0")/autodiscover.sh"
-                
-                detect_nodes
-                if [ $? -ne 0 ]; then
-                    echo "Error: Autodiscovery failed."
-                    exit 1
-                fi
-                
-                # Use PEER_NODES directly
-                if [ ${#PEER_NODES[@]} -gt 0 ]; then
-                    COPY_HOSTS=("${PEER_NODES[@]}")
-                fi
-                
-                if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
-                     echo "Error: Autodiscovery found no other nodes."
-                     exit 1
-                fi
-                echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
-            fi
            continue
            ;;
        --copy-parallel) PARALLEL_COPY=true ;;
        -u|--user) SSH_USER="$2"; shift ;;
+        --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
        -h|--help) usage ;;
-        *) 
+        *)
            # If positional argument is provided
            if [ -z "${MODEL_NAME:-}" ]; then
                MODEL_NAME="$1"
@@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do
    shift
 done

+# Export config so autodiscover.sh picks it up
+export CONFIG_FILE CONFIG_FILE_SET
+
+# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available
+source "$(dirname "$0")/autodiscover.sh"
+
 # Validate model name is provided
 if [ -z "${MODEL_NAME:-}" ]; then
    echo "Error: Model name is required."
    usage
 fi

+# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env
+if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
+    # --copy-to was specified but no hosts given: use .env or autodiscover
+    if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
+        echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
+        IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
+        COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
+    else
+        echo "No hosts specified. Using autodiscovery..."
+        detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
+        detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
+        detect_nodes || { echo "Error: Node detection failed."; exit 1; }
+        detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
+
+        if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
+            COPY_HOSTS=("${COPY_PEER_NODES[@]}")
+        fi
+
+        if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
+            echo "Error: Autodiscovery found no other nodes."
+            exit 1
+        fi
+        echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}"
+    fi
+elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then
+    # No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly
+    : # intentional no-op; user didn't ask for copy
+fi
+
 # Check if uvx is installed
 if ! command -v uvx &> /dev/null; then
    echo "Error: 'uvx' command not found."
@@ -231,4 +249,4 @@ if [ "$COPY_TIME" -gt 0 ]; then
 fi
 echo "Total:     $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
 echo "========================================="
-echo "Done downloading $MODEL_NAME."
+echo "Done downloading $MODEL_NAME."
--- a/launch-cluster.sh
+++ b/launch-cluster.sh
@@ -16,6 +16,7 @@ fi
 ETH_IF=""
 IB_IF=""
 NCCL_DEBUG_VAL=""
+MASTER_PORT="29501"

 # Initialize variables
 NODES_ARG=""
@@ -23,15 +24,18 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
 COMMAND_TO_RUN=""
 DAEMON_MODE="false"
 CHECK_CONFIG="false"
-ACTION="start"
+ACTION=""
 CLUSTER_WAS_RUNNING="false"
 MOD_PATHS=()
 MOD_TYPES=()
 LAUNCH_SCRIPT_PATH=""
 SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+CONFIG_FILE=""  # Will be set to default after argument parsing

 ACTIONS_ARG=""
 SOLO_MODE="false"
+NO_RAY_MODE="false"
+LAUNCH_SCRIPT_MODE="false"
 MOUNT_CACHE_DIRS="true"
 BUILD_JOBS=""
 NON_PRIVILEGED_MODE="false"
@@ -55,6 +59,8 @@ usage() {
    echo "  --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted."
    echo "  --check-config  Check configuration and auto-detection without launching"
    echo "  --solo          Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
+    echo "  --master-port   Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501)"
+    echo "  --no-ray        No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
    echo "  --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)"
    echo "  -d              Daemon mode (only for 'start' action)"
    echo "  --non-privileged Run in non-privileged mode (removes --privileged and --ipc=host)"
@@ -62,9 +68,31 @@ usage() {
    echo "  --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
    echo "  --pids-limit    Process limit (default: 4096, only with --non-privileged)"
    echo "  --shm-size-gb   Shared memory size in GB (default: 64, only with --non-privileged)"
+    echo "  --config        Path to .env configuration file (default: .env in script directory)
+  --setup/--discover  Force autodiscovery and save configuration (even if .env exists)"
    echo "  action          start | stop | status | exec (Default: start). Not compatible with --launch-script."
    echo "  command         Command to run (only for 'exec' action). Not compatible with --launch-script."
    echo ""
+    echo "Supported .env file variables:"
+    echo "  CLUSTER_NODES       Comma-separated list of node IPs"
+    echo "  ETH_IF              Ethernet interface name"
+    echo "  IB_IF               InfiniBand interface name"
+    echo "  MASTER_PORT         Port for cluster coordination (default: 29501)"
+    echo "  CONTAINER_NAME      Container name (default: vllm_node)"
+    echo "  LOCAL_IP            Local IP address (for solo mode or override auto-detection)"
+    echo "  CONTAINER_*         Any variable starting with CONTAINER_ (except CONTAINER_NAME)"
+    echo "                      becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO"
+    echo ""
+    echo "Example .env file:"
+    echo "  CLUSTER_NODES=192.168.1.1,192.168.1.2"
+    echo "  ETH_IF=eth0"
+    echo "  IB_IF=ib0"
+    echo "  MASTER_PORT=29501"
+    echo "  CONTAINER_NAME=vllm_node"
+    echo "  LOCAL_IP=192.168.1.1"
+    echo "  CONTAINER_NCCL_DEBUG=INFO"
+    echo "  CONTAINER_HF_TOKEN=abc123"
+    echo ""
    echo "Launch Script Usage:"
    echo "  $0 --launch-script examples/my-script.sh   # Script copied to container and executed"
    echo "  $0 --launch-script /path/to/script.sh      # Uses absolute path to script"
@@ -91,8 +119,10 @@ while [[ "$#" -gt 0 ]]; do
                NCCL_DEBUG_VAL="INFO"
            fi
            ;;
+        --master-port|--head-port) MASTER_PORT="$2"; shift ;;
        --check-config) CHECK_CONFIG="true" ;;
        --solo) SOLO_MODE="true" ;;
+        --no-ray) NO_RAY_MODE="true" ;;
        --no-cache-dirs) MOUNT_CACHE_DIRS="false" ;;
        --non-privileged) NON_PRIVILEGED_MODE="true" ;;
        --mem-limit-gb) MEM_LIMIT_GB="$2"; shift ;;
@@ -101,6 +131,8 @@ while [[ "$#" -gt 0 ]]; do
        --shm-size-gb) SHM_SIZE_GB="$2"; shift ;;
        -d) DAEMON_MODE="true" ;;
        -h|--help) usage ;;
+        --config) CONFIG_FILE="$2"; shift ;;
+        --setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
        start|stop|status) 
            if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
                echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
@@ -115,7 +147,7 @@ while [[ "$#" -gt 0 ]]; do
            fi
            ACTION="exec"
            shift
-            COMMAND_TO_RUN="$@"
+            COMMAND_TO_RUN=$(printf "%q " "$@")
            break
            ;;
        *) 
@@ -126,6 +158,115 @@ while [[ "$#" -gt 0 ]]; do
    shift
 done

+# Set .env file path (use default if not specified)
+if [[ -z "$CONFIG_FILE" ]]; then
+    CONFIG_FILE="$SCRIPT_DIR/.env"
+    CONFIG_FILE_SET=false
+else
+    CONFIG_FILE_SET=true
+fi
+
+# Load .env file
+if [[ -f "$CONFIG_FILE" ]]; then
+    echo "Loading configuration from .env file..."
+    
+    # Validate .env file syntax
+    if ! python3 -c "
+import sys
+import re
+
+env_file = '$CONFIG_FILE'
+seen_keys = set()
+
+with open(env_file, 'r') as f:
+    for line_num, line in enumerate(f, 1):
+        line = line.strip()
+        # Skip empty lines and comments
+        if not line or line.startswith('#'):
+            continue
+        
+        # Check for key=value format
+        if '=' not in line:
+            print(f'Error: Invalid syntax at line {line_num}: missing \"=\"')
+            sys.exit(1)
+        
+        key = line.split('=', 1)[0].strip()
+        
+        # Validate key format (alphanumeric + underscore)
+        if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', key):
+            print(f'Error: Invalid key format at line {line_num}: {key}')
+            sys.exit(1)
+        
+        # Check for duplicates
+        if key in seen_keys:
+            print(f'Error: Duplicate key at line {line_num}: {key}')
+            sys.exit(1)
+        
+        seen_keys.add(key)
+
+sys.exit(0)
+" 2>/dev/null; then
+        echo "Error: Invalid .env file syntax. Aborting."
+        exit 1
+    fi
+    
+    # Load .env variables with DOTENV_ prefix
+    while IFS='=' read -r key value || [[ -n "$key" ]]; do
+        # Skip comments and empty lines
+        [[ "$key" =~ ^[[:space:]]*# ]] && continue
+        [[ -z "$key" ]] && continue
+        
+        # Remove leading/trailing whitespace from key
+        key=$(echo "$key" | xargs)
+        
+        # Skip if key is empty after trimming
+        [[ -z "$key" ]] && continue
+        
+        # Remove quotes and whitespace from value using Python for proper shlex handling
+        value=$(python3 -c "
+import shlex
+import sys
+value = '''$value'''
+# Strip whitespace
+value = value.strip()
+# Remove surrounding quotes if present
+if (value.startswith('\"') and value.endswith('\"')) or (value.startswith(\"'\" ) and value.endswith(\"'\")):
+    value = value[1:-1]
+print(value)
+")
+        
+        # Export with DOTENV_ prefix
+        export "DOTENV_$key=$value"
+    done < "$CONFIG_FILE"
+    
+    echo "Loaded .env variables: $(compgen -v DOTENV_ | tr '\n' ' ')"
+fi
+
+# Apply .env configuration (CLI args take precedence)
+if [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]]; then
+    NODES_ARG="$DOTENV_CLUSTER_NODES"
+fi
+
+if [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]]; then
+    ETH_IF="$DOTENV_ETH_IF"
+fi
+
+if [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]]; then
+    IB_IF="$DOTENV_IB_IF"
+fi
+
+if [[ -z "$MASTER_PORT" || "$MASTER_PORT" == "29501" ]] && [[ -n "$DOTENV_MASTER_PORT" ]]; then
+    MASTER_PORT="$DOTENV_MASTER_PORT"
+fi
+
+if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOTENV_CONTAINER_NAME" ]]; then
+    CONTAINER_NAME="$DOTENV_CONTAINER_NAME"
+fi
+
+if [[ -n "$DOTENV_LOCAL_IP" ]]; then
+    export LOCAL_IP="$DOTENV_LOCAL_IP"
+fi
+
 # Validate non-privileged mode flags
 if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then
    # Set default swap limit if not specified
@@ -156,6 +297,26 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
    esac
 fi

+# Add container environment variables from .env (CONTAINER_* pattern)
+# Excludes CONTAINER_NAME which is a configuration variable, not an env var
+for env_var in $(compgen -v DOTENV_CONTAINER_); do
+    # Skip CONTAINER_NAME as it's a configuration variable
+    [[ "$env_var" == "DOTENV_CONTAINER_NAME" ]] && continue
+    
+    # Get the value
+    value="${!env_var}"
+    
+    # Extract the actual env var name (remove DOTENV_CONTAINER_ prefix)
+    actual_var="${env_var#DOTENV_CONTAINER_}"
+    
+    # Properly escape the value for shell using Python
+    escaped_value=$(python3 -c "import shlex; print(shlex.quote('$value'))")
+    
+    # Add to docker args
+    DOCKER_ARGS="$DOCKER_ARGS -e $actual_var=$escaped_value"
+    echo "Adding container env: $actual_var"
+done
+
 # Add build job parallelization environment variables if BUILD_JOBS is set
 if [[ -n "$BUILD_JOBS" ]]; then
    DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS"
@@ -204,9 +365,10 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
    
    # Set command to run the copied script (use absolute path since docker exec may not be in /workspace)
    COMMAND_TO_RUN="/workspace/exec-script.sh"
-    
+    LAUNCH_SCRIPT_MODE="true"
+
    # If launch script is specified, default action to exec unless explicitly set to stop/status
-    if [[ "$ACTION" == "start" ]]; then
+    if [[ -z "$ACTION" || "$ACTION" == "start" ]]; then
        ACTION="exec"
    fi
 fi
@@ -251,13 +413,33 @@ done
 # Source autodiscover module
 source "$(dirname "$0")/autodiscover.sh"

-if [[ "$SOLO_MODE" == "true" ]]; then
-    if [[ -n "$NODES_ARG" ]]; then
-        echo "Error: --solo is incompatible with -n/--nodes."
-        exit 1
+if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
+    # --setup: force full autodiscovery and save configuration
+    echo "Running full autodiscovery (--setup)..."
+    # Clear pre-loaded values so detect functions run fresh instead of short-circuiting
+    ETH_IF="" IB_IF="" NODES_ARG="" LOCAL_IP=""
+    detect_interfaces || exit 1
+    detect_local_ip || exit 1
+    detect_nodes || exit 1
+    detect_copy_hosts || exit 1
+    save_config || exit 1
+    # Reload .env so DOTENV_* variables reflect saved config
+    load_env_if_exists
+    [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
+    [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
+    [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
+    # If no action was specified, setup was the only intent — exit cleanly
+    if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then
+        exit 0
    fi
+fi
+
+if [[ "$SOLO_MODE" == "true" ]]; then
    # Solo mode: skip node detection, just get local IP
-    LOCAL_IP="127.0.0.1"
+    # Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1
+    if [[ -z "$LOCAL_IP" ]]; then
+        LOCAL_IP="127.0.0.1"
+    fi
    NODES_ARG="$LOCAL_IP"
    PEER_NODES=()
    echo "Solo mode enabled. Skipping node detection."
@@ -303,6 +485,11 @@ if [[ "$SOLO_MODE" == "false" && ${#PEER_NODES[@]} -eq 0 ]]; then
    SOLO_MODE="true"
 fi

+if [[ "$NO_RAY_MODE" == "true" && "$SOLO_MODE" == "true" ]]; then
+    echo "Warning: Only one node detected; --no-ray has no effect in solo mode. Proceeding normally."
+    NO_RAY_MODE="false"
+fi
+
 echo "Head Node: $HEAD_IP"
 echo "Worker Nodes: ${PEER_NODES[*]}"
 echo "Container Name: $CONTAINER_NAME"
@@ -324,6 +511,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]
    fi
 fi

+if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" && "$CHECK_CONFIG" != "true" ]]; then
+    echo "Error: No action specified. Use: start | stop | status | exec"
+    usage
+    exit 1
+fi
+
 if [[ "$CHECK_CONFIG" == "true" ]]; then
    echo "Configuration Check Complete."
    echo "  Image Name: $IMAGE_NAME"
@@ -377,9 +570,11 @@ if [[ "$ACTION" == "status" ]]; then
    # Check Head
    if docker ps | grep -q "$CONTAINER_NAME"; then
        echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is RUNNING."
-        echo "--- Ray Status ---"
-        docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
-        echo "------------------"
+        if [[ "$NO_RAY_MODE" == "false" ]]; then
+            echo "--- Ray Status ---"
+            docker exec "$CONTAINER_NAME" ray status || echo "Failed to get ray status."
+            echo "------------------"
+        fi
    else
        echo "[HEAD] $HEAD_IP: Container '$CONTAINER_NAME' is NOT running."
    fi
@@ -537,23 +732,109 @@ apply_mod_to_container() {
    fi
 }

-# Copy Launch Script to Container Function
-copy_launch_script_to_container() {
-    local container="$1"
-    local script_path="$2"
+# Parse -tp/-pp/-dp (and long forms) from a text string (command or script content).
+# Sets TP_SIZE, PP_SIZE, DP_SIZE, PARALLELISM_FOUND globals.
+# Only acts when at least one parallelism flag is present.
+parse_parallelism_from_text() {
+    local text="$1"
+    TP_SIZE=1; PP_SIZE=1; DP_SIZE=1
+    PARALLELISM_FOUND=false

-    echo "Copying launch script to head node..."
+    # Normalize --flag=value to --flag value for uniform word-by-word parsing
+    local normalized
+    normalized=$(echo "$text" | sed 's/\(--[a-z-]*\)=/\1 /g')

-    local target_script_path="$script_path"
+    local prev=""
+    for word in $normalized; do
+        case "$prev" in
+            -tp|--tensor-parallel-size)
+                [[ "$word" =~ ^[0-9]+$ ]] && TP_SIZE="$word" && PARALLELISM_FOUND=true ;;
+            -pp|--pipeline-parallel-size)
+                [[ "$word" =~ ^[0-9]+$ ]] && PP_SIZE="$word" && PARALLELISM_FOUND=true ;;
+            -dp|--data-parallel-size)
+                [[ "$word" =~ ^[0-9]+$ ]] && DP_SIZE="$word" && PARALLELISM_FOUND=true ;;
+        esac
+        prev="$word"
+    done
+}

-    # Copy script into container as /workspace/exec-script.sh
-    echo "  Copying script into container..."
-    docker cp "$target_script_path" "$container:/workspace/exec-script.sh"
+# Build a patched copy of the launch script on the host for a specific node.
+# Strips --distributed-executor-backend and appends multi-node args.
+# Prints the path of the temp file (caller must delete it).
+make_node_script() {
+    local script_path="$1"; local nnodes="$2"; local node_rank="$3"; local master_addr="$4"
+    local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr --master-port $MASTER_PORT"
+    [[ "$node_rank" -gt 0 ]] && extra="$extra --headless"

-    # Make executable
+    local tmp; tmp=$(mktemp /tmp/vllm_node_script_XXXXXX.sh)
+    # Remove just the flag and its value (not the whole line), then filter empty/backslash-only lines
+    sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//' "$script_path" | \
+        grep -Ev '^[[:space:]\\]*$' > "$tmp"
+    # Strip trailing backslash from last line before appending multi-node args
+    sed -i "$ s/[[:space:]]*\\\\[[:space:]]*$//" "$tmp"
+    sed -i "$ s/$/ $extra/" "$tmp"
+    chmod +x "$tmp"
+    echo "$tmp"
+}
+
+# Copy a script file into a local container as /workspace/exec-script.sh
+copy_script_to_container() {
+    local container="$1"; local script_path="$2"; local label="${3:-node}"
+    echo "Copying launch script to $label..."
+    docker cp "$script_path" "$container:/workspace/exec-script.sh" || { echo "Error: docker cp to $label failed"; exit 1; }
    docker exec "$container" chmod +x /workspace/exec-script.sh
+}

-    echo "  Launch script copied to head node"
+# Copy a script file to a remote container via scp + docker cp
+copy_script_to_worker() {
+    local worker_ip="$1"; local container="$2"; local script_path="$3"
+    echo "Copying launch script to worker $worker_ip..."
+    local remote_tmp="/tmp/vllm_script_$(date +%s)_$RANDOM.sh"
+    scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$worker_ip:$remote_tmp" || { echo "Error: scp to $worker_ip failed"; exit 1; }
+    ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
+        "docker cp $remote_tmp $container:/workspace/exec-script.sh && \
+         docker exec $container chmod +x /workspace/exec-script.sh && \
+         rm -f $remote_tmp" || { echo "Error: docker cp to worker $worker_ip failed"; exit 1; }
+}
+
+# Build -e KEY=VALUE flags for a given node IP (used in docker run and docker exec)
+get_env_flags() {
+    local node_ip="$1"
+    printf -- '-e %s ' \
+        "VLLM_HOST_IP=$node_ip" \
+        "RAY_NODE_IP_ADDRESS=$node_ip" \
+        "RAY_OVERRIDE_NODE_IP_ADDRESS=$node_ip" \
+        "MN_IF_NAME=$ETH_IF" \
+        "UCX_NET_DEVICES=$ETH_IF" \
+        "NCCL_SOCKET_IFNAME=$ETH_IF" \
+        "NCCL_IB_HCA=$IB_IF" \
+        "NCCL_IB_DISABLE=0" \
+        "OMPI_MCA_btl_tcp_if_include=$ETH_IF" \
+        "GLOO_SOCKET_IFNAME=$ETH_IF" \
+        "TP_SOCKET_IFNAME=$ETH_IF" \
+        "RAY_memory_monitor_refresh_ms=0" \
+        "RAY_num_prestart_python_workers=0" \
+        "RAY_object_store_memory=1073741824"
+}
+
+# Start Ray head node inside the container
+start_ray_head() {
+    local container="$1"
+    echo "Starting Ray HEAD node on $HEAD_IP..."
+    docker exec -d "$container" bash -c \
+        "ray start --block --head --port $MASTER_PORT --object-store-memory 1073741824 --num-cpus 2 \
+         --node-ip-address $HEAD_IP --include-dashboard=false --disable-usage-stats \
+         >> /proc/1/fd/1 2>&1"
+}
+
+# Start Ray worker node inside the container on a remote host
+start_ray_worker() {
+    local worker_ip="$1"; local container="$2"
+    echo "Starting Ray WORKER node on $worker_ip..."
+    ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
+        "docker exec -d $container bash -c \
+         'ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
+          --address=$HEAD_IP:$MASTER_PORT --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'"
 }

 # Start Cluster Function
@@ -564,31 +845,6 @@ start_cluster() {
        return
    fi

-    # Start Head Node
-    echo "Starting Head Node on $HEAD_IP..."
-    
-    # Ensure cache dirs exist on head
-    if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
-        for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
-            mkdir -p "$dir"
-        done
-    fi
-
-    local head_cmd_args=()
-    if [[ "$SOLO_MODE" == "true" ]]; then
-        if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
-             head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting container...; exec sleep infinity")
-        else
-             head_cmd_args=(sleep infinity)
-        fi
-    else
-        if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
-            head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role head --host-ip $HEAD_IP --eth-if $ETH_IF --ib-if $IB_IF")
-        else
-            head_cmd_args=(./run-cluster-node.sh --role head --host-ip "$HEAD_IP" --eth-if "$ETH_IF" --ib-if "$IB_IF")
-        fi
-    fi
-
    # Build docker run arguments based on mode
    local docker_args_common="--gpus all -d --rm --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME"
    local docker_caps_args=""
@@ -603,62 +859,68 @@ start_cluster() {
        docker_resource_args="--ipc=host"
    fi

+    # Start Head Node
+    echo "Starting Head Node on $HEAD_IP..."
+    if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
+        for dir in "${CACHE_DIRS_TO_CREATE[@]}"; do
+            mkdir -p "$dir"
+        done
+    fi
    docker run $docker_caps_args $docker_resource_args \
-        $docker_args_common \
-        "${head_cmd_args[@]}"
+        $(get_env_flags "$HEAD_IP") $docker_args_common sleep infinity

    # Start Worker Nodes
    for worker in "${PEER_NODES[@]}"; do
        echo "Starting Worker Node on $worker..."
-        
-        # Ensure cache dirs exist on worker
        if [[ "$MOUNT_CACHE_DIRS" == "true" ]]; then
-             # Create string of dirs to create
-             dirs_str="${CACHE_DIRS_TO_CREATE[*]}"
-             ssh "$worker" "mkdir -p $dirs_str"
-        fi
-
-        local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $docker_args_common"
-        
-        if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
-            local inner_script="echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
-            ssh "$worker" "$docker_run_cmd bash -c \"$inner_script\""
-        else
-            ssh "$worker" "$docker_run_cmd ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
+            ssh "$worker" "mkdir -p ${CACHE_DIRS_TO_CREATE[*]}"
        fi
+        local docker_run_cmd="docker run $docker_caps_args $docker_resource_args $(get_env_flags "$worker") $docker_args_common"
+        ssh "$worker" "$docker_run_cmd sleep infinity"
    done

-    # Apply mods if requested
+    # Apply mods (containers are idle — no mod_done sync needed)
    if [[ ${#MOD_PATHS[@]} -gt 0 ]]; then
        echo "Applying modifications to cluster nodes..."
-        
-        # Apply to Head
        for i in "${!MOD_PATHS[@]}"; do
            apply_mod_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
        done
-        # Signal completion on Head
-        docker exec "$CONTAINER_NAME" touch /tmp/mod_done
-        
-        # Apply to Workers
        for worker in "${PEER_NODES[@]}"; do
            for i in "${!MOD_PATHS[@]}"; do
                apply_mod_to_container "$worker" "$CONTAINER_NAME" "false" "${MOD_PATHS[$i]}" "${MOD_TYPES[$i]}"
            done
-            # Signal completion on Worker
-            ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "docker exec $CONTAINER_NAME touch /tmp/mod_done"
        done
    fi

-    # Copy launch script to head node only (workers don't need it - they just run Ray)
+    # Copy (and patch for no-ray) launch script
    if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
-        copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
+        local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
+        if [[ "$NO_RAY_MODE" == "true" ]]; then
+            # Build per-node patched scripts on the host, then copy
+            local head_script; head_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "0" "$HEAD_IP")
+            copy_script_to_container "$CONTAINER_NAME" "$head_script" "head node ($HEAD_IP)"
+            rm -f "$head_script"
+
+            local rank=1
+            for worker in "${PEER_NODES[@]}"; do
+                local worker_script; worker_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "$rank" "$HEAD_IP")
+                copy_script_to_worker "$worker" "$CONTAINER_NAME" "$worker_script"
+                rm -f "$worker_script"
+                (( rank++ ))
+            done
+        else
+            copy_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" "head node"
+        fi
    fi

-    if [[ "$SOLO_MODE" == "false" ]]; then
+    # Start Ray cluster (unless solo or no-ray)
+    if [[ "$SOLO_MODE" == "false" && "$NO_RAY_MODE" == "false" ]]; then
+        start_ray_head "$CONTAINER_NAME"
+        for worker in "${PEER_NODES[@]}"; do
+            start_ray_worker "$worker" "$CONTAINER_NAME"
+        done
        wait_for_cluster
    else
-        echo "Solo mode active: Skipping Ray cluster readiness check."
-        # Give container a moment to start up
        sleep 2
    fi
 }
@@ -686,25 +948,97 @@ wait_for_cluster() {
    exit 1
 }

-if [[ "$ACTION" == "exec" ]]; then
-    start_cluster
-    echo "Executing command on head node: $COMMAND_TO_RUN"
-
+# Execute command on head node (daemon or interactive)
+_exec_on_head() {
+    local cmd="$1"
    if [[ "$DAEMON_MODE" == "true" ]]; then
-        # Daemon mode: run command detached inside the container and exit immediately
-        # Extract env vars starting from VLLM_HOST_IP to avoid interactive check in .bashrc
-        # Redirect output to PID 1 stdout/stderr so it shows up in docker logs
-        docker exec -d "$CONTAINER_NAME" bash -c "eval \"\$(sed -n '/export VLLM_HOST_IP/,\$p' /root/.bashrc)\" && { $COMMAND_TO_RUN; } >> /proc/1/fd/1 2>> /proc/1/fd/2"
+        docker exec -d "$CONTAINER_NAME" bash -c "$cmd >> /proc/1/fd/1 2>&1"
        echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
    else
-        # Check if running in a TTY to avoid "input device is not a TTY" error
-        if [ -t 0 ]; then
-            DOCKER_EXEC_FLAGS="-it"
-        else
-            DOCKER_EXEC_FLAGS="-i"
-        fi
+        if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
+        docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -c "$cmd"
+    fi
+}

-        docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
+# Execute a no-ray multi-node command: workers (background) then head
+exec_no_ray_cluster() {
+    local base_cmd="$1"
+    local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
+
+    # Launch workers first (always background)
+    local rank=1
+    for worker in "${PEER_NODES[@]}"; do
+        local worker_cmd
+        if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
+            worker_cmd="$base_cmd"  # script already patched per-node in start_cluster()
+        else
+            local clean
+            clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
+            worker_cmd="$clean --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --master-port $MASTER_PORT --headless"
+        fi
+        echo "Launching worker (rank $rank) on $worker..."
+        local remote_payload remote_cmd
+        remote_payload="$worker_cmd >> /proc/1/fd/1 2>&1"
+        printf -v remote_cmd 'docker exec -d %q bash -c %q' "$CONTAINER_NAME" "$remote_payload"
+        ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" "$remote_cmd"
+        (( rank++ ))
+    done
+
+    # Launch head (rank 0) last
+    local head_cmd
+    if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
+        head_cmd="$base_cmd"
+    else
+        local clean
+        clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
+        head_cmd="$clean --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP --master-port $MASTER_PORT"
+    fi
+
+    echo "Executing command on head node (rank 0): $head_cmd"
+    if [[ "$DAEMON_MODE" == "true" ]]; then
+        docker exec -d "$CONTAINER_NAME" bash -c "$head_cmd >> /proc/1/fd/1 2>&1"
+        echo "Command dispatched in background (Daemon mode). Container: $CONTAINER_NAME"
+    else
+        if [ -t 0 ]; then DOCKER_EXEC_FLAGS="-it"; else DOCKER_EXEC_FLAGS="-i"; fi
+        docker exec $DOCKER_EXEC_FLAGS "$CONTAINER_NAME" bash -c "$head_cmd"
+    fi
+}
+
+if [[ "$ACTION" == "exec" ]]; then
+    # Trim (or error on) PEER_NODES based on declared parallelism, for any multi-node exec
+    if [[ "$SOLO_MODE" != "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
+        if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then
+            cmd_text=$(cat "$LAUNCH_SCRIPT_PATH" 2>/dev/null || true)
+        else
+            cmd_text="$COMMAND_TO_RUN"
+        fi
+        parse_parallelism_from_text "$cmd_text"
+
+        if [[ "$PARALLELISM_FOUND" == "true" ]]; then
+            required_nodes=$(( TP_SIZE * PP_SIZE * DP_SIZE ))
+            total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
+
+            if [[ "$required_nodes" -gt "$total_nodes" ]]; then
+                echo "Error: Command requires $required_nodes nodes (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE) but only $total_nodes node(s) are configured."
+                exit 1
+            elif [[ "$required_nodes" -lt "$total_nodes" ]]; then
+                echo "Note: Command requires $required_nodes node(s) (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE); using $required_nodes of $total_nodes configured node(s)."
+                PEER_NODES=("${PEER_NODES[@]:0:$(( required_nodes - 1 ))}")
+            fi
+        fi
+    fi
+
+    start_cluster
+    echo "Executing command: $COMMAND_TO_RUN"
+
+    if [[ "$NO_RAY_MODE" == "true" && ${#PEER_NODES[@]} -gt 0 ]]; then
+        if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]] || echo "$COMMAND_TO_RUN" | grep -q "vllm serve"; then
+            exec_no_ray_cluster "$COMMAND_TO_RUN"
+        else
+            _exec_on_head "$COMMAND_TO_RUN"
+        fi
+    else
+        _exec_on_head "$COMMAND_TO_RUN"
    fi
 elif [[ "$ACTION" == "start" ]]; then
    start_cluster
--- a/mods/drop-caches/run.sh
+++ b/mods/drop-caches/run.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# This mod will drop the FS caches every minute - useful to unstuck Qwen3.5-397B or other similar models during loading
+
+CMD='sync; echo 3 > /proc/sys/vm/drop_caches'
+LOG="/tmp/drop_caches.log"
+PIDFILE="/tmp/drop_caches.pid"
+
+nohup bash -c '
+  while true; do
+    '"$CMD"' >> "'"$LOG"'" 2>&1
+    sleep 60
+  done
+' >/dev/null 2>&1 &
+
+echo $! > "$PIDFILE"
+echo "Started drop_caches loop with PID $(cat "$PIDFILE"); log is available in $LOG"
--- a/mods/exp-b12x/run.sh
+++ b/mods/exp-b12x/run.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+set -e
+
+SITE_PACKAGES="/usr/local/lib/python3.12/dist-packages"
+
+echo "=== EXPERIMENTAL b12x-patches mod ==="
+
+# 0a. Check if b12x support is present in vLLM
+if [ ! -f "$SITE_PACKAGES/vllm/model_executor/layers/fused_moe/experts/flashinfer_b12x_moe.py" ]; then
+    echo "[b12x ERROR] No b12x support detected; please rebuild with --apply-vllm-pr 40082, e.g.:"
+    echo "./build-and-copy.sh -t vllm-node-40082 --apply-vllm-pr 40082"
+    exit 1
+fi
+
+# 0b. Check if environment variables are set
+
+if [[ "$VLLM_NVFP4_GEMM_BACKEND" != "flashinfer-b12x" ]]; then
+    echo "[b12x ERROR] Please set required environment variables to use b12x backend"
+    echo "*** Add the following arguments to launch-cluster.sh:"
+    echo "       -e FLASHINFER_DISABLE_VERSION_CHECK=1 -e VLLM_USE_FLASHINFER_MOE_FP16=1 -e VLLM_NVFP4_GEMM_BACKEND=flashinfer-b12x -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 -e VLLM_FLASHINFER_ALLREDUCE_BACKEND=trtllm -e VLLM_USE_FLASHINFER_MOE_FP4=1"
+    echo "*** also set the following vLLM parameters:"
+    echo "       --moe-backend flashinfer_b12x --attention-backend flashinfer"
+    exit 1
+fi
+
+
+# ---------------------------------------------------------------
+# 1. Pin nvidia-cutlass-dsl + companion libs to 4.4.2
+#    (4.5.x generates bad PTX on SM121 — `_mma` rejected by ptxas).
+#    All THREE packages must match: the python frontend, the base libs,
+#    and the CUDA 13 libs (which contain the MLIR compiler).
+# ---------------------------------------------------------------
+DSL_VER=$(pip show nvidia-cutlass-dsl 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
+LIBS_BASE_VER=$(pip show nvidia-cutlass-dsl-libs-base 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
+# LIBS_CU13_VER=$(pip show nvidia-cutlass-dsl-libs-cu13 2>/dev/null | grep '^Version:' | awk '{print $2}' || true)
+if [ "$DSL_VER" != "4.4.2" ] || [ "$LIBS_BASE_VER" != "4.4.2" ] || [ "$LIBS_CU13_VER" != "4.4.2" ]; then
+    echo "[b12x] Pinning nvidia-cutlass-dsl{,-libs-base,-libs-cu13} to 4.4.2"
+    echo "[b12x]   current: dsl=${DSL_VER:-none} libs-base=${LIBS_BASE_VER:-none} libs-cu13=${LIBS_CU13_VER:-none}"
+    uv pip install \
+        nvidia-cutlass-dsl==4.4.2 \
+        nvidia-cutlass-dsl-libs-base==4.4.2 \
+        nvidia-cutlass-dsl-libs-cu13==4.4.2 \
+        -q 2>/dev/null || echo "[b12x] WARNING: cutlass-dsl pin install returned non-zero"
+else
+    echo "[b12x] nvidia-cutlass-dsl + libs already at 4.4.2"
+fi
+
+# ---------------------------------------------------------------
+# 2. Apply cutlass-dsl SM121 patches
+#    FlashInfer/vLLM install wipes vendored cutlass, so re-apply every time
+# ---------------------------------------------------------------
+echo "[b12x] Applying cutlass-dsl SM121 patches..."
+
+# 2a. warp/mma.py: allow sm_121a alongside sm_120a in both the runtime
+#     arch check and the `admissible_archs` string list (used in error msgs)
+for f in $(find "$SITE_PACKAGES" -name "mma.py" -path "*/warp/*" 2>/dev/null); do
+    if grep -q "if not arch == Arch.sm_120a:" "$f" 2>/dev/null; then
+        sed -i "s/if not arch == Arch.sm_120a:/if arch not in (Arch.sm_120a, Arch.sm_121a):/" "$f"
+        echo "  patched $f (warp sm_121a runtime check)"
+    fi
+    # Add sm_121a to the admissible_archs list if missing
+    if grep -q '"sm_120a",' "$f" 2>/dev/null && ! grep -q '"sm_121a"' "$f" 2>/dev/null; then
+        sed -i 's/^\(\s*\)"sm_120a",$/\1"sm_120a",\n\1"sm_121a",/' "$f"
+        echo "  patched $f (warp sm_121a admissible_archs)"
+    fi
+done
+
+# 2b. tcgen05/mma.py: add sm_120a and sm_121a to supported arch list
+for f in $(find "$SITE_PACKAGES" -name "mma.py" -path "*/tcgen05/*" 2>/dev/null); do
+    if ! grep -q "Arch.sm_121a" "$f" 2>/dev/null; then
+        sed -i "/Arch.sm_103a,/a\\        Arch.sm_120a,\n        Arch.sm_121a," "$f"
+        echo "  patched $f (tcgen05 mma sm_121a)"
+    fi
+done
+
+# 2c. tcgen05/copy.py: allow sm_120f family
+for f in $(find "$SITE_PACKAGES" -name "copy.py" -path "*/tcgen05/*" 2>/dev/null); do
+    if ! grep -q "sm_120f" "$f" 2>/dev/null; then
+        sed -i "s/arch.is_family_of(Arch.sm_110f)/arch.is_family_of(Arch.sm_110f) or arch.is_family_of(Arch.sm_120f)/" "$f"
+        echo "  patched $f (tcgen05 copy sm_120f)"
+    fi
+done
+
+# Clear pycache so patched code takes effect
+find "$SITE_PACKAGES" -name "__pycache__" -path "*/cutlass*" -exec rm -rf {} + 2>/dev/null || true
+find "$SITE_PACKAGES" -name "__pycache__" -path "*/flashinfer*" -exec rm -rf {} + 2>/dev/null || true
+
+# ---------------------------------------------------------------
+# 3 Patch FlashInfer's blackwell_sm12x __init__.py to drop the
+#      broken `sm120_moe_dispatch_context` import (FlashInfer main
+#      has a stale __init__ that references a function that no
+#      longer exists in moe_dispatch.py — but the symbol isn't
+#      actually used by anything, so we just remove it from the
+#      import + __all__ list).
+# ---------------------------------------------------------------
+SM12X_INIT="$SITE_PACKAGES/flashinfer/fused_moe/cute_dsl/blackwell_sm12x/__init__.py"
+if [ -f "$SM12X_INIT" ]; then
+    if grep -q "sm120_moe_dispatch_context" "$SM12X_INIT"; then
+        # Drop the line that imports/exports the missing symbol
+        sed -i '/sm120_moe_dispatch_context/d' "$SM12X_INIT"
+        find "$SITE_PACKAGES/flashinfer" -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
+        echo "[b12x] patched $SM12X_INIT (dropped stale sm120_moe_dispatch_context references)"
+    else
+        echo "[b12x] $SM12X_INIT already cleaned"
+    fi
+else
+    echo "[b12x] $SM12X_INIT not found (older FlashInfer?), skipping"
+fi
+
+if grep -q "if current_platform.has_device_capability(120) and has_flashinfer_b12x_gemm():" $SITE_PACKAGES/vllm/model_executor/kernels/linear/nvfp4/flashinfer.py; then
+    echo "[b12x] Patching vLLM PR 40080 to enable sm121 cap"
+    sed -i "s/if current_platform.has_device_capability(120) and has_flashinfer_b12x_gemm():/if True:/" $SITE_PACKAGES/vllm/model_executor/kernels/linear/nvfp4/flashinfer.py
+fi
+
+
--- a/mods/fix-gemma4-tool-parser/run.sh
+++ b/mods/fix-gemma4-tool-parser/run.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -e
+
+cd /usr/local/lib/python3.12/dist-packages
+echo "Applying PR #38909"
+if curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38909.diff | git apply --exclude="tests/*"; then
+  echo "- PR #38909 applied successfully"
+else
+  echo "- PR #38909 can't be applied, skipping"
+fi
--- a/mods/fix-qwen3.5-autoround/run.sh
+++ b/mods/fix-qwen3.5-autoround/run.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 set -e
-patch -p1 -d /usr/local/lib/python3.12/dist-packages < transformers.patch
+patch -p1 -d /usr/local/lib/python3.12/dist-packages < transformers.patch || echo "Patch is not applicable, skipping..."
--- a/mods/fix-qwen3.5-chat-template/chat_template.jinja
+++ b/mods/fix-qwen3.5-chat-template/chat_template.jinja
@@ -0,0 +1,155 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if message.role == "system" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is mapping %}
+                    {%- for args_name in tool_call.arguments %}
+                        {%- set args_value = tool_call.arguments[args_name] %}
+                        {{- '<parameter=' + args_name + '>\n' }}
+                        {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                        {{- args_value }}
+                        {{- '\n</parameter>\n' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
--- a/mods/fix-qwen3.5-chat-template/run.sh
+++ b/mods/fix-qwen3.5-chat-template/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+cp chat_template.jinja $WORKSPACE_DIR/unsloth.jinja
+echo "=======> to apply chat template, use --chat-template unsloth.jinja"
--- a/mods/fix-qwen3.6-chat-template/chat_template.jinja
+++ b/mods/fix-qwen3.6-chat-template/chat_template.jinja
@@ -0,0 +1,223 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+    {%- if content is string %}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping %}
+        {%- for item in content %}
+            {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain images.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id is defined and add_vision_id %}
+                    {{- 'Picture ' ~ image_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+            {%- elif 'video' in item or item.type == 'video' %}
+                {%- if is_system_content %}
+                    {{- raise_exception('System message cannot contain videos.') }}
+                {%- endif %}
+                {%- if do_vision_count %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                {%- endif %}
+                {%- if add_vision_id is defined and add_vision_id %}
+                    {{- 'Video ' ~ video_count.value ~ ': ' }}
+                {%- endif %}
+                {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+            {%- elif 'text' in item %}
+                {{- item.text }}
+            {%- else %}
+                {{- raise_exception('Unexpected item type in content.') }}
+            {%- endif %}
+        {%- endfor %}
+    {%- elif content is none or content is undefined %}
+        {{- '' }}
+    {%- else %}
+        {{- raise_exception('Unexpected content type.') }}
+    {%- endif %}
+{%- endmacro %}
+{%- set ns_flags = namespace(enable_thinking=true) %}
+{%- if enable_thinking is defined %}
+    {%- set ns_flags.enable_thinking = enable_thinking %}
+{%- endif %}
+{%- if not messages %}
+    {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+    {{- '<|im_start|>system\n' }}
+    {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+    {%- if messages[0].role == 'system' or messages[0].role == 'developer' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if '<|think_off|>' in content %}
+            {%- set ns_flags.enable_thinking = false %}
+            {%- set content = content.replace('<|think_off|>', '') %}
+        {%- endif %}
+        {%- if '<|think_on|>' in content %}
+            {%- set ns_flags.enable_thinking = true %}
+            {%- set content = content.replace('<|think_on|>', '') %}
+        {%- endif %}
+        {%- set content = content.strip() %}
+        {%- if content %}
+            {{- '\n\n' + content }}
+        {%- endif %}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' or messages[0].role == 'developer' %}
+        {%- set content = render_content(messages[0].content, false, true)|trim %}
+        {%- if '<|think_off|>' in content %}
+            {%- set ns_flags.enable_thinking = false %}
+            {%- set content = content.replace('<|think_off|>', '') %}
+        {%- endif %}
+        {%- if '<|think_on|>' in content %}
+            {%- set ns_flags.enable_thinking = true %}
+            {%- set content = content.replace('<|think_on|>', '') %}
+        {%- endif %}
+        {%- set content = content.strip() %}
+        {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" %}
+        {%- set content = render_content(message.content, false)|trim %}
+        {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
+            {%- set ns.multi_step_tool = false %}
+            {%- set ns.last_query_index = index %}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+    {%- set ns.last_query_index = messages|length - 1 %}
+{%- endif %}
+{%- for message in messages %}
+    {%- set content = render_content(message.content, true)|trim %}
+    {%- if '<|think_off|>' in content %}
+        {%- set ns_flags.enable_thinking = false %}
+        {%- set content = content.replace('<|think_off|>', '') %}
+    {%- endif %}
+    {%- if '<|think_on|>' in content %}
+        {%- set ns_flags.enable_thinking = true %}
+        {%- set content = content.replace('<|think_on|>', '') %}
+    {%- endif %}
+    {%- set content = content.strip() %}
+    {%- if message.role == "system" or message.role == "developer" %}
+        {%- if not loop.first %}
+            {{- raise_exception('System message must be at the beginning.') }}
+        {%- endif %}
+    {%- elif message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {#- Auto-close unclosed think before tool_call -#}
+        {%- if '<think>' in content and '<tool_call>' in content %}
+            {%- set last_think = content.rfind('<think>') %}
+            {%- set last_close = content.rfind('</think>') %}
+            {%- set tool_pos = content.find('<tool_call>') %}
+            {%- if last_close < last_think or last_close == -1 %}
+                {%- if tool_pos > last_think %}
+                    {%- set content = content[:tool_pos] + '</think>' + content[tool_pos:] %}
+                {%- else %}
+                    {%- set content = content + '</think>' %}
+                {%- endif %}
+            {%- endif %}
+        {%- endif %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- set has_think_tag = false %}
+            {%- set think_end_token = '</think>' %}
+            {%- if '</think>' in content %}
+                {%- set has_think_tag = true %}
+            {%- elif '</thinking>' in content %}
+                {%- set has_think_tag = true %}
+                {%- set think_end_token = '</thinking>' %}
+            {%- elif '<think>' in content %}
+                {%- set reasoning_content = content.split('<think>')[-1].lstrip('\n') %}
+                {%- set content = '' %}
+            {%- endif %}
+            {%- if has_think_tag %}
+                {%- set reasoning_content = content.split(think_end_token)[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split(think_end_token)[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- set reasoning_content = reasoning_content|trim %}
+        {%- set show_think = false %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- set show_think = true %}
+        {%- elif ns_flags.enable_thinking and (preserve_thinking is undefined or preserve_thinking is true) and reasoning_content|length > 0 %}
+            {%- set show_think = true %}
+        {%- endif %}
+        {%- if show_think %}
+            {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if tool_call.function is defined %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {%- if loop.first %}
+                    {%- if content|trim %}
+                        {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- else %}
+                        {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                    {%- endif %}
+                {%- else %}
+                    {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+                {%- endif %}
+                {%- if tool_call.arguments is defined and tool_call.arguments is mapping %}
+                    {%- if tool_call.arguments|length > 0 %}
+                        {%- for args_name in tool_call.arguments %}
+                            {%- set args_value = tool_call.arguments[args_name] %}
+                            {{- '<parameter=' + args_name + '>\n' }}
+                            {%- set args_value = args_value | string if args_value is string else args_value | tojson %}
+                            {{- args_value }}
+                            {{- '\n</parameter>\n' }}
+                        {%- endfor %}
+                    {%- endif %}
+                {%- elif tool_call.arguments is defined and tool_call.arguments is string %}
+                    {%- if tool_call.arguments|trim|length > 0 %}
+                        {{- tool_call.arguments }}
+                        {{- '\n' }}
+                    {%- endif %}
+                {%- endif %}
+                {{- '</function>\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- raise_exception('Unexpected message role.') }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if ns_flags.enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
--- a/mods/fix-qwen3.6-chat-template/run.sh
+++ b/mods/fix-qwen3.6-chat-template/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+cp chat_template.jinja $WORKSPACE_DIR/fixed_chat_template.jinja
+echo "=======> to apply chat template, use --chat-template fixed_chat_template.jinja"
--- a/mods/fix-qwen35-tp4-marlin/fix_rope.py
+++ b/mods/fix-qwen35-tp4-marlin/fix_rope.py
@@ -0,0 +1,23 @@
+# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
+import re
+
+path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
+with open(path) as f:
+    content = f.read()
+
+old = """kwargs["ignore_keys_at_rope_validation"] = [
+            "mrope_section",
+            "mrope_interleaved",
+        ]"""
+
+new = """kwargs["ignore_keys_at_rope_validation"] = {
+            "mrope_section",
+            "mrope_interleaved",
+        }"""
+
+content = content.replace(old, new)
+
+with open(path, "w") as f:
+    f.write(content)
+
+print("Fixed ignore_keys_at_rope_validation: list -> set")
--- a/mods/fix-qwen35-tp4-marlin/qwen3_5.patch
+++ b/mods/fix-qwen35-tp4-marlin/qwen3_5.patch
@@ -0,0 +1,46 @@
+--- qwen3_5.py.orig	2026-03-03 00:00:00.000000000 +0000
+++ qwen3_5.py	2026-03-03 00:00:00.000000000 +0000
+@@ -166,11 +166,13 @@
+         z_size = self.value_dim // self.tp_size
+         mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
+         z = z.reshape(z.size(0), -1, self.head_v_dim)
+-        ba, _ = self.in_proj_ba(hidden_states)
+-        b, a = ba.chunk(2, dim=-1)
+-
+-        b = b.contiguous()
+-        a = a.contiguous()
+        # Replicated B/A projections — full output, sliced to local TP partition
+        b_full, _ = self.in_proj_b(hidden_states)
+        a_full, _ = self.in_proj_a(hidden_states)
+        _ba_chunk = self.num_v_heads // self.tp_size
+        _ba_start = self.tp_rank * _ba_chunk
+        b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
+        a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
+
+         # ============================================================
+         # Part 2: Core Attention (Custom Op)
+@@ -374,8 +376,6 @@
+             # GDN
+             ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
+             ("in_proj_qkvz", "in_proj_z", 3),
+-            ("in_proj_ba", "in_proj_b", 0),
+-            ("in_proj_ba", "in_proj_a", 1),
+         ]
+
+         params_dict = dict(self.named_parameters())
+@@ -530,7 +530,6 @@
+         "gate_up_proj": ["gate_proj", "up_proj"],
+         # GDN fused projections.
+         "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+-        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+     }
+
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+@@ -630,7 +629,6 @@
+ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
+     packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
+         "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+-        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+     }
+
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
--- a/mods/fix-qwen35-tp4-marlin/qwen3_next.patch
+++ b/mods/fix-qwen35-tp4-marlin/qwen3_next.patch
@@ -0,0 +1,56 @@
+--- qwen3_next.py.orig	2026-03-03 00:00:00.000000000 +0000
+++ qwen3_next.py	2026-03-03 00:00:00.000000000 +0000
+@@ -411,15 +411,22 @@
+             quant_config=quant_config,
+             prefix=f"{prefix}.in_proj_qkvz",
+         )
+-        # ba_proj doesn't support blockwise fp8 quantization.
+-        # # in_proj_ba is defined as MergedColumnParallelLinear for
+-        # compatibility with Qwen3_5.
+-        self.in_proj_ba = MergedColumnParallelLinear(
+        # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
+        # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
+        # Each rank loads full weights and slices in forward.
+        self.in_proj_b = ReplicatedLinear(
+             input_size=self.hidden_size,
+-            output_sizes=[self.num_v_heads] * 2,
+            output_size=self.num_v_heads,
+             bias=False,
+             quant_config=quant_config,
+-            prefix=f"{prefix}.in_proj_ba",
+            prefix=f"{prefix}.in_proj_b",
+        )
+        self.in_proj_a = ReplicatedLinear(
+            input_size=self.hidden_size,
+            output_size=self.num_v_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.in_proj_a",
+         )
+
+         query_key_settings = (self.key_dim, 0, False)
+@@ -584,7 +591,15 @@
+         # Part 1: Input Projection
+         # ============================================================
+         projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+-        projected_states_ba, _ = self.in_proj_ba(hidden_states)
+        # Replicated B/A projections — full output, sliced to local TP partition
+        b_full, _ = self.in_proj_b(hidden_states)
+        a_full, _ = self.in_proj_a(hidden_states)
+        _ba_chunk = self.num_v_heads // self.tp_size
+        _ba_start = self.tp_rank * _ba_chunk
+        projected_states_ba = torch.cat([
+            b_full[:, _ba_start:_ba_start+_ba_chunk],
+            a_full[:, _ba_start:_ba_start+_ba_chunk],
+        ], dim=-1)
+         query, key, value, z, b, a = self.fix_query_key_value_ordering(
+             projected_states_qkvz, projected_states_ba
+         )
+@@ -1326,7 +1341,6 @@
+         ],
+         "gate_up_proj": ["gate_proj", "up_proj"],
+         "in_proj_qkvz": ["in_proj_qkvz"],
+-        "in_proj_ba": ["in_proj_ba"],
+     }
+
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
--- a/mods/fix-qwen35-tp4-marlin/run.sh
+++ b/mods/fix-qwen35-tp4-marlin/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
+# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
+# Delivery: unified diff patches (portable across vLLM versions)
+
+set -e
+MOD_DIR="$(dirname "$0")"
+MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
+
+echo "[fix-qwen35-tp4-marlin] Applying patches..."
+
+# Apply patches with --forward (skip if already applied)
+patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
+    echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
+}
+patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
+    echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
+}
+
+# Fix rope validation (idempotent)
+python3 "$MOD_DIR/fix_rope.py"
+
+echo "[fix-qwen35-tp4-marlin] Done."
--- a/mods/gpu-mem-util-gb/gpu_mem.patch
+++ b/mods/gpu-mem-util-gb/gpu_mem.patch
@@ -0,0 +1,255 @@
+diff --git a/vllm/config/cache.py b/vllm/config/cache.py
+index 3796265ff..b6dcfb54c 100644
+--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
+@@ -45,6 +45,11 @@ class CacheConfig:
+     not matter if you have another vLLM instance running on the same GPU. For
+     example, if you have two vLLM instances running on the same GPU, you can
+     set the GPU memory utilization to 0.5 for each instance."""
+    gpu_memory_utilization_gb: float | None = Field(default=None, gt=0)
+    """Amount of GPU memory to be used in GiB. This provides fine-grained control
+    over GPU memory usage and is particularly useful on unified memory systems
+    where available memory changes dynamically. If specified, it overrides
+    gpu_memory_utilization. Cannot be used simultaneously with kv_cache_memory_bytes."""
+     cache_dtype: CacheDType = "auto"
+     """Data type for kv cache storage. If "auto", will use model data type.
+     CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
+@@ -204,6 +209,18 @@ class CacheConfig:
+             object.__setattr__(self, "user_specified_block_size", True)
+         return self
+ 
+    @model_validator(mode="after")
+    def _validate_memory_params(self) -> "CacheConfig":
+        if (
+            self.gpu_memory_utilization_gb is not None
+            and self.kv_cache_memory_bytes is not None
+        ):
+            raise ValueError(
+                "Cannot specify both gpu_memory_utilization_gb and "
+                "kv_cache_memory_bytes. Please use only one of them."
+            )
+        return self
+
+     @field_validator("cache_dtype", mode="after")
+     @classmethod
+     def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
+diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
+index 56bbb7bf5..db5012608 100644
+--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
+@@ -454,6 +454,7 @@ class EngineArgs:
+     offload_prefetch_step: int = PrefetchOffloadConfig.offload_prefetch_step
+     offload_params: set[str] = get_field(PrefetchOffloadConfig, "offload_params")
+     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
+    gpu_memory_utilization_gb: float | None = CacheConfig.gpu_memory_utilization_gb
+     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
+     max_num_batched_tokens: int | None = None
+     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
+@@ -954,6 +955,9 @@ class EngineArgs:
+         cache_group.add_argument(
+             "--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]
+         )
+        cache_group.add_argument(
+            "--gpu-memory-utilization-gb", **cache_kwargs["gpu_memory_utilization_gb"]
+        )
+         cache_group.add_argument(
+             "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
+         )
+@@ -1512,6 +1516,7 @@ class EngineArgs:
+         cache_config = CacheConfig(
+             block_size=self.block_size,  # type: ignore[arg-type]
+             gpu_memory_utilization=self.gpu_memory_utilization,
+            gpu_memory_utilization_gb=self.gpu_memory_utilization_gb,
+             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
+             cache_dtype=resolved_cache_dtype,  # type: ignore[arg-type]
+             is_attention_free=model_config.is_attention_free,
+diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
+index 5909b3043..c2607df6a 100644
+--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
+@@ -156,6 +156,11 @@ class LLM:
+             values will increase the KV cache size and thus improve the model's
+             throughput. However, if the value is too high, it may cause out-of-
+             memory (OOM) errors.
+        gpu_memory_utilization_gb: Amount of GPU memory to reserve in GiB.
+            This provides fine-grained control over GPU memory usage and is
+            particularly useful on unified memory systems where available memory
+            changes dynamically. If specified, it overrides gpu_memory_utilization.
+            Cannot be used simultaneously with kv_cache_memory_bytes.
+         kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default,
+             this is set to None and vllm can automatically infer the kv cache
+             size based on gpu_memory_utilization. However, users may want to
+@@ -234,6 +239,7 @@ class LLM:
+         chat_template: Path | str | None = None,
+         seed: int = 0,
+         gpu_memory_utilization: float = 0.92,
+        gpu_memory_utilization_gb: float | None = None,
+         cpu_offload_gb: float = 0,
+         offload_group_size: int = 0,
+         offload_num_in_group: int = 1,
+@@ -356,6 +362,7 @@ class LLM:
+             tokenizer_revision=tokenizer_revision,
+             seed=seed,
+             gpu_memory_utilization=gpu_memory_utilization,
+            gpu_memory_utilization_gb=gpu_memory_utilization_gb,
+             kv_cache_memory_bytes=kv_cache_memory_bytes,
+             cpu_offload_gb=cpu_offload_gb,
+             offload_group_size=offload_group_size,
+diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
+index 2ed7ef7e0..806830b17 100644
+--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
+@@ -622,7 +622,8 @@ def _check_enough_kv_cache_memory(
+     if available_memory <= 0:
+         raise ValueError(
+             "No available memory for the cache blocks. "
+-            "Try increasing `gpu_memory_utilization` when initializing the engine. "
+            "Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb` "
+            "when initializing the engine. "
+             "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+             "for more details."
+         )
+@@ -643,8 +644,8 @@ def _check_enough_kv_cache_memory(
+             f"({max_model_len}), ({format_gib(needed_memory)} GiB KV "
+             f"cache is needed, which is larger than the available KV cache "
+             f"memory ({format_gib(available_memory)} GiB). {estimated_msg}"
+-            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
+-            f"when initializing the engine. "
+            f"Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb`, "
+            f"or decreasing `max_model_len` when initializing the engine. "
+             f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+             f"for more details."
+         )
+@@ -1438,7 +1439,8 @@ def _auto_fit_max_model_len(
+     if auto_fit_max <= 0:
+         raise ValueError(
+             "Cannot auto-fit max_model_len: not enough GPU memory available "
+-            "to serve even a single token. Try increasing `gpu_memory_utilization`."
+            "to serve even a single token. Try increasing `gpu_memory_utilization` "
+            "or `gpu_memory_utilization_gb`."
+         )
+ 
+     if auto_fit_max >= original_max:
+diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
+index 3d065927e..e8cef2ceb 100644
+--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
+@@ -358,6 +358,7 @@ def report_usage_stats(
+             "dtype": str(vllm_config.model_config.dtype),
+             "block_size": vllm_config.cache_config.block_size,
+             "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
+            "gpu_memory_utilization_gb": vllm_config.cache_config.gpu_memory_utilization_gb,
+             "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
+             # Quantization
+             "quantization": vllm_config.model_config.quantization,
+diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
+index b53bd71a1..d28821328 100644
+--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
+@@ -5355,8 +5355,8 @@ class GPUModelRunner(
+                 raise RuntimeError(
+                     "CUDA out of memory occurred when warming up sampler with "
+                     f"{num_reqs} dummy requests. Please try lowering "
+-                    "`max_num_seqs` or `gpu_memory_utilization` when "
+-                    "initializing the engine."
+                    "`max_num_seqs`, `gpu_memory_utilization`, or "
+                    "`gpu_memory_utilization_gb` when initializing the engine."
+                 ) from e
+             else:
+                 raise e
+@@ -5434,8 +5434,8 @@ class GPUModelRunner(
+                 raise RuntimeError(
+                     "CUDA out of memory occurred when warming up pooler "
+                     f"({task=}) with {num_reqs} dummy requests. Please try "
+-                    "lowering `max_num_seqs` or `gpu_memory_utilization` when "
+-                    "initializing the engine."
+                    "lowering `max_num_seqs`, `gpu_memory_utilization`, or "
+                    "`gpu_memory_utilization_gb` when initializing the engine."
+                 ) from e
+             else:
+                 raise e
+diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
+index 842e76549..bf3bb359b 100644
+--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
+@@ -357,7 +357,8 @@ class Worker(WorkerBase):
+ 
+         Tip:
+             You may limit the usage of GPU memory
+-            by adjusting the `gpu_memory_utilization` parameter.
+            by adjusting the `gpu_memory_utilization` or
+            `gpu_memory_utilization_gb` parameter.
+         """
+         if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
+             # still need a profile run which compiles the model for
+@@ -369,7 +370,8 @@ class Worker(WorkerBase):
+                 f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
+                 "KV Cache as specified by kv_cache_memory_bytes config and "
+                 "skipped memory profiling. This does not respect the "
+-                "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
+                "gpu_memory_utilization or gpu_memory_utilization_gb config. "
+                "Only use kv_cache_memory_bytes "
+                 "config when you want manual control of KV cache memory "
+                 "size. If OOM'ed, check the difference of initial free "
+                 "memory between the current run and the previous run "
+diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
+index d06c40ed6..89c94e641 100644
+--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
+@@ -405,21 +405,43 @@ def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) ->
+     Calculate the amount of memory required by vLLM, then validate
+     that the current amount of free memory is sufficient for that.
+     """
+-    requested_memory = math.ceil(
+-        init_snapshot.total_memory * cache_config.gpu_memory_utilization
+-    )
+-
+-    if init_snapshot.free_memory < requested_memory:
+-        raise ValueError(
+-            f"Free memory on device {init_snapshot.device_} "
+-            f"({format_gib(init_snapshot.free_memory)}/"
+-            f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
+-            f"is less than desired GPU memory utilization "
+-            f"({cache_config.gpu_memory_utilization}, "
+-            f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
+-            f"utilization or reduce GPU memory used by other processes."
+    if cache_config.gpu_memory_utilization_gb is not None:
+        requested_memory = math.ceil(cache_config.gpu_memory_utilization_gb * 1024**3)
+        if requested_memory <= 0:
+            raise ValueError(
+                f"gpu_memory_utilization_gb must be positive, got "
+                f"{cache_config.gpu_memory_utilization_gb} GiB."
+            )
+        if requested_memory > init_snapshot.total_memory:
+            raise ValueError(
+                f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
+                f"total GPU memory ({format_gib(init_snapshot.total_memory)} GiB). "
+                f"Reduce gpu_memory_utilization_gb or use a smaller value."
+            )
+        safety_margin = 0.5 * 1024**3
+        if requested_memory > init_snapshot.free_memory + safety_margin:
+            raise ValueError(
+                f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
+                f"available memory ({format_gib(init_snapshot.free_memory)} GiB) "
+                f"with safety margin ({format_gib(safety_margin)} GiB). "
+                f"Reduce gpu_memory_utilization_gb or free up GPU memory."
+            )
+    else:
+        requested_memory = math.ceil(
+            init_snapshot.total_memory * cache_config.gpu_memory_utilization
+         )
+ 
+        if init_snapshot.free_memory < requested_memory:
+            raise ValueError(
+                f"Free memory on device {init_snapshot.device_} "
+                f"({format_gib(init_snapshot.free_memory)}/"
+                f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
+                f"is less than desired GPU memory utilization "
+                f"({cache_config.gpu_memory_utilization}, "
+                f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
+                f"utilization or reduce GPU memory used by other processes."
+            )
+
+     return requested_memory
+ 
+ 
--- a/mods/gpu-mem-util-gb/run.sh
+++ b/mods/gpu-mem-util-gb/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+patch -p1 -d /usr/local/lib/python3.12/dist-packages < gpu_mem.patch \
+  && echo "=====> You can now use --gpu-memory-utilization-gb parameter to specify reserved memory in GiB"
--- a/mods/nemotron-super/run.sh
+++ b/mods/nemotron-super/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+cd $WORKSPACE_DIR
+wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/resolve/main/super_v3_reasoning_parser.py
--- a/mods/use-ngc-vllm/run-cluster-node.sh
+++ b/mods/use-ngc-vllm/run-cluster-node.sh
@@ -1,117 +0,0 @@
-#!/bin/bash
-set -e
-
-# Define a function to export immediately AND save to .bashrc for future sessions
-export_persist() {
-    local var_name="$1"
-    local var_value="$2"
-    
-    # 1. Export for the current running process
-    export "$var_name"="$var_value"
-    
-    # 2. Append to .bashrc (idempotent check to avoid duplicate lines)
-    if ! grep -q "export $var_name=" ~/.bashrc; then
-        echo "export $var_name=\"$var_value\"" >> ~/.bashrc
-    else
-        # Optional: Update the existing line if it exists
-        sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
-    fi
-}
-
-# --- Help Function ---
-usage() {
-    echo "Usage: $0 [OPTIONS]"
-    echo ""
-    echo "Required Arguments:"
-    echo "  -r, --role <head|node>      : Set the node type"
-    echo "  -h, --host-ip <ip>          : IP address of this interface (Host IP)"
-    echo "  -e, --eth-if <name>         : Ethernet interface name (e.g., eth0)"
-    echo "  -i, --ib-if <name>          : InfiniBand/RDMA interface name"
-    echo ""
-    echo "Conditional Arguments:"
-    echo "  -m, --head-ip <ip>          : IP of the head node (REQUIRED if role is 'node')"
-    echo ""
-    echo "Example:"
-    echo "  $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
-    echo "  $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
-    exit 1
-}
-
-# --- Argument Parsing ---
-
-# Initialize variables to empty
-NODE_TYPE=""
-HOST_IP=""
-ETH_IF_NAME=""
-IB_IF_NAME=""
-HEAD_IP=""
-
-while [[ "$#" -gt 0 ]]; do
-    case $1 in
-        -r|--role) NODE_TYPE="$2"; shift ;;
-        -h|--host-ip) HOST_IP="$2"; shift ;;
-        -e|--eth-if) ETH_IF_NAME="$2"; shift ;;
-        -i|--ib-if) IB_IF_NAME="$2"; shift ;;
-        -m|--head-ip) HEAD_IP="$2"; shift ;;
-        *) echo "Unknown parameter passed: $1"; usage ;;
-    esac
-    shift
-done
-
-# --- Validation ---
-
-# 1. Check if all common required arguments are present
-if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
-    echo "Error: Missing required arguments."
-    usage
-fi
-
-# 2. Validate Role
-if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
-    echo "Error: --role must be 'head' or 'node'."
-    exit 1
-fi
-
-# 3. Conditional Check for Head IP
-if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
-    echo "Error: When --role is 'node', you must provide --head-ip."
-    exit 1
-fi
-
-# --- Environment Configuration ---
-
-echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
-
-export_persist VLLM_HOST_IP "$HOST_IP"
-export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
-export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
-
-# Network Interface
-export_persist MN_IF_NAME "$ETH_IF_NAME"
-export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
-export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
-
-# InfiniBand
-export_persist NCCL_IB_HCA "$IB_IF_NAME"
-export_persist NCCL_IB_DISABLE "0"
-
-# Sockets/Transport
-export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
-export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
-export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
-export_persist RAY_memory_monitor_refresh_ms "0"
-
-# --- Execution ---
-
-if [ "${NODE_TYPE}" == "head" ]; then
-    echo "Starting Ray HEAD node..."
-    exec ray start --block --head --port 6379 \
-        --node-ip-address "$VLLM_HOST_IP" \
-        --disable-usage-stats
-else
-    echo "Starting Ray WORKER node connecting to $HEAD_IP..."
-    exec ray start --block \
-        --address="$HEAD_IP:6379" \
-        --node-ip-address "$VLLM_HOST_IP"
-fi
-
--- a/mods/use-ngc-vllm/run.sh
+++ b/mods/use-ngc-vllm/run.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 set -e

-echo "Setting up cluster initialization script..."
-cp run-cluster-node.sh $WORKSPACE_DIR/run-cluster-node.sh
-chmod +x $WORKSPACE_DIR/run-cluster-node.sh
+# NGC vLLM mod: container initialization is now handled by launch-cluster.sh
+echo "NGC vLLM mod applied."
--- a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
@@ -0,0 +1,58 @@
+# Recipe: Qwen3.5-122B-A10B-INT4-Autoround
+# Qwen3.5-122B model in Intel INT4-Autoround quantization
+# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks.
+# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150`
+
+recipe_version: "1"
+name: Qwen3.5-397B-INT4-Autoround (PP=3)
+description: Recipe for Qwen3.5-397B-INT4-Autoround to run on 3-node mesh in pipeline-parallel mode
+
+# HuggingFace model to download (optional, for --download-model)
+model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
+
+cluster_only: true
+
+# Container image to use
+container: vllm-node-tf5
+
+build_args:
+  - --tf5
+
+mods:
+  - mods/fix-qwen3.5-chat-template
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  pipeline_parallel: 3
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 16384
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
+    --max-model-len {max_model_len} \
+    --max-num-seqs 10 \
+    --kv-cache-dtype fp8 \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser qwen3 \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --trust-remote-code \
+    --chat-template unsloth.jinja \
+    -tp 1 \
+    -pp {pipeline_parallel} \
+    --load-format instanttensor \
+    --distributed-executor-backend ray
+
+  
--- a/recipes/4x-spark-cluster/minimax-m2.5.yaml
+++ b/recipes/4x-spark-cluster/minimax-m2.5.yaml
@@ -0,0 +1,45 @@
+# Recipe: MiniMax-M2.5
+# MiniMaxAI/MiniMax-M2.5
+
+recipe_version: "1"
+name: MiniMax-M2.5
+description: vLLM serving MiniMax-M2.5 with Ray distributed backend
+
+# HuggingFace model to download (optional, for --download-model)
+model: MiniMaxAI/MiniMax-M2.5
+
+# Container image to use
+container: vllm-node
+
+# Can only be run in a cluster
+cluster_only: true
+
+# No mods required
+mods: []
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 4
+  gpu_memory_utilization: 0.90
+  max_model_len: 128000
+
+# Environment variables
+env:
+  VLLM_DISTRIBUTED_EXECUTOR_CONFIG: '{"placement_group_options":{"strategy":"SPREAD"}}'
+
+# The vLLM serve command template
+command: |
+  vllm serve MiniMaxAI/MiniMax-M2.5 \
+      --trust-remote-code \
+      --port {port} \
+      --host {host} \
+      --gpu-memory-utilization {gpu_memory_utilization} \
+      -tp {tensor_parallel} \
+      --distributed-executor-backend ray \
+      --max-model-len {max_model_len} \
+      --load-format fastsafetensors \
+      --enable-auto-tool-choice \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2_append_think
--- a/recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
+++ b/recipes/4x-spark-cluster/qwen3.5-397b-a17B-fp8.yaml
@@ -0,0 +1,61 @@
+# Recipe: Qwen3.5-397B-A17B-FP8
+# Qwen3.5-397B-A17B model in FP8 precision
+# Multi-modal input
+
+recipe_version: "1"
+name: Qwen3.5-397B-A17B-FP8
+description: vLLM serving Qwen3.5-397B-A17B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.5-397B-A17B-FP8
+
+#solo_only: true
+
+# Container image to use
+container: vllm-node-tf5
+
+build_args:
+  - --tf5
+
+# Mod required to fix ROPE syntax error
+mods:
+  - mods/fix-qwen3.5-autoround
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 4
+  gpu_memory_utilization: 0.85
+  max_model_len: 262144
+  max_num_batched_tokens: 8192
+
+# Environment variables
+env: 
+  VLLM_USE_DEEP_GEMM: 0
+  VLLM_USE_FLASHINFER_MOE_FP16: 1
+  VLLM_USE_FLASHINFER_SAMPLER: 0
+  OMP_NUM_THREADS: 4
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
+    --max-model-len {max_model_len} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --load-format fastsafetensors \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser qwen3 \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --trust-remote-code \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray \
+    --mm-encoder-tp-mode data \
+    --kv-cache-dtype fp8 \
+    --compilation-config.cudagraph_mode none \
+    --max-num-seqs 32 \
+    --attention-backend flashinfer
+
--- a/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
@@ -0,0 +1,53 @@
+# Recipe: Qwen3.5-397B-A17B-INT4-Autoround
+# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
+# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
+# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
+
+recipe_version: "1"
+name: Qwen3.5-397B-INT4-Autoround
+description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
+
+# HuggingFace model to download (optional, for --download-model)
+model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
+
+# Container image to use
+container: vllm-node-tf5
+
+build_args:
+  - --tf5
+
+# Mods required: coder-next tool/reasoning parser + Marlin TP fix
+mods:
+  - mods/fix-qwen3-coder-next
+  - mods/fix-qwen35-tp4-marlin
+
+# Environment variables
+env:
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+
+# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 4
+  gpu_memory_utilization: 0.78
+  max_model_len: 32768
+  max_num_batched_tokens: 8192
+
+# The vLLM serve command template
+command: |
+  vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser qwen3 \
+    --enable-auto-tool-choice \
+    --tensor-parallel-size {tensor_parallel} \
+    --distributed-executor-backend ray \
+    --kv-cache-dtype fp8 \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --max-model-len {max_model_len} \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --enable-prefix-caching \
+    --trust-remote-code \
+    --host {host} \
+    --port {port}
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -44,12 +44,16 @@ The recipe runner can automatically discover cluster nodes:
 ```

 When you run `--discover`, it:
-1. Scans the network for nodes with SSH access
-2. Prompts you to select which nodes to include
-3. Saves the configuration to `.env`
+1. Detects active CX7 interfaces and determines mesh vs. standard topology.
+2. Scans the network for peers that are both SSH-reachable **and** have an NVIDIA GB10 GPU.
+3. In mesh mode, separately discovers `COPY_HOSTS` on the direct IB-attached interfaces.
+4. Prompts for per-node confirmation for `CLUSTER_NODES` and `COPY_HOSTS`.
+5. Saves the full configuration (including mesh NCCL settings if applicable) to `.env`.

 Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`.

+When distributing the container image or model files, the runner uses `COPY_HOSTS` from `.env` (which may differ from `CLUSTER_NODES` in mesh mode) to ensure transfers go over the fastest available path.
+
 ## Workflow Modes

 ### Solo Mode (Single Node)
@@ -169,6 +173,7 @@ Usage: ./run-recipe.sh [OPTIONS] [RECIPE]
 Cluster discovery:
  --discover                  Auto-detect cluster nodes and save to .env
  --show-env                  Show current .env configuration
+  --config FILE               Path to .env configuration file (default: .env in repo directory)

 Recipe overrides:
  --port PORT                 Override port
@@ -186,10 +191,25 @@ Setup options:

 Launch options:
  --solo                      Run in solo mode (single node, no Ray)
+  --no-ray                    Multi-node without Ray (PyTorch distributed backend)
  -n, --nodes IPS             Comma-separated node IPs (first = head)
  -d, --daemon                Run in daemon mode
  -t, --container IMAGE       Override container from recipe
+  --name NAME                 Override container name
  --nccl-debug LEVEL          NCCL debug level (VERSION, WARN, INFO, TRACE)
+  --master-port PORT          Cluster coordination port: Ray head port or PyTorch
+                              distributed master port (default: 29501).
+                              Alias: --head-port
+  --eth-if IFACE              Override Ethernet interface
+  --ib-if IFACE               Override InfiniBand interface
+  -e VAR=VALUE                Pass environment variable to container (repeatable)
+  -j N                        Number of parallel build jobs
+  --no-cache-dirs             Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton
+  --non-privileged            Run container without --privileged
+  --mem-limit-gb N            Memory limit in GB (only with --non-privileged)
+  --mem-swap-limit-gb N       Memory+swap limit in GB (only with --non-privileged)
+  --pids-limit N              Process limit (only with --non-privileged)
+  --shm-size-gb N             Shared memory size in GB (only with --non-privileged)

 Extra vLLM arguments:
  -- ARGS...                  Pass additional arguments directly to vLLM
@@ -261,10 +281,18 @@ command: |

 ```
 ┌─────────────────────────────────────────────────────────┐
+│  autodiscover.sh                                        │
+│  - Interface detection (standard / mesh topology)       │
+│  - GB10 peer verification via SSH                       │
+│  - CLUSTER_NODES and COPY_HOSTS discovery               │
+│  - Interactive .env save with per-node confirmation     │
+└──────────────────────────┬──────────────────────────────┘
+                           │ sourced by
+                           ▼
+┌─────────────────────────────────────────────────────────┐
 │  run-recipe.sh / run-recipe.py                          │
 │  - Parses YAML recipe                                   │
-│  - Auto-discovers cluster nodes (--discover)            │
-│  - Loads nodes from .env                                │
+│  - Loads / triggers cluster discovery (--discover)      │
 │  - Handles --setup (build + download + run)             │
 │  - Generates launch script from template                │
 │  - Applies CLI overrides                                │
@@ -274,15 +302,15 @@ command: |
 ┌──────────────────────┐  ┌───────────────────────────────┐
 │  build-and-copy.sh   │  │  hf-download.sh               │
 │  - Docker build      │  │  - HuggingFace model download │
-│  - Copy to workers   │  │  - Rsync to workers           │
+│  - Copy to COPY_HOSTS│  │  - Rsync to COPY_HOSTS        │
 └──────────────────────┘  └───────────────────────────────┘
-           │ 
+           │
           │ then calls (for run)
           ▼
 ┌─────────────────────────────────────────────────────────┐
 │  launch-cluster.sh                                      │
 │  - Cluster orchestration                                │
-│  - Container lifecycle                                  │
+│  - Container lifecycle (trimmed to required node count) │
 │  - Mod application                                      │
 │  - Launch script execution                              │
 └─────────────────────────────────────────────────────────┘
--- a/recipes/gemma4-26b-a4b.yaml
+++ b/recipes/gemma4-26b-a4b.yaml
@@ -0,0 +1,53 @@
+# Recipe: Gemma4-26B-A4B
+# Gemma4-26B-A4B model in online FP8 quantization
+
+recipe_version: "1"
+name: Gemma4-26B-A4B
+description: vLLM serving Gemma4-26B-A4B
+
+# HuggingFace model to download (optional, for --download-model)
+model: google/gemma-4-26B-A4B-it
+
+# Only cluster is supported
+cluster_only: false
+solo_only: false
+
+# Container image to use
+container: vllm-node-tf5
+
+build_args:
+  - --tf5
+
+# Mods
+# mods:
+  # - mods/fix-gemma4-tool-parser
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 8192
+
+# Environment variables
+env: {}
+
+# The vLLM serve command template
+command: |
+  vllm serve google/gemma-4-26B-A4B-it  \
+    --max-model-len {max_model_len} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --load-format safetensors \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser gemma4 \
+    --reasoning-parser gemma4 \
+    --quantization fp8 \
+    --kv-cache-dtype fp8 \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    -tp {tensor_parallel} --distributed-executor-backend ray
+    
--- a/recipes/glm-4.7-flash-awq.yaml
+++ b/recipes/glm-4.7-flash-awq.yaml
@@ -30,8 +30,8 @@ build_args:

 # Mods to apply before running (paths relative to repo root)
 # This mod prevents severe inference speed degradation
-mods:
-  - mods/fix-glm-4.7-flash-AWQ
+# mods:
+#   - mods/fix-glm-4.7-flash-AWQ

 # Default settings (can be overridden via CLI)
 defaults:
--- a/recipes/minimax-m2.7-awq.yaml
+++ b/recipes/minimax-m2.7-awq.yaml
@@ -0,0 +1,44 @@
+# Recipe: MiniMax-M2.7-AWQ
+# MiniMax M2.7 model with AWQ quantization
+
+recipe_version: "1"
+name: MiniMax-M2.7-AWQ
+description: vLLM serving MiniMax-M2.7-AWQ with Ray distributed backend
+
+# HuggingFace model to download (optional, for --download-model)
+model: cyankiwi/MiniMax-M2.7-AWQ-4bit
+
+# Container image to use
+container: vllm-node
+
+# Can only be run in a cluster
+cluster_only: true
+
+# No mods required
+mods: []
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.8
+  max_model_len: 196608
+
+# Environment variables
+env: {}
+
+# The vLLM serve command template
+command: |
+  vllm serve cyankiwi/MiniMax-M2.7-AWQ-4bit \
+      --trust-remote-code \
+      --port {port} \
+      --host {host} \
+      --gpu-memory-utilization {gpu_memory_utilization} \
+      -tp {tensor_parallel} \
+      --distributed-executor-backend ray \
+      --max-model-len {max_model_len} \
+      --load-format fastsafetensors \
+      --enable-auto-tool-choice \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2
--- a/recipes/nemotron-3-nano-nvfp4.yaml
+++ b/recipes/nemotron-3-nano-nvfp4.yaml
@@ -25,16 +25,12 @@ defaults:
  host: 0.0.0.0
  tensor_parallel: 1
  gpu_memory_utilization: 0.7
-  max_model_len: 131072
-
-# Environment variables
-env:
-  VLLM_USE_FLASHINFER_MOE_FP4: 1
-  VLLM_FLASHINFER_MOE_BACKEND: "throughput"
+  max_model_len: 262144

 # The vLLM serve command template
 command: |
  vllm serve nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4  \
+     --moe-backend cutlass \
     --max-model-len {max_model_len} \
     --port {port} --host {host} \
     --trust-remote-code \
@@ -44,6 +40,5 @@ command: |
     --reasoning-parser nano_v3  \
     --kv-cache-dtype fp8  \
     --enable-prefix-caching  \
-     --attention-backend flashinfer  \
     --load-format fastsafetensors  \
     --gpu-memory-utilization {gpu_memory_utilization}
--- a/recipes/nemotron-3-super-nvfp4.yaml
+++ b/recipes/nemotron-3-super-nvfp4.yaml
@@ -0,0 +1,46 @@
+# Recipe: Nemotron-3-Super-NVFP4
+# Uses VLLM_CUTLASS for NVFP4
+recipe_version: "1"
+name: Nemotron-3-Super-NVFP4-CUTLASS-Optimized
+description: vLLM serving Nemotron-3-Super-120B using CUTLASS kernels
+
+model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
+container: vllm-node
+cluster_only: false
+solo_only: false
+
+# mods: 
+#   - mods/nemotron-super
+
+env:
+  VLLM_FLASHINFER_ALLREDUCE_BACKEND: trtllm
+  VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1
+
+container: vllm-node
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_seqs: 10
+
+command: |
+  vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
+  --kv-cache-dtype fp8 \
+  --moe-backend cutlass \
+  --trust-remote-code \
+  --gpu-memory-utilization {gpu_memory_utilization} \
+  --max-model-len {max_model_len} \
+  --max-num-seqs {max_num_seqs} \
+  --enable-prefix-caching \
+  --host {host} \
+  --port {port} \
+  --enable-auto-tool-choice \
+  --load-format fastsafetensors \
+  --tool-call-parser qwen3_coder \
+  --reasoning-parser nemotron_v3 \
+  --mamba_ssm_cache_dtype float32 \
+  --tensor-parallel-size {tensor_parallel} \
+  --attention-backend TRITON_ATTN \
+  --distributed-executor-backend ray
--- a/recipes/openai-gpt-oss-120b.yaml
+++ b/recipes/openai-gpt-oss-120b.yaml
@@ -11,6 +11,9 @@ model: openai/gpt-oss-120b
 # Container image to use
 container: vllm-node-mxfp4

+# Only solo now
+solo_only: true
+
 # Build arguments for build-and-copy.sh
 build_args:
  - --exp-mxfp4
@@ -22,7 +25,7 @@ mods: []
 defaults:
  port: 8000
  host: 0.0.0.0
-  tensor_parallel: 2
+  tensor_parallel: 1
  gpu_memory_utilization: 0.70
  max_num_batched_tokens: 8192

@@ -37,8 +40,6 @@ command: |
      --tool-call-parser openai \
      --reasoning-parser openai_gptoss \
      --enable-auto-tool-choice \
-      --tensor-parallel-size {tensor_parallel} \
-      --distributed-executor-backend ray \
      --gpu-memory-utilization {gpu_memory_utilization} \
      --enable-prefix-caching \
      --load-format fastsafetensors \
--- a/recipes/qwen3-coder-next-fp8.yaml
+++ b/recipes/qwen3-coder-next-fp8.yaml
@@ -15,8 +15,8 @@ model: Qwen/Qwen3-Coder-Next-FP8
 container: vllm-node

 # Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
-mods:
-  - mods/fix-qwen3-coder-next
+# mods:
+#   - mods/fix-qwen3-coder-next

 # Default settings (can be overridden via CLI)
 defaults:
--- a/recipes/qwen3-coder-next-int4-autoround.yaml
+++ b/recipes/qwen3-coder-next-int4-autoround.yaml
@@ -0,0 +1,43 @@
+# Recipe: Qwen3-Coder-Next-int4-Autoround
+# Qwen3-Coder-Next model in Intel int4-Autoround format
+
+
+recipe_version: "1"
+name: Qwen3-Coder-Next-int4-Autoround
+description: Qwen3-Coder-Next-int4-Autoround
+
+# HuggingFace model to download (optional, for --download-model)
+model: Intel/Qwen3-Coder-Next-int4-AutoRound
+
+solo_only: true
+
+# Container image to use
+container: vllm-node
+
+# Mod required to fix autoround weight loading issues
+mods:
+  - mods/fix-qwen3-next-autoround
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Intel/Qwen3-Coder-Next-int4-AutoRound \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --host {host} \
+    --port {port} \
+    --load-format fastsafetensors \
+    --enable-prefix-caching \
+    --max-model-len {max_model_len}
+
--- a/recipes/qwen3.5-122b-fp8.yaml
+++ b/recipes/qwen3.5-122b-fp8.yaml
@@ -0,0 +1,47 @@
+# Recipe: Qwen3.5-122B-A10B-FP8
+# Qwen3.5-122B model in native FP8 quantization
+
+recipe_version: "1"
+name: Qwen3.5-122B-FP8
+description: vLLM serving Qwen3.5-122B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.5-122B-A10B-FP8
+
+# Only cluster is supported
+cluster_only: true
+
+# Container image to use
+container: vllm-node
+
+# No mods required
+mods:
+  - mods/fix-qwen3.5-chat-template
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 8192
+
+# Environment variables
+env: {}
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.5-122B-A10B-FP8 \
+    --max-model-len {max_model_len} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --load-format fastsafetensors \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser qwen3 \
+    --chat-template unsloth.jinja \
+    -tp {tensor_parallel} --distributed-executor-backend ray \
+    --max-num-batched-tokens {max_num_batched_tokens}
--- a/recipes/qwen3.5-122b-int4-autoround.yaml
+++ b/recipes/qwen3.5-122b-int4-autoround.yaml
@@ -18,7 +18,8 @@ build_args:

 # Mod required to fix ROPE syntax error
 mods:
-  - mods/fix-qwen3.5-autoround
+  # - mods/fix-qwen3.5-autoround
+  - mods/fix-qwen3.5-chat-template

 # Default settings (can be overridden via CLI)
 defaults:
@@ -43,10 +44,11 @@ command: |
    --load-format fastsafetensors \
    --enable-prefix-caching \
    --enable-auto-tool-choice \
-    --tool-call-parser qwen3_coder \
+    --tool-call-parser qwen3_xml \
    --reasoning-parser qwen3 \
    --max-num-batched-tokens {max_num_batched_tokens} \
    --trust-remote-code \
+    --chat-template unsloth.jinja \
    -tp {tensor_parallel} \
    --distributed-executor-backend ray

--- a/recipes/qwen3.5-35b-a3b-fp8.yaml
+++ b/recipes/qwen3.5-35b-a3b-fp8.yaml
@@ -0,0 +1,51 @@
+# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
+# Qwen/Qwen3.5-35B-A3B model in native FP8 format
+
+
+recipe_version: "1"
+name: Qwen35-35B-A3B
+description: vLLM serving Qwen3.5-35B-A3B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.5-35B-A3B-FP8
+
+#solo_only: true
+
+# Container image to use
+container: vllm-node
+
+# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
+mods:
+  - mods/fix-qwen3-coder-next
+  - mods/fix-qwen3.5-chat-template
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 16384
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
+    --host {host} \
+    --port {port} \
+    --max-model-len {max_model_len} \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --kv-cache-dtype fp8 \
+    --load-format fastsafetensors \
+    --attention-backend flashinfer \
+    --enable-prefix-caching \
+    --chat-template unsloth.jinja \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray
--- a/recipes/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/qwen3.5-397b-int4-autoround.yaml
@@ -0,0 +1,59 @@
+# Recipe: Qwen3.5-122B-A10B-INT4-Autoround
+# Qwen3.5-122B model in Intel INT4-Autoround quantization
+# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks.
+# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150`
+
+recipe_version: "1"
+name: Qwen3.5-397B-INT4-Autoround
+description: EXPERIMENTAL recipe for Qwen3.5-397B-INT4-Autoround (please refer to README for details! Use with `--no-ray` parameter!)
+
+# HuggingFace model to download (optional, for --download-model)
+model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
+
+cluster_only: true
+
+# Container image to use
+container: vllm-node-tf5
+
+build_args:
+  - --tf5
+
+mods:
+  - mods/fix-qwen3.5-chat-template
+  #- mods/gpu-mem-util-gb
+  # - mods/drop-caches
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.9
+  max_model_len: 262144
+  max_num_batched_tokens: 4176
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
+    --max-model-len {max_model_len} \
+    --max-num-seqs 2 \
+    --kv-cache-dtype fp8 \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --port {port} \
+    --host {host} \
+    --enable-prefix-caching \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_xml \
+    --reasoning-parser qwen3 \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --trust-remote-code \
+    --chat-template unsloth.jinja \
+    --load-format instanttensor \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray
+
+  
--- a/recipes/qwen3.6-35b-a3b-fp8-dflash.yaml
+++ b/recipes/qwen3.6-35b-a3b-fp8-dflash.yaml
@@ -0,0 +1,51 @@
+# Recipe: Qwen/Qwen3.6-35B-A3B-FP8
+# Qwen/Qwen3.6-35B-A3B model in native FP8 format
+
+
+recipe_version: "1"
+name: Qwen36-35B-A3B
+description: vLLM serving Qwen3.6-35B-A3B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.6-35B-A3B-FP8
+
+#solo_only: true
+
+# Container image to use
+container: vllm-node
+
+# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
+mods:
+  - mods/fix-qwen3.6-chat-template
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 16384
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \
+    --host {host} \
+    --port {port} \
+    --max-model-len {max_model_len} \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_xml \
+    --reasoning-parser qwen3 \
+    --load-format fastsafetensors \
+    --attention-backend flash_attn \
+    --enable-prefix-caching \
+    --chat-template fixed_chat_template.jinja \
+    --speculative-config '{{"method": "dflash", "model": "z-lab/Qwen3.6-35B-A3B-DFlash", "num_speculative_tokens": 15}}' \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray
--- a/recipes/qwen3.6-35b-a3b-fp8.yaml
+++ b/recipes/qwen3.6-35b-a3b-fp8.yaml
@@ -0,0 +1,49 @@
+# Recipe: Qwen/Qwen3.5-35B-A3B-FP8
+# Qwen/Qwen3.5-35B-A3B model in native FP8 format
+
+
+recipe_version: "1"
+name: Qwen36-35B-A3B
+description: vLLM serving Qwen3.6-35B-A3B-FP8
+
+# HuggingFace model to download (optional, for --download-model)
+model: Qwen/Qwen3.6-35B-A3B-FP8
+
+#solo_only: true
+
+# Container image to use
+container: vllm-node
+
+mods:
+  - mods/fix-qwen3.6-chat-template
+
+# Default settings (can be overridden via CLI)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 2
+  gpu_memory_utilization: 0.7
+  max_model_len: 262144
+  max_num_batched_tokens: 16384
+
+# Environment variables
+env: 
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# The vLLM serve command template
+command: |
+  vllm serve Qwen/Qwen3.6-35B-A3B-FP8 \
+    --host {host} \
+    --port {port} \
+    --max-model-len {max_model_len} \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_xml \
+    --kv-cache-dtype fp8 \
+    --load-format fastsafetensors \
+    --attention-backend flashinfer \
+    --enable-prefix-caching \
+    --chat-template fixed_chat_template.jinja \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray
--- a/run-cluster-node.sh
+++ b/run-cluster-node.sh
@@ -1,120 +0,0 @@
-#!/bin/bash
-set -e
-
-# Define a function to export immediately AND save to .bashrc for future sessions
-export_persist() {
-    local var_name="$1"
-    local var_value="$2"
-    
-    # 1. Export for the current running process
-    export "$var_name"="$var_value"
-    
-    # 2. Append to .bashrc (idempotent check to avoid duplicate lines)
-    if ! grep -q "export $var_name=" ~/.bashrc; then
-        echo "export $var_name=\"$var_value\"" >> ~/.bashrc
-    else
-        # Optional: Update the existing line if it exists
-        sed -i "s|export $var_name=.*|export $var_name=\"$var_value\"|" ~/.bashrc
-    fi
-}
-
-# --- Help Function ---
-usage() {
-    echo "Usage: $0 [OPTIONS]"
-    echo ""
-    echo "Required Arguments:"
-    echo "  -r, --role <head|node>      : Set the node type"
-    echo "  -h, --host-ip <ip>          : IP address of this interface (Host IP)"
-    echo "  -e, --eth-if <name>         : Ethernet interface name (e.g., eth0)"
-    echo "  -i, --ib-if <name>          : InfiniBand/RDMA interface name"
-    echo ""
-    echo "Conditional Arguments:"
-    echo "  -m, --head-ip <ip>          : IP of the head node (REQUIRED if role is 'node')"
-    echo ""
-    echo "Example:"
-    echo "  $0 --role head --host-ip 192.168.1.10 --eth-if eth0 --ib-if ib0"
-    echo "  $0 --role node --host-ip 192.168.1.20 --eth-if eth0 --ib-if ib0 --head-ip 192.168.1.10"
-    exit 1
-}
-
-# --- Argument Parsing ---
-
-# Initialize variables to empty
-NODE_TYPE=""
-HOST_IP=""
-ETH_IF_NAME=""
-IB_IF_NAME=""
-HEAD_IP=""
-
-while [[ "$#" -gt 0 ]]; do
-    case $1 in
-        -r|--role) NODE_TYPE="$2"; shift ;;
-        -h|--host-ip) HOST_IP="$2"; shift ;;
-        -e|--eth-if) ETH_IF_NAME="$2"; shift ;;
-        -i|--ib-if) IB_IF_NAME="$2"; shift ;;
-        -m|--head-ip) HEAD_IP="$2"; shift ;;
-        *) echo "Unknown parameter passed: $1"; usage ;;
-    esac
-    shift
-done
-
-# --- Validation ---
-
-# 1. Check if all common required arguments are present
-if [[ -z "$NODE_TYPE" || -z "$HOST_IP" || -z "$ETH_IF_NAME" || -z "$IB_IF_NAME" ]]; then
-    echo "Error: Missing required arguments."
-    usage
-fi
-
-# 2. Validate Role
-if [[ "$NODE_TYPE" != "head" && "$NODE_TYPE" != "node" ]]; then
-    echo "Error: --role must be 'head' or 'node'."
-    exit 1
-fi
-
-# 3. Conditional Check for Head IP
-if [[ "$NODE_TYPE" == "node" && -z "$HEAD_IP" ]]; then
-    echo "Error: When --role is 'node', you must provide --head-ip."
-    exit 1
-fi
-
-# --- Environment Configuration ---
-
-echo "Configuring environment for [$NODE_TYPE] at $HOST_IP..."
-
-export_persist VLLM_HOST_IP "$HOST_IP"
-export_persist RAY_NODE_IP_ADDRESS "$HOST_IP"
-export_persist RAY_OVERRIDE_NODE_IP_ADDRESS "$HOST_IP"
-
-# Network Interface
-export_persist MN_IF_NAME "$ETH_IF_NAME"
-export_persist UCX_NET_DEVICES "$ETH_IF_NAME"
-export_persist NCCL_SOCKET_IFNAME "$ETH_IF_NAME"
-
-# InfiniBand
-export_persist NCCL_IB_HCA "$IB_IF_NAME"
-export_persist NCCL_IB_DISABLE "0"
-
-# Sockets/Transport
-export_persist OMPI_MCA_btl_tcp_if_include "$ETH_IF_NAME"
-export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
-export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
-export_persist RAY_memory_monitor_refresh_ms "0"
-
-# --- Execution ---
-
-if [ "${NODE_TYPE}" == "head" ]; then
-    echo "Starting Ray HEAD node..."
-    exec ray start --block --head --port 6379 \
-        --node-ip-address "$VLLM_HOST_IP" \
-	--include-dashboard=True \
-        --dashboard-host "0.0.0.0" \
-        --dashboard-port 8265 \
-        --disable-usage-stats
-else
-    echo "Starting Ray WORKER node connecting to $HEAD_IP..."
-    exec ray start --block \
-        --address="$HEAD_IP:6379" \
-        --node-ip-address "$VLLM_HOST_IP"
-fi
-
--- a/run-recipe.py
+++ b/run-recipe.py
--- a/tests/test_recipes.sh
+++ b/tests/test_recipes.sh
@@ -728,6 +728,48 @@ test_launch_cmd_no_solo_in_cluster() {
    fi
 }

+# Test: -e / --env passthrough to launch-cluster.sh
+test_launch_cmd_env_passthrough() {
+    log_test "Launch command includes -e env vars"
+    
+    recipe_name=$(find_solo_recipe)
+    if [[ -z "$recipe_name" ]]; then
+        log_skip "No solo-capable recipes found"
+        return
+    fi
+    
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo -e HF_TOKEN=test123 -e MY_VAR=hello 2>&1)
+    launch_cmd=$(extract_launch_cmd "$output")
+    
+    if echo "$launch_cmd" | grep -q "\-e HF_TOKEN=test123" && echo "$launch_cmd" | grep -q "\-e MY_VAR=hello"; then
+        log_pass "Launch command includes -e env vars"
+    else
+        log_fail "-e env vars not found in launch command"
+        log_verbose "Launch cmd: $launch_cmd"
+    fi
+}
+
+# Test: no -e flags when none specified
+test_launch_cmd_no_env_by_default() {
+    log_test "Launch command omits -e when no env vars specified"
+    
+    recipe_name=$(find_solo_recipe)
+    if [[ -z "$recipe_name" ]]; then
+        log_skip "No solo-capable recipes found"
+        return
+    fi
+    
+    output=$("$PROJECT_DIR/run-recipe.py" "$recipe_name" --dry-run --solo 2>&1)
+    launch_cmd=$(extract_launch_cmd "$output")
+    
+    if echo "$launch_cmd" | grep -q " -e "; then
+        log_fail "Unexpected -e flag in launch command"
+        log_verbose "Launch cmd: $launch_cmd"
+    else
+        log_pass "Launch command correctly omits -e when none specified"
+    fi
+}
+
 # ==============================================================================
 # README Documentation Verification Tests
 # ==============================================================================
@@ -1203,6 +1245,8 @@ main() {
    test_launch_cmd_launch_script
    test_launch_cmd_container_override
    test_launch_cmd_no_solo_in_cluster
+    test_launch_cmd_env_passthrough
+    test_launch_cmd_no_env_by_default
    echo ""
    
    # README documentation verification tests
--- a/wheels/.gitignore
+++ b/wheels/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
Author	SHA1	Message	Date
eugr	ba9dde963f	Fixed 3-node Qwen 397B recipe to prevent OOM and use instanttensor	2026-05-10 22:20:49 -07:00
eugr	ae8ac815ac	Adjusted Qwen3.5-397B recipe to fix OOM issue and lower memory requirements	2026-05-09 13:45:15 -07:00
eugr	83a680c87b	Fixed OOM for Qwen3.5-397B	2026-05-09 13:25:31 -07:00
Eugene Rakhmatulin	69ea62294f	remove unnecessary mod from qwen3-coder-next template	2026-05-08 16:32:54 -07:00
Eugene Rakhmatulin	8e548ce664	Fixed typo	2026-05-08 14:59:13 -07:00
Eugene Rakhmatulin	bca64f9a53	Performance regression fix	2026-05-08 13:40:55 -07:00
Eugene Rakhmatulin	29d5904b80	Fix performance regression	2026-05-08 12:56:28 -07:00
Eugene Rakhmatulin	b87854fd4c	Fixed qwen3.6 recipes	2026-05-06 10:56:09 -07:00
Eugene Rakhmatulin	c67c5b5c1e	Add chat template and recipe for Qwen3.6-35B-A3B-FP8 model	2026-05-06 10:32:46 -07:00
Eugene Rakhmatulin	9fbed882bc	Added EXPERIMENTAL mod for b12x - initial support	2026-04-29 14:38:37 -07:00
Eugene Rakhmatulin	97e51d5d23	fixed gemma4 recipe	2026-04-29 12:56:07 -07:00
Eugene Rakhmatulin	87cb9f6e1e	Reverted gemma4 to safetensors. Fixes #214 and #217 .	2026-04-29 10:56:40 -07:00
eugr	e3243bf555	Merge pull request #197 from mmonad/minimax-m2.7-awq-recipe Add recipe for MiniMax-M2.7-AWQ	2026-04-25 19:26:43 -07:00
Eugene Rakhmatulin	43a00ed90f	Fixed #205	2026-04-25 18:39:46 -07:00
eugr	ef9b0e50f4	Merge pull request #210 from Kaweees/main Update gpu-mem-util-gb: patch with new vLLM default value	2026-04-25 10:00:52 -07:00
Miguel Villa Floran	c1e952de2e	Update gpu-mem-util-gb: patch with new vLLM default value	2026-04-24 11:40:41 -07:00
Eugene Rakhmatulin	b13a3600d3	Remove a dependency	2026-04-23 07:47:23 -07:00
Eugene Rakhmatulin	7dea11bbf0	More robust handling of PRs	2026-04-22 13:18:12 -07:00
Eugene Rakhmatulin	c187912e23	Removed merged PRs	2026-04-21 09:47:26 -07:00
L.B.R.	caa28c8e12	Add recipe for MiniMax-M2.7-AWQ Add a vLLM serving recipe for the MiniMax M2.7 model using the cyankiwi/MiniMax-M2.7-AWQ-4bit quantization. Uses the same minimax_m2 tool-call and reasoning parsers as the existing M2 recipe, with Ray distributed backend on 2 GPUs.	2026-04-18 22:44:26 +01:00
Eugene Rakhmatulin	5415c1fe9e	Include a PR to fix broken torch bindings (vllm pr 40191)	2026-04-18 09:19:50 -07:00
Eugene Rakhmatulin	d49fac1b8b	Re-enable flashinfer_cutlass	2026-04-16 16:40:56 -07:00
Eugene Rakhmatulin	6b7f8dace6	Fixes #187	2026-04-15 22:32:14 -07:00
Eugene Rakhmatulin	76fbf0d0be	Fix for broken MiniMax M2 parser	2026-04-15 16:31:50 -07:00
Eugene Rakhmatulin	b7830469be	Updated README	2026-04-14 17:23:42 -07:00
Eugene Rakhmatulin	b50fa426c8	Merge pull request #190	2026-04-14 17:18:56 -07:00
Tim Messerschmidt	2c13e1ce25	Add InstantTensor to runtime dependencies	2026-04-14 19:38:36 +02:00
Eugene Rakhmatulin	c026c92bd0	Updated README	2026-04-13 11:27:57 -07:00
Eugene Rakhmatulin	cf4cb35356	added new flashinfer build dependency	2026-04-13 08:47:34 -07:00
Eugene Rakhmatulin	1ad85442ac	Added a helper mod for Qwen3.5-397B recipe	2026-04-12 19:14:23 -07:00
Eugene Rakhmatulin	30919581ee	Included .gitgnore in wheels	2026-04-11 17:02:39 -07:00
Eugene Rakhmatulin	b7c8616743	Pinned pytorch version	2026-04-11 11:54:46 -07:00
Eugene Rakhmatulin	8e8e850ef1	fix for new requirements structure	2026-04-10 20:14:47 -07:00
Eugene Rakhmatulin	fc08740fba	Increased uv timeout	2026-04-10 19:38:38 -07:00
Eugene Rakhmatulin	288da8e911	Mod to fix Gemma4 tool parser	2026-04-04 16:48:07 -07:00
Eugene Rakhmatulin	7bc4e4ce5e	Fixes #158 by adding build args to gemma4 recipe	2026-04-04 10:46:06 -07:00
Eugene Rakhmatulin	49d6d9fefd	Removed PR2927 as it's been merged	2026-04-03 16:56:00 -07:00
Eugene Rakhmatulin	4afca860a5	Fix broken compilation (PR 38919)	2026-04-03 10:22:10 -07:00
Eugene Rakhmatulin	ed32612cdd	A recipe for Gemma4-26B	2026-04-02 23:53:55 -07:00
Eugene Rakhmatulin	44808f7018	Apply vLLM PR 35568	2026-04-02 17:13:54 -07:00
Eugene Rakhmatulin	12caec228e	switching gpt-oss-120b to solo only for now	2026-04-01 10:27:50 -07:00
Eugene Rakhmatulin	27eb35f08d	Fixed 4x qwen recipe	2026-04-01 10:09:01 -07:00
eugr	3335540972	Merge branch 'pr-152'	2026-04-01 08:59:01 -07:00
eugr	ae25d64ac0	Changed CUTLASS ref for mxfp4 build	2026-04-01 08:58:31 -07:00
Eugene Rakhmatulin	a770865834	Updated PRs to apply	2026-04-01 08:31:34 -07:00
Artyom	7b47235463	Pin nvidia-nvshmem-cu13 to <3.6 in Dockerfile.mxfp4 nvidia-nvshmem-cu13 3.6.5 (released Mar 24) introduced a breaking change — nvshmemi_device_state_d was removed from NVSHMEM headers, which breaks FlashInfer AOT compilation of nvshmem_binding.cu.	2026-04-01 07:38:53 +02:00
Eugene Rakhmatulin	3a3ab98b3e	Temporarily added PR2897 to Dockerfile	2026-03-31 22:06:08 -07:00
Eugene Rakhmatulin	23fb7dcc20	Merge branch '3-node-autodiscover'	2026-03-31 18:22:23 -07:00
Eugene Rakhmatulin	c4860b86a2	Updated README with 3-node support	2026-03-31 18:19:22 -07:00
Eugene Rakhmatulin	044557943c	Bugfixes	2026-03-31 17:49:17 -07:00
Eugene Rakhmatulin	ead749239d	Bugfix	2026-03-31 16:57:56 -07:00
Eugene Rakhmatulin	a889fed254	Updated README	2026-03-31 16:54:19 -07:00
Eugene Rakhmatulin	e89104d91b	Always rerun discovery when `--discover` is specified	2026-03-31 16:25:05 -07:00
Eugene Rakhmatulin	15a04ada32	Bug fixes	2026-03-31 16:20:23 -07:00
Eugene Rakhmatulin	a467a7a0bd	Updated README for 3-node	2026-03-31 13:47:04 -07:00
Eugene Rakhmatulin	48318380f9	Bugfix	2026-03-31 13:41:35 -07:00
Eugene Rakhmatulin	287d3c72e5	Fix for forced autodiscovery	2026-03-31 13:34:59 -07:00
Eugene Rakhmatulin	9370b2bb34	Don't start the cluster if only --setup/--discover is specified	2026-03-31 13:29:56 -07:00
Eugene Rakhmatulin	bb177383ff	Bugfix in autodiscovery dedup	2026-03-31 12:46:15 -07:00
Eugene Rakhmatulin	7f0be29fcc	Handle edge case when two sparks have both cables plugged and assigned IPs	2026-03-31 11:59:03 -07:00
Eugene Rakhmatulin	41c0ce2c9a	Fixed FI PR	2026-03-30 14:25:42 -07:00
Eugene Rakhmatulin	45494688d1	Updated README, added NVFP4 fix	2026-03-30 11:45:40 -07:00
Eugene Rakhmatulin	a3201f8873	--flashinfer-ref / --apply-flashinfer-pr	2026-03-29 22:40:35 -07:00
Eugene Rakhmatulin	e471ca2436	Don't copy if `-c` is not specified	2026-03-28 18:12:32 -07:00
Eugene Rakhmatulin	32674c2619	removed temporary patch as it causes more issues.	2026-03-28 17:49:17 -07:00
Eugene Rakhmatulin	47f5f931b5	Allow to specify config file when doing setup	2026-03-28 14:55:31 -07:00
Eugene Rakhmatulin	d37217bad0	moved PR patch before the requirements patching	2026-03-28 09:22:19 -07:00
Eugene Rakhmatulin	e70c87b4f6	Added PR38423 (temp)	2026-03-28 08:50:54 -07:00
Eugene Rakhmatulin	c1a6cec074	Updated documentation; default image tags in build script	2026-03-27 16:41:09 -07:00
Eugene Rakhmatulin	51d69c5c17	commenting out non-applicable PRs	2026-03-27 16:15:54 -07:00
Eugene Rakhmatulin	e7f2ee692f	Added temporary patch to apply PR38126 that fixes broken NVFP4 quants	2026-03-27 09:30:26 -07:00
Eugene Rakhmatulin	101ae6fd56	Merge branch 'main' into 3-node-autodiscover	2026-03-27 09:02:10 -07:00
Eugene Rakhmatulin	f4ca15ce18	Made autoround mod optional to support latest version of vLLM. Fixes #144 .	2026-03-27 09:00:50 -07:00
Eugene Rakhmatulin	3d918e0b82	Merge branch '3-node' into 3-node-autodiscover	2026-03-27 07:51:08 -07:00
eugr	47a896d722	Removed expert-parallel from 3x-node Qwen	2026-03-26 22:44:48 -07:00
Eugene Rakhmatulin	0fa585f909	Fix typo in pipeline_parallel setting in Qwen3.5-397B-INT4-Autoround recipe	2026-03-26 18:43:17 -07:00
Eugene Rakhmatulin	cecec74828	Add recipe for Qwen3.5-397B-INT4-Autoround in pipeline-parallel mode	2026-03-26 18:41:57 -07:00
Eugene Rakhmatulin	c8ee2a2511	Perform node count check in any mode	2026-03-26 18:15:09 -07:00
Eugene Rakhmatulin	ce293b5f05	Additional checks for parallelism and cluster size	2026-03-26 17:52:47 -07:00
Eugene Rakhmatulin	f872cc17a8	Fix for --setup behavior	2026-03-26 16:49:09 -07:00
Eugene Rakhmatulin	00c16746e5	Handle new copy hosts setup in run-recipe.py	2026-03-26 16:45:35 -07:00
Eugene Rakhmatulin	f163ca69de	Autodiscover tweaks	2026-03-26 16:30:05 -07:00
Eugene Rakhmatulin	a78e221de3	Autodiscovery refactoring with mesh support	2026-03-26 15:47:41 -07:00
Eugene Rakhmatulin	e6ee108cdf	Temporary patch for NVFP4	2026-03-26 11:43:44 -07:00
Eugene Rakhmatulin	174de6f0a8	temporary patch for PR38126	2026-03-26 08:58:04 -07:00
Eugene Rakhmatulin	83a74bccec	Removed extra solo mode check	2026-03-26 07:45:23 -07:00
Eugene Rakhmatulin	ff18a9ad5b	Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node	2026-03-25 23:38:44 -07:00
Eugene Rakhmatulin	c08b34a218	add --config passthrough to run-recipe	2026-03-25 23:35:52 -07:00
Eugene Rakhmatulin	23cca2a11a	Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node	2026-03-25 23:17:25 -07:00
Eugene Rakhmatulin	c2fe579ccc	Enhance .env file handling and validation in scripts	2026-03-25 23:16:56 -07:00
Eugene Rakhmatulin	8b7c02aa25	add .env support to build-and-copy.sh	2026-03-25 22:47:02 -07:00
Eugene Rakhmatulin	73fec1bdf8	bugfix	2026-03-25 15:40:09 -07:00
Eugene Rakhmatulin	2f5ff0211e	Cleanup in build script	2026-03-25 15:39:23 -07:00
Eugene Rakhmatulin	63ee72e729	Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node	2026-03-25 15:36:31 -07:00
Eugene Rakhmatulin	4a0feea6c3	Added `--cleanup` option to build script	2026-03-25 15:35:32 -07:00
Eugene Rakhmatulin	429042b7dc	Revert "Added `--cleanup` option" This reverts commit `b8930b05a1`.	2026-03-25 15:35:15 -07:00
Eugene Rakhmatulin	ef95336937	Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node	2026-03-25 15:25:19 -07:00
Eugene Rakhmatulin	b8930b05a1	Added `--cleanup` option	2026-03-25 15:24:59 -07:00
Eugene Rakhmatulin	49d505ad14	Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node	2026-03-25 15:16:47 -07:00
Eugene Rakhmatulin	1755dfd114	Added LOCAL_IP support	2026-03-25 15:16:06 -07:00
Eugene Rakhmatulin	3d4dc4c82e	Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node	2026-03-25 14:42:37 -07:00
Eugene Rakhmatulin	07fac71dac	Fixed bug with CONTAINER_NAME variable	2026-03-25 14:42:01 -07:00
Eugene Rakhmatulin	1702f47df6	Merge branch '3-node' of gitlab.home.eugr.net:ai/spark-vllm into 3-node	2026-03-25 14:18:32 -07:00
Eugene Rakhmatulin	ad2cd3373f	.env configuration support for launch-cluster.sh	2026-03-25 14:18:00 -07:00
Eugene Rakhmatulin	1fd8c7afc3	Merge branch 'main' into 3-node	2026-03-25 12:45:40 -07:00
Eugene Rakhmatulin	3dcd2a90c1	Updated Nemotron-3-Super recipe	2026-03-25 12:44:44 -07:00
Eugene Rakhmatulin	efacbd69f2	Updated Nemotron3-Super recipe	2026-03-25 12:43:12 -07:00
Eugene Rakhmatulin	c4b078b868	Merge branch 'main' into 3-node	2026-03-24 22:21:25 -07:00
Eugene Rakhmatulin	3be2fb24a8	Merge pull request #122	2026-03-24 22:18:52 -07:00
Eugene Rakhmatulin	7fa69187df	metadata changes	2026-03-24 22:18:07 -07:00
Drew Botwinick	8298c3d7f8	Merge remote-tracking branch 'upstream/main' # Conflicts: # Dockerfile	2026-03-24 15:41:09 -05:00
Eugene Rakhmatulin	f8c2653fd3	Quick fix for NCCL dependency	2026-03-23 23:20:59 -07:00
Eugene Rakhmatulin	990a7b3837	Use mesh-optimized NCCL	2026-03-23 15:43:18 -07:00
Eugene Rakhmatulin	9e089acf2b	Updated Nemotron recipes to use VLLM CUTLASS	2026-03-22 23:03:24 -07:00
Eugene Rakhmatulin	2d749742e4	Changed base image back to base CUDA development one	2026-03-21 18:11:20 -07:00
Eugene Rakhmatulin	7a54657abf	Revert "cuda 13.2 torch" This reverts commit `926dd57a87`.	2026-03-21 15:36:17 -07:00
Eugene Rakhmatulin	926dd57a87	cuda 13.2 torch	2026-03-21 15:15:01 -07:00
Eugene Rakhmatulin	6e8d85c914	cleanup	2026-03-21 15:12:12 -07:00
Drew Botwinick	d6e76f8e2f	add build metadata generation and include in Dockerfiles	2026-03-21 16:10:04 -05:00
Eugene Rakhmatulin	8385506c5e	Fixes	2026-03-20 23:51:21 -07:00
Eugene Rakhmatulin	8caebe3155	Reverting back to CUDA image + pytorch from wheels	2026-03-20 17:03:18 -07:00
Eugene Rakhmatulin	919a881cb1	Merge branch 'main' of gitlab.home.eugr.net:ai/spark-vllm	2026-03-18 22:03:25 -07:00
Eugene Rakhmatulin	8ddc259619	Fixed #111	2026-03-18 22:03:04 -07:00
eugr	22f3fa6c21	Merge pull request #103 from apairmont/network_arg Add docker --network arg to common build flags	2026-03-18 21:48:48 -07:00
Eugene Rakhmatulin	15d295887c	Updated README to reflect `--master-port` parameter	2026-03-18 21:23:28 -07:00
Eugene Rakhmatulin	7e4150feed	Added master-port argument	2026-03-18 16:57:55 -07:00
eugr	7b752c31c5	Merge pull request #110 from voloszad/patch-1 Remove run-cluster-node.sh script copy and permission commands from Dockerfile.mxfp4	2026-03-18 14:54:11 -07:00
Andrej V.	bdd2b10f54	Remove script copy and permission commands from Dockerfile Removed script copying and permission setting for run-cluster-node.sh.	2026-03-18 21:57:56 +01:00
Eugene Rakhmatulin	2755b62d12	Fixes #108	2026-03-18 13:26:39 -07:00
Eugene Rakhmatulin	f327b92abe	Fixes #106 and #108	2026-03-18 13:06:44 -07:00
Eugene Rakhmatulin	57b458570e	Added experimental Qwen3.5-397B support for dual Spark configuration	2026-03-17 19:05:36 -07:00
Eugene Rakhmatulin	57ed099465	Updated README file to reflect new launch-cluster options.	2026-03-17 16:16:04 -07:00
Eugene Rakhmatulin	fb0687cd1b	Updated README to describe no-ray mode	2026-03-17 15:27:22 -07:00
Eugene Rakhmatulin	ccea2ba861	Bugfixes	2026-03-17 13:54:42 -07:00
Eugene Rakhmatulin	957605498c	Added extra passthrough variables to run-recipe	2026-03-17 13:41:40 -07:00
Eugene Rakhmatulin	b1eeefc0eb	Changed Nemotron-3-Nano-NVFP4 to Marlin backend	2026-03-17 13:10:48 -07:00
Alan Pairmont	b879b7748f	add network arg to common build flags	2026-03-16 12:09:59 -04:00
Eugene Rakhmatulin	fa645f3e4b	bugfixes	2026-03-13 13:39:30 -07:00
Eugene Rakhmatulin	dedbd0a01d	bugfixes	2026-03-13 12:41:48 -07:00
Eugene Rakhmatulin	caa83d9e5b	Bugfixes	2026-03-13 12:32:43 -07:00
Eugene Rakhmatulin	4bcbbaa25a	Bugfixes	2026-03-13 12:23:41 -07:00
Eugene Rakhmatulin	d08266a123	Bugfixes	2026-03-13 12:18:22 -07:00
Eugene Rakhmatulin	03b055d7f0	Major cluster orchestration refactoring to support running without Ray	2026-03-13 11:55:18 -07:00
Eugene Rakhmatulin	d609fecef3	Merge branch 'main' of github.com:eugr/spark-vllm-docker	2026-03-12 15:04:41 -07:00
eugr	7c198b1ceb	Merge pull request #90 from sonusflow/pr/qwen35-397b-tp4 Add Qwen3.5-397B INT4-AutoRound TP=4 recipe (37 tok/s)	2026-03-12 15:04:23 -07:00
Eugene Rakhmatulin	8ae51192e5	Experimental mod to support gpu-memory-utilization-gb	2026-03-12 13:37:44 -07:00
Eugene Rakhmatulin	8fec9bed06	Updated Nemotron to support dual sparks	2026-03-12 13:30:15 -07:00
Eugene Rakhmatulin	6a323cc6f5	Merge pull request #93	2026-03-12 13:00:13 -07:00
Eugene Rakhmatulin	6f9a2f981c	Adjusted model parameters	2026-03-12 12:59:05 -07:00
remi	122edc8229	super nemotron mod & recipe for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4	2026-03-11 20:53:44 +01:00
Eugene Rakhmatulin	7ceea85647	Fixed qwen3-coder-next-int4-autoround to exclude Ray	2026-03-11 11:20:56 -07:00
Eugene Rakhmatulin	45066e2b16	Updated README	2026-03-11 09:57:34 -07:00
Eugene Rakhmatulin	f2cf11b047	Added a recipe for qwen3-coder-next-int4-autoround	2026-03-11 09:23:23 -07:00
sonusflow	3baca14eb1	Move recipe to 4x-spark-cluster/ and add UMA memory optimizations - Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/ per maintainer request (multi-node recipes in separate directory) - Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env - Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory): - Disable Ray dashboard (saves ~1.2 GiB per node) - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB) - Disable pre-started idle workers (saves ~8 GiB on head node) - Set --num-cpus 2 and --disable-usage-stats on all nodes - Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache	2026-03-11 07:29:45 +00:00
Eugene Rakhmatulin	66b5c85907	Merge branch 'main' of github.com:eugr/spark-vllm-docker	2026-03-10 10:29:10 -07:00
eugr	0019bdf5ed	Merge pull request #85 from saladinomario/feat/recipe-env-passthrough Add -e/--env passthrough to run-recipe.py	2026-03-10 10:28:29 -07:00
sonusflow	006734910c	Add Qwen3.5-397B INT4-AutoRound TP=4 recipe and Marlin fix Production-tested recipe for running Qwen3.5-397B-A17B with INT4 AutoRound quantization across 4 DGX Spark nodes using tensor parallelism. Performance (4× DGX Spark, driver 580.126.09): - Single user: 37 tok/s - 4 concurrent: ~26 tok/s per user, ~103 tok/s aggregate The Marlin TP fix resolves the MIN_THREAD_N=64 constraint that breaks in_proj_ba layers at TP=4 (output_size=128/4=32 < 64). Solution: ReplicatedLinear for B/A projections, applied via diff patches. Key config: - VLLM_MARLIN_USE_ATOMIC_ADD=1 (required for Marlin correctness) - KV cache FP8, prefix caching enabled - gpu_memory_utilization 0.78 (UMA safe margin) - CUDAGraphs enabled (default, requires driver 580.x) Note: Driver 590.x has CUDAGraph capture deadlock on GB10 unified memory. Stay on driver 580.126.09.	2026-03-09 21:30:28 +00:00
Eugene Rakhmatulin	e225c709fb	Revert "fix: add temporary patch for CUDA graphs estimation" as it has been merged to main This reverts commit `63b2a8dbed`.	2026-03-09 09:46:50 -07:00
Eugene Rakhmatulin	63b2a8dbed	fix: add temporary patch for CUDA graphs estimation	2026-03-08 22:43:41 -07:00
eugr	9724619dbd	Merge pull request #87 from SeraphimSerapis/fix_wheels_download fix: skip empty lines in wheel download read loop	2026-03-07 09:34:31 -08:00
Eugene Rakhmatulin	d42c4199fa	Unsloth chat template for qwen3.5	2026-03-06 23:35:18 -08:00
Tim Messerschmidt	b9fc32ec34	fix: skip empty lines in wheel download read loop Add a guard to skip empty lines (e.g. trailing newlines) in the while-read loop to prevent try_download_wheels from breaking on unexpected blank input.	2026-03-07 05:06:12 +01:00
Eugene Rakhmatulin	9dc09bd04b	Renamed recipe for qwen3.5-35b-a3b-fp8 to match others	2026-03-06 13:56:06 -08:00
eugr	e88426646b	Merge pull request #76 from mmonad/fix-exec-arg-quoting Fix shell quoting for exec command arguments	2026-03-06 13:45:53 -08:00
mariosaladino	f95beba566	Add -e/--env passthrough to run-recipe.py Fixes #81. Allows passing environment variables (e.g. HF_TOKEN) through to the container when launching via recipes, mirroring the existing -e flag in launch-cluster.sh. Usage: ./run-recipe.sh glm-4.7-flash-awq --solo -e HF_TOKEN=$HF_TOKEN	2026-03-06 21:50:29 +01:00
Olivier Paroz	eb8abcca7f	Prevent 169.254.x.x fallback when setting fix IP address (#84 ) * Prevent 169.254.x.x fallback when setting fix IP address To force the use of the IP we've chosen to be assigned to the interface, it's safer to disable the fallback to avoid problems down the line * Prevent 169.254.x.x fallback when setting fix IP address To force the use of the static IP address we've chosen to be assigned to the interface, it's safer to disable the fallback to avoid problems down the line	2026-03-06 11:47:47 -08:00
eugr	d148d95a19	Merge pull request #80 from oliverjohnwilson/recipe-add_minimax-m2.5_qwen3.5-397b-a17B-fp8 added minimax-m2.5 and qwen3.5-397b-a17B-fp8 recipes to a recipes/4x-spark-cluster/ subdirectory	2026-03-06 11:46:37 -08:00
Eugene Rakhmatulin	5346372f14	More robust wheels check before download	2026-03-05 17:06:57 -08:00
Eugene Rakhmatulin	5f8f988d91	Merge branch 'main' of github.com:eugr/spark-vllm-docker	2026-03-05 16:29:00 -08:00
eugr	3fabd3fb1c	Merge pull request #72 from erikvullings/main Add Qwen35-35B-A3B recipe in FP8 format	2026-03-05 16:27:50 -08:00
Eugene Rakhmatulin	2d03bc138d	saving flashinfer and vllm commits in wheels directories	2026-03-05 14:41:25 -08:00
Eugene Rakhmatulin	a749fcce87	Added a recipe for qwen3.5-122B-FP8	2026-03-04 16:49:39 -08:00
Eugene Rakhmatulin	505a060a7d	vLLM prebuilt wheels support	2026-03-04 16:01:50 -08:00
Eugene Rakhmatulin	ca34ebcffc	Merge branch 'main' into vllm-wheels	2026-03-04 15:59:16 -08:00
oliverjohnwilson	4303f8b6d0	added minimax-m2.5 and qwen3.5-397b-a17B-fp8 recipes to a recipes/4x-spark-cluster/ subdirectory	2026-03-04 16:01:37 -06:00
Eugene Rakhmatulin	2152ef127d	Now can use prebuilt vLLM wheels	2026-03-04 13:33:32 -08:00
L.B.R.	50b3ca60f3	Fix shell quoting for exec command arguments Arguments with special characters (e.g. JSON strings) were passed unquoted, causing breakage for commands like: --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":2}' Use printf %q in launch-cluster.sh and shlex.quote() in run-recipe.py to properly escape arguments.	2026-03-04 15:22:42 +00:00
Erik Vullings	163f23d85b	Update qwen35-35b-a3b-fp8.yaml --max_num_batched_tokens is a default variable now, which can be overriden via the CLI	2026-03-03 12:46:12 +01:00
Erik Vullings	e8f94d6b8b	Add Qwen35-35B-A3B recipe in FP8 format	2026-02-27 17:46:06 +01:00