From 006734910c08d50f45059da526ac7507ee7bdc6d Mon Sep 17 00:00:00 2001
From: sonusflow <admin@sonusflow.pl>
Date: Mon, 9 Mar 2026 21:30:28 +0000
Subject: [PATCH 1/2] Add Qwen3.5-397B INT4-AutoRound TP=4 recipe and Marlin
 fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Production-tested recipe for running Qwen3.5-397B-A17B with INT4 AutoRound
quantization across 4 DGX Spark nodes using tensor parallelism.

Performance (4× DGX Spark, driver 580.126.09):
- Single user: 37 tok/s
- 4 concurrent: ~26 tok/s per user, ~103 tok/s aggregate

The Marlin TP fix resolves the MIN_THREAD_N=64 constraint that breaks
in_proj_ba layers at TP=4 (output_size=128/4=32 < 64). Solution:
ReplicatedLinear for B/A projections, applied via diff patches.

Key config:
- VLLM_MARLIN_USE_ATOMIC_ADD=1 (required for Marlin correctness)
- KV cache FP8, prefix caching enabled
- gpu_memory_utilization 0.78 (UMA safe margin)
- CUDAGraphs enabled (default, requires driver 580.x)

Note: Driver 590.x has CUDAGraph capture deadlock on GB10 unified memory.
Stay on driver 580.126.09.
---
 mods/fix-qwen35-tp4-marlin/fix_rope.py      | 23 +++++++++
 mods/fix-qwen35-tp4-marlin/qwen3_5.patch    | 46 +++++++++++++++++
 mods/fix-qwen35-tp4-marlin/qwen3_next.patch | 56 +++++++++++++++++++++
 mods/fix-qwen35-tp4-marlin/run.sh           | 23 +++++++++
 recipes/qwen3.5-397b-int4-autoround.yaml    | 52 +++++++++++++++++++
 5 files changed, 200 insertions(+)
 create mode 100644 mods/fix-qwen35-tp4-marlin/fix_rope.py
 create mode 100644 mods/fix-qwen35-tp4-marlin/qwen3_5.patch
 create mode 100644 mods/fix-qwen35-tp4-marlin/qwen3_next.patch
 create mode 100755 mods/fix-qwen35-tp4-marlin/run.sh
 create mode 100644 recipes/qwen3.5-397b-int4-autoround.yaml

diff --git a/mods/fix-qwen35-tp4-marlin/fix_rope.py b/mods/fix-qwen35-tp4-marlin/fix_rope.py
new file mode 100644
index 0000000..56cbe7a
--- /dev/null
+++ b/mods/fix-qwen35-tp4-marlin/fix_rope.py
@@ -0,0 +1,23 @@
+# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
+import re
+
+path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
+with open(path) as f:
+    content = f.read()
+
+old = """kwargs["ignore_keys_at_rope_validation"] = [
+            "mrope_section",
+            "mrope_interleaved",
+        ]"""
+
+new = """kwargs["ignore_keys_at_rope_validation"] = {
+            "mrope_section",
+            "mrope_interleaved",
+        }"""
+
+content = content.replace(old, new)
+
+with open(path, "w") as f:
+    f.write(content)
+
+print("Fixed ignore_keys_at_rope_validation: list -> set")
diff --git a/mods/fix-qwen35-tp4-marlin/qwen3_5.patch b/mods/fix-qwen35-tp4-marlin/qwen3_5.patch
new file mode 100644
index 0000000..835c856
--- /dev/null
+++ b/mods/fix-qwen35-tp4-marlin/qwen3_5.patch
@@ -0,0 +1,46 @@
+--- qwen3_5.py.orig	2026-03-03 00:00:00.000000000 +0000
++++ qwen3_5.py	2026-03-03 00:00:00.000000000 +0000
+@@ -166,11 +166,13 @@
+         z_size = self.value_dim // self.tp_size
+         mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
+         z = z.reshape(z.size(0), -1, self.head_v_dim)
+-        ba, _ = self.in_proj_ba(hidden_states)
+-        b, a = ba.chunk(2, dim=-1)
+-
+-        b = b.contiguous()
+-        a = a.contiguous()
++        # Replicated B/A projections — full output, sliced to local TP partition
++        b_full, _ = self.in_proj_b(hidden_states)
++        a_full, _ = self.in_proj_a(hidden_states)
++        _ba_chunk = self.num_v_heads // self.tp_size
++        _ba_start = self.tp_rank * _ba_chunk
++        b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
++        a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
+
+         # ============================================================
+         # Part 2: Core Attention (Custom Op)
+@@ -374,8 +376,6 @@
+             # GDN
+             ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
+             ("in_proj_qkvz", "in_proj_z", 3),
+-            ("in_proj_ba", "in_proj_b", 0),
+-            ("in_proj_ba", "in_proj_a", 1),
+         ]
+
+         params_dict = dict(self.named_parameters())
+@@ -530,7 +530,6 @@
+         "gate_up_proj": ["gate_proj", "up_proj"],
+         # GDN fused projections.
+         "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+-        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+     }
+
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+@@ -630,7 +629,6 @@
+ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
+     packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
+         "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+-        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+     }
+
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
diff --git a/mods/fix-qwen35-tp4-marlin/qwen3_next.patch b/mods/fix-qwen35-tp4-marlin/qwen3_next.patch
new file mode 100644
index 0000000..1681194
--- /dev/null
+++ b/mods/fix-qwen35-tp4-marlin/qwen3_next.patch
@@ -0,0 +1,56 @@
+--- qwen3_next.py.orig	2026-03-03 00:00:00.000000000 +0000
++++ qwen3_next.py	2026-03-03 00:00:00.000000000 +0000
+@@ -411,15 +411,22 @@
+             quant_config=quant_config,
+             prefix=f"{prefix}.in_proj_qkvz",
+         )
+-        # ba_proj doesn't support blockwise fp8 quantization.
+-        # # in_proj_ba is defined as MergedColumnParallelLinear for
+-        # compatibility with Qwen3_5.
+-        self.in_proj_ba = MergedColumnParallelLinear(
++        # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
++        # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
++        # Each rank loads full weights and slices in forward.
++        self.in_proj_b = ReplicatedLinear(
+             input_size=self.hidden_size,
+-            output_sizes=[self.num_v_heads] * 2,
++            output_size=self.num_v_heads,
+             bias=False,
+             quant_config=quant_config,
+-            prefix=f"{prefix}.in_proj_ba",
++            prefix=f"{prefix}.in_proj_b",
++        )
++        self.in_proj_a = ReplicatedLinear(
++            input_size=self.hidden_size,
++            output_size=self.num_v_heads,
++            bias=False,
++            quant_config=quant_config,
++            prefix=f"{prefix}.in_proj_a",
+         )
+
+         query_key_settings = (self.key_dim, 0, False)
+@@ -584,7 +591,15 @@
+         # Part 1: Input Projection
+         # ============================================================
+         projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+-        projected_states_ba, _ = self.in_proj_ba(hidden_states)
++        # Replicated B/A projections — full output, sliced to local TP partition
++        b_full, _ = self.in_proj_b(hidden_states)
++        a_full, _ = self.in_proj_a(hidden_states)
++        _ba_chunk = self.num_v_heads // self.tp_size
++        _ba_start = self.tp_rank * _ba_chunk
++        projected_states_ba = torch.cat([
++            b_full[:, _ba_start:_ba_start+_ba_chunk],
++            a_full[:, _ba_start:_ba_start+_ba_chunk],
++        ], dim=-1)
+         query, key, value, z, b, a = self.fix_query_key_value_ordering(
+             projected_states_qkvz, projected_states_ba
+         )
+@@ -1326,7 +1341,6 @@
+         ],
+         "gate_up_proj": ["gate_proj", "up_proj"],
+         "in_proj_qkvz": ["in_proj_qkvz"],
+-        "in_proj_ba": ["in_proj_ba"],
+     }
+
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/mods/fix-qwen35-tp4-marlin/run.sh b/mods/fix-qwen35-tp4-marlin/run.sh
new file mode 100755
index 0000000..372b1bd
--- /dev/null
+++ b/mods/fix-qwen35-tp4-marlin/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
+# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
+# Delivery: unified diff patches (portable across vLLM versions)
+
+set -e
+MOD_DIR="$(dirname "$0")"
+MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
+
+echo "[fix-qwen35-tp4-marlin] Applying patches..."
+
+# Apply patches with --forward (skip if already applied)
+patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
+    echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
+}
+patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
+    echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
+}
+
+# Fix rope validation (idempotent)
+python3 "$MOD_DIR/fix_rope.py"
+
+echo "[fix-qwen35-tp4-marlin] Done."
diff --git a/recipes/qwen3.5-397b-int4-autoround.yaml b/recipes/qwen3.5-397b-int4-autoround.yaml
new file mode 100644
index 0000000..073741a
--- /dev/null
+++ b/recipes/qwen3.5-397b-int4-autoround.yaml
@@ -0,0 +1,52 @@
+# Recipe: Qwen3.5-397B-A17B-INT4-Autoround
+# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
+# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
+# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
+
+recipe_version: "1"
+name: Qwen3.5-397B-INT4-Autoround
+description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
+
+# HuggingFace model to download (optional, for --download-model)
+model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
+
+# Container image to use
+container: vllm-node-tf5
+
+build_args:
+  - --tf5
+
+# Mods required: coder-next tool/reasoning parser + Marlin TP fix
+mods:
+  - mods/fix-qwen3-coder-next
+  - mods/fix-qwen35-tp4-marlin
+
+# Environment variables
+env:
+  VLLM_MARLIN_USE_ATOMIC_ADD: 1
+
+# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
+defaults:
+  port: 8000
+  host: 0.0.0.0
+  tensor_parallel: 4
+  gpu_memory_utilization: 0.78
+  max_model_len: 32768
+  max_num_batched_tokens: 8192
+
+# The vLLM serve command template
+command: |
+  vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
+    --tool-call-parser qwen3_coder \
+    --reasoning-parser qwen3 \
+    --enable-auto-tool-choice \
+    --tensor-parallel-size {tensor_parallel} \
+    --distributed-executor-backend ray \
+    --kv-cache-dtype fp8 \
+    --gpu-memory-utilization {gpu_memory_utilization} \
+    --max-model-len {max_model_len} \
+    --max-num-batched-tokens {max_num_batched_tokens} \
+    --enable-prefix-caching \
+    --trust-remote-code \
+    --host {host} \
+    --port {port}

From 3baca14eb1364b2cb039fa5339d09a36b577896d Mon Sep 17 00:00:00 2001
From: sonusflow <admin@sonusflow.pl>
Date: Wed, 11 Mar 2026 07:29:45 +0000
Subject: [PATCH 2/2] Move recipe to 4x-spark-cluster/ and add UMA memory
 optimizations

- Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/
  per maintainer request (multi-node recipes in separate directory)
- Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env
- Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory):
  - Disable Ray dashboard (saves ~1.2 GiB per node)
  - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB)
  - Disable pre-started idle workers (saves ~8 GiB on head node)
  - Set --num-cpus 2 and --disable-usage-stats on all nodes
- Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache
---
 .../qwen3.5-397b-int4-autoround.yaml               |  1 +
 run-cluster-node.sh                                | 14 +++++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)
 rename recipes/{ => 4x-spark-cluster}/qwen3.5-397b-int4-autoround.yaml (96%)

diff --git a/recipes/qwen3.5-397b-int4-autoround.yaml b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
similarity index 96%
rename from recipes/qwen3.5-397b-int4-autoround.yaml
rename to recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
index 073741a..c1d4d85 100644
--- a/recipes/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
@@ -24,6 +24,7 @@ mods:
 # Environment variables
 env:
   VLLM_MARLIN_USE_ATOMIC_ADD: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
 
 # Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
 defaults:
diff --git a/run-cluster-node.sh b/run-cluster-node.sh
index 2ec9049..796fe06 100755
--- a/run-cluster-node.sh
+++ b/run-cluster-node.sh
@@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist RAY_memory_monitor_refresh_ms "0"
 
+# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory)
+# Disable pre-started idle workers (saves ~8 GiB on head node)
+export_persist RAY_num_prestart_python_workers "0"
+# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA)
+export_persist RAY_object_store_memory "1073741824"
+
 # --- Execution ---
 
 if [ "${NODE_TYPE}" == "head" ]; then
     echo "Starting Ray HEAD node..."
-    exec ray start --block --head --port 6379 \
+    exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
         --node-ip-address "$VLLM_HOST_IP" \
-	--include-dashboard=True \
-        --dashboard-host "0.0.0.0" \
-        --dashboard-port 8265 \
+	--include-dashboard=false \
         --disable-usage-stats
 else
     echo "Starting Ray WORKER node connecting to $HEAD_IP..."
-    exec ray start --block \
+    exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
         --address="$HEAD_IP:6379" \
         --node-ip-address "$VLLM_HOST_IP"
 fi