From 006734910c08d50f45059da526ac7507ee7bdc6d Mon Sep 17 00:00:00 2001 From: sonusflow Date: Mon, 9 Mar 2026 21:30:28 +0000 Subject: [PATCH 1/2] Add Qwen3.5-397B INT4-AutoRound TP=4 recipe and Marlin fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production-tested recipe for running Qwen3.5-397B-A17B with INT4 AutoRound quantization across 4 DGX Spark nodes using tensor parallelism. Performance (4× DGX Spark, driver 580.126.09): - Single user: 37 tok/s - 4 concurrent: ~26 tok/s per user, ~103 tok/s aggregate The Marlin TP fix resolves the MIN_THREAD_N=64 constraint that breaks in_proj_ba layers at TP=4 (output_size=128/4=32 < 64). Solution: ReplicatedLinear for B/A projections, applied via diff patches. Key config: - VLLM_MARLIN_USE_ATOMIC_ADD=1 (required for Marlin correctness) - KV cache FP8, prefix caching enabled - gpu_memory_utilization 0.78 (UMA safe margin) - CUDAGraphs enabled (default, requires driver 580.x) Note: Driver 590.x has CUDAGraph capture deadlock on GB10 unified memory. Stay on driver 580.126.09. --- mods/fix-qwen35-tp4-marlin/fix_rope.py | 23 +++++++++ mods/fix-qwen35-tp4-marlin/qwen3_5.patch | 46 +++++++++++++++++ mods/fix-qwen35-tp4-marlin/qwen3_next.patch | 56 +++++++++++++++++++++ mods/fix-qwen35-tp4-marlin/run.sh | 23 +++++++++ recipes/qwen3.5-397b-int4-autoround.yaml | 52 +++++++++++++++++++ 5 files changed, 200 insertions(+) create mode 100644 mods/fix-qwen35-tp4-marlin/fix_rope.py create mode 100644 mods/fix-qwen35-tp4-marlin/qwen3_5.patch create mode 100644 mods/fix-qwen35-tp4-marlin/qwen3_next.patch create mode 100755 mods/fix-qwen35-tp4-marlin/run.sh create mode 100644 recipes/qwen3.5-397b-int4-autoround.yaml diff --git a/mods/fix-qwen35-tp4-marlin/fix_rope.py b/mods/fix-qwen35-tp4-marlin/fix_rope.py new file mode 100644 index 0000000..56cbe7a --- /dev/null +++ b/mods/fix-qwen35-tp4-marlin/fix_rope.py @@ -0,0 +1,23 @@ +# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union) +import re + +path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py" +with open(path) as f: + content = f.read() + +old = """kwargs["ignore_keys_at_rope_validation"] = [ + "mrope_section", + "mrope_interleaved", + ]""" + +new = """kwargs["ignore_keys_at_rope_validation"] = { + "mrope_section", + "mrope_interleaved", + }""" + +content = content.replace(old, new) + +with open(path, "w") as f: + f.write(content) + +print("Fixed ignore_keys_at_rope_validation: list -> set") diff --git a/mods/fix-qwen35-tp4-marlin/qwen3_5.patch b/mods/fix-qwen35-tp4-marlin/qwen3_5.patch new file mode 100644 index 0000000..835c856 --- /dev/null +++ b/mods/fix-qwen35-tp4-marlin/qwen3_5.patch @@ -0,0 +1,46 @@ +--- qwen3_5.py.orig 2026-03-03 00:00:00.000000000 +0000 ++++ qwen3_5.py 2026-03-03 00:00:00.000000000 +0000 +@@ -166,11 +166,13 @@ + z_size = self.value_dim // self.tp_size + mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1) + z = z.reshape(z.size(0), -1, self.head_v_dim) +- ba, _ = self.in_proj_ba(hidden_states) +- b, a = ba.chunk(2, dim=-1) +- +- b = b.contiguous() +- a = a.contiguous() ++ # Replicated B/A projections — full output, sliced to local TP partition ++ b_full, _ = self.in_proj_b(hidden_states) ++ a_full, _ = self.in_proj_a(hidden_states) ++ _ba_chunk = self.num_v_heads // self.tp_size ++ _ba_start = self.tp_rank * _ba_chunk ++ b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous() ++ a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous() + + # ============================================================ + # Part 2: Core Attention (Custom Op) +@@ -374,8 +376,6 @@ + # GDN + ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)), + ("in_proj_qkvz", "in_proj_z", 3), +- ("in_proj_ba", "in_proj_b", 0), +- ("in_proj_ba", "in_proj_a", 1), + ] + + params_dict = dict(self.named_parameters()) +@@ -530,7 +530,6 @@ + "gate_up_proj": ["gate_proj", "up_proj"], + # GDN fused projections. + "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], +- "in_proj_ba": ["in_proj_b", "in_proj_a"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): +@@ -630,7 +629,6 @@ + class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid): + packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | { + "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], +- "in_proj_ba": ["in_proj_b", "in_proj_a"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): diff --git a/mods/fix-qwen35-tp4-marlin/qwen3_next.patch b/mods/fix-qwen35-tp4-marlin/qwen3_next.patch new file mode 100644 index 0000000..1681194 --- /dev/null +++ b/mods/fix-qwen35-tp4-marlin/qwen3_next.patch @@ -0,0 +1,56 @@ +--- qwen3_next.py.orig 2026-03-03 00:00:00.000000000 +0000 ++++ qwen3_next.py 2026-03-03 00:00:00.000000000 +0000 +@@ -411,15 +411,22 @@ + quant_config=quant_config, + prefix=f"{prefix}.in_proj_qkvz", + ) +- # ba_proj doesn't support blockwise fp8 quantization. +- # # in_proj_ba is defined as MergedColumnParallelLinear for +- # compatibility with Qwen3_5. +- self.in_proj_ba = MergedColumnParallelLinear( ++ # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint ++ # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64). ++ # Each rank loads full weights and slices in forward. ++ self.in_proj_b = ReplicatedLinear( + input_size=self.hidden_size, +- output_sizes=[self.num_v_heads] * 2, ++ output_size=self.num_v_heads, + bias=False, + quant_config=quant_config, +- prefix=f"{prefix}.in_proj_ba", ++ prefix=f"{prefix}.in_proj_b", ++ ) ++ self.in_proj_a = ReplicatedLinear( ++ input_size=self.hidden_size, ++ output_size=self.num_v_heads, ++ bias=False, ++ quant_config=quant_config, ++ prefix=f"{prefix}.in_proj_a", + ) + + query_key_settings = (self.key_dim, 0, False) +@@ -584,7 +591,15 @@ + # Part 1: Input Projection + # ============================================================ + projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states) +- projected_states_ba, _ = self.in_proj_ba(hidden_states) ++ # Replicated B/A projections — full output, sliced to local TP partition ++ b_full, _ = self.in_proj_b(hidden_states) ++ a_full, _ = self.in_proj_a(hidden_states) ++ _ba_chunk = self.num_v_heads // self.tp_size ++ _ba_start = self.tp_rank * _ba_chunk ++ projected_states_ba = torch.cat([ ++ b_full[:, _ba_start:_ba_start+_ba_chunk], ++ a_full[:, _ba_start:_ba_start+_ba_chunk], ++ ], dim=-1) + query, key, value, z, b, a = self.fix_query_key_value_ordering( + projected_states_qkvz, projected_states_ba + ) +@@ -1326,7 +1341,6 @@ + ], + "gate_up_proj": ["gate_proj", "up_proj"], + "in_proj_qkvz": ["in_proj_qkvz"], +- "in_proj_ba": ["in_proj_ba"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/mods/fix-qwen35-tp4-marlin/run.sh b/mods/fix-qwen35-tp4-marlin/run.sh new file mode 100755 index 0000000..372b1bd --- /dev/null +++ b/mods/fix-qwen35-tp4-marlin/run.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64 +# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections +# Delivery: unified diff patches (portable across vLLM versions) + +set -e +MOD_DIR="$(dirname "$0")" +MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models" + +echo "[fix-qwen35-tp4-marlin] Applying patches..." + +# Apply patches with --forward (skip if already applied) +patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || { + echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed" +} +patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || { + echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed" +} + +# Fix rope validation (idempotent) +python3 "$MOD_DIR/fix_rope.py" + +echo "[fix-qwen35-tp4-marlin] Done." diff --git a/recipes/qwen3.5-397b-int4-autoround.yaml b/recipes/qwen3.5-397b-int4-autoround.yaml new file mode 100644 index 0000000..073741a --- /dev/null +++ b/recipes/qwen3.5-397b-int4-autoround.yaml @@ -0,0 +1,52 @@ +# Recipe: Qwen3.5-397B-A17B-INT4-Autoround +# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes +# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark +# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10) + +recipe_version: "1" +name: Qwen3.5-397B-INT4-Autoround +description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied) + +# HuggingFace model to download (optional, for --download-model) +model: Intel/Qwen3.5-397B-A17B-int4-AutoRound + +# Container image to use +container: vllm-node-tf5 + +build_args: + - --tf5 + +# Mods required: coder-next tool/reasoning parser + Marlin TP fix +mods: + - mods/fix-qwen3-coder-next + - mods/fix-qwen35-tp4-marlin + +# Environment variables +env: + VLLM_MARLIN_USE_ATOMIC_ADD: 1 + +# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 4 + gpu_memory_utilization: 0.78 + max_model_len: 32768 + max_num_batched_tokens: 8192 + +# The vLLM serve command template +command: | + vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \ + --tool-call-parser qwen3_coder \ + --reasoning-parser qwen3 \ + --enable-auto-tool-choice \ + --tensor-parallel-size {tensor_parallel} \ + --distributed-executor-backend ray \ + --kv-cache-dtype fp8 \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --max-model-len {max_model_len} \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --enable-prefix-caching \ + --trust-remote-code \ + --host {host} \ + --port {port} From 3baca14eb1364b2cb039fa5339d09a36b577896d Mon Sep 17 00:00:00 2001 From: sonusflow Date: Wed, 11 Mar 2026 07:29:45 +0000 Subject: [PATCH 2/2] Move recipe to 4x-spark-cluster/ and add UMA memory optimizations - Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/ per maintainer request (multi-node recipes in separate directory) - Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env - Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory): - Disable Ray dashboard (saves ~1.2 GiB per node) - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB) - Disable pre-started idle workers (saves ~8 GiB on head node) - Set --num-cpus 2 and --disable-usage-stats on all nodes - Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache --- .../qwen3.5-397b-int4-autoround.yaml | 1 + run-cluster-node.sh | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) rename recipes/{ => 4x-spark-cluster}/qwen3.5-397b-int4-autoround.yaml (96%) diff --git a/recipes/qwen3.5-397b-int4-autoround.yaml b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml similarity index 96% rename from recipes/qwen3.5-397b-int4-autoround.yaml rename to recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml index 073741a..c1d4d85 100644 --- a/recipes/qwen3.5-397b-int4-autoround.yaml +++ b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml @@ -24,6 +24,7 @@ mods: # Environment variables env: VLLM_MARLIN_USE_ATOMIC_ADD: 1 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True # Default settings (can be overridden via CLI, e.g. --tensor_parallel 2) defaults: diff --git a/run-cluster-node.sh b/run-cluster-node.sh index 2ec9049..796fe06 100755 --- a/run-cluster-node.sh +++ b/run-cluster-node.sh @@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME" export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME" export_persist RAY_memory_monitor_refresh_ms "0" +# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory) +# Disable pre-started idle workers (saves ~8 GiB on head node) +export_persist RAY_num_prestart_python_workers "0" +# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA) +export_persist RAY_object_store_memory "1073741824" + # --- Execution --- if [ "${NODE_TYPE}" == "head" ]; then echo "Starting Ray HEAD node..." - exec ray start --block --head --port 6379 \ + exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \ --node-ip-address "$VLLM_HOST_IP" \ - --include-dashboard=True \ - --dashboard-host "0.0.0.0" \ - --dashboard-port 8265 \ + --include-dashboard=false \ --disable-usage-stats else echo "Starting Ray WORKER node connecting to $HEAD_IP..." - exec ray start --block \ + exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \ --address="$HEAD_IP:6379" \ --node-ip-address "$VLLM_HOST_IP" fi