Add Qwen3.5-397B INT4-AutoRound TP=4 recipe and Marlin fix

Production-tested recipe for running Qwen3.5-397B-A17B with INT4 AutoRound quantization across 4 DGX Spark nodes using tensor parallelism. Performance (4× DGX Spark, driver 580.126.09): - Single user: 37 tok/s - 4 concurrent: ~26 tok/s per user, ~103 tok/s aggregate The Marlin TP fix resolves the MIN_THREAD_N=64 constraint that breaks in_proj_ba layers at TP=4 (output_size=128/4=32 < 64). Solution: ReplicatedLinear for B/A projections, applied via diff patches. Key config: - VLLM_MARLIN_USE_ATOMIC_ADD=1 (required for Marlin correctness) - KV cache FP8, prefix caching enabled - gpu_memory_utilization 0.78 (UMA safe margin) - CUDAGraphs enabled (default, requires driver 580.x) Note: Driver 590.x has CUDAGraph capture deadlock on GB10 unified memory. Stay on driver 580.126.09.
2026-03-09 21:30:28 +00:00
parent 9724619dbd
commit 006734910c
5 changed files with 200 additions and 0 deletions
--- a/mods/fix-qwen35-tp4-marlin/fix_rope.py
+++ b/mods/fix-qwen35-tp4-marlin/fix_rope.py
@@ -0,0 +1,23 @@
 # Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
 import re
 path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
 with open(path) as f:
    content = f.read()
 old = """kwargs["ignore_keys_at_rope_validation"] = [
            "mrope_section",
            "mrope_interleaved",
        ]"""
 new = """kwargs["ignore_keys_at_rope_validation"] = {
            "mrope_section",
            "mrope_interleaved",
        }"""
 content = content.replace(old, new)
 with open(path, "w") as f:
    f.write(content)
 print("Fixed ignore_keys_at_rope_validation: list -> set")
--- a/mods/fix-qwen35-tp4-marlin/qwen3_5.patch
+++ b/mods/fix-qwen35-tp4-marlin/qwen3_5.patch
@@ -0,0 +1,46 @@
 --- qwen3_5.py.orig	2026-03-03 00:00:00.000000000 +0000
 +++ qwen3_5.py	2026-03-03 00:00:00.000000000 +0000
@@ -166,11 +166,13 @@
         z_size = self.value_dim // self.tp_size
         mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
         z = z.reshape(z.size(0), -1, self.head_v_dim)
 -        ba, _ = self.in_proj_ba(hidden_states)
 -        b, a = ba.chunk(2, dim=-1)
 -
 -        b = b.contiguous()
 -        a = a.contiguous()
 +        # Replicated B/A projections — full output, sliced to local TP partition
 +        b_full, _ = self.in_proj_b(hidden_states)
 +        a_full, _ = self.in_proj_a(hidden_states)
 +        _ba_chunk = self.num_v_heads // self.tp_size
 +        _ba_start = self.tp_rank * _ba_chunk
 +        b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
 +        a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
         # ============================================================
         # Part 2: Core Attention (Custom Op)
@@ -374,8 +376,6 @@
             # GDN
             ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
             ("in_proj_qkvz", "in_proj_z", 3),
 -            ("in_proj_ba", "in_proj_b", 0),
 -            ("in_proj_ba", "in_proj_a", 1),
         ]
         params_dict = dict(self.named_parameters())
@@ -530,7 +530,6 @@
         "gate_up_proj": ["gate_proj", "up_proj"],
         # GDN fused projections.
         "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
 -        "in_proj_ba": ["in_proj_b", "in_proj_a"],
     }
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -630,7 +629,6 @@
 class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
     packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
         "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
 -        "in_proj_ba": ["in_proj_b", "in_proj_a"],
     }
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
--- a/mods/fix-qwen35-tp4-marlin/qwen3_next.patch
+++ b/mods/fix-qwen35-tp4-marlin/qwen3_next.patch
@@ -0,0 +1,56 @@
 --- qwen3_next.py.orig	2026-03-03 00:00:00.000000000 +0000
 +++ qwen3_next.py	2026-03-03 00:00:00.000000000 +0000
@@ -411,15 +411,22 @@
             quant_config=quant_config,
             prefix=f"{prefix}.in_proj_qkvz",
         )
 -        # ba_proj doesn't support blockwise fp8 quantization.
 -        # # in_proj_ba is defined as MergedColumnParallelLinear for
 -        # compatibility with Qwen3_5.
 -        self.in_proj_ba = MergedColumnParallelLinear(
 +        # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
 +        # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
 +        # Each rank loads full weights and slices in forward.
 +        self.in_proj_b = ReplicatedLinear(
             input_size=self.hidden_size,
 -            output_sizes=[self.num_v_heads] * 2,
 +            output_size=self.num_v_heads,
             bias=False,
             quant_config=quant_config,
 -            prefix=f"{prefix}.in_proj_ba",
 +            prefix=f"{prefix}.in_proj_b",
 +        )
 +        self.in_proj_a = ReplicatedLinear(
 +            input_size=self.hidden_size,
 +            output_size=self.num_v_heads,
 +            bias=False,
 +            quant_config=quant_config,
 +            prefix=f"{prefix}.in_proj_a",
         )
         query_key_settings = (self.key_dim, 0, False)
@@ -584,7 +591,15 @@
         # Part 1: Input Projection
         # ============================================================
         projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
 -        projected_states_ba, _ = self.in_proj_ba(hidden_states)
 +        # Replicated B/A projections — full output, sliced to local TP partition
 +        b_full, _ = self.in_proj_b(hidden_states)
 +        a_full, _ = self.in_proj_a(hidden_states)
 +        _ba_chunk = self.num_v_heads // self.tp_size
 +        _ba_start = self.tp_rank * _ba_chunk
 +        projected_states_ba = torch.cat([
 +            b_full[:, _ba_start:_ba_start+_ba_chunk],
 +            a_full[:, _ba_start:_ba_start+_ba_chunk],
 +        ], dim=-1)
         query, key, value, z, b, a = self.fix_query_key_value_ordering(
             projected_states_qkvz, projected_states_ba
         )
@@ -1326,7 +1341,6 @@
         ],
         "gate_up_proj": ["gate_proj", "up_proj"],
         "in_proj_qkvz": ["in_proj_qkvz"],
 -        "in_proj_ba": ["in_proj_ba"],
     }
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
--- a/mods/fix-qwen35-tp4-marlin/run.sh
+++ b/mods/fix-qwen35-tp4-marlin/run.sh
@@ -0,0 +1,23 @@
 #!/bin/bash
 # Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
 # Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
 # Delivery: unified diff patches (portable across vLLM versions)
 set -e
 MOD_DIR="$(dirname "$0")"
 MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
 echo "[fix-qwen35-tp4-marlin] Applying patches..."
 # Apply patches with --forward (skip if already applied)
 patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
    echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
 }
 patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
    echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
 }
 # Fix rope validation (idempotent)
 python3 "$MOD_DIR/fix_rope.py"
 echo "[fix-qwen35-tp4-marlin] Done."
--- a/recipes/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/qwen3.5-397b-int4-autoround.yaml
@@ -0,0 +1,52 @@
 # Recipe: Qwen3.5-397B-A17B-INT4-Autoround
 # Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
 # Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
 # Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
 recipe_version: "1"
 name: Qwen3.5-397B-INT4-Autoround
 description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
 # HuggingFace model to download (optional, for --download-model)
 model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
 # Container image to use
 container: vllm-node-tf5
 build_args:
  - --tf5
 # Mods required: coder-next tool/reasoning parser + Marlin TP fix
 mods:
  - mods/fix-qwen3-coder-next
  - mods/fix-qwen35-tp4-marlin
 # Environment variables
 env:
  VLLM_MARLIN_USE_ATOMIC_ADD: 1
 # Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 4
  gpu_memory_utilization: 0.78
  max_model_len: 32768
  max_num_batched_tokens: 8192
 # The vLLM serve command template
 command: |
  vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
    --tool-call-parser qwen3_coder \
    --reasoning-parser qwen3 \
    --enable-auto-tool-choice \
    --tensor-parallel-size {tensor_parallel} \
    --distributed-executor-backend ray \
    --kv-cache-dtype fp8 \
    --gpu-memory-utilization {gpu_memory_utilization} \
    --max-model-len {max_model_len} \
    --max-num-batched-tokens {max_num_batched_tokens} \
    --enable-prefix-caching \
    --trust-remote-code \
    --host {host} \
    --port {port}