Add Qwen3.5-397B INT4-AutoRound TP=4 recipe and Marlin fix
Production-tested recipe for running Qwen3.5-397B-A17B with INT4 AutoRound quantization across 4 DGX Spark nodes using tensor parallelism. Performance (4× DGX Spark, driver 580.126.09): - Single user: 37 tok/s - 4 concurrent: ~26 tok/s per user, ~103 tok/s aggregate The Marlin TP fix resolves the MIN_THREAD_N=64 constraint that breaks in_proj_ba layers at TP=4 (output_size=128/4=32 < 64). Solution: ReplicatedLinear for B/A projections, applied via diff patches. Key config: - VLLM_MARLIN_USE_ATOMIC_ADD=1 (required for Marlin correctness) - KV cache FP8, prefix caching enabled - gpu_memory_utilization 0.78 (UMA safe margin) - CUDAGraphs enabled (default, requires driver 580.x) Note: Driver 590.x has CUDAGraph capture deadlock on GB10 unified memory. Stay on driver 580.126.09.
This commit is contained in:
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
|
||||||
|
import re
|
||||||
|
|
||||||
|
path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
|
||||||
|
with open(path) as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
old = """kwargs["ignore_keys_at_rope_validation"] = [
|
||||||
|
"mrope_section",
|
||||||
|
"mrope_interleaved",
|
||||||
|
]"""
|
||||||
|
|
||||||
|
new = """kwargs["ignore_keys_at_rope_validation"] = {
|
||||||
|
"mrope_section",
|
||||||
|
"mrope_interleaved",
|
||||||
|
}"""
|
||||||
|
|
||||||
|
content = content.replace(old, new)
|
||||||
|
|
||||||
|
with open(path, "w") as f:
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
print("Fixed ignore_keys_at_rope_validation: list -> set")
|
||||||
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
--- qwen3_5.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||||
|
+++ qwen3_5.py 2026-03-03 00:00:00.000000000 +0000
|
||||||
|
@@ -166,11 +166,13 @@
|
||||||
|
z_size = self.value_dim // self.tp_size
|
||||||
|
mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
|
||||||
|
z = z.reshape(z.size(0), -1, self.head_v_dim)
|
||||||
|
- ba, _ = self.in_proj_ba(hidden_states)
|
||||||
|
- b, a = ba.chunk(2, dim=-1)
|
||||||
|
-
|
||||||
|
- b = b.contiguous()
|
||||||
|
- a = a.contiguous()
|
||||||
|
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||||
|
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||||
|
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||||
|
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||||
|
+ _ba_start = self.tp_rank * _ba_chunk
|
||||||
|
+ b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||||
|
+ a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# Part 2: Core Attention (Custom Op)
|
||||||
|
@@ -374,8 +376,6 @@
|
||||||
|
# GDN
|
||||||
|
("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
|
||||||
|
("in_proj_qkvz", "in_proj_z", 3),
|
||||||
|
- ("in_proj_ba", "in_proj_b", 0),
|
||||||
|
- ("in_proj_ba", "in_proj_a", 1),
|
||||||
|
]
|
||||||
|
|
||||||
|
params_dict = dict(self.named_parameters())
|
||||||
|
@@ -530,7 +530,6 @@
|
||||||
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
# GDN fused projections.
|
||||||
|
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||||
|
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
|
@@ -630,7 +629,6 @@
|
||||||
|
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
|
||||||
|
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
|
||||||
|
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||||
|
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
|
||||||
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
--- qwen3_next.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||||
|
+++ qwen3_next.py 2026-03-03 00:00:00.000000000 +0000
|
||||||
|
@@ -411,15 +411,22 @@
|
||||||
|
quant_config=quant_config,
|
||||||
|
prefix=f"{prefix}.in_proj_qkvz",
|
||||||
|
)
|
||||||
|
- # ba_proj doesn't support blockwise fp8 quantization.
|
||||||
|
- # # in_proj_ba is defined as MergedColumnParallelLinear for
|
||||||
|
- # compatibility with Qwen3_5.
|
||||||
|
- self.in_proj_ba = MergedColumnParallelLinear(
|
||||||
|
+ # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
|
||||||
|
+ # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
|
||||||
|
+ # Each rank loads full weights and slices in forward.
|
||||||
|
+ self.in_proj_b = ReplicatedLinear(
|
||||||
|
input_size=self.hidden_size,
|
||||||
|
- output_sizes=[self.num_v_heads] * 2,
|
||||||
|
+ output_size=self.num_v_heads,
|
||||||
|
bias=False,
|
||||||
|
quant_config=quant_config,
|
||||||
|
- prefix=f"{prefix}.in_proj_ba",
|
||||||
|
+ prefix=f"{prefix}.in_proj_b",
|
||||||
|
+ )
|
||||||
|
+ self.in_proj_a = ReplicatedLinear(
|
||||||
|
+ input_size=self.hidden_size,
|
||||||
|
+ output_size=self.num_v_heads,
|
||||||
|
+ bias=False,
|
||||||
|
+ quant_config=quant_config,
|
||||||
|
+ prefix=f"{prefix}.in_proj_a",
|
||||||
|
)
|
||||||
|
|
||||||
|
query_key_settings = (self.key_dim, 0, False)
|
||||||
|
@@ -584,7 +591,15 @@
|
||||||
|
# Part 1: Input Projection
|
||||||
|
# ============================================================
|
||||||
|
projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
|
||||||
|
- projected_states_ba, _ = self.in_proj_ba(hidden_states)
|
||||||
|
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||||
|
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||||
|
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||||
|
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||||
|
+ _ba_start = self.tp_rank * _ba_chunk
|
||||||
|
+ projected_states_ba = torch.cat([
|
||||||
|
+ b_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||||
|
+ a_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||||
|
+ ], dim=-1)
|
||||||
|
query, key, value, z, b, a = self.fix_query_key_value_ordering(
|
||||||
|
projected_states_qkvz, projected_states_ba
|
||||||
|
)
|
||||||
|
@@ -1326,7 +1341,6 @@
|
||||||
|
],
|
||||||
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||||
|
"in_proj_qkvz": ["in_proj_qkvz"],
|
||||||
|
- "in_proj_ba": ["in_proj_ba"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
@@ -0,0 +1,23 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
|
||||||
|
# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
|
||||||
|
# Delivery: unified diff patches (portable across vLLM versions)
|
||||||
|
|
||||||
|
set -e
|
||||||
|
MOD_DIR="$(dirname "$0")"
|
||||||
|
MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
|
||||||
|
|
||||||
|
echo "[fix-qwen35-tp4-marlin] Applying patches..."
|
||||||
|
|
||||||
|
# Apply patches with --forward (skip if already applied)
|
||||||
|
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
|
||||||
|
echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
|
||||||
|
}
|
||||||
|
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
|
||||||
|
echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fix rope validation (idempotent)
|
||||||
|
python3 "$MOD_DIR/fix_rope.py"
|
||||||
|
|
||||||
|
echo "[fix-qwen35-tp4-marlin] Done."
|
||||||
52
recipes/qwen3.5-397b-int4-autoround.yaml
Normal file
52
recipes/qwen3.5-397b-int4-autoround.yaml
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# Recipe: Qwen3.5-397B-A17B-INT4-Autoround
|
||||||
|
# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
|
||||||
|
# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
|
||||||
|
# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
|
||||||
|
|
||||||
|
recipe_version: "1"
|
||||||
|
name: Qwen3.5-397B-INT4-Autoround
|
||||||
|
description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
|
||||||
|
|
||||||
|
# HuggingFace model to download (optional, for --download-model)
|
||||||
|
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
|
||||||
|
|
||||||
|
# Container image to use
|
||||||
|
container: vllm-node-tf5
|
||||||
|
|
||||||
|
build_args:
|
||||||
|
- --tf5
|
||||||
|
|
||||||
|
# Mods required: coder-next tool/reasoning parser + Marlin TP fix
|
||||||
|
mods:
|
||||||
|
- mods/fix-qwen3-coder-next
|
||||||
|
- mods/fix-qwen35-tp4-marlin
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
env:
|
||||||
|
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||||
|
|
||||||
|
# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
|
||||||
|
defaults:
|
||||||
|
port: 8000
|
||||||
|
host: 0.0.0.0
|
||||||
|
tensor_parallel: 4
|
||||||
|
gpu_memory_utilization: 0.78
|
||||||
|
max_model_len: 32768
|
||||||
|
max_num_batched_tokens: 8192
|
||||||
|
|
||||||
|
# The vLLM serve command template
|
||||||
|
command: |
|
||||||
|
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
|
||||||
|
--tool-call-parser qwen3_coder \
|
||||||
|
--reasoning-parser qwen3 \
|
||||||
|
--enable-auto-tool-choice \
|
||||||
|
--tensor-parallel-size {tensor_parallel} \
|
||||||
|
--distributed-executor-backend ray \
|
||||||
|
--kv-cache-dtype fp8 \
|
||||||
|
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||||
|
--max-model-len {max_model_len} \
|
||||||
|
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--trust-remote-code \
|
||||||
|
--host {host} \
|
||||||
|
--port {port}
|
||||||
Reference in New Issue
Block a user