Merge pull request #90 from sonusflow/pr/qwen35-397b-tp4

Add Qwen3.5-397B INT4-AutoRound TP=4 recipe (37 tok/s)
This commit is contained in:
eugr
2026-03-12 15:04:23 -07:00
committed by GitHub
6 changed files with 210 additions and 5 deletions

View File

@@ -0,0 +1,23 @@
# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
import re
path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
with open(path) as f:
content = f.read()
old = """kwargs["ignore_keys_at_rope_validation"] = [
"mrope_section",
"mrope_interleaved",
]"""
new = """kwargs["ignore_keys_at_rope_validation"] = {
"mrope_section",
"mrope_interleaved",
}"""
content = content.replace(old, new)
with open(path, "w") as f:
f.write(content)
print("Fixed ignore_keys_at_rope_validation: list -> set")

View File

@@ -0,0 +1,46 @@
--- qwen3_5.py.orig 2026-03-03 00:00:00.000000000 +0000
+++ qwen3_5.py 2026-03-03 00:00:00.000000000 +0000
@@ -166,11 +166,13 @@
z_size = self.value_dim // self.tp_size
mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
z = z.reshape(z.size(0), -1, self.head_v_dim)
- ba, _ = self.in_proj_ba(hidden_states)
- b, a = ba.chunk(2, dim=-1)
-
- b = b.contiguous()
- a = a.contiguous()
+ # Replicated B/A projections — full output, sliced to local TP partition
+ b_full, _ = self.in_proj_b(hidden_states)
+ a_full, _ = self.in_proj_a(hidden_states)
+ _ba_chunk = self.num_v_heads // self.tp_size
+ _ba_start = self.tp_rank * _ba_chunk
+ b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
+ a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
# ============================================================
# Part 2: Core Attention (Custom Op)
@@ -374,8 +376,6 @@
# GDN
("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
("in_proj_qkvz", "in_proj_z", 3),
- ("in_proj_ba", "in_proj_b", 0),
- ("in_proj_ba", "in_proj_a", 1),
]
params_dict = dict(self.named_parameters())
@@ -530,7 +530,6 @@
"gate_up_proj": ["gate_proj", "up_proj"],
# GDN fused projections.
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -630,7 +629,6 @@
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):

View File

@@ -0,0 +1,56 @@
--- qwen3_next.py.orig 2026-03-03 00:00:00.000000000 +0000
+++ qwen3_next.py 2026-03-03 00:00:00.000000000 +0000
@@ -411,15 +411,22 @@
quant_config=quant_config,
prefix=f"{prefix}.in_proj_qkvz",
)
- # ba_proj doesn't support blockwise fp8 quantization.
- # # in_proj_ba is defined as MergedColumnParallelLinear for
- # compatibility with Qwen3_5.
- self.in_proj_ba = MergedColumnParallelLinear(
+ # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
+ # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
+ # Each rank loads full weights and slices in forward.
+ self.in_proj_b = ReplicatedLinear(
input_size=self.hidden_size,
- output_sizes=[self.num_v_heads] * 2,
+ output_size=self.num_v_heads,
bias=False,
quant_config=quant_config,
- prefix=f"{prefix}.in_proj_ba",
+ prefix=f"{prefix}.in_proj_b",
+ )
+ self.in_proj_a = ReplicatedLinear(
+ input_size=self.hidden_size,
+ output_size=self.num_v_heads,
+ bias=False,
+ quant_config=quant_config,
+ prefix=f"{prefix}.in_proj_a",
)
query_key_settings = (self.key_dim, 0, False)
@@ -584,7 +591,15 @@
# Part 1: Input Projection
# ============================================================
projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
- projected_states_ba, _ = self.in_proj_ba(hidden_states)
+ # Replicated B/A projections — full output, sliced to local TP partition
+ b_full, _ = self.in_proj_b(hidden_states)
+ a_full, _ = self.in_proj_a(hidden_states)
+ _ba_chunk = self.num_v_heads // self.tp_size
+ _ba_start = self.tp_rank * _ba_chunk
+ projected_states_ba = torch.cat([
+ b_full[:, _ba_start:_ba_start+_ba_chunk],
+ a_full[:, _ba_start:_ba_start+_ba_chunk],
+ ], dim=-1)
query, key, value, z, b, a = self.fix_query_key_value_ordering(
projected_states_qkvz, projected_states_ba
)
@@ -1326,7 +1341,6 @@
],
"gate_up_proj": ["gate_proj", "up_proj"],
"in_proj_qkvz": ["in_proj_qkvz"],
- "in_proj_ba": ["in_proj_ba"],
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

View File

@@ -0,0 +1,23 @@
#!/bin/bash
# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
# Delivery: unified diff patches (portable across vLLM versions)
set -e
MOD_DIR="$(dirname "$0")"
MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
echo "[fix-qwen35-tp4-marlin] Applying patches..."
# Apply patches with --forward (skip if already applied)
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
}
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
}
# Fix rope validation (idempotent)
python3 "$MOD_DIR/fix_rope.py"
echo "[fix-qwen35-tp4-marlin] Done."

View File

@@ -0,0 +1,53 @@
# Recipe: Qwen3.5-397B-A17B-INT4-Autoround
# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
recipe_version: "1"
name: Qwen3.5-397B-INT4-Autoround
description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
# HuggingFace model to download (optional, for --download-model)
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
# Container image to use
container: vllm-node-tf5
build_args:
- --tf5
# Mods required: coder-next tool/reasoning parser + Marlin TP fix
mods:
- mods/fix-qwen3-coder-next
- mods/fix-qwen35-tp4-marlin
# Environment variables
env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 4
gpu_memory_utilization: 0.78
max_model_len: 32768
max_num_batched_tokens: 8192
# The vLLM serve command template
command: |
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
--tool-call-parser qwen3_coder \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tensor-parallel-size {tensor_parallel} \
--distributed-executor-backend ray \
--kv-cache-dtype fp8 \
--gpu-memory-utilization {gpu_memory_utilization} \
--max-model-len {max_model_len} \
--max-num-batched-tokens {max_num_batched_tokens} \
--enable-prefix-caching \
--trust-remote-code \
--host {host} \
--port {port}

View File

@@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist RAY_memory_monitor_refresh_ms "0"
# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory)
# Disable pre-started idle workers (saves ~8 GiB on head node)
export_persist RAY_num_prestart_python_workers "0"
# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA)
export_persist RAY_object_store_memory "1073741824"
# --- Execution ---
if [ "${NODE_TYPE}" == "head" ]; then
echo "Starting Ray HEAD node..."
exec ray start --block --head --port 6379 \
exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
--node-ip-address "$VLLM_HOST_IP" \
--include-dashboard=True \
--dashboard-host "0.0.0.0" \
--dashboard-port 8265 \
--include-dashboard=false \
--disable-usage-stats
else
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
exec ray start --block \
exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
--address="$HEAD_IP:6379" \
--node-ip-address "$VLLM_HOST_IP"
fi