Merge branch 'main' of github.com:eugr/spark-vllm-docker
This commit is contained in:
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
|
||||
import re
|
||||
|
||||
path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
|
||||
with open(path) as f:
|
||||
content = f.read()
|
||||
|
||||
old = """kwargs["ignore_keys_at_rope_validation"] = [
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
]"""
|
||||
|
||||
new = """kwargs["ignore_keys_at_rope_validation"] = {
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
}"""
|
||||
|
||||
content = content.replace(old, new)
|
||||
|
||||
with open(path, "w") as f:
|
||||
f.write(content)
|
||||
|
||||
print("Fixed ignore_keys_at_rope_validation: list -> set")
|
||||
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
@@ -0,0 +1,46 @@
|
||||
--- qwen3_5.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||
+++ qwen3_5.py 2026-03-03 00:00:00.000000000 +0000
|
||||
@@ -166,11 +166,13 @@
|
||||
z_size = self.value_dim // self.tp_size
|
||||
mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
|
||||
z = z.reshape(z.size(0), -1, self.head_v_dim)
|
||||
- ba, _ = self.in_proj_ba(hidden_states)
|
||||
- b, a = ba.chunk(2, dim=-1)
|
||||
-
|
||||
- b = b.contiguous()
|
||||
- a = a.contiguous()
|
||||
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||
+ _ba_start = self.tp_rank * _ba_chunk
|
||||
+ b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||
+ a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||
|
||||
# ============================================================
|
||||
# Part 2: Core Attention (Custom Op)
|
||||
@@ -374,8 +376,6 @@
|
||||
# GDN
|
||||
("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
|
||||
("in_proj_qkvz", "in_proj_z", 3),
|
||||
- ("in_proj_ba", "in_proj_b", 0),
|
||||
- ("in_proj_ba", "in_proj_a", 1),
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
@@ -530,7 +530,6 @@
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
# GDN fused projections.
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
@@ -630,7 +629,6 @@
|
||||
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
|
||||
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
|
||||
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
@@ -0,0 +1,56 @@
|
||||
--- qwen3_next.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||
+++ qwen3_next.py 2026-03-03 00:00:00.000000000 +0000
|
||||
@@ -411,15 +411,22 @@
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.in_proj_qkvz",
|
||||
)
|
||||
- # ba_proj doesn't support blockwise fp8 quantization.
|
||||
- # # in_proj_ba is defined as MergedColumnParallelLinear for
|
||||
- # compatibility with Qwen3_5.
|
||||
- self.in_proj_ba = MergedColumnParallelLinear(
|
||||
+ # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
|
||||
+ # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
|
||||
+ # Each rank loads full weights and slices in forward.
|
||||
+ self.in_proj_b = ReplicatedLinear(
|
||||
input_size=self.hidden_size,
|
||||
- output_sizes=[self.num_v_heads] * 2,
|
||||
+ output_size=self.num_v_heads,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
- prefix=f"{prefix}.in_proj_ba",
|
||||
+ prefix=f"{prefix}.in_proj_b",
|
||||
+ )
|
||||
+ self.in_proj_a = ReplicatedLinear(
|
||||
+ input_size=self.hidden_size,
|
||||
+ output_size=self.num_v_heads,
|
||||
+ bias=False,
|
||||
+ quant_config=quant_config,
|
||||
+ prefix=f"{prefix}.in_proj_a",
|
||||
)
|
||||
|
||||
query_key_settings = (self.key_dim, 0, False)
|
||||
@@ -584,7 +591,15 @@
|
||||
# Part 1: Input Projection
|
||||
# ============================================================
|
||||
projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
|
||||
- projected_states_ba, _ = self.in_proj_ba(hidden_states)
|
||||
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||
+ _ba_start = self.tp_rank * _ba_chunk
|
||||
+ projected_states_ba = torch.cat([
|
||||
+ b_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||
+ a_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||
+ ], dim=-1)
|
||||
query, key, value, z, b, a = self.fix_query_key_value_ordering(
|
||||
projected_states_qkvz, projected_states_ba
|
||||
)
|
||||
@@ -1326,7 +1341,6 @@
|
||||
],
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
"in_proj_qkvz": ["in_proj_qkvz"],
|
||||
- "in_proj_ba": ["in_proj_ba"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
|
||||
# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
|
||||
# Delivery: unified diff patches (portable across vLLM versions)
|
||||
|
||||
set -e
|
||||
MOD_DIR="$(dirname "$0")"
|
||||
MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
|
||||
|
||||
echo "[fix-qwen35-tp4-marlin] Applying patches..."
|
||||
|
||||
# Apply patches with --forward (skip if already applied)
|
||||
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
|
||||
echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
|
||||
}
|
||||
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
|
||||
echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
|
||||
}
|
||||
|
||||
# Fix rope validation (idempotent)
|
||||
python3 "$MOD_DIR/fix_rope.py"
|
||||
|
||||
echo "[fix-qwen35-tp4-marlin] Done."
|
||||
Reference in New Issue
Block a user