Merge branch 'main' of github.com:eugr/spark-vllm-docker
This commit is contained in:
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
23
mods/fix-qwen35-tp4-marlin/fix_rope.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Fix: ignore_keys_at_rope_validation is a list but transformers uses | (set union)
|
||||
import re
|
||||
|
||||
path = "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/configs/qwen3_5_moe.py"
|
||||
with open(path) as f:
|
||||
content = f.read()
|
||||
|
||||
old = """kwargs["ignore_keys_at_rope_validation"] = [
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
]"""
|
||||
|
||||
new = """kwargs["ignore_keys_at_rope_validation"] = {
|
||||
"mrope_section",
|
||||
"mrope_interleaved",
|
||||
}"""
|
||||
|
||||
content = content.replace(old, new)
|
||||
|
||||
with open(path, "w") as f:
|
||||
f.write(content)
|
||||
|
||||
print("Fixed ignore_keys_at_rope_validation: list -> set")
|
||||
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
46
mods/fix-qwen35-tp4-marlin/qwen3_5.patch
Normal file
@@ -0,0 +1,46 @@
|
||||
--- qwen3_5.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||
+++ qwen3_5.py 2026-03-03 00:00:00.000000000 +0000
|
||||
@@ -166,11 +166,13 @@
|
||||
z_size = self.value_dim // self.tp_size
|
||||
mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
|
||||
z = z.reshape(z.size(0), -1, self.head_v_dim)
|
||||
- ba, _ = self.in_proj_ba(hidden_states)
|
||||
- b, a = ba.chunk(2, dim=-1)
|
||||
-
|
||||
- b = b.contiguous()
|
||||
- a = a.contiguous()
|
||||
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||
+ _ba_start = self.tp_rank * _ba_chunk
|
||||
+ b = b_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||
+ a = a_full[:, _ba_start:_ba_start+_ba_chunk].contiguous()
|
||||
|
||||
# ============================================================
|
||||
# Part 2: Core Attention (Custom Op)
|
||||
@@ -374,8 +376,6 @@
|
||||
# GDN
|
||||
("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
|
||||
("in_proj_qkvz", "in_proj_z", 3),
|
||||
- ("in_proj_ba", "in_proj_b", 0),
|
||||
- ("in_proj_ba", "in_proj_a", 1),
|
||||
]
|
||||
|
||||
params_dict = dict(self.named_parameters())
|
||||
@@ -530,7 +530,6 @@
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
# GDN fused projections.
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
@@ -630,7 +629,6 @@
|
||||
class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
|
||||
packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
|
||||
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
||||
- "in_proj_ba": ["in_proj_b", "in_proj_a"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
|
||||
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
56
mods/fix-qwen35-tp4-marlin/qwen3_next.patch
Normal file
@@ -0,0 +1,56 @@
|
||||
--- qwen3_next.py.orig 2026-03-03 00:00:00.000000000 +0000
|
||||
+++ qwen3_next.py 2026-03-03 00:00:00.000000000 +0000
|
||||
@@ -411,15 +411,22 @@
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.in_proj_qkvz",
|
||||
)
|
||||
- # ba_proj doesn't support blockwise fp8 quantization.
|
||||
- # # in_proj_ba is defined as MergedColumnParallelLinear for
|
||||
- # compatibility with Qwen3_5.
|
||||
- self.in_proj_ba = MergedColumnParallelLinear(
|
||||
+ # ba_proj: Use ReplicatedLinear to avoid Marlin TP split constraint
|
||||
+ # (num_v_heads=64 is too small for TP=4 Marlin min_thread_n=64).
|
||||
+ # Each rank loads full weights and slices in forward.
|
||||
+ self.in_proj_b = ReplicatedLinear(
|
||||
input_size=self.hidden_size,
|
||||
- output_sizes=[self.num_v_heads] * 2,
|
||||
+ output_size=self.num_v_heads,
|
||||
bias=False,
|
||||
quant_config=quant_config,
|
||||
- prefix=f"{prefix}.in_proj_ba",
|
||||
+ prefix=f"{prefix}.in_proj_b",
|
||||
+ )
|
||||
+ self.in_proj_a = ReplicatedLinear(
|
||||
+ input_size=self.hidden_size,
|
||||
+ output_size=self.num_v_heads,
|
||||
+ bias=False,
|
||||
+ quant_config=quant_config,
|
||||
+ prefix=f"{prefix}.in_proj_a",
|
||||
)
|
||||
|
||||
query_key_settings = (self.key_dim, 0, False)
|
||||
@@ -584,7 +591,15 @@
|
||||
# Part 1: Input Projection
|
||||
# ============================================================
|
||||
projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
|
||||
- projected_states_ba, _ = self.in_proj_ba(hidden_states)
|
||||
+ # Replicated B/A projections — full output, sliced to local TP partition
|
||||
+ b_full, _ = self.in_proj_b(hidden_states)
|
||||
+ a_full, _ = self.in_proj_a(hidden_states)
|
||||
+ _ba_chunk = self.num_v_heads // self.tp_size
|
||||
+ _ba_start = self.tp_rank * _ba_chunk
|
||||
+ projected_states_ba = torch.cat([
|
||||
+ b_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||
+ a_full[:, _ba_start:_ba_start+_ba_chunk],
|
||||
+ ], dim=-1)
|
||||
query, key, value, z, b, a = self.fix_query_key_value_ordering(
|
||||
projected_states_qkvz, projected_states_ba
|
||||
)
|
||||
@@ -1326,7 +1341,6 @@
|
||||
],
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
"in_proj_qkvz": ["in_proj_qkvz"],
|
||||
- "in_proj_ba": ["in_proj_ba"],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
23
mods/fix-qwen35-tp4-marlin/run.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
# Fix Marlin TP=4 constraint for Qwen3.5-397B: in_proj_ba output_size=128 / TP=4 = 32 < min_thread_n=64
|
||||
# Solution: Replace MergedColumnParallelLinear with two ReplicatedLinear for B/A projections
|
||||
# Delivery: unified diff patches (portable across vLLM versions)
|
||||
|
||||
set -e
|
||||
MOD_DIR="$(dirname "$0")"
|
||||
MODELS_DIR="/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models"
|
||||
|
||||
echo "[fix-qwen35-tp4-marlin] Applying patches..."
|
||||
|
||||
# Apply patches with --forward (skip if already applied)
|
||||
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_next.patch" || {
|
||||
echo "[fix-qwen35-tp4-marlin] qwen3_next.patch already applied or failed"
|
||||
}
|
||||
patch --forward --batch -p0 -d "$MODELS_DIR" < "$MOD_DIR/qwen3_5.patch" || {
|
||||
echo "[fix-qwen35-tp4-marlin] qwen3_5.patch already applied or failed"
|
||||
}
|
||||
|
||||
# Fix rope validation (idempotent)
|
||||
python3 "$MOD_DIR/fix_rope.py"
|
||||
|
||||
echo "[fix-qwen35-tp4-marlin] Done."
|
||||
53
recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
Normal file
53
recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
Normal file
@@ -0,0 +1,53 @@
|
||||
# Recipe: Qwen3.5-397B-A17B-INT4-Autoround
|
||||
# Qwen3.5-397B model in Intel INT4-Autoround quantization, TP=4 across 4 DGX Spark nodes
|
||||
# Benchmarked at 37 tok/s single-user, 103 tok/s aggregate (4 concurrent) on 4× DGX Spark
|
||||
# Requires NVIDIA driver 580.x (590.x has CUDAGraph deadlock bug on GB10)
|
||||
|
||||
recipe_version: "1"
|
||||
name: Qwen3.5-397B-INT4-Autoround
|
||||
description: Qwen3.5-397B with TP=4 across 4 DGX Spark nodes (Marlin fix applied)
|
||||
|
||||
# HuggingFace model to download (optional, for --download-model)
|
||||
model: Intel/Qwen3.5-397B-A17B-int4-AutoRound
|
||||
|
||||
# Container image to use
|
||||
container: vllm-node-tf5
|
||||
|
||||
build_args:
|
||||
- --tf5
|
||||
|
||||
# Mods required: coder-next tool/reasoning parser + Marlin TP fix
|
||||
mods:
|
||||
- mods/fix-qwen3-coder-next
|
||||
- mods/fix-qwen35-tp4-marlin
|
||||
|
||||
# Environment variables
|
||||
env:
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
|
||||
|
||||
# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
|
||||
defaults:
|
||||
port: 8000
|
||||
host: 0.0.0.0
|
||||
tensor_parallel: 4
|
||||
gpu_memory_utilization: 0.78
|
||||
max_model_len: 32768
|
||||
max_num_batched_tokens: 8192
|
||||
|
||||
# The vLLM serve command template
|
||||
command: |
|
||||
vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \
|
||||
--tool-call-parser qwen3_coder \
|
||||
--reasoning-parser qwen3 \
|
||||
--enable-auto-tool-choice \
|
||||
--tensor-parallel-size {tensor_parallel} \
|
||||
--distributed-executor-backend ray \
|
||||
--kv-cache-dtype fp8 \
|
||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||
--max-model-len {max_model_len} \
|
||||
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||
--enable-prefix-caching \
|
||||
--trust-remote-code \
|
||||
--host {host} \
|
||||
--port {port}
|
||||
@@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
|
||||
export_persist RAY_memory_monitor_refresh_ms "0"
|
||||
|
||||
# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory)
|
||||
# Disable pre-started idle workers (saves ~8 GiB on head node)
|
||||
export_persist RAY_num_prestart_python_workers "0"
|
||||
# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA)
|
||||
export_persist RAY_object_store_memory "1073741824"
|
||||
|
||||
# --- Execution ---
|
||||
|
||||
if [ "${NODE_TYPE}" == "head" ]; then
|
||||
echo "Starting Ray HEAD node..."
|
||||
exec ray start --block --head --port 6379 \
|
||||
exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
|
||||
--node-ip-address "$VLLM_HOST_IP" \
|
||||
--include-dashboard=True \
|
||||
--dashboard-host "0.0.0.0" \
|
||||
--dashboard-port 8265 \
|
||||
--include-dashboard=false \
|
||||
--disable-usage-stats
|
||||
else
|
||||
echo "Starting Ray WORKER node connecting to $HEAD_IP..."
|
||||
exec ray start --block \
|
||||
exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
|
||||
--address="$HEAD_IP:6379" \
|
||||
--node-ip-address "$VLLM_HOST_IP"
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user