Qwen3-Coder-Next fixes and updated recipe

2026-02-12 15:56:32 -08:00
parent da4185cb12
commit 701147b1eb
7 changed files with 129 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -164,6 +164,16 @@ Don't do it every time you rebuild, because it will slow down compilation times.

 For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`

+### 2026-02-12
+
+Added a mod for Qwen3-Coder-Next-FP8 that fixes:
+
+- A bug with Triton allocator (https://github.com/vllm-project/vllm/issues/33857) that prevented the model to run in a cluster.
+- A bug that introduced crash when `--enable-prefix-caching` is on (https://github.com/vllm-project/vllm/issues/34361).
+- A bug that significantly impacted the performance on Spark (https://github.com/vllm-project/vllm/issues/34413).
+
+This mod was included in `qwen3-coder-next-fp8` recipe.
+
 ### 2026-02-11

 #### Configurable GPU Architecture
--- a/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
@@ -0,0 +1 @@
+import _triton_alloc_setup
--- a/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
@@ -0,0 +1,9 @@
+try:
+    import triton.runtime._allocation as _alloc
+    import torch
+
+    _alloc.NullAllocator.__call__ = staticmethod(
+        lambda size, alignment, stream:
+            torch.cuda.caching_allocator_alloc(size, stream=stream))
+except Exception:
+    pass
--- a/mods/fix-qwen3-coder-next/fix_crash.diff
+++ b/mods/fix-qwen3-coder-next/fix_crash.diff
@@ -0,0 +1,14 @@
+diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
+index 0b6b7ed42ac1..b6e0305a312d 100644
+--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
+@@ -1000,7 +1000,8 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
+             for block in self.req_to_blocks[request.request_id][
+                 num_cached_blocks_before:num_cached_blocks_after
+             ]:
+-                assert block.block_hash is not None
+                if block.is_null:
+                    continue
+                 self.cached_blocks_this_step.add(block.block_hash)
+ 
+     def new_step_starts(self) -> None:
--- a/mods/fix-qwen3-coder-next/fix_slowness.diff
+++ b/mods/fix-qwen3-coder-next/fix_slowness.diff
@@ -0,0 +1,72 @@
+diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
+index 63aae43c3ddf..6ca3213fbd8d 100644
+--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
+@@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq(
+     # moving by 1 element in a particular dimension. E.g. `stride_am` is
+     # how much to increase `a_ptr` by to get the element one row down
+     # (A has M rows).
+-    stride_am,
+-    stride_ak,
+-    stride_be,
+-    stride_bk,
+-    stride_bn,
+-    stride_cm,
+-    stride_cn,
+-    stride_bse,
+-    stride_bsk,
+-    stride_bsn,
+-    stride_bze,
+-    stride_bzk,
+-    stride_bzn,
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_be: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    stride_bze: tl.int64,
+    stride_bzk: tl.int64,
+    stride_bzn: tl.int64,
+     block_k_diviable: tl.constexpr,
+     group_size: tl.constexpr,
+     # Meta-parameters
+@@ -329,20 +329,20 @@ def fused_moe_kernel(
+     # moving by 1 element in a particular dimension. E.g. `stride_am` is
+     # how much to increase `a_ptr` by to get the element one row down
+     # (A has M rows).
+-    stride_am,
+-    stride_ak,
+-    stride_be,
+-    stride_bk,
+-    stride_bn,
+-    stride_cm,
+-    stride_cn,
+-    stride_asm,
+-    stride_ask,
+-    stride_bse,
+-    stride_bsk,
+-    stride_bsn,
+-    stride_bbe,  # bias expert stride
+-    stride_bbn,  # bias N stride
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_be: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    stride_bbe: tl.int64,  # bias expert stride
+    stride_bbn: tl.int64,  # bias N stride
+     # Block size for block-wise quantization
+     group_n: tl.constexpr,
+     group_k: tl.constexpr,
--- a/mods/fix-qwen3-coder-next/run.sh
+++ b/mods/fix-qwen3-coder-next/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+echo "Patching Qwen3-Coder-Next crashing on start"
+patch -p1 -d /usr/local/lib/python3.12/dist-packages < fix_crash.diff || echo "Patch is not applicable, skipping"
+
+echo "Reverting PR #34279 that causes slowness"
+patch -p1 -R -d /usr/local/lib/python3.12/dist-packages < fix_slowness.diff || echo "Reversing PR #34279 failed, skipping"
+
+echo "Fixing Triton allocator bug"
+cp _triton* /usr/local/lib/python3.12/dist-packages/
--- a/recipes/qwen3-coder-next-fp8.yaml
+++ b/recipes/qwen3-coder-next-fp8.yaml
@@ -1,30 +1,30 @@
 # Recipe: Qwen3-Coder-Next-FP8
 # Qwen3-Coder-Next model in native FP8 format
-# Currently can only be run in solo mode, cluster mode fails with error - tracking https://github.com/vllm-project/vllm/issues/33857
+

 recipe_version: "1"
 name: Qwen3-Coder-Next-FP8
-description: vLLM serving Qwen3-Coder-Next-FP8 on a SINGLE NODE ONLY!
+description: vLLM serving Qwen3-Coder-Next-FP8

 # HuggingFace model to download (optional, for --download-model)
 model: Qwen/Qwen3-Coder-Next-FP8

-# This model can only run on single node (solo)
-solo_only: true
+#solo_only: true

 # Container image to use
 container: vllm-node

-# No mods required
-mods: []
+# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
+mods:
+  - mods/fix-qwen3-coder-next

 # Default settings (can be overridden via CLI)
 defaults:
  port: 8000
  host: 0.0.0.0
-  tensor_parallel: 1
+  tensor_parallel: 2
  gpu_memory_utilization: 0.7
-  max_model_len: 131072
+  max_model_len: 262144

 # Environment variables
 env: {}
@@ -40,4 +40,7 @@ command: |
    --load-format fastsafetensors \
    --attention-backend flashinfer \
    --enable-prefix-caching \
-    --max-model-len {max_model_len} 
+    --max-model-len {max_model_len} \
+    -tp {tensor_parallel} \
+    --distributed-executor-backend ray
+