Qwen3-Coder-Next fixes and updated recipe

2026-02-12 15:56:32 -08:00
parent da4185cb12
commit 701147b1eb
7 changed files with 129 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -164,6 +164,16 @@ Don't do it every time you rebuild, because it will slow down compilation times.
 For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
 ### 2026-02-12
 Added a mod for Qwen3-Coder-Next-FP8 that fixes:
 - A bug with Triton allocator (https://github.com/vllm-project/vllm/issues/33857) that prevented the model to run in a cluster.
 - A bug that introduced crash when `--enable-prefix-caching` is on (https://github.com/vllm-project/vllm/issues/34361).
 - A bug that significantly impacted the performance on Spark (https://github.com/vllm-project/vllm/issues/34413).
 This mod was included in `qwen3-coder-next-fp8` recipe.
 ### 2026-02-11
 #### Configurable GPU Architecture
--- a/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
@@ -0,0 +1 @@
 import _triton_alloc_setup
--- a/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
@@ -0,0 +1,9 @@
 try:
    import triton.runtime._allocation as _alloc
    import torch
    _alloc.NullAllocator.__call__ = staticmethod(
        lambda size, alignment, stream:
            torch.cuda.caching_allocator_alloc(size, stream=stream))
 except Exception:
    pass
--- a/mods/fix-qwen3-coder-next/fix_crash.diff
+++ b/mods/fix-qwen3-coder-next/fix_crash.diff
@@ -0,0 +1,14 @@
 diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
 index 0b6b7ed42ac1..b6e0305a312d 100644
 --- a/vllm/v1/core/single_type_kv_cache_manager.py
 +++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -1000,7 +1000,8 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
             for block in self.req_to_blocks[request.request_id][
                 num_cached_blocks_before:num_cached_blocks_after
             ]:
 -                assert block.block_hash is not None
 +                if block.is_null:
 +                    continue
                 self.cached_blocks_this_step.add(block.block_hash)
     def new_step_starts(self) -> None:
--- a/mods/fix-qwen3-coder-next/fix_slowness.diff
+++ b/mods/fix-qwen3-coder-next/fix_slowness.diff
@@ -0,0 +1,72 @@
 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
 index 63aae43c3ddf..6ca3213fbd8d 100644
 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py
 +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq(
     # moving by 1 element in a particular dimension. E.g. `stride_am` is
     # how much to increase `a_ptr` by to get the element one row down
     # (A has M rows).
 -    stride_am,
 -    stride_ak,
 -    stride_be,
 -    stride_bk,
 -    stride_bn,
 -    stride_cm,
 -    stride_cn,
 -    stride_bse,
 -    stride_bsk,
 -    stride_bsn,
 -    stride_bze,
 -    stride_bzk,
 -    stride_bzn,
 +    stride_am: tl.int64,
 +    stride_ak: tl.int64,
 +    stride_be: tl.int64,
 +    stride_bk: tl.int64,
 +    stride_bn: tl.int64,
 +    stride_cm: tl.int64,
 +    stride_cn: tl.int64,
 +    stride_bse: tl.int64,
 +    stride_bsk: tl.int64,
 +    stride_bsn: tl.int64,
 +    stride_bze: tl.int64,
 +    stride_bzk: tl.int64,
 +    stride_bzn: tl.int64,
     block_k_diviable: tl.constexpr,
     group_size: tl.constexpr,
     # Meta-parameters
@@ -329,20 +329,20 @@ def fused_moe_kernel(
     # moving by 1 element in a particular dimension. E.g. `stride_am` is
     # how much to increase `a_ptr` by to get the element one row down
     # (A has M rows).
 -    stride_am,
 -    stride_ak,
 -    stride_be,
 -    stride_bk,
 -    stride_bn,
 -    stride_cm,
 -    stride_cn,
 -    stride_asm,
 -    stride_ask,
 -    stride_bse,
 -    stride_bsk,
 -    stride_bsn,
 -    stride_bbe,  # bias expert stride
 -    stride_bbn,  # bias N stride
 +    stride_am: tl.int64,
 +    stride_ak: tl.int64,
 +    stride_be: tl.int64,
 +    stride_bk: tl.int64,
 +    stride_bn: tl.int64,
 +    stride_cm: tl.int64,
 +    stride_cn: tl.int64,
 +    stride_asm: tl.int64,
 +    stride_ask: tl.int64,
 +    stride_bse: tl.int64,
 +    stride_bsk: tl.int64,
 +    stride_bsn: tl.int64,
 +    stride_bbe: tl.int64,  # bias expert stride
 +    stride_bbn: tl.int64,  # bias N stride
     # Block size for block-wise quantization
     group_n: tl.constexpr,
     group_k: tl.constexpr,
--- a/mods/fix-qwen3-coder-next/run.sh
+++ b/mods/fix-qwen3-coder-next/run.sh
@@ -0,0 +1,11 @@
 #!/bin/bash
 set -e
 echo "Patching Qwen3-Coder-Next crashing on start"
 patch -p1 -d /usr/local/lib/python3.12/dist-packages < fix_crash.diff || echo "Patch is not applicable, skipping"
 echo "Reverting PR #34279 that causes slowness"
 patch -p1 -R -d /usr/local/lib/python3.12/dist-packages < fix_slowness.diff || echo "Reversing PR #34279 failed, skipping"
 echo "Fixing Triton allocator bug"
 cp _triton* /usr/local/lib/python3.12/dist-packages/
--- a/recipes/qwen3-coder-next-fp8.yaml
+++ b/recipes/qwen3-coder-next-fp8.yaml
@@ -1,30 +1,30 @@
 # Recipe: Qwen3-Coder-Next-FP8
 # Qwen3-Coder-Next model in native FP8 format
-# Currently can only be run in solo mode, cluster mode fails with error - tracking https://github.com/vllm-project/vllm/issues/33857
+
 recipe_version: "1"
 name: Qwen3-Coder-Next-FP8
-description: vLLM serving Qwen3-Coder-Next-FP8 on a SINGLE NODE ONLY!
+description: vLLM serving Qwen3-Coder-Next-FP8
 # HuggingFace model to download (optional, for --download-model)
 model: Qwen/Qwen3-Coder-Next-FP8
-# This model can only run on single node (solo)
+#solo_only: true
 solo_only: true
 # Container image to use
 container: vllm-node
-# No mods required
+# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
-mods: []
+mods:
  - mods/fix-qwen3-coder-next
 # Default settings (can be overridden via CLI)
 defaults:
  port: 8000
  host: 0.0.0.0
-  tensor_parallel: 1
+  tensor_parallel: 2
  gpu_memory_utilization: 0.7
-  max_model_len: 131072
+  max_model_len: 262144
 # Environment variables
 env: {}
@@ -40,4 +40,7 @@ command: |
    --load-format fastsafetensors \
    --attention-backend flashinfer \
    --enable-prefix-caching \
-    --max-model-len {max_model_len} 
+    --max-model-len {max_model_len} \
    -tp {tensor_parallel} \
    --distributed-executor-backend ray