Qwen3-Coder-Next fixes and updated recipe

2026-02-12 15:56:32 -08:00
parent da4185cb12
commit 701147b1eb
7 changed files with 129 additions and 9 deletions
--- a/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
@@ -0,0 +1 @@
+import _triton_alloc_setup
--- a/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
+++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.py
@@ -0,0 +1,9 @@
+try:
+    import triton.runtime._allocation as _alloc
+    import torch
+
+    _alloc.NullAllocator.__call__ = staticmethod(
+        lambda size, alignment, stream:
+            torch.cuda.caching_allocator_alloc(size, stream=stream))
+except Exception:
+    pass
--- a/mods/fix-qwen3-coder-next/fix_crash.diff
+++ b/mods/fix-qwen3-coder-next/fix_crash.diff
@@ -0,0 +1,14 @@
+diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
+index 0b6b7ed42ac1..b6e0305a312d 100644
+--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
+@@ -1000,7 +1000,8 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
+             for block in self.req_to_blocks[request.request_id][
+                 num_cached_blocks_before:num_cached_blocks_after
+             ]:
+-                assert block.block_hash is not None
+                if block.is_null:
+                    continue
+                 self.cached_blocks_this_step.add(block.block_hash)
+ 
+     def new_step_starts(self) -> None:
--- a/mods/fix-qwen3-coder-next/fix_slowness.diff
+++ b/mods/fix-qwen3-coder-next/fix_slowness.diff
@@ -0,0 +1,72 @@
+diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
+index 63aae43c3ddf..6ca3213fbd8d 100644
+--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
+@@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq(
+     # moving by 1 element in a particular dimension. E.g. `stride_am` is
+     # how much to increase `a_ptr` by to get the element one row down
+     # (A has M rows).
+-    stride_am,
+-    stride_ak,
+-    stride_be,
+-    stride_bk,
+-    stride_bn,
+-    stride_cm,
+-    stride_cn,
+-    stride_bse,
+-    stride_bsk,
+-    stride_bsn,
+-    stride_bze,
+-    stride_bzk,
+-    stride_bzn,
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_be: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    stride_bze: tl.int64,
+    stride_bzk: tl.int64,
+    stride_bzn: tl.int64,
+     block_k_diviable: tl.constexpr,
+     group_size: tl.constexpr,
+     # Meta-parameters
+@@ -329,20 +329,20 @@ def fused_moe_kernel(
+     # moving by 1 element in a particular dimension. E.g. `stride_am` is
+     # how much to increase `a_ptr` by to get the element one row down
+     # (A has M rows).
+-    stride_am,
+-    stride_ak,
+-    stride_be,
+-    stride_bk,
+-    stride_bn,
+-    stride_cm,
+-    stride_cn,
+-    stride_asm,
+-    stride_ask,
+-    stride_bse,
+-    stride_bsk,
+-    stride_bsn,
+-    stride_bbe,  # bias expert stride
+-    stride_bbn,  # bias N stride
+    stride_am: tl.int64,
+    stride_ak: tl.int64,
+    stride_be: tl.int64,
+    stride_bk: tl.int64,
+    stride_bn: tl.int64,
+    stride_cm: tl.int64,
+    stride_cn: tl.int64,
+    stride_asm: tl.int64,
+    stride_ask: tl.int64,
+    stride_bse: tl.int64,
+    stride_bsk: tl.int64,
+    stride_bsn: tl.int64,
+    stride_bbe: tl.int64,  # bias expert stride
+    stride_bbn: tl.int64,  # bias N stride
+     # Block size for block-wise quantization
+     group_n: tl.constexpr,
+     group_k: tl.constexpr,
--- a/mods/fix-qwen3-coder-next/run.sh
+++ b/mods/fix-qwen3-coder-next/run.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+echo "Patching Qwen3-Coder-Next crashing on start"
+patch -p1 -d /usr/local/lib/python3.12/dist-packages < fix_crash.diff || echo "Patch is not applicable, skipping"
+
+echo "Reverting PR #34279 that causes slowness"
+patch -p1 -R -d /usr/local/lib/python3.12/dist-packages < fix_slowness.diff || echo "Reversing PR #34279 failed, skipping"
+
+echo "Fixing Triton allocator bug"
+cp _triton* /usr/local/lib/python3.12/dist-packages/