Qwen3-Coder-Next fixes and updated recipe
This commit is contained in:
1
mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
Normal file
1
mods/fix-qwen3-coder-next/_triton_alloc_setup.pth
Normal file
@@ -0,0 +1 @@
|
||||
import _triton_alloc_setup
|
||||
9
mods/fix-qwen3-coder-next/_triton_alloc_setup.py
Normal file
9
mods/fix-qwen3-coder-next/_triton_alloc_setup.py
Normal file
@@ -0,0 +1,9 @@
|
||||
try:
|
||||
import triton.runtime._allocation as _alloc
|
||||
import torch
|
||||
|
||||
_alloc.NullAllocator.__call__ = staticmethod(
|
||||
lambda size, alignment, stream:
|
||||
torch.cuda.caching_allocator_alloc(size, stream=stream))
|
||||
except Exception:
|
||||
pass
|
||||
14
mods/fix-qwen3-coder-next/fix_crash.diff
Normal file
14
mods/fix-qwen3-coder-next/fix_crash.diff
Normal file
@@ -0,0 +1,14 @@
|
||||
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
|
||||
index 0b6b7ed42ac1..b6e0305a312d 100644
|
||||
--- a/vllm/v1/core/single_type_kv_cache_manager.py
|
||||
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
|
||||
@@ -1000,7 +1000,8 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
|
||||
for block in self.req_to_blocks[request.request_id][
|
||||
num_cached_blocks_before:num_cached_blocks_after
|
||||
]:
|
||||
- assert block.block_hash is not None
|
||||
+ if block.is_null:
|
||||
+ continue
|
||||
self.cached_blocks_this_step.add(block.block_hash)
|
||||
|
||||
def new_step_starts(self) -> None:
|
||||
72
mods/fix-qwen3-coder-next/fix_slowness.diff
Normal file
72
mods/fix-qwen3-coder-next/fix_slowness.diff
Normal file
@@ -0,0 +1,72 @@
|
||||
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
|
||||
index 63aae43c3ddf..6ca3213fbd8d 100644
|
||||
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
|
||||
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
|
||||
@@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq(
|
||||
# moving by 1 element in a particular dimension. E.g. `stride_am` is
|
||||
# how much to increase `a_ptr` by to get the element one row down
|
||||
# (A has M rows).
|
||||
- stride_am,
|
||||
- stride_ak,
|
||||
- stride_be,
|
||||
- stride_bk,
|
||||
- stride_bn,
|
||||
- stride_cm,
|
||||
- stride_cn,
|
||||
- stride_bse,
|
||||
- stride_bsk,
|
||||
- stride_bsn,
|
||||
- stride_bze,
|
||||
- stride_bzk,
|
||||
- stride_bzn,
|
||||
+ stride_am: tl.int64,
|
||||
+ stride_ak: tl.int64,
|
||||
+ stride_be: tl.int64,
|
||||
+ stride_bk: tl.int64,
|
||||
+ stride_bn: tl.int64,
|
||||
+ stride_cm: tl.int64,
|
||||
+ stride_cn: tl.int64,
|
||||
+ stride_bse: tl.int64,
|
||||
+ stride_bsk: tl.int64,
|
||||
+ stride_bsn: tl.int64,
|
||||
+ stride_bze: tl.int64,
|
||||
+ stride_bzk: tl.int64,
|
||||
+ stride_bzn: tl.int64,
|
||||
block_k_diviable: tl.constexpr,
|
||||
group_size: tl.constexpr,
|
||||
# Meta-parameters
|
||||
@@ -329,20 +329,20 @@ def fused_moe_kernel(
|
||||
# moving by 1 element in a particular dimension. E.g. `stride_am` is
|
||||
# how much to increase `a_ptr` by to get the element one row down
|
||||
# (A has M rows).
|
||||
- stride_am,
|
||||
- stride_ak,
|
||||
- stride_be,
|
||||
- stride_bk,
|
||||
- stride_bn,
|
||||
- stride_cm,
|
||||
- stride_cn,
|
||||
- stride_asm,
|
||||
- stride_ask,
|
||||
- stride_bse,
|
||||
- stride_bsk,
|
||||
- stride_bsn,
|
||||
- stride_bbe, # bias expert stride
|
||||
- stride_bbn, # bias N stride
|
||||
+ stride_am: tl.int64,
|
||||
+ stride_ak: tl.int64,
|
||||
+ stride_be: tl.int64,
|
||||
+ stride_bk: tl.int64,
|
||||
+ stride_bn: tl.int64,
|
||||
+ stride_cm: tl.int64,
|
||||
+ stride_cn: tl.int64,
|
||||
+ stride_asm: tl.int64,
|
||||
+ stride_ask: tl.int64,
|
||||
+ stride_bse: tl.int64,
|
||||
+ stride_bsk: tl.int64,
|
||||
+ stride_bsn: tl.int64,
|
||||
+ stride_bbe: tl.int64, # bias expert stride
|
||||
+ stride_bbn: tl.int64, # bias N stride
|
||||
# Block size for block-wise quantization
|
||||
group_n: tl.constexpr,
|
||||
group_k: tl.constexpr,
|
||||
11
mods/fix-qwen3-coder-next/run.sh
Normal file
11
mods/fix-qwen3-coder-next/run.sh
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
echo "Patching Qwen3-Coder-Next crashing on start"
|
||||
patch -p1 -d /usr/local/lib/python3.12/dist-packages < fix_crash.diff || echo "Patch is not applicable, skipping"
|
||||
|
||||
echo "Reverting PR #34279 that causes slowness"
|
||||
patch -p1 -R -d /usr/local/lib/python3.12/dist-packages < fix_slowness.diff || echo "Reversing PR #34279 failed, skipping"
|
||||
|
||||
echo "Fixing Triton allocator bug"
|
||||
cp _triton* /usr/local/lib/python3.12/dist-packages/
|
||||
Reference in New Issue
Block a user