Qwen3-Coder-Next fixes and updated recipe

This commit is contained in:
Eugene Rakhmatulin
2026-02-12 15:56:32 -08:00
parent da4185cb12
commit 701147b1eb
7 changed files with 129 additions and 9 deletions

View File

@@ -164,6 +164,16 @@ Don't do it every time you rebuild, because it will slow down compilation times.
For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h`
### 2026-02-12
Added a mod for Qwen3-Coder-Next-FP8 that fixes:
- A bug with Triton allocator (https://github.com/vllm-project/vllm/issues/33857) that prevented the model to run in a cluster.
- A bug that introduced crash when `--enable-prefix-caching` is on (https://github.com/vllm-project/vllm/issues/34361).
- A bug that significantly impacted the performance on Spark (https://github.com/vllm-project/vllm/issues/34413).
This mod was included in `qwen3-coder-next-fp8` recipe.
### 2026-02-11
#### Configurable GPU Architecture

View File

@@ -0,0 +1 @@
import _triton_alloc_setup

View File

@@ -0,0 +1,9 @@
try:
import triton.runtime._allocation as _alloc
import torch
_alloc.NullAllocator.__call__ = staticmethod(
lambda size, alignment, stream:
torch.cuda.caching_allocator_alloc(size, stream=stream))
except Exception:
pass

View File

@@ -0,0 +1,14 @@
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 0b6b7ed42ac1..b6e0305a312d 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -1000,7 +1000,8 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
for block in self.req_to_blocks[request.request_id][
num_cached_blocks_before:num_cached_blocks_after
]:
- assert block.block_hash is not None
+ if block.is_null:
+ continue
self.cached_blocks_this_step.add(block.block_hash)
def new_step_starts(self) -> None:

View File

@@ -0,0 +1,72 @@
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 63aae43c3ddf..6ca3213fbd8d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq(
# moving by 1 element in a particular dimension. E.g. `stride_am` is
# how much to increase `a_ptr` by to get the element one row down
# (A has M rows).
- stride_am,
- stride_ak,
- stride_be,
- stride_bk,
- stride_bn,
- stride_cm,
- stride_cn,
- stride_bse,
- stride_bsk,
- stride_bsn,
- stride_bze,
- stride_bzk,
- stride_bzn,
+ stride_am: tl.int64,
+ stride_ak: tl.int64,
+ stride_be: tl.int64,
+ stride_bk: tl.int64,
+ stride_bn: tl.int64,
+ stride_cm: tl.int64,
+ stride_cn: tl.int64,
+ stride_bse: tl.int64,
+ stride_bsk: tl.int64,
+ stride_bsn: tl.int64,
+ stride_bze: tl.int64,
+ stride_bzk: tl.int64,
+ stride_bzn: tl.int64,
block_k_diviable: tl.constexpr,
group_size: tl.constexpr,
# Meta-parameters
@@ -329,20 +329,20 @@ def fused_moe_kernel(
# moving by 1 element in a particular dimension. E.g. `stride_am` is
# how much to increase `a_ptr` by to get the element one row down
# (A has M rows).
- stride_am,
- stride_ak,
- stride_be,
- stride_bk,
- stride_bn,
- stride_cm,
- stride_cn,
- stride_asm,
- stride_ask,
- stride_bse,
- stride_bsk,
- stride_bsn,
- stride_bbe, # bias expert stride
- stride_bbn, # bias N stride
+ stride_am: tl.int64,
+ stride_ak: tl.int64,
+ stride_be: tl.int64,
+ stride_bk: tl.int64,
+ stride_bn: tl.int64,
+ stride_cm: tl.int64,
+ stride_cn: tl.int64,
+ stride_asm: tl.int64,
+ stride_ask: tl.int64,
+ stride_bse: tl.int64,
+ stride_bsk: tl.int64,
+ stride_bsn: tl.int64,
+ stride_bbe: tl.int64, # bias expert stride
+ stride_bbn: tl.int64, # bias N stride
# Block size for block-wise quantization
group_n: tl.constexpr,
group_k: tl.constexpr,

View File

@@ -0,0 +1,11 @@
#!/bin/bash
set -e
echo "Patching Qwen3-Coder-Next crashing on start"
patch -p1 -d /usr/local/lib/python3.12/dist-packages < fix_crash.diff || echo "Patch is not applicable, skipping"
echo "Reverting PR #34279 that causes slowness"
patch -p1 -R -d /usr/local/lib/python3.12/dist-packages < fix_slowness.diff || echo "Reversing PR #34279 failed, skipping"
echo "Fixing Triton allocator bug"
cp _triton* /usr/local/lib/python3.12/dist-packages/

View File

@@ -1,30 +1,30 @@
# Recipe: Qwen3-Coder-Next-FP8
# Qwen3-Coder-Next model in native FP8 format
# Currently can only be run in solo mode, cluster mode fails with error - tracking https://github.com/vllm-project/vllm/issues/33857
recipe_version: "1"
name: Qwen3-Coder-Next-FP8
description: vLLM serving Qwen3-Coder-Next-FP8 on a SINGLE NODE ONLY!
description: vLLM serving Qwen3-Coder-Next-FP8
# HuggingFace model to download (optional, for --download-model)
model: Qwen/Qwen3-Coder-Next-FP8
# This model can only run on single node (solo)
solo_only: true
#solo_only: true
# Container image to use
container: vllm-node
# No mods required
mods: []
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
mods:
- mods/fix-qwen3-coder-next
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 1
tensor_parallel: 2
gpu_memory_utilization: 0.7
max_model_len: 131072
max_model_len: 262144
# Environment variables
env: {}
@@ -40,4 +40,7 @@ command: |
--load-format fastsafetensors \
--attention-backend flashinfer \
--enable-prefix-caching \
--max-model-len {max_model_len}
--max-model-len {max_model_len} \
-tp {tensor_parallel} \
--distributed-executor-backend ray