diff --git a/README.md b/README.md index 80d463b..0fc55f9 100644 --- a/README.md +++ b/README.md @@ -164,6 +164,16 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +### 2026-02-12 + +Added a mod for Qwen3-Coder-Next-FP8 that fixes: + +- A bug with Triton allocator (https://github.com/vllm-project/vllm/issues/33857) that prevented the model to run in a cluster. +- A bug that introduced crash when `--enable-prefix-caching` is on (https://github.com/vllm-project/vllm/issues/34361). +- A bug that significantly impacted the performance on Spark (https://github.com/vllm-project/vllm/issues/34413). + +This mod was included in `qwen3-coder-next-fp8` recipe. + ### 2026-02-11 #### Configurable GPU Architecture diff --git a/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth b/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth new file mode 100644 index 0000000..e658dd1 --- /dev/null +++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.pth @@ -0,0 +1 @@ +import _triton_alloc_setup diff --git a/mods/fix-qwen3-coder-next/_triton_alloc_setup.py b/mods/fix-qwen3-coder-next/_triton_alloc_setup.py new file mode 100644 index 0000000..8ade2bf --- /dev/null +++ b/mods/fix-qwen3-coder-next/_triton_alloc_setup.py @@ -0,0 +1,9 @@ +try: + import triton.runtime._allocation as _alloc + import torch + + _alloc.NullAllocator.__call__ = staticmethod( + lambda size, alignment, stream: + torch.cuda.caching_allocator_alloc(size, stream=stream)) +except Exception: + pass diff --git a/mods/fix-qwen3-coder-next/fix_crash.diff b/mods/fix-qwen3-coder-next/fix_crash.diff new file mode 100644 index 0000000..5b5dcc0 --- /dev/null +++ b/mods/fix-qwen3-coder-next/fix_crash.diff @@ -0,0 +1,14 @@ +diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py +index 0b6b7ed42ac1..b6e0305a312d 100644 +--- a/vllm/v1/core/single_type_kv_cache_manager.py ++++ b/vllm/v1/core/single_type_kv_cache_manager.py +@@ -1000,7 +1000,8 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None: + for block in self.req_to_blocks[request.request_id][ + num_cached_blocks_before:num_cached_blocks_after + ]: +- assert block.block_hash is not None ++ if block.is_null: ++ continue + self.cached_blocks_this_step.add(block.block_hash) + + def new_step_starts(self) -> None: diff --git a/mods/fix-qwen3-coder-next/fix_slowness.diff b/mods/fix-qwen3-coder-next/fix_slowness.diff new file mode 100644 index 0000000..d3111bf --- /dev/null +++ b/mods/fix-qwen3-coder-next/fix_slowness.diff @@ -0,0 +1,72 @@ +diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py +index 63aae43c3ddf..6ca3213fbd8d 100644 +--- a/vllm/model_executor/layers/fused_moe/fused_moe.py ++++ b/vllm/model_executor/layers/fused_moe/fused_moe.py +@@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq( + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). +- stride_am, +- stride_ak, +- stride_be, +- stride_bk, +- stride_bn, +- stride_cm, +- stride_cn, +- stride_bse, +- stride_bsk, +- stride_bsn, +- stride_bze, +- stride_bzk, +- stride_bzn, ++ stride_am: tl.int64, ++ stride_ak: tl.int64, ++ stride_be: tl.int64, ++ stride_bk: tl.int64, ++ stride_bn: tl.int64, ++ stride_cm: tl.int64, ++ stride_cn: tl.int64, ++ stride_bse: tl.int64, ++ stride_bsk: tl.int64, ++ stride_bsn: tl.int64, ++ stride_bze: tl.int64, ++ stride_bzk: tl.int64, ++ stride_bzn: tl.int64, + block_k_diviable: tl.constexpr, + group_size: tl.constexpr, + # Meta-parameters +@@ -329,20 +329,20 @@ def fused_moe_kernel( + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). +- stride_am, +- stride_ak, +- stride_be, +- stride_bk, +- stride_bn, +- stride_cm, +- stride_cn, +- stride_asm, +- stride_ask, +- stride_bse, +- stride_bsk, +- stride_bsn, +- stride_bbe, # bias expert stride +- stride_bbn, # bias N stride ++ stride_am: tl.int64, ++ stride_ak: tl.int64, ++ stride_be: tl.int64, ++ stride_bk: tl.int64, ++ stride_bn: tl.int64, ++ stride_cm: tl.int64, ++ stride_cn: tl.int64, ++ stride_asm: tl.int64, ++ stride_ask: tl.int64, ++ stride_bse: tl.int64, ++ stride_bsk: tl.int64, ++ stride_bsn: tl.int64, ++ stride_bbe: tl.int64, # bias expert stride ++ stride_bbn: tl.int64, # bias N stride + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, diff --git a/mods/fix-qwen3-coder-next/run.sh b/mods/fix-qwen3-coder-next/run.sh new file mode 100644 index 0000000..b0f3499 --- /dev/null +++ b/mods/fix-qwen3-coder-next/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +echo "Patching Qwen3-Coder-Next crashing on start" +patch -p1 -d /usr/local/lib/python3.12/dist-packages < fix_crash.diff || echo "Patch is not applicable, skipping" + +echo "Reverting PR #34279 that causes slowness" +patch -p1 -R -d /usr/local/lib/python3.12/dist-packages < fix_slowness.diff || echo "Reversing PR #34279 failed, skipping" + +echo "Fixing Triton allocator bug" +cp _triton* /usr/local/lib/python3.12/dist-packages/ diff --git a/recipes/qwen3-coder-next-fp8.yaml b/recipes/qwen3-coder-next-fp8.yaml index 0200c0c..192db84 100644 --- a/recipes/qwen3-coder-next-fp8.yaml +++ b/recipes/qwen3-coder-next-fp8.yaml @@ -1,30 +1,30 @@ # Recipe: Qwen3-Coder-Next-FP8 # Qwen3-Coder-Next model in native FP8 format -# Currently can only be run in solo mode, cluster mode fails with error - tracking https://github.com/vllm-project/vllm/issues/33857 + recipe_version: "1" name: Qwen3-Coder-Next-FP8 -description: vLLM serving Qwen3-Coder-Next-FP8 on a SINGLE NODE ONLY! +description: vLLM serving Qwen3-Coder-Next-FP8 # HuggingFace model to download (optional, for --download-model) model: Qwen/Qwen3-Coder-Next-FP8 -# This model can only run on single node (solo) -solo_only: true +#solo_only: true # Container image to use container: vllm-node -# No mods required -mods: [] +# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857) +mods: + - mods/fix-qwen3-coder-next # Default settings (can be overridden via CLI) defaults: port: 8000 host: 0.0.0.0 - tensor_parallel: 1 + tensor_parallel: 2 gpu_memory_utilization: 0.7 - max_model_len: 131072 + max_model_len: 262144 # Environment variables env: {} @@ -40,4 +40,7 @@ command: | --load-format fastsafetensors \ --attention-backend flashinfer \ --enable-prefix-caching \ - --max-model-len {max_model_len} + --max-model-len {max_model_len} \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray +