Qwen3-Coder-Next fixes and updated recipe

This commit is contained in:
Eugene Rakhmatulin
2026-02-12 15:56:32 -08:00
parent da4185cb12
commit 701147b1eb
7 changed files with 129 additions and 9 deletions

View File

@@ -1,30 +1,30 @@
# Recipe: Qwen3-Coder-Next-FP8
# Qwen3-Coder-Next model in native FP8 format
# Currently can only be run in solo mode, cluster mode fails with error - tracking https://github.com/vllm-project/vllm/issues/33857
recipe_version: "1"
name: Qwen3-Coder-Next-FP8
description: vLLM serving Qwen3-Coder-Next-FP8 on a SINGLE NODE ONLY!
description: vLLM serving Qwen3-Coder-Next-FP8
# HuggingFace model to download (optional, for --download-model)
model: Qwen/Qwen3-Coder-Next-FP8
# This model can only run on single node (solo)
solo_only: true
#solo_only: true
# Container image to use
container: vllm-node
# No mods required
mods: []
# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
mods:
- mods/fix-qwen3-coder-next
# Default settings (can be overridden via CLI)
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 1
tensor_parallel: 2
gpu_memory_utilization: 0.7
max_model_len: 131072
max_model_len: 262144
# Environment variables
env: {}
@@ -40,4 +40,7 @@ command: |
--load-format fastsafetensors \
--attention-backend flashinfer \
--enable-prefix-caching \
--max-model-len {max_model_len}
--max-model-len {max_model_len} \
-tp {tensor_parallel} \
--distributed-executor-backend ray