Added a recipe for qwen3-coder-next-int4-autoround

2026-03-11 09:23:23 -07:00
parent 66b5c85907
commit f2cf11b047
1 changed files with 46 additions and 0 deletions
--- a/recipes/qwen3-coder-next-int4-autoround.yaml
+++ b/recipes/qwen3-coder-next-int4-autoround.yaml
@@ -0,0 +1,46 @@
 # Recipe: Qwen3-Coder-Next-int4-Autoround
 # Qwen3-Coder-Next model in Intel int4-Autoround format
 recipe_version: "1"
 name: Qwen3-Coder-Next-int4-Autoround
 description: Qwen3-Coder-Next-int4-Autoround
 # HuggingFace model to download (optional, for --download-model)
 model: Intel/Qwen3-Coder-Next-int4-AutoRound
 solo_only: true
 # Container image to use
 container: vllm-node
 # Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857)
 mods:
  - mods/fix-qwen3-next-autoround
 # Default settings (can be overridden via CLI)
 defaults:
  port: 8000
  host: 0.0.0.0
  tensor_parallel: 2
  gpu_memory_utilization: 0.7
  max_model_len: 262144
 # Environment variables
 env: 
  VLLM_MARLIN_USE_ATOMIC_ADD: 1
 # The vLLM serve command template
 command: |
  vllm serve Intel/Qwen3-Coder-Next-int4-AutoRound \
    --enable-auto-tool-choice \
    --tool-call-parser qwen3_coder \
    --gpu-memory-utilization {gpu_memory_utilization} \
    --host {host} \
    --port {port} \
    --load-format fastsafetensors \
    --enable-prefix-caching \
    --max-model-len {max_model_len} \
    -tp {tensor_parallel} \
    --distributed-executor-backend ray