From cecec7482896cd91d5c0a492904d7e7e73165136 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 18:41:57 -0700 Subject: [PATCH] Add recipe for Qwen3.5-397B-INT4-Autoround in pipeline-parallel mode --- .../qwen3.5-397b-int4-autoround.yaml | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml diff --git a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml new file mode 100644 index 0000000..bff1f23 --- /dev/null +++ b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml @@ -0,0 +1,61 @@ +# Recipe: Qwen3.5-122B-A10B-INT4-Autoround +# Qwen3.5-122B model in Intel INT4-Autoround quantization +# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks. +# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150` + +recipe_version: "1" +name: Qwen3.5-397B-INT4-Autoround (PP=3) +description: Recipe for Qwen3.5-397B-INT4-Autoround to run on 3-node mesh in pipeline-parallel mode + +# HuggingFace model to download (optional, for --download-model) +model: Intel/Qwen3.5-397B-A17B-int4-AutoRound + +cluster_only: true + +# Container image to use +container: vllm-node-tf5 + +build_args: + - --tf5 + +# Mod required to fix ROPE syntax error +mods: + - mods/fix-qwen3.5-autoround + - mods/fix-qwen3.5-chat-template + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + pipeline_parellel: 3 + gpu_memory_utilization: 0.6 + max_model_len: 262144 + max_num_batched_tokens: 4176 + +# Environment variables +env: + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + VLLM_MARLIN_USE_ATOMIC_ADD: 1 + +# The vLLM serve command template +command: | + vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \ + --max-model-len {max_model_len} \ + --max-num-seqs 10 \ + --kv-cache-dtype fp8 \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --port {port} \ + --host {host} \ + --enable-prefix-caching \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --reasoning-parser qwen3 \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --trust-remote-code \ + --chat-template unsloth.jinja \ + -tp 1 \ + -pp {pipeline_parallel} \ + --enable-expert-parallel \ + --distributed-executor-backend ray + +