From e8f94d6b8b5b1a4d4d191890f8169880580cefa4 Mon Sep 17 00:00:00 2001 From: Erik Vullings Date: Fri, 27 Feb 2026 17:46:06 +0100 Subject: [PATCH 1/2] Add Qwen35-35B-A3B recipe in FP8 format --- recipes/qwen35-35b-a3b-fp8.yaml | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 recipes/qwen35-35b-a3b-fp8.yaml diff --git a/recipes/qwen35-35b-a3b-fp8.yaml b/recipes/qwen35-35b-a3b-fp8.yaml new file mode 100644 index 0000000..1f0c70c --- /dev/null +++ b/recipes/qwen35-35b-a3b-fp8.yaml @@ -0,0 +1,47 @@ +# Recipe: Qwen/Qwen3.5-35B-A3B-FP8 +# Qwen/Qwen3.5-35B-A3B model in native FP8 format + + +recipe_version: "1" +name: Qwen35-35B-A3B +description: vLLM serving Qwen3.5-35B-A3B-FP8 + +# HuggingFace model to download (optional, for --download-model) +model: Qwen/Qwen3.5-35B-A3B-FP8 + +#solo_only: true + +# Container image to use +container: vllm-node + +# Mod required to fix slowness and crash in the cluster (tracking https://github.com/vllm-project/vllm/issues/33857) +mods: + - mods/fix-qwen3-coder-next + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + tensor_parallel: 2 + gpu_memory_utilization: 0.7 + max_model_len: 131072 + +# Environment variables +env: {} + +# The vLLM serve command template +command: | + vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ + --max-num-batched-tokens 16384 \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --host {host} \ + --port {port} \ + --kv-cache-dtype fp8 \ + --load-format fastsafetensors \ + --attention-backend flashinfer \ + --enable-prefix-caching \ + --max-model-len {max_model_len} \ + -tp {tensor_parallel} \ + --distributed-executor-backend ray From 163f23d85ba031cf23c1572d0326822b5733b82d Mon Sep 17 00:00:00 2001 From: Erik Vullings Date: Tue, 3 Mar 2026 12:46:12 +0100 Subject: [PATCH 2/2] Update qwen35-35b-a3b-fp8.yaml --max_num_batched_tokens is a default variable now, which can be overriden via the CLI --- recipes/qwen35-35b-a3b-fp8.yaml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/recipes/qwen35-35b-a3b-fp8.yaml b/recipes/qwen35-35b-a3b-fp8.yaml index 1f0c70c..296146a 100644 --- a/recipes/qwen35-35b-a3b-fp8.yaml +++ b/recipes/qwen35-35b-a3b-fp8.yaml @@ -25,23 +25,25 @@ defaults: tensor_parallel: 2 gpu_memory_utilization: 0.7 max_model_len: 131072 + max_num_batched_tokens: 16384 # Environment variables -env: {} +env: + VLLM_MARLIN_USE_ATOMIC_ADD: 1 # The vLLM serve command template command: | vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ - --max-num-batched-tokens 16384 \ - --enable-auto-tool-choice \ - --tool-call-parser qwen3_coder \ - --gpu-memory-utilization {gpu_memory_utilization} \ --host {host} \ --port {port} \ + --max-model-len {max_model_len} \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ --kv-cache-dtype fp8 \ --load-format fastsafetensors \ --attention-backend flashinfer \ --enable-prefix-caching \ - --max-model-len {max_model_len} \ -tp {tensor_parallel} \ --distributed-executor-backend ray