Update qwen35-35b-a3b-fp8.yaml

--max_num_batched_tokens is a default variable now, which can be overriden via the CLI
This commit is contained in:
Erik Vullings
2026-03-03 12:46:12 +01:00
committed by GitHub
parent e8f94d6b8b
commit 163f23d85b

View File

@@ -25,23 +25,25 @@ defaults:
tensor_parallel: 2 tensor_parallel: 2
gpu_memory_utilization: 0.7 gpu_memory_utilization: 0.7
max_model_len: 131072 max_model_len: 131072
max_num_batched_tokens: 16384
# Environment variables # Environment variables
env: {} env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1
# The vLLM serve command template # The vLLM serve command template
command: | command: |
vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
--max-num-batched-tokens 16384 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--gpu-memory-utilization {gpu_memory_utilization} \
--host {host} \ --host {host} \
--port {port} \ --port {port} \
--max-model-len {max_model_len} \
--max-num-batched-tokens {max_num_batched_tokens} \
--gpu-memory-utilization {gpu_memory_utilization} \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--kv-cache-dtype fp8 \ --kv-cache-dtype fp8 \
--load-format fastsafetensors \ --load-format fastsafetensors \
--attention-backend flashinfer \ --attention-backend flashinfer \
--enable-prefix-caching \ --enable-prefix-caching \
--max-model-len {max_model_len} \
-tp {tensor_parallel} \ -tp {tensor_parallel} \
--distributed-executor-backend ray --distributed-executor-backend ray