diff --git a/recipes/qwen35-35b-a3b-fp8.yaml b/recipes/qwen35-35b-a3b-fp8.yaml index 1f0c70c..296146a 100644 --- a/recipes/qwen35-35b-a3b-fp8.yaml +++ b/recipes/qwen35-35b-a3b-fp8.yaml @@ -25,23 +25,25 @@ defaults: tensor_parallel: 2 gpu_memory_utilization: 0.7 max_model_len: 131072 + max_num_batched_tokens: 16384 # Environment variables -env: {} +env: + VLLM_MARLIN_USE_ATOMIC_ADD: 1 # The vLLM serve command template command: | vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \ - --max-num-batched-tokens 16384 \ - --enable-auto-tool-choice \ - --tool-call-parser qwen3_coder \ - --gpu-memory-utilization {gpu_memory_utilization} \ --host {host} \ --port {port} \ + --max-model-len {max_model_len} \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ --kv-cache-dtype fp8 \ --load-format fastsafetensors \ --attention-backend flashinfer \ --enable-prefix-caching \ - --max-model-len {max_model_len} \ -tp {tensor_parallel} \ --distributed-executor-backend ray