Update qwen35-35b-a3b-fp8.yaml
--max_num_batched_tokens is a default variable now, which can be overriden via the CLI
This commit is contained in:
@@ -25,23 +25,25 @@ defaults:
|
|||||||
tensor_parallel: 2
|
tensor_parallel: 2
|
||||||
gpu_memory_utilization: 0.7
|
gpu_memory_utilization: 0.7
|
||||||
max_model_len: 131072
|
max_model_len: 131072
|
||||||
|
max_num_batched_tokens: 16384
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
env: {}
|
env:
|
||||||
|
VLLM_MARLIN_USE_ATOMIC_ADD: 1
|
||||||
|
|
||||||
# The vLLM serve command template
|
# The vLLM serve command template
|
||||||
command: |
|
command: |
|
||||||
vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
|
vllm serve Qwen/Qwen3.5-35B-A3B-FP8 \
|
||||||
--max-num-batched-tokens 16384 \
|
|
||||||
--enable-auto-tool-choice \
|
|
||||||
--tool-call-parser qwen3_coder \
|
|
||||||
--gpu-memory-utilization {gpu_memory_utilization} \
|
|
||||||
--host {host} \
|
--host {host} \
|
||||||
--port {port} \
|
--port {port} \
|
||||||
|
--max-model-len {max_model_len} \
|
||||||
|
--max-num-batched-tokens {max_num_batched_tokens} \
|
||||||
|
--gpu-memory-utilization {gpu_memory_utilization} \
|
||||||
|
--enable-auto-tool-choice \
|
||||||
|
--tool-call-parser qwen3_coder \
|
||||||
--kv-cache-dtype fp8 \
|
--kv-cache-dtype fp8 \
|
||||||
--load-format fastsafetensors \
|
--load-format fastsafetensors \
|
||||||
--attention-backend flashinfer \
|
--attention-backend flashinfer \
|
||||||
--enable-prefix-caching \
|
--enable-prefix-caching \
|
||||||
--max-model-len {max_model_len} \
|
|
||||||
-tp {tensor_parallel} \
|
-tp {tensor_parallel} \
|
||||||
--distributed-executor-backend ray
|
--distributed-executor-backend ray
|
||||||
|
|||||||
Reference in New Issue
Block a user