optimize model
This commit is contained in:
@@ -33,6 +33,10 @@ spec:
|
||||
env:
|
||||
- name: VLLM_CACHE_ROOT
|
||||
value: /model-store/vllm-cache
|
||||
- name: VLLM_ATTENTION_BACKEND
|
||||
value: FLASHINFER # Faster attention on CUDA >= 12.1
|
||||
- name: VLLM_DISABLE_LOGGING
|
||||
value: "1" # Remove logging overhead
|
||||
command:
|
||||
- python3
|
||||
args:
|
||||
@@ -45,9 +49,9 @@ spec:
|
||||
- --served-model-name
|
||||
- Qwen/Qwen3.6-27B-FP8
|
||||
- --gpu-memory-utilization
|
||||
- "0.85"
|
||||
- "0.92" # ↑ More memory for KV cache / bigger batches
|
||||
- --max-model-len
|
||||
- "256000"
|
||||
- "32768" # ↓ 256K→32K (adjust if you really need long context)
|
||||
- --language-model-only
|
||||
- --reasoning-parser
|
||||
- qwen3
|
||||
@@ -56,13 +60,15 @@ spec:
|
||||
- qwen3_coder
|
||||
- --enable-chunked-prefill
|
||||
- --max-num-batched-tokens
|
||||
- "32768"
|
||||
- "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec
|
||||
- --max-num-seqs
|
||||
- "10"
|
||||
- --enable-prefix-caching
|
||||
- --speculative-config
|
||||
- '{"method":"mtp","num_speculative_tokens":2}'
|
||||
|
||||
- "254" # ↑ Allow more concurrent sequences
|
||||
--enable-prefix-caching
|
||||
--dtype
|
||||
- "float8" # Explicit FP8 encoding
|
||||
--quantization
|
||||
- "fbgemm-fp8" # Explicit quantization backend
|
||||
--sv2-transformer-bindings # Skip SA checks (small speedup)
|
||||
authSecret: hf-api-secret
|
||||
storage:
|
||||
sharedMemorySizeLimit: 64Gi
|
||||
|
||||
Reference in New Issue
Block a user