optimize model
This commit is contained in:
@@ -49,19 +49,32 @@ spec:
|
|||||||
- --gpu-memory-utilization
|
- --gpu-memory-utilization
|
||||||
- "0.80" # ↑ More memory for KV cache / bigger batches
|
- "0.80" # ↑ More memory for KV cache / bigger batches
|
||||||
- --max-model-len
|
- --max-model-len
|
||||||
- "32768" # ↓ 256K→32K (adjust if you really need long context)
|
- "262144"
|
||||||
- --language-model-only
|
- --max-num-batched-tokens
|
||||||
- --reasoning-parser
|
- "16384"
|
||||||
- qwen3
|
- --max-num-seqs
|
||||||
|
- "4"
|
||||||
|
- --enable-prefix-caching
|
||||||
|
- --enable-chunked-prefill
|
||||||
|
- --load-format
|
||||||
|
- instanttensor
|
||||||
|
- --attention-backend
|
||||||
|
- flashinfer
|
||||||
|
- --dtype
|
||||||
|
- auto
|
||||||
|
- --kv-cache-dtype
|
||||||
|
- fp8
|
||||||
|
- --trust-remote-code
|
||||||
- --enable-auto-tool-choice
|
- --enable-auto-tool-choice
|
||||||
- --tool-call-parser
|
- --tool-call-parser
|
||||||
- qwen3_coder
|
- qwen3_coder
|
||||||
- --enable-chunked-prefill
|
- --reasoning-parser
|
||||||
- --max-num-batched-tokens
|
- qwen3
|
||||||
- "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec
|
- --default-chat-template-kwargs
|
||||||
- --max-num-seqs
|
- '{"preserve_thinking": true}'
|
||||||
- "254" # ↑ Allow more concurrent sequences
|
- --override-generation-config
|
||||||
- --enable-prefix-caching
|
- '{"temperature": 0.6, "top_p": 0.95, "top_k": 20, "min_p": 0.0, "presence_penalty": 0.0, "repetition_penalty": 1.0}'
|
||||||
|
- --disable-log-requests
|
||||||
- --attention-backend
|
- --attention-backend
|
||||||
- FLASHINFER
|
- FLASHINFER
|
||||||
authSecret: hf-api-secret
|
authSecret: hf-api-secret
|
||||||
|
|||||||
Reference in New Issue
Block a user