optimize model

This commit is contained in:
2026-05-11 13:03:44 -05:00
parent 689f7665af
commit e991993636

View File

@@ -49,19 +49,32 @@ spec:
- --gpu-memory-utilization - --gpu-memory-utilization
- "0.80" # ↑ More memory for KV cache / bigger batches - "0.80" # ↑ More memory for KV cache / bigger batches
- --max-model-len - --max-model-len
- "32768" # ↓ 256K→32K (adjust if you really need long context) - "262144"
- --language-model-only - --max-num-batched-tokens
- --reasoning-parser - "16384"
- qwen3 - --max-num-seqs
- "4"
- --enable-prefix-caching
- --enable-chunked-prefill
- --load-format
- instanttensor
- --attention-backend
- flashinfer
- --dtype
- auto
- --kv-cache-dtype
- fp8
- --trust-remote-code
- --enable-auto-tool-choice - --enable-auto-tool-choice
- --tool-call-parser - --tool-call-parser
- qwen3_coder - qwen3_coder
- --enable-chunked-prefill - --reasoning-parser
- --max-num-batched-tokens - qwen3
- "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec - --default-chat-template-kwargs
- --max-num-seqs - '{"preserve_thinking": true}'
- "254" # ↑ Allow more concurrent sequences - --override-generation-config
- --enable-prefix-caching - '{"temperature": 0.6, "top_p": 0.95, "top_k": 20, "min_p": 0.0, "presence_penalty": 0.0, "repetition_penalty": 1.0}'
- --disable-log-requests
- --attention-backend - --attention-backend
- FLASHINFER - FLASHINFER
authSecret: hf-api-secret authSecret: hf-api-secret