optimize model
This commit is contained in:
@@ -33,6 +33,10 @@ spec:
|
|||||||
env:
|
env:
|
||||||
- name: VLLM_CACHE_ROOT
|
- name: VLLM_CACHE_ROOT
|
||||||
value: /model-store/vllm-cache
|
value: /model-store/vllm-cache
|
||||||
|
- name: VLLM_ATTENTION_BACKEND
|
||||||
|
value: FLASHINFER # Faster attention on CUDA >= 12.1
|
||||||
|
- name: VLLM_DISABLE_LOGGING
|
||||||
|
value: "1" # Remove logging overhead
|
||||||
command:
|
command:
|
||||||
- python3
|
- python3
|
||||||
args:
|
args:
|
||||||
@@ -45,9 +49,9 @@ spec:
|
|||||||
- --served-model-name
|
- --served-model-name
|
||||||
- Qwen/Qwen3.6-27B-FP8
|
- Qwen/Qwen3.6-27B-FP8
|
||||||
- --gpu-memory-utilization
|
- --gpu-memory-utilization
|
||||||
- "0.85"
|
- "0.92" # ↑ More memory for KV cache / bigger batches
|
||||||
- --max-model-len
|
- --max-model-len
|
||||||
- "256000"
|
- "32768" # ↓ 256K→32K (adjust if you really need long context)
|
||||||
- --language-model-only
|
- --language-model-only
|
||||||
- --reasoning-parser
|
- --reasoning-parser
|
||||||
- qwen3
|
- qwen3
|
||||||
@@ -56,13 +60,15 @@ spec:
|
|||||||
- qwen3_coder
|
- qwen3_coder
|
||||||
- --enable-chunked-prefill
|
- --enable-chunked-prefill
|
||||||
- --max-num-batched-tokens
|
- --max-num-batched-tokens
|
||||||
- "32768"
|
- "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec
|
||||||
- --max-num-seqs
|
- --max-num-seqs
|
||||||
- "10"
|
- "254" # ↑ Allow more concurrent sequences
|
||||||
- --enable-prefix-caching
|
--enable-prefix-caching
|
||||||
- --speculative-config
|
--dtype
|
||||||
- '{"method":"mtp","num_speculative_tokens":2}'
|
- "float8" # Explicit FP8 encoding
|
||||||
|
--quantization
|
||||||
|
- "fbgemm-fp8" # Explicit quantization backend
|
||||||
|
--sv2-transformer-bindings # Skip SA checks (small speedup)
|
||||||
authSecret: hf-api-secret
|
authSecret: hf-api-secret
|
||||||
storage:
|
storage:
|
||||||
sharedMemorySizeLimit: 64Gi
|
sharedMemorySizeLimit: 64Gi
|
||||||
|
|||||||
Reference in New Issue
Block a user