optimize model

This commit is contained in:
2026-05-11 11:30:01 -05:00
parent b70f8063a8
commit 9594d6311a

View File

@@ -33,6 +33,10 @@ spec:
env: env:
- name: VLLM_CACHE_ROOT - name: VLLM_CACHE_ROOT
value: /model-store/vllm-cache value: /model-store/vllm-cache
- name: VLLM_ATTENTION_BACKEND
value: FLASHINFER # Faster attention on CUDA >= 12.1
- name: VLLM_DISABLE_LOGGING
value: "1" # Remove logging overhead
command: command:
- python3 - python3
args: args:
@@ -45,9 +49,9 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3.6-27B-FP8 - Qwen/Qwen3.6-27B-FP8
- --gpu-memory-utilization - --gpu-memory-utilization
- "0.85" - "0.92" # ↑ More memory for KV cache / bigger batches
- --max-model-len - --max-model-len
- "256000" - "32768" # ↓ 256K→32K (adjust if you really need long context)
- --language-model-only - --language-model-only
- --reasoning-parser - --reasoning-parser
- qwen3 - qwen3
@@ -56,13 +60,15 @@ spec:
- qwen3_coder - qwen3_coder
- --enable-chunked-prefill - --enable-chunked-prefill
- --max-num-batched-tokens - --max-num-batched-tokens
- "32768" - "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec
- --max-num-seqs - --max-num-seqs
- "10" - "254" # ↑ Allow more concurrent sequences
- --enable-prefix-caching --enable-prefix-caching
- --speculative-config --dtype
- '{"method":"mtp","num_speculative_tokens":2}' - "float8" # Explicit FP8 encoding
--quantization
- "fbgemm-fp8" # Explicit quantization backend
--sv2-transformer-bindings # Skip SA checks (small speedup)
authSecret: hf-api-secret authSecret: hf-api-secret
storage: storage:
sharedMemorySizeLimit: 64Gi sharedMemorySizeLimit: 64Gi