use less gpu and add swap

This commit is contained in:
2026-05-11 11:53:13 -05:00
parent 9ea69aedb4
commit 82a411166a

View File

@@ -49,7 +49,7 @@ spec:
- --served-model-name - --served-model-name
- Qwen/Qwen3.6-27B-FP8 - Qwen/Qwen3.6-27B-FP8
- --gpu-memory-utilization - --gpu-memory-utilization
- "0.90" # ↑ More memory for KV cache / bigger batches - "0.80" # ↑ More memory for KV cache / bigger batches
- --max-model-len - --max-model-len
- "32768" # ↓ 256K→32K (adjust if you really need long context) - "32768" # ↓ 256K→32K (adjust if you really need long context)
- --language-model-only - --language-model-only
@@ -64,6 +64,8 @@ spec:
- --max-num-seqs - --max-num-seqs
- "254" # ↑ Allow more concurrent sequences - "254" # ↑ Allow more concurrent sequences
- --enable-prefix-caching - --enable-prefix-caching
- --swap-space
- "8"
authSecret: hf-api-secret authSecret: hf-api-secret
storage: storage:
sharedMemorySizeLimit: 64Gi sharedMemorySizeLimit: 64Gi