use less gpu and add swap
This commit is contained in:
@@ -49,7 +49,7 @@ spec:
|
|||||||
- --served-model-name
|
- --served-model-name
|
||||||
- Qwen/Qwen3.6-27B-FP8
|
- Qwen/Qwen3.6-27B-FP8
|
||||||
- --gpu-memory-utilization
|
- --gpu-memory-utilization
|
||||||
- "0.90" # ↑ More memory for KV cache / bigger batches
|
- "0.80" # ↑ More memory for KV cache / bigger batches
|
||||||
- --max-model-len
|
- --max-model-len
|
||||||
- "32768" # ↓ 256K→32K (adjust if you really need long context)
|
- "32768" # ↓ 256K→32K (adjust if you really need long context)
|
||||||
- --language-model-only
|
- --language-model-only
|
||||||
@@ -64,6 +64,8 @@ spec:
|
|||||||
- --max-num-seqs
|
- --max-num-seqs
|
||||||
- "254" # ↑ Allow more concurrent sequences
|
- "254" # ↑ Allow more concurrent sequences
|
||||||
- --enable-prefix-caching
|
- --enable-prefix-caching
|
||||||
|
- --swap-space
|
||||||
|
- "8"
|
||||||
authSecret: hf-api-secret
|
authSecret: hf-api-secret
|
||||||
storage:
|
storage:
|
||||||
sharedMemorySizeLimit: 64Gi
|
sharedMemorySizeLimit: 64Gi
|
||||||
|
|||||||
Reference in New Issue
Block a user