use less gpu and add swap

2026-05-11 11:53:13 -05:00
parent 9ea69aedb4
commit 82a411166a
1 changed files with 3 additions and 1 deletions
--- a/clusters/k3s-dgx/nim-service/qwen.yaml
+++ b/clusters/k3s-dgx/nim-service/qwen.yaml
@@ -49,7 +49,7 @@ spec:
    - --served-model-name
    - Qwen/Qwen3.6-27B-FP8
    - --gpu-memory-utilization
-    - "0.90"                  # ↑ More memory for KV cache / bigger batches
+    - "0.80"                  # ↑ More memory for KV cache / bigger batches
    - --max-model-len
    - "32768"                 # ↓ 256K→32K (adjust if you really need long context)
    - --language-model-only
@@ -64,6 +64,8 @@ spec:
    - --max-num-seqs
    - "254"                   # ↑ Allow more concurrent sequences
    - --enable-prefix-caching
    - --swap-space
    - "8" 
  authSecret: hf-api-secret
  storage:
    sharedMemorySizeLimit: 64Gi