From 82a411166aa01f6a07ce9c0202f5059271df0dc4 Mon Sep 17 00:00:00 2001 From: HaimKortovich Date: Mon, 11 May 2026 11:53:13 -0500 Subject: [PATCH] use less gpu and add swap --- clusters/k3s-dgx/nim-service/qwen.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index ef146df..db19c0b 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -49,7 +49,7 @@ spec: - --served-model-name - Qwen/Qwen3.6-27B-FP8 - --gpu-memory-utilization - - "0.90" # ↑ More memory for KV cache / bigger batches + - "0.80" # ↑ More memory for KV cache / bigger batches - --max-model-len - "32768" # ↓ 256K→32K (adjust if you really need long context) - --language-model-only @@ -64,6 +64,8 @@ spec: - --max-num-seqs - "254" # ↑ Allow more concurrent sequences - --enable-prefix-caching + - --swap-space + - "8" authSecret: hf-api-secret storage: sharedMemorySizeLimit: 64Gi