diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index 872b8e9..ef146df 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -49,7 +49,7 @@ spec: - --served-model-name - Qwen/Qwen3.6-27B-FP8 - --gpu-memory-utilization - - "0.92" # ↑ More memory for KV cache / bigger batches + - "0.90" # ↑ More memory for KV cache / bigger batches - --max-model-len - "32768" # ↓ 256K→32K (adjust if you really need long context) - --language-model-only