From e99199363687e690a65f8db0a656b92f18b9a0ab Mon Sep 17 00:00:00 2001 From: HaimKortovich Date: Mon, 11 May 2026 13:03:44 -0500 Subject: [PATCH] optimize model --- clusters/k3s-dgx/nim-service/qwen.yaml | 33 ++++++++++++++++++-------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index 932f138..f3319b6 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -49,19 +49,32 @@ spec: - --gpu-memory-utilization - "0.80" # ↑ More memory for KV cache / bigger batches - --max-model-len - - "32768" # ↓ 256K→32K (adjust if you really need long context) - - --language-model-only - - --reasoning-parser - - qwen3 + - "262144" + - --max-num-batched-tokens + - "16384" + - --max-num-seqs + - "4" + - --enable-prefix-caching + - --enable-chunked-prefill + - --load-format + - instanttensor + - --attention-backend + - flashinfer + - --dtype + - auto + - --kv-cache-dtype + - fp8 + - --trust-remote-code - --enable-auto-tool-choice - --tool-call-parser - qwen3_coder - - --enable-chunked-prefill - - --max-num-batched-tokens - - "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec - - --max-num-seqs - - "254" # ↑ Allow more concurrent sequences - - --enable-prefix-caching + - --reasoning-parser + - qwen3 + - --default-chat-template-kwargs + - '{"preserve_thinking": true}' + - --override-generation-config + - '{"temperature": 0.6, "top_p": 0.95, "top_k": 20, "min_p": 0.0, "presence_penalty": 0.0, "repetition_penalty": 1.0}' + - --disable-log-requests - --attention-backend - FLASHINFER authSecret: hf-api-secret