From e99199363687e690a65f8db0a656b92f18b9a0ab Mon Sep 17 00:00:00 2001
From: HaimKortovich <haimkortovich88@gmail.com>
Date: Mon, 11 May 2026 13:03:44 -0500
Subject: [PATCH] optimize model

---
 clusters/k3s-dgx/nim-service/qwen.yaml | 33 ++++++++++++++++++--------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml
index 932f138..f3319b6 100644
--- a/clusters/k3s-dgx/nim-service/qwen.yaml
+++ b/clusters/k3s-dgx/nim-service/qwen.yaml
@@ -49,19 +49,32 @@ spec:
     - --gpu-memory-utilization
     - "0.80"                  # ↑ More memory for KV cache / bigger batches
     - --max-model-len
-    - "32768"                 # ↓ 256K→32K (adjust if you really need long context)
-    - --language-model-only
-    - --reasoning-parser
-    - qwen3
+    - "262144"
+    - --max-num-batched-tokens
+    - "16384"
+    - --max-num-seqs
+    - "4"
+    - --enable-prefix-caching
+    - --enable-chunked-prefill
+    - --load-format
+    - instanttensor
+    - --attention-backend
+    - flashinfer
+    - --dtype
+    - auto
+    - --kv-cache-dtype
+    - fp8
+    - --trust-remote-code
     - --enable-auto-tool-choice
     - --tool-call-parser
     - qwen3_coder
-    - --enable-chunked-prefill
-    - --max-num-batched-tokens
-    - "131072"                # ↑ 32K→128K — larger decode batches = more tokens/sec
-    - --max-num-seqs
-    - "254"                   # ↑ Allow more concurrent sequences
-    - --enable-prefix-caching
+    - --reasoning-parser
+    - qwen3
+    - --default-chat-template-kwargs
+    - '{"preserve_thinking": true}'
+    - --override-generation-config
+    - '{"temperature": 0.6, "top_p": 0.95, "top_k": 20, "min_p": 0.0, "presence_penalty": 0.0, "repetition_penalty": 1.0}'
+    - --disable-log-requests
     - --attention-backend
     - FLASHINFER
   authSecret: hf-api-secret