From 9594d6311ae10931c776c6bb8e5fae3a9d66eb83 Mon Sep 17 00:00:00 2001
From: HaimKortovich <haimkortovich88@gmail.com>
Date: Mon, 11 May 2026 11:30:01 -0500
Subject: [PATCH] optimize model

---
 clusters/k3s-dgx/nim-service/qwen.yaml | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml
index e8b80b0..61fae86 100644
--- a/clusters/k3s-dgx/nim-service/qwen.yaml
+++ b/clusters/k3s-dgx/nim-service/qwen.yaml
@@ -33,6 +33,10 @@ spec:
   env:
     - name: VLLM_CACHE_ROOT
       value: /model-store/vllm-cache
+    - name: VLLM_ATTENTION_BACKEND
+      value: FLASHINFER      # Faster attention on CUDA >= 12.1
+    - name: VLLM_DISABLE_LOGGING
+      value: "1"              # Remove logging overhead
   command:
     - python3
   args:
@@ -45,9 +49,9 @@ spec:
     - --served-model-name
     - Qwen/Qwen3.6-27B-FP8
     - --gpu-memory-utilization
-    - "0.85"
+    - "0.92"                  # ↑ More memory for KV cache / bigger batches
     - --max-model-len
-    - "256000"
+    - "32768"                 # ↓ 256K→32K (adjust if you really need long context)
     - --language-model-only
     - --reasoning-parser
     - qwen3
@@ -56,13 +60,15 @@ spec:
     - qwen3_coder
     - --enable-chunked-prefill
     - --max-num-batched-tokens
-    - "32768"
+    - "131072"                # ↑ 32K→128K — larger decode batches = more tokens/sec
     - --max-num-seqs
-    - "10"
-    - --enable-prefix-caching
-    - --speculative-config
-    - '{"method":"mtp","num_speculative_tokens":2}'
-
+    - "254"                   # ↑ Allow more concurrent sequences
+    --enable-prefix-caching
+    --dtype
+    - "float8"                # Explicit FP8 encoding
+    --quantization
+    - "fbgemm-fp8"            # Explicit quantization backend
+    --sv2-transformer-bindings  # Skip SA checks (small speedup)
   authSecret: hf-api-secret
   storage:
     sharedMemorySizeLimit: 64Gi