optimize model

2026-05-11 11:30:01 -05:00
parent b70f8063a8
commit 9594d6311a
1 changed files with 14 additions and 8 deletions
--- a/clusters/k3s-dgx/nim-service/qwen.yaml
+++ b/clusters/k3s-dgx/nim-service/qwen.yaml
@@ -33,6 +33,10 @@ spec:
  env:
    - name: VLLM_CACHE_ROOT
      value: /model-store/vllm-cache
    - name: VLLM_ATTENTION_BACKEND
      value: FLASHINFER      # Faster attention on CUDA >= 12.1
    - name: VLLM_DISABLE_LOGGING
      value: "1"              # Remove logging overhead
  command:
    - python3
  args:
@@ -45,9 +49,9 @@ spec:
    - --served-model-name
    - Qwen/Qwen3.6-27B-FP8
    - --gpu-memory-utilization
-    - "0.85"
+    - "0.92"                  # ↑ More memory for KV cache / bigger batches
    - --max-model-len
-    - "256000"
+    - "32768"                 # ↓ 256K→32K (adjust if you really need long context)
    - --language-model-only
    - --reasoning-parser
    - qwen3
@@ -56,13 +60,15 @@ spec:
    - qwen3_coder
    - --enable-chunked-prefill
    - --max-num-batched-tokens
-    - "32768"
+    - "131072"                # ↑ 32K→128K — larger decode batches = more tokens/sec
    - --max-num-seqs
-    - "10"
+    - "254"                   # ↑ Allow more concurrent sequences
-    - --enable-prefix-caching
+    --enable-prefix-caching
-    - --speculative-config
+    --dtype
-    - '{"method":"mtp","num_speculative_tokens":2}'
+    - "float8"                # Explicit FP8 encoding
-
+    --quantization
    - "fbgemm-fp8"            # Explicit quantization backend
    --sv2-transformer-bindings  # Skip SA checks (small speedup)
  authSecret: hf-api-secret
  storage:
    sharedMemorySizeLimit: 64Gi