From 9594d6311ae10931c776c6bb8e5fae3a9d66eb83 Mon Sep 17 00:00:00 2001 From: HaimKortovich Date: Mon, 11 May 2026 11:30:01 -0500 Subject: [PATCH] optimize model --- clusters/k3s-dgx/nim-service/qwen.yaml | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index e8b80b0..61fae86 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -33,6 +33,10 @@ spec: env: - name: VLLM_CACHE_ROOT value: /model-store/vllm-cache + - name: VLLM_ATTENTION_BACKEND + value: FLASHINFER # Faster attention on CUDA >= 12.1 + - name: VLLM_DISABLE_LOGGING + value: "1" # Remove logging overhead command: - python3 args: @@ -45,9 +49,9 @@ spec: - --served-model-name - Qwen/Qwen3.6-27B-FP8 - --gpu-memory-utilization - - "0.85" + - "0.92" # ↑ More memory for KV cache / bigger batches - --max-model-len - - "256000" + - "32768" # ↓ 256K→32K (adjust if you really need long context) - --language-model-only - --reasoning-parser - qwen3 @@ -56,13 +60,15 @@ spec: - qwen3_coder - --enable-chunked-prefill - --max-num-batched-tokens - - "32768" + - "131072" # ↑ 32K→128K — larger decode batches = more tokens/sec - --max-num-seqs - - "10" - - --enable-prefix-caching - - --speculative-config - - '{"method":"mtp","num_speculative_tokens":2}' - + - "254" # ↑ Allow more concurrent sequences + --enable-prefix-caching + --dtype + - "float8" # Explicit FP8 encoding + --quantization + - "fbgemm-fp8" # Explicit quantization backend + --sv2-transformer-bindings # Skip SA checks (small speedup) authSecret: hf-api-secret storage: sharedMemorySizeLimit: 64Gi