From 2e1ab2ea2d172fe6736e55e459730bd4596f9411 Mon Sep 17 00:00:00 2001 From: HaimKortovich Date: Thu, 7 May 2026 14:54:30 -0500 Subject: [PATCH] try befier model --- clusters/k3s-dgx/nim-service/qwen.yaml | 42 +++++++++++--------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index 4835811..4e68ddf 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -43,40 +43,32 @@ spec: apiVersion: apps.nvidia.com/v1alpha1 kind: NIMService metadata: - name: qwen3-32b-instruct + name: qwen36-27b-fp8 namespace: nim-service spec: - env: - - name: GPU_MEMORY_UTILIZATION - value: "0.90" - - name: NIM_MAX_NUM_SEQS - value: "32" - - name: NIM_MAX_NUM_BATCHED_TOKENS - value: "16384" - - name: ENABLE_AUTO_TOOL_CHOICE - value: "true" - - name: ENABLE_PREFIX_CACHING - value: "true" - - name: TRUNCATION_SIDE - value: "left" - - name: VLLM_LOGGING_LEVEL - value: "INFO" - - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN - value: "true" - - name: NIM_MAX_MODEL_LEN - value: "32768" - - name: NIM_ENABLE_KV_CACHE_REUSE - value: "1" + args: + - "--gpu-memory-utilization=0.85" + - "--served-model-name=qwen36" + - "--max-model-len=256K" + - "--language-model-only" + - "--reasoning-parser=qwen3" + - "--enable-auto-tool-choice" + - "--tool-call-parser=qwen3_coder" + - "--enable-chunked-prefill" + - "--max-num-batched-tokens=32768" + - "--max-num-seqs=10" + - "--enable-prefix-caching" + - '--speculative-config={"method":"mtp","num_speculative_tokens":2}' image: - repository: nvcr.io/nim/qwen/qwen3-32b-dgx-spark - tag: "1.1.0-variant" + repository: nvcr.io/nim/nvidia/llm-nim + tag: "1.12" pullPolicy: IfNotPresent pullSecrets: - ngc-secret authSecret: ngc-api-secret storage: nimCache: - name: qwen3-32b-instruct + name: qwen36-27b-fp8 replicas: 1 resources: limits: