From c5db9144a90a1884f7c271783baecceac1172c5d Mon Sep 17 00:00:00 2001 From: HaimKortovich Date: Thu, 7 May 2026 15:03:56 -0500 Subject: [PATCH] pass args --- clusters/k3s-dgx/nim-service/qwen.yaml | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index ee491cd..aeb5088 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -46,23 +46,15 @@ metadata: name: qwen36-27b-fp8 namespace: nim-service spec: - args: - - "--gpu-memory-utilization=0.85" - - "--served-model-name=qwen36" - - "--max-model-len=256K" - - "--language-model-only" - - "--reasoning-parser=qwen3" - - "--enable-auto-tool-choice" - - "--tool-call-parser=qwen3_coder" - - "--enable-chunked-prefill" - - "--max-num-batched-tokens=32768" - - "--max-num-seqs=10" - - "--enable-prefix-caching" - - '--speculative-config={"method":"mtp","num_speculative_tokens":2}' + env: + - name: NIM_PASSTHROUGH_ARGS + value: "--gpu-memory-utilization 0.85 --served-model-name qwen36 --max-model-len 256K --language-model-only --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-chunked-prefill --max-num-batched-tokens 32768 --max-num-seqs 10 --enable-prefix-caching --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":2}'" image: - repository: vllm/vllm-openai - tag: "v0.19.1-cu130" + repository: nvcr.io/nim/nvidia/llm-nim + tag: "1.12" pullPolicy: IfNotPresent + pullSecrets: + - ngc-secret authSecret: hf-api-secret storage: nimCache: