From 62a42ed8f08d0efbc553a9c74e7036ebea3af8c0 Mon Sep 17 00:00:00 2001 From: HaimKortovich Date: Thu, 7 May 2026 15:57:11 -0500 Subject: [PATCH] change backend --- clusters/k3s-dgx/nim-service/qwen.yaml | 32 +++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index fa12962..3ff7d57 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -46,26 +46,30 @@ metadata: name: qwen36-27b-fp8 namespace: nim-service spec: - env: - - name: NIM_DISABLE_MODEL_DOWNLOAD - value: "1" - - name: HF_HUB_OFFLINE - value: "1" - - name: NIM_BACKEND - value: vllm - - name: NIM_PASSTHROUGH_ARGS - value: "--gpu-memory-utilization 0.85 --served-model-name qwen36 --max-model-len 256K --language-model-only --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-chunked-prefill --max-num-batched-tokens 32768 --max-num-seqs 10 --enable-prefix-caching --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":2}'" image: - repository: nvcr.io/nim/nvidia/llm-nim - tag: "1.12" + repository: vllm/vllm-openai + tag: latest pullPolicy: IfNotPresent - pullSecrets: - - ngc-secret + + command: + - python3 + + args: + - -m + - vllm.entrypoints.openai.api_server + - --model + - /model-store + - --served-model-name + - qwen36 + - --gpu-memory-utilization + - "0.85" + - --max-model-len + - "262144" + - --enable-prefix-caching authSecret: hf-api-secret storage: nimCache: name: qwen36-27b-fp8 - profile: Qwen3_5ForConditionalGeneration replicas: 1 resources: limits: