diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index fc06171..bf29956 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -47,6 +47,8 @@ metadata: namespace: nim-service spec: env: + - name: NIM_BACKEND + value: vllm - name: NIM_PASSTHROUGH_ARGS value: "--gpu-memory-utilization 0.85 --served-model-name qwen36 --max-model-len 256K --language-model-only --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-chunked-prefill --max-num-batched-tokens 32768 --max-num-seqs 10 --enable-prefix-caching --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":2}'" image: @@ -59,7 +61,6 @@ spec: storage: nimCache: name: qwen36-27b-fp8 - profile: vllm replicas: 1 resources: limits: