change backend

This commit is contained in:
2026-05-07 15:57:11 -05:00
parent f9d81c3a17
commit 62a42ed8f0

View File

@@ -46,26 +46,30 @@ metadata:
name: qwen36-27b-fp8
namespace: nim-service
spec:
env:
- name: NIM_DISABLE_MODEL_DOWNLOAD
value: "1"
- name: HF_HUB_OFFLINE
value: "1"
- name: NIM_BACKEND
value: vllm
- name: NIM_PASSTHROUGH_ARGS
value: "--gpu-memory-utilization 0.85 --served-model-name qwen36 --max-model-len 256K --language-model-only --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-chunked-prefill --max-num-batched-tokens 32768 --max-num-seqs 10 --enable-prefix-caching --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":2}'"
image:
repository: nvcr.io/nim/nvidia/llm-nim
tag: "1.12"
repository: vllm/vllm-openai
tag: latest
pullPolicy: IfNotPresent
pullSecrets:
- ngc-secret
command:
- python3
args:
- -m
- vllm.entrypoints.openai.api_server
- --model
- /model-store
- --served-model-name
- qwen36
- --gpu-memory-utilization
- "0.85"
- --max-model-len
- "262144"
- --enable-prefix-caching
authSecret: hf-api-secret
storage:
nimCache:
name: qwen36-27b-fp8
profile: Qwen3_5ForConditionalGeneration
replicas: 1
resources:
limits: