add more gpu and change model name served

This commit is contained in:
2026-05-08 09:30:48 -05:00
parent de12558021
commit 2f922279ef

View File

@@ -1,27 +1,5 @@
apiVersion: apps.nvidia.com/v1alpha1
kind: NIMCache
metadata:
name: qwen3-32b-instruct
namespace: nim-service
spec:
source:
ngc:
modelPuller: nvcr.io/nim/qwen/qwen3-32b-dgx-spark:1.1.0-variant
pullSecret: ngc-secret
authSecret: ngc-api-secret
model:
engine: "vllm"
tensorParallelism: "1"
profiles:
- c4f105d92c72ab56200884dfacde9d2128b139755c06b9c883eeb3e287b7408a
storage:
pvc:
create: true
size: "100Gi"
volumeAccessMode: ReadWriteOnce
---
apiVersion: apps.nvidia.com/v1alpha1
kind: NIMCache
metadata:
name: qwen36-27b-fp8
namespace: nim-service
@@ -65,9 +43,9 @@ spec:
- --host
- 0.0.0.0
- --served-model-name
- qwen36
- Qwen/Qwen3.6-27B-FP8
- --gpu-memory-utilization
- "0.85"
- "0.90"
- --max-model-len
- "256000"
- --language-model-only