add more gpu and change model name served
This commit is contained in:
@@ -1,27 +1,5 @@
|
||||
apiVersion: apps.nvidia.com/v1alpha1
|
||||
kind: NIMCache
|
||||
metadata:
|
||||
name: qwen3-32b-instruct
|
||||
namespace: nim-service
|
||||
spec:
|
||||
source:
|
||||
ngc:
|
||||
modelPuller: nvcr.io/nim/qwen/qwen3-32b-dgx-spark:1.1.0-variant
|
||||
pullSecret: ngc-secret
|
||||
authSecret: ngc-api-secret
|
||||
model:
|
||||
engine: "vllm"
|
||||
tensorParallelism: "1"
|
||||
profiles:
|
||||
- c4f105d92c72ab56200884dfacde9d2128b139755c06b9c883eeb3e287b7408a
|
||||
storage:
|
||||
pvc:
|
||||
create: true
|
||||
size: "100Gi"
|
||||
volumeAccessMode: ReadWriteOnce
|
||||
---
|
||||
apiVersion: apps.nvidia.com/v1alpha1
|
||||
kind: NIMCache
|
||||
metadata:
|
||||
name: qwen36-27b-fp8
|
||||
namespace: nim-service
|
||||
@@ -65,9 +43,9 @@ spec:
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --served-model-name
|
||||
- qwen36
|
||||
- Qwen/Qwen3.6-27B-FP8
|
||||
- --gpu-memory-utilization
|
||||
- "0.85"
|
||||
- "0.90"
|
||||
- --max-model-len
|
||||
- "256000"
|
||||
- --language-model-only
|
||||
|
||||
Reference in New Issue
Block a user