add more gpu and change model name served
This commit is contained in:
@@ -1,27 +1,5 @@
|
|||||||
apiVersion: apps.nvidia.com/v1alpha1
|
apiVersion: apps.nvidia.com/v1alpha1
|
||||||
kind: NIMCache
|
kind: NIMCache
|
||||||
metadata:
|
|
||||||
name: qwen3-32b-instruct
|
|
||||||
namespace: nim-service
|
|
||||||
spec:
|
|
||||||
source:
|
|
||||||
ngc:
|
|
||||||
modelPuller: nvcr.io/nim/qwen/qwen3-32b-dgx-spark:1.1.0-variant
|
|
||||||
pullSecret: ngc-secret
|
|
||||||
authSecret: ngc-api-secret
|
|
||||||
model:
|
|
||||||
engine: "vllm"
|
|
||||||
tensorParallelism: "1"
|
|
||||||
profiles:
|
|
||||||
- c4f105d92c72ab56200884dfacde9d2128b139755c06b9c883eeb3e287b7408a
|
|
||||||
storage:
|
|
||||||
pvc:
|
|
||||||
create: true
|
|
||||||
size: "100Gi"
|
|
||||||
volumeAccessMode: ReadWriteOnce
|
|
||||||
---
|
|
||||||
apiVersion: apps.nvidia.com/v1alpha1
|
|
||||||
kind: NIMCache
|
|
||||||
metadata:
|
metadata:
|
||||||
name: qwen36-27b-fp8
|
name: qwen36-27b-fp8
|
||||||
namespace: nim-service
|
namespace: nim-service
|
||||||
@@ -65,9 +43,9 @@ spec:
|
|||||||
- --host
|
- --host
|
||||||
- 0.0.0.0
|
- 0.0.0.0
|
||||||
- --served-model-name
|
- --served-model-name
|
||||||
- qwen36
|
- Qwen/Qwen3.6-27B-FP8
|
||||||
- --gpu-memory-utilization
|
- --gpu-memory-utilization
|
||||||
- "0.85"
|
- "0.90"
|
||||||
- --max-model-len
|
- --max-model-len
|
||||||
- "256000"
|
- "256000"
|
||||||
- --language-model-only
|
- --language-model-only
|
||||||
|
|||||||
Reference in New Issue
Block a user