diff --git a/clusters/k3s-dgx/nim-service/llama.yaml b/clusters/k3s-dgx/nim-service/llama.yaml index 7b37660..64b44e8 100644 --- a/clusters/k3s-dgx/nim-service/llama.yaml +++ b/clusters/k3s-dgx/nim-service/llama.yaml @@ -1,21 +1,21 @@ apiVersion: apps.nvidia.com/v1alpha1 kind: NIMCache metadata: - name: meta-llama-3-2-1b-instruct + name: meta-llama-3-1-8b-instruct namespace: nim-service spec: source: ngc: - modelPuller: nvcr.io/nim/meta/llama-3.2-1b-instruct:1.12.0 + modelPuller: nvcr.io/nim/meta/llama-3_1-8b-instruct:2.0.3 pullSecret: ngc-secret authSecret: ngc-api-secret model: - engine: tensorrt_llm + engine: "vllm" tensorParallelism: "1" storage: pvc: create: true - size: "50Gi" + size: "100Gi" volumeAccessMode: ReadWriteOnce --- apiVersion: apps.nvidia.com/v1alpha1 @@ -25,16 +25,16 @@ metadata: namespace: nim-service spec: image: - repository: nvcr.io/nim/meta/llama-3.2-1b-instruct - tag: "1.12.0" + repository: nvcr.io/nim/meta/llama-3_1-8b-instruct + tag: "2.0.3" pullPolicy: IfNotPresent pullSecrets: - ngc-secret authSecret: ngc-api-secret storage: nimCache: - name: meta-llama-3-2-1b-instruct - profile: '' + name: meta-llama-3-1-8b-instruct + readOnly: true replicas: 1 resources: limits: