apiVersion: apps.nvidia.com/v1alpha1 kind: NIMCache metadata: name: meta-llama-3-1-8b-instruct namespace: nim-service spec: source: ngc: modelPuller: nvcr.io/nim/meta/llama-3.1-8b-instruct:2.0.3 pullSecret: ngc-secret authSecret: ngc-api-secret model: engine: "vllm" tensorParallelism: "1" storage: pvc: create: true size: "100Gi" volumeAccessMode: ReadWriteOnce --- apiVersion: apps.nvidia.com/v1alpha1 kind: NIMService metadata: name: meta-llama-3-1-8b-instruct namespace: nim-service spec: image: repository: nvcr.io/nim/meta/llama-3.1-8b-instruct tag: "2.0.3" pullPolicy: IfNotPresent pullSecrets: - ngc-secret authSecret: ngc-api-secret storage: nimCache: name: meta-llama-3-1-8b-instruct replicas: 1 resources: limits: nvidia.com/gpu: 1 expose: service: type: ClusterIP port: 8000