diff --git a/clusters/k3s-dgx/nim-service/kustomization.yaml b/clusters/k3s-dgx/nim-service/kustomization.yaml index bdea9fc..e8a542a 100644 --- a/clusters/k3s-dgx/nim-service/kustomization.yaml +++ b/clusters/k3s-dgx/nim-service/kustomization.yaml @@ -3,5 +3,4 @@ kind: Kustomization namespace: nim-service resources: - namespace.yaml - - llama.yaml - qwen.yaml diff --git a/clusters/k3s-dgx/nim-service/llama.yaml b/clusters/k3s-dgx/nim-service/llama.yaml deleted file mode 100644 index f088dac..0000000 --- a/clusters/k3s-dgx/nim-service/llama.yaml +++ /dev/null @@ -1,68 +0,0 @@ -apiVersion: apps.nvidia.com/v1alpha1 -kind: NIMCache -metadata: - name: meta-llama-3-1-8b-instruct - namespace: nim-service -spec: - source: - ngc: - modelPuller: nvcr.io/nim/meta/llama-3.1-8b-instruct:2.0.3 - pullSecret: ngc-secret - authSecret: ngc-api-secret - model: - engine: "vllm" - tensorParallelism: "1" - storage: - pvc: - create: true - size: "100Gi" - volumeAccessMode: ReadWriteOnce ---- -apiVersion: apps.nvidia.com/v1alpha1 -kind: NIMService -metadata: - name: meta-llama-3-1-8b-instruct - namespace: nim-service -spec: - args: - - --enable-auto-tool-choice - - --tool-call-parser - - llama3_json - image: - repository: nvcr.io/nim/meta/llama-3.1-8b-instruct - tag: "2.0.3" - pullPolicy: IfNotPresent - pullSecrets: - - ngc-secret - authSecret: ngc-api-secret - storage: - nimCache: - name: meta-llama-3-1-8b-instruct - replicas: 1 - resources: - limits: - nvidia.com/gpu: 1 - expose: - service: - type: ClusterIP - port: 8000 ---- -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: llm-route - namespace: nim-service -spec: - parentRefs: - - name: envoy-gateway - namespace: default - hostnames: - - "mcp.corredorconect.com" - rules: - - matches: - - path: - type: PathPrefix - value: / - backendRefs: - - name: meta-llama-3-1-8b-instruct - port: 8000 diff --git a/clusters/k3s-dgx/nim-service/qwen.yaml b/clusters/k3s-dgx/nim-service/qwen.yaml index 362c127..c2086bb 100644 --- a/clusters/k3s-dgx/nim-service/qwen.yaml +++ b/clusters/k3s-dgx/nim-service/qwen.yaml @@ -18,3 +18,51 @@ spec: size: "100Gi" volumeAccessMode: ReadWriteOnce --- +apiVersion: apps.nvidia.com/v1alpha1 +kind: NIMService +metadata: + name: qwen3-32b-instruct + namespace: nim-service +spec: + args: + - --enable-auto-tool-choice + - --tool-call-parser + - llama3_json + image: + repository: nvcr.io/nim/qwen/qwen3-32b-dgx-spark + tag: "1.1.0-variant" + pullPolicy: IfNotPresent + pullSecrets: + - ngc-secret + authSecret: ngc-api-secret + storage: + nimCache: + name: qwen3-32b-instruct + replicas: 1 + resources: + limits: + nvidia.com/gpu: 1 + expose: + service: + type: ClusterIP + port: 8000 +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route + namespace: nim-service +spec: + parentRefs: + - name: envoy-gateway + namespace: default + hostnames: + - "mcp.corredorconect.com" + rules: + - matches: + - path: + type: PathPrefix + value: / + backendRefs: + - name: qwen3-32b-instruct + port: 8000