apiVersion: apps.nvidia.com/v1alpha1 kind: NIMCache metadata: name: qwen3-32b-instruct namespace: nim-service spec: source: ngc: modelPuller: nvcr.io/nim/qwen/qwen3-32b-dgx-spark:1.1.0-variant pullSecret: ngc-secret authSecret: ngc-api-secret model: engine: "vllm" tensorParallelism: "1" profiles: - c4f105d92c72ab56200884dfacde9d2128b139755c06b9c883eeb3e287b7408a storage: pvc: create: true size: "100Gi" volumeAccessMode: ReadWriteOnce --- apiVersion: apps.nvidia.com/v1alpha1 kind: NIMCache metadata: name: qwen36-27b-fp8 namespace: nim-service spec: source: hf: endpoint: "https://huggingface.co" namespace: "Qwen" authSecret: hf-api-secret modelPuller: nvcr.io/nim/nvidia/llm-nim:1.12 pullSecret: ngc-secret modelName: "Qwen3.6-27B-FP8" storage: pvc: create: true size: "50Gi" volumeAccessMode: ReadWriteOnce --- apiVersion: apps.nvidia.com/v1alpha1 kind: NIMService metadata: name: qwen36-27b-fp8 namespace: nim-service spec: args: - "--gpu-memory-utilization=0.85" - "--served-model-name=qwen36" - "--max-model-len=256K" - "--language-model-only" - "--reasoning-parser=qwen3" - "--enable-auto-tool-choice" - "--tool-call-parser=qwen3_coder" - "--enable-chunked-prefill" - "--max-num-batched-tokens=32768" - "--max-num-seqs=10" - "--enable-prefix-caching" - '--speculative-config={"method":"mtp","num_speculative_tokens":2}' image: repository: nvcr.io/nim/nvidia/llm-nim tag: "1.12" pullPolicy: IfNotPresent pullSecrets: - ngc-secret authSecret: ngc-api-secret storage: nimCache: name: qwen36-27b-fp8 replicas: 1 resources: limits: nvidia.com/gpu: 1 expose: service: type: ClusterIP port: 8000 --- apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: llm-route namespace: nim-service spec: parentRefs: - name: envoy-gateway namespace: default hostnames: - "mcp.corredorconect.com" rules: - matches: - path: type: PathPrefix value: / backendRefs: - name: qwen3-32b-instruct port: 8000 timeouts: request: 600s backendRequest: 600s --- apiVersion: gateway.envoyproxy.io/v1alpha1 kind: SecurityPolicy metadata: name: llm-bearer-auth namespace: nim-service spec: targetRefs: - group: gateway.networking.k8s.io kind: HTTPRoute name: llm-route apiKeyAuth: credentialRefs: - group: "" kind: Secret name: mcp-bearer-token extractFrom: - headers: - authorization