apiVersion: apps.nvidia.com/v1alpha1 kind: NIMCache metadata: name: qwen3-32b-instruct namespace: nim-service spec: source: ngc: modelPuller: nvcr.io/nim/qwen/qwen3-32b-dgx-spark:1.1.0-variant pullSecret: ngc-secret authSecret: ngc-api-secret model: engine: "vllm" tensorParallelism: "1" profiles: - c4f105d92c72ab56200884dfacde9d2128b139755c06b9c883eeb3e287b7408a storage: pvc: create: true size: "100Gi" volumeAccessMode: ReadWriteOnce --- apiVersion: apps.nvidia.com/v1alpha1 kind: NIMService metadata: name: qwen3-32b-instruct namespace: nim-service spec: env: - name: GPU_MEMORY_UTILIZATION value: "0.90" - name: NIM_MAX_NUM_SEQS value: "32" - name: NIM_MAX_NUM_BATCHED_TOKENS value: "16384" - name: ENABLE_AUTO_TOOL_CHOICE value: "true" - name: ENABLE_PREFIX_CACHING value: "true" - name: TRUNCATION_SIDE value: "left" - name: VLLM_LOGGING_LEVEL value: "INFO" - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN value: "true" - name: NIM_MAX_MODEL_LEN value: "32768" - name: NIM_ENABLE_KV_CACHE_REUSE value: "1" image: repository: nvcr.io/nim/qwen/qwen3-32b-dgx-spark tag: "1.1.0-variant" pullPolicy: IfNotPresent pullSecrets: - ngc-secret authSecret: ngc-api-secret storage: nimCache: name: qwen3-32b-instruct replicas: 1 resources: limits: nvidia.com/gpu: 1 expose: service: type: ClusterIP port: 8000 --- apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: llm-route namespace: nim-service spec: parentRefs: - name: envoy-gateway namespace: default hostnames: - "mcp.corredorconect.com" rules: - matches: - path: type: PathPrefix value: / backendRefs: - name: qwen3-32b-instruct port: 8000