apiVersion: apps.nvidia.com/v1alpha1 kind: NIMCache metadata: name: qwen36-27b-fp8 namespace: nim-service spec: source: hf: endpoint: "https://huggingface.co" namespace: "Qwen" authSecret: hf-api-secret modelPuller: nvcr.io/nim/nvidia/llm-nim:1.12 pullSecret: ngc-secret modelName: "Qwen3.6-27B-FP8" storage: pvc: create: true size: "50Gi" volumeAccessMode: ReadWriteOnce --- apiVersion: apps.nvidia.com/v1alpha1 kind: NIMService metadata: name: qwen36-27b-fp8 namespace: nim-service spec: userID: 0 groupID: 0 image: repository: scitrera/dgx-spark-vllm tag: 0.17.0-t4 pullPolicy: IfNotPresent env: - name: VLLM_CACHE_ROOT value: /model-store/vllm-cache - name: VLLM_DISABLE_LOGGING value: "1" # Remove logging overhead command: - python3 args: - -m - vllm.entrypoints.openai.api_server - --model - /model-store - --host - 0.0.0.0 - --served-model-name - Qwen/Qwen3.6-27B-FP8 - --gpu-memory-utilization - "0.80" # ↑ More memory for KV cache / bigger batches - --max-model-len - "262144" - --max-num-batched-tokens - "16384" - --max-num-seqs - "4" - --enable-prefix-caching - --enable-chunked-prefill - --load-format - instanttensor - --attention-backend - flashinfer - --dtype - auto - --kv-cache-dtype - fp8 - --trust-remote-code - --enable-auto-tool-choice - --tool-call-parser - qwen3_coder - --reasoning-parser - qwen3 - --default-chat-template-kwargs - '{"preserve_thinking": true}' - --override-generation-config - '{"temperature": 0.6, "top_p": 0.95, "top_k": 20, "min_p": 0.0, "presence_penalty": 0.0, "repetition_penalty": 1.0}' - --attention-backend - FLASHINFER authSecret: hf-api-secret storage: sharedMemorySizeLimit: 64Gi nimCache: name: qwen36-27b-fp8 replicas: 1 resources: limits: nvidia.com/gpu: 1 livenessProbe: enabled: true probe: httpGet: path: /health port: 8000 initialDelaySeconds: 120 periodSeconds: 30 timeoutSeconds: 10 failureThreshold: 10 readinessProbe: enabled: true probe: httpGet: path: /health port: 8000 initialDelaySeconds: 30 periodSeconds: 15 timeoutSeconds: 10 failureThreshold: 20 startupProbe: enabled: true probe: httpGet: path: /health port: 8000 initialDelaySeconds: 10 periodSeconds: 20 timeoutSeconds: 10 failureThreshold: 60 expose: service: type: ClusterIP port: 8000 --- apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: llm-route namespace: nim-service spec: parentRefs: - name: envoy-gateway namespace: default hostnames: - "mcp.corredorconect.com" rules: - matches: - path: type: PathPrefix value: / backendRefs: - name: qwen36-27b-fp8 port: 8000 timeouts: request: 600s backendRequest: 600s --- apiVersion: gateway.envoyproxy.io/v1alpha1 kind: SecurityPolicy metadata: name: llm-bearer-auth namespace: nim-service spec: targetRefs: - group: gateway.networking.k8s.io kind: HTTPRoute name: llm-route apiKeyAuth: credentialRefs: - group: "" kind: Secret name: mcp-bearer-token extractFrom: - headers: - authorization