apiVersion: serving.kserve.io/v1alpha1 kind: LLMInferenceService metadata: name: huihui-granite spec: model: uri: hf://huihui-ai/Huihui-granite-4.1-30b-abliterated name: huihui-ai/Huihui-granite-4.1-30b-abliterated # Three replicas for load balancing replicas: 1 router: scheduler: { } # Default scheduler with default load balancing route: { } gateway: { } template: containers: - name: main resources: limits: cpu: "4" memory: 16Gi nvidia.com/gpu: "1" requests: cpu: "2" memory: 8Gi nvidia.com/gpu: "1" livenessProbe: httpGet: path: /health port: 8000 scheme: HTTPS initialDelaySeconds: 120 periodSeconds: 30 timeoutSeconds: 30 failureThreshold: 5