Files
edge-gitops/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
2026-05-05 15:21:10 -05:00

36 lines
887 B
YAML

apiVersion: serving.kserve.io/v1alpha1
kind: LLMInferenceService
metadata:
name: huihui-granite
spec:
model:
uri: hf://huihui-ai/Huihui-granite-4.1-30b-abliterated
name: huihui-ai/Huihui-granite-4.1-30b-abliterated
# Three replicas for load balancing
replicas: 1
router:
scheduler: { } # Default scheduler with default load balancing
route: { }
gateway: { }
template:
containers:
- name: main
resources:
limits:
cpu: "4"
memory: 16Gi
nvidia.com/gpu: "1"
requests:
cpu: "2"
memory: 8Gi
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /health
port: 8000
scheme: HTTPS
initialDelaySeconds: 120
periodSeconds: 30
timeoutSeconds: 30
failureThreshold: 5