36 lines
887 B
YAML
36 lines
887 B
YAML
apiVersion: serving.kserve.io/v1alpha1
|
|
kind: LLMInferenceService
|
|
metadata:
|
|
name: huihui-granite
|
|
spec:
|
|
model:
|
|
uri: hf://huihui-ai/Huihui-granite-4.1-30b-abliterated
|
|
name: huihui-ai/Huihui-granite-4.1-30b-abliterated
|
|
# Three replicas for load balancing
|
|
replicas: 1
|
|
router:
|
|
scheduler: { } # Default scheduler with default load balancing
|
|
route: { }
|
|
gateway: { }
|
|
template:
|
|
containers:
|
|
- name: main
|
|
resources:
|
|
limits:
|
|
cpu: "4"
|
|
memory: 16Gi
|
|
nvidia.com/gpu: "1"
|
|
requests:
|
|
cpu: "2"
|
|
memory: 8Gi
|
|
nvidia.com/gpu: "1"
|
|
livenessProbe:
|
|
httpGet:
|
|
path: /health
|
|
port: 8000
|
|
scheme: HTTPS
|
|
initialDelaySeconds: 120
|
|
periodSeconds: 30
|
|
timeoutSeconds: 30
|
|
failureThreshold: 5
|