apiVersion: serving.kserve.io/v1alpha1 kind: LLMInferenceService metadata: name: huihui-granite spec: model: modelFormat: name: huggingface storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated" replicas: 1 template: containers: - name: main image: quay.io/pierdipi/vllm-cpu:latest args: - --model_name=huihui-granite securityContext: runAsNonRoot: false env: - name: VLLM_LOGGING_LEVEL value: DEBUG resources: limits: cpu: "4" memory: 16Gi nvidia.com/gpu: "1" requests: cpu: "2" memory: 8Gi nvidia.com/gpu: "1" livenessProbe: initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 30 failureThreshold: 5 router: gateway: {} route: {} scheduler: {}