From 778083b9f647a1b63692192f2531ed5da88f477b Mon Sep 17 00:00:00 2001
From: HaimKortovich <haimkortovich88@gmail.com>
Date: Tue, 5 May 2026 15:13:59 -0500
Subject: [PATCH] use correctly

---
 .../apps/huihui-granite-inference.yaml        | 51 ++++++++++++-------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/clusters/k3s-dgx/apps/huihui-granite-inference.yaml b/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
index d661648..114be8e 100644
--- a/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
+++ b/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
@@ -2,21 +2,38 @@ apiVersion: serving.kserve.io/v1alpha1
 kind: LLMInferenceService
 metadata:
   name: huihui-granite
-  namespace: kserve
 spec:
-  predictor:
-    model:
-      modelFormat:
-        name: huggingface
-      args:
-        - --model_name=huihui-granite
-      storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
-      resources:
-        limits:
-          cpu: "4"
-          memory: 16Gi
-          nvidia.com/gpu: "1"
-        requests:
-          cpu: "2"
-          memory: 8Gi
-          nvidia.com/gpu: "1"
+  model:
+    modelFormat:
+      name: huggingface
+    storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
+  replicas: 1
+  template:
+    containers:
+      - name: main
+        image: quay.io/pierdipi/vllm-cpu:latest
+        args:
+          - --model_name=huihui-granite
+        securityContext:
+          runAsNonRoot: false
+        env:
+          - name: VLLM_LOGGING_LEVEL
+            value: DEBUG
+        resources:
+          limits:
+            cpu: "4"
+            memory: 16Gi
+            nvidia.com/gpu: "1"
+          requests:
+            cpu: "2"
+            memory: 8Gi
+            nvidia.com/gpu: "1"
+        livenessProbe:
+          initialDelaySeconds: 30
+          periodSeconds: 30
+          timeoutSeconds: 30
+          failureThreshold: 5
+  router:
+    gateway: {}
+    route: {}
+    scheduler: {}