use correctly

2026-05-05 15:13:59 -05:00
parent 87f98fbca6
commit 778083b9f6
1 changed files with 34 additions and 17 deletions
--- a/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
+++ b/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
@@ -2,21 +2,38 @@ apiVersion: serving.kserve.io/v1alpha1
 kind: LLMInferenceService
 metadata:
  name: huihui-granite
  namespace: kserve
 spec:
-  predictor:
+  model:
-    model:
+    modelFormat:
-      modelFormat:
+      name: huggingface
-        name: huggingface
+    storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
-      args:
+  replicas: 1
-        - --model_name=huihui-granite
+  template:
-      storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
+    containers:
-      resources:
+      - name: main
-        limits:
+        image: quay.io/pierdipi/vllm-cpu:latest
-          cpu: "4"
+        args:
-          memory: 16Gi
+          - --model_name=huihui-granite
-          nvidia.com/gpu: "1"
+        securityContext:
-        requests:
+          runAsNonRoot: false
-          cpu: "2"
+        env:
-          memory: 8Gi
+          - name: VLLM_LOGGING_LEVEL
-          nvidia.com/gpu: "1"
+            value: DEBUG
        resources:
          limits:
            cpu: "4"
            memory: 16Gi
            nvidia.com/gpu: "1"
          requests:
            cpu: "2"
            memory: 8Gi
            nvidia.com/gpu: "1"
        livenessProbe:
          initialDelaySeconds: 30
          periodSeconds: 30
          timeoutSeconds: 30
          failureThreshold: 5
  router:
    gateway: {}
    route: {}
    scheduler: {}