use correctly
This commit is contained in:
@@ -2,21 +2,38 @@ apiVersion: serving.kserve.io/v1alpha1
|
|||||||
kind: LLMInferenceService
|
kind: LLMInferenceService
|
||||||
metadata:
|
metadata:
|
||||||
name: huihui-granite
|
name: huihui-granite
|
||||||
namespace: kserve
|
|
||||||
spec:
|
spec:
|
||||||
predictor:
|
model:
|
||||||
model:
|
modelFormat:
|
||||||
modelFormat:
|
name: huggingface
|
||||||
name: huggingface
|
storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
|
||||||
args:
|
replicas: 1
|
||||||
- --model_name=huihui-granite
|
template:
|
||||||
storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
|
containers:
|
||||||
resources:
|
- name: main
|
||||||
limits:
|
image: quay.io/pierdipi/vllm-cpu:latest
|
||||||
cpu: "4"
|
args:
|
||||||
memory: 16Gi
|
- --model_name=huihui-granite
|
||||||
nvidia.com/gpu: "1"
|
securityContext:
|
||||||
requests:
|
runAsNonRoot: false
|
||||||
cpu: "2"
|
env:
|
||||||
memory: 8Gi
|
- name: VLLM_LOGGING_LEVEL
|
||||||
nvidia.com/gpu: "1"
|
value: DEBUG
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: "4"
|
||||||
|
memory: 16Gi
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
|
requests:
|
||||||
|
cpu: "2"
|
||||||
|
memory: 8Gi
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
|
livenessProbe:
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 30
|
||||||
|
failureThreshold: 5
|
||||||
|
router:
|
||||||
|
gateway: {}
|
||||||
|
route: {}
|
||||||
|
scheduler: {}
|
||||||
|
|||||||
Reference in New Issue
Block a user