apiVersion: "serving.kserve.io/v1beta1" kind: "InferenceService" metadata: name: "qwen-llm" namespace: kserve spec: predictor: model: modelFormat: name: huggingface args: - --model_name=qwen storageUri: "hf://Qwen/Qwen2.5-0.5B-Instruct" resources: limits: cpu: "2" memory: 6Gi nvidia.com/gpu: "1" requests: cpu: "1" memory: 4Gi nvidia.com/gpu: "1"