cleanup

2026-05-05 11:31:59 -05:00
parent a1b1d936bd
commit f132fc5f0c
9 changed files with 41 additions and 136 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,40 @@
 # FluxCD secrets
 *.yaml.enc
 *.yaml.dec
 flux-gitea-key
 flux-gitea-key.pub
 # Kubernetes secrets
 *-secret.yaml
 secrets/
 # Temporary files
 *.tmp
 *.bak
 *.swp
 *~
 # IDE files
 .vscode/
 .idea/
 *.iml
 # OS files
 .DS_Store
 Thumbs.db
 # Logs
 *.log
 logs/
 # Build artifacts
 dist/
 build/
 *.tar.gz
 # Environment files
 .env
 .env.local
 # Helm
 charts/*.tgz
--- a/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
+++ b/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
@@ -1,39 +0,0 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: gpu-operator
 ---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: nvidia-device-plugin-daemonset
  namespace: gpu-operator
 spec:
  selector:
    matchLabels:
      name: nvidia-device-plugin-ds
  template:
    metadata:
      labels:
        name: nvidia-device-plugin-ds
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      containers:
        - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
          name: nvidia-device-plugin-ctr
          args: ["--fail-on-init-error=false"]
          env:
            - name: NVIDIA_VISIBLE_DEVICES
              value: "all"
            - name: MIG_STRATEGY
              value: "single"
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
--- a/clusters/k3s-dgx/gpu-support/runtime-class.yaml
+++ b/clusters/k3s-dgx/gpu-support/runtime-class.yaml
@@ -1,16 +0,0 @@
 apiVersion: node.k8s.io/v1
 kind: RuntimeClass
 metadata:
  name: nvidia
 handler: nvidia
 overhead:
  podFixed:
    memory: "1Gi"
    cpu: "500m"
 scheduling:
  nodeSelector:
    nvidia.com/gpu.present: "true"
  tolerations:
    - key: nvidia.com/gpu
      operator: Exists
      effect: NoSchedule
--- a/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
+++ b/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
@@ -1,42 +0,0 @@
 apiVersion: serving.kserve.io/v1beta1
 kind: ServingRuntime
 metadata:
  name: kserve-gpu-runtime
  namespace: kserve
 spec:
  supportedModelFormats:
    - name: tensorflow
      version: "2"
      autoSelect: true
    - name: pytorch
      version: "2"
      autoSelect: true
    - name: sklearn
      version: "1"
      autoSelect: true
    - name: xgboost
      version: "1"
      autoSelect: true
  protocol: v1
  protocolVersions:
    - v1
    - v2
  containers:
    - name: kserve-container
      image: kserve/sklearnserver-gpu:latest
      resources:
        requests:
          cpu: "1"
          memory: "2Gi"
          nvidia.com/gpu: "1"
        limits:
          cpu: "4"
          memory: "8Gi"
          nvidia.com/gpu: "1"
      env:
        - name: NVIDIA_VISIBLE_DEVICES
          value: "all"
        - name: NVIDIA_DRIVER_CAPABILITIES
          value: "compute,utility"
  multiModel: false
  disabled: false
--- a/clusters/k3s-dgx/kserve/kserve-controller.yaml
+++ b/clusters/k3s-dgx/kserve/kserve-controller.yaml
@@ -34,9 +34,6 @@ spec:
    config:
      ingress:
        className: istio
      storage:
        initialCapacity: 10Gi
        storageClassName: local-path
    knative:
      enabled: true
    istio:
--- a/clusters/k3s-dgx/kserve/kustomization.yaml
+++ b/clusters/k3s-dgx/kserve/kustomization.yaml
@@ -5,6 +5,3 @@ resources:
  - kserve-namespace.yaml
  - kserve-controller.yaml
  - istio-gateway.yaml
  # - gpu-serving-runtime.yaml
  # - model-storage-pvc.yaml
  # - storage-config.yaml
--- a/clusters/k3s-dgx/kserve/model-storage-pvc.yaml
+++ b/clusters/k3s-dgx/kserve/model-storage-pvc.yaml
@@ -1,12 +0,0 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: model-storage-pvc
  namespace: kserve
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi
  storageClassName: local-path
--- a/clusters/k3s-dgx/kserve/storage-config.yaml
+++ b/clusters/k3s-dgx/kserve/storage-config.yaml
@@ -1,20 +0,0 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: storage-config
  namespace: kserve
 data:
  storageConfig.yaml: |
    defaultStorageUri: "pvc://model-storage"
    storageSpec:
      - name: model-storage
        type: pvc
        pvcSpec:
          accessModes:
            - ReadWriteOnce
          resources:
            requests:
              storage: 50Gi
          storageClassName: local-path
    modelCacheSize: 10Gi
    modelCacheMemory: 2Gi