init commit

This commit is contained in:
2026-05-05 11:15:49 -05:00
commit 06f52750ac
24 changed files with 1158 additions and 0 deletions

View File

@@ -0,0 +1,13 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: gpu-node-labels
namespace: gpu-operator
data:
labels.yaml: |
- key: accelerator
value: nvidia-tesla
- key: nvidia.com/gpu.present
value: "true"
- key: topology.kubernetes.io/zone
value: "dgx-spark"

View File

@@ -0,0 +1,70 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: nvidia
namespace: gpu-operator
spec:
interval: 10m
url: https://nvidia.github.io/gpu-operator
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: gpu-operator
namespace: gpu-operator
spec:
interval: 10m
chart:
spec:
chart: gpu-operator
version: "v23.9.1"
sourceRef:
kind: HelmRepository
name: nvidia
namespace: gpu-operator
values:
driver:
enabled: true
image: "nvcr.io/nvidia/driver"
version: "535.129.03"
operator:
defaultRuntime: nvidia-container-runtime
toolkit:
enabled: true
image: "nvcr.io/nvidia/k8s-device-plugin"
version: "v0.14.0"
env:
- name: CONTAINERD_CONFIG
value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
- name: CONTAINERD_SOCKET
value: /run/k3s/containerd/containerd.sock
- name: CONTAINERD_RUNTIME_CLASS
value: nvidia
devicePlugin:
enabled: true
image: "nvcr.io/nvidia/k8s-device-plugin"
version: "v0.14.0"
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 500Mi
dcgm:
enabled: true
image: "nvcr.io/nvidia/dcgm-exporter"
version: "3.3.3-3.1.0-ubuntu22.04"
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 500Mi
migManager:
enabled: true
gfd:
enabled: true
node-feature-discovery:
enabled: true

View File

@@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: gpu-operator
labels:
openshift.io/cluster-monitoring: "true"

View File

@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: gpu-operator
resources:
- gpu-operator-namespace.yaml
- gpu-operator-helmrelease.yaml
- gpu-node-labels.yaml

View File

@@ -0,0 +1,39 @@
apiVersion: v1
kind: Namespace
metadata:
name: gpu-operator
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: gpu-operator
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
name: nvidia-device-plugin-ctr
args: ["--fail-on-init-error=false"]
env:
- name: NVIDIA_VISIBLE_DEVICES
value: "all"
- name: MIG_STRATEGY
value: "single"
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins

View File

@@ -0,0 +1,16 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
overhead:
podFixed:
memory: "1Gi"
cpu: "500m"
scheduling:
nodeSelector:
nvidia.com/gpu.present: "true"
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule