init commit
This commit is contained in:
13
clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
Normal file
13
clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: gpu-node-labels
|
||||
namespace: gpu-operator
|
||||
data:
|
||||
labels.yaml: |
|
||||
- key: accelerator
|
||||
value: nvidia-tesla
|
||||
- key: nvidia.com/gpu.present
|
||||
value: "true"
|
||||
- key: topology.kubernetes.io/zone
|
||||
value: "dgx-spark"
|
||||
70
clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
Normal file
70
clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: nvidia
|
||||
namespace: gpu-operator
|
||||
spec:
|
||||
interval: 10m
|
||||
url: https://nvidia.github.io/gpu-operator
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: gpu-operator
|
||||
namespace: gpu-operator
|
||||
spec:
|
||||
interval: 10m
|
||||
chart:
|
||||
spec:
|
||||
chart: gpu-operator
|
||||
version: "v23.9.1"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: nvidia
|
||||
namespace: gpu-operator
|
||||
values:
|
||||
driver:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/driver"
|
||||
version: "535.129.03"
|
||||
operator:
|
||||
defaultRuntime: nvidia-container-runtime
|
||||
toolkit:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/k8s-device-plugin"
|
||||
version: "v0.14.0"
|
||||
env:
|
||||
- name: CONTAINERD_CONFIG
|
||||
value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
|
||||
- name: CONTAINERD_SOCKET
|
||||
value: /run/k3s/containerd/containerd.sock
|
||||
- name: CONTAINERD_RUNTIME_CLASS
|
||||
value: nvidia
|
||||
devicePlugin:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/k8s-device-plugin"
|
||||
version: "v0.14.0"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 500Mi
|
||||
dcgm:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/dcgm-exporter"
|
||||
version: "3.3.3-3.1.0-ubuntu22.04"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 500Mi
|
||||
migManager:
|
||||
enabled: true
|
||||
gfd:
|
||||
enabled: true
|
||||
node-feature-discovery:
|
||||
enabled: true
|
||||
6
clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
Normal file
6
clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: gpu-operator
|
||||
labels:
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
7
clusters/k3s-dgx/gpu-support/kustomization.yaml
Normal file
7
clusters/k3s-dgx/gpu-support/kustomization.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: gpu-operator
|
||||
resources:
|
||||
- gpu-operator-namespace.yaml
|
||||
- gpu-operator-helmrelease.yaml
|
||||
- gpu-node-labels.yaml
|
||||
39
clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
Normal file
39
clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: gpu-operator
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-device-plugin-daemonset
|
||||
namespace: gpu-operator
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: nvidia-device-plugin-ds
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-device-plugin-ds
|
||||
spec:
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
|
||||
name: nvidia-device-plugin-ctr
|
||||
args: ["--fail-on-init-error=false"]
|
||||
env:
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: "all"
|
||||
- name: MIG_STRATEGY
|
||||
value: "single"
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
16
clusters/k3s-dgx/gpu-support/runtime-class.yaml
Normal file
16
clusters/k3s-dgx/gpu-support/runtime-class.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: nvidia
|
||||
handler: nvidia
|
||||
overhead:
|
||||
podFixed:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
scheduling:
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.present: "true"
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
Reference in New Issue
Block a user