commit 06f52750ac10f2348e0528f785877bdd11e2dd22 Author: HaimKortovich Date: Tue May 5 11:15:49 2026 -0500 init commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..460be6c --- /dev/null +++ b/Makefile @@ -0,0 +1,61 @@ +.PHONY: help bootstrap sync status logs clean test + +help: ## Show this help message + @echo 'Usage: make [target]' + @echo '' + @echo 'Available targets:' + @awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST) + +bootstrap: ## Bootstrap FluxCD on the cluster + @./bootstrap.sh + +sync: ## Force sync all FluxCD resources + @flux reconcile kustomization flux-system --with-source + @flux reconcile helmrelease --all + +status: ## Show status of all FluxCD resources + @flux get all --all-namespaces + +logs: ## Show FluxCD logs + @flux logs --all-namespaces + +clean: ## Remove FluxCD from cluster + @flux uninstall --namespace=flux-system --silent + +test: ## Test the cluster connectivity + @echo "Testing cluster connectivity..." + @kubectl cluster-info + @echo "" + @echo "Testing GPU availability..." + @kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' + @echo "" + @echo "Testing FluxCD..." + @flux check + +apply: ## Apply all manifests directly (for testing) + @kubectl apply -k clusters/k3s-dgx + +diff: ## Show diff between cluster and git + @flux diff kustomization flux-system --with-source + +restart: ## Restart all FluxCD controllers + @kubectl rollout restart deployment/source-controller -n flux-system + @kubectl rollout restart deployment/kustomize-controller -n flux-system + @kubectl rollout restart deployment/helm-controller -n flux-system + @kubectl rollout restart deployment/notification-controller -n flux-system + +gpu-status: ## Show GPU status + @kubectl describe nodes | grep -A 5 "nvidia.com/gpu" + +kserve-status: ## Show KServe status + @kubectl get inferenceservices -n kserve + @kubectl get pods -n kserve + +model-logs: ## Show model inference logs + @kubectl logs -n kserve -l serving.kserve.io/inferenceservice=huihui-granite --tail=100 -f + +secrets: ## Generate example secrets (DO NOT COMMIT) + @echo "Creating example secrets directory..." + @mkdir -p secrets + @echo "# Add your secrets here" > secrets/README.md + @echo "# DO NOT commit actual secrets to git" >> secrets/README.md \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bb5df18 --- /dev/null +++ b/README.md @@ -0,0 +1,165 @@ +# Edge GitOps - KServe on k3s with GPU + +GitOps setup for deploying ML models using KServe on a k3s cluster with GPU support (DGX Spark). + +## Prerequisites + +- k3s cluster with GPU support +- kubectl configured to access the cluster +- Gitea instance for GitOps repository +- FluxCD CLI installed + +## Architecture + +``` +edge-gitops/ +├── clusters/ +│ └── k3s-dgx/ +│ ├── flux-system/ # FluxCD installation +│ ├── gpu-support/ # NVIDIA GPU Operator +│ ├── kserve/ # KServe installation +│ └── apps/ # ML model deployments +├── apps/ # Reusable app manifests +└── infrastructure/ # Base infrastructure +``` + +## Setup Instructions + +### 1. Bootstrap FluxCD + +```bash +flux bootstrap git \ + --url=ssh://git@gitea.example.com/edge-gitops/edge-gitops.git \ + --branch=main \ + --path=clusters/k3s-dgx \ + --components=source-controller,kustomize-controller,helm-controller,notification-controller +``` + +### 2. Configure Gitea SSH Key + +Generate SSH key for FluxCD: +```bash +ssh-keygen -t ed25519 -N "" -f flux-gitea-key +``` + +Add the public key to your Gitea repository as a deploy key. + +### 3. Update Repository Configuration + +Edit `clusters/k3s-dgx/flux-system/gotk-sync.yaml` to match your Gitea URL: +```yaml +url: ssh://git@your-gitea-instance.com/edge-gitops/edge-gitops.git +``` + +### 4. Deploy the Stack + +Commit and push the changes: +```bash +git add . +git commit -m "Initial GitOps setup for KServe on k3s" +git push origin main +``` + +FluxCD will automatically sync the changes to your cluster. + +## Components + +### GPU Support +- NVIDIA GPU Operator (v23.9.1) +- NVIDIA Device Plugin +- DCGM Exporter for monitoring +- GPU Node Feature Discovery + +### KServe +- KServe Core (v0.12.0) +- GPU-enabled Serving Runtime +- Istio Gateway for networking +- Model Storage (PVC) + +### Example Model +- Huihui-granite-4.1-30b-abliterated (Hugging Face) +- GPU-accelerated inference +- REST API endpoint + +## Usage + +### Deploy a New Model + +1. Create a new InferenceService in `clusters/k3s-dgx/apps/`: +```yaml +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: your-model + namespace: kserve +spec: + predictor: + model: + modelFormat: + name: huggingface + storageUri: "hf://your-org/your-model" + resources: + limits: + nvidia.com/gpu: "1" +``` + +2. Commit and push changes + +### Test the Model + +```bash +# Get the service URL +kubectl get inferenceservice huihui-granite -n kserve + +# Test inference +curl -X POST http://your-service-url/v1/models/huihui-granite:predict \ + -H "Content-Type: application/json" \ + -d '{"inputs": [{"name": "text", "shape": [1], "datatype": "BYTES", "data": ["Hello world"]}]}' +``` + +## Monitoring + +Check FluxCD status: +```bash +flux get all --all-namespaces +``` + +Check GPU status: +```bash +kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' +``` + +Check KServe services: +```bash +kubectl get inferenceservices -n kserve +``` + +## Troubleshooting + +### GPU Not Available +```bash +kubectl describe node | grep -A 5 nvidia.com/gpu +``` + +### KServe Pods Not Starting +```bash +kubectl logs -n kserve deployment/kserve-controller-manager +kubectl get pods -n kserve +``` + +### FluxCD Sync Issues +```bash +flux reconcile kustomization flux-system --with-source +flux logs +``` + +## Customization + +### GPU Resources +Edit `clusters/k3s-dgx/apps/huihui-granite-inference.yaml` to adjust GPU allocation. + +### Storage +Modify `clusters/k3s-dgx/kserve/model-storage-pvc.yaml` for different storage requirements. + +### Networking +Update `clusters/k3s-dgx/kserve/istio-gateway.yaml` for custom ingress configuration. \ No newline at end of file diff --git a/SETUP.md b/SETUP.md new file mode 100644 index 0000000..f8038a0 --- /dev/null +++ b/SETUP.md @@ -0,0 +1,297 @@ +# Edge GitOps - Complete Setup Guide + +## Quick Start + +1. **Configure the setup:** + ```bash + ./configure.sh + ``` + +2. **Bootstrap FluxCD:** + ```bash + make bootstrap + ``` + +3. **Monitor the deployment:** + ```bash + make status + ``` + +## Directory Structure + +``` +edge-gitops/ +├── bootstrap.sh # FluxCD bootstrap script +├── configure.sh # Configuration wizard +├── Makefile # Convenient commands +├── README.md # Main documentation +├── .gitignore # Git ignore rules +├── .env # Environment variables (not committed) +│ +├── clusters/ +│ └── k3s-dgx/ # Cluster-specific configuration +│ ├── kustomization.yaml +│ │ +│ ├── flux-system/ # FluxCD installation +│ │ ├── kustomization.yaml +│ │ ├── gotk-components.yaml +│ │ └── gotk-sync.yaml +│ │ +│ ├── gpu-support/ # NVIDIA GPU Operator +│ │ ├── kustomization.yaml +│ │ ├── gpu-operator-namespace.yaml +│ │ ├── gpu-operator-helmrelease.yaml +│ │ └── gpu-node-labels.yaml +│ │ +│ ├── kserve/ # KServe installation +│ │ ├── kustomization.yaml +│ │ ├── kserve-namespace.yaml +│ │ ├── kserve-crds.yaml +│ │ ├── kserve-controller.yaml +│ │ ├── istio-gateway.yaml +│ │ ├── gpu-serving-runtime.yaml +│ │ ├── model-storage-pvc.yaml +│ │ └── storage-config.yaml +│ │ +│ └── apps/ # ML model deployments +│ ├── kustomization.yaml +│ └── huihui-granite-inference.yaml +│ +├── apps/ # Reusable application manifests +└── infrastructure/ # Base infrastructure components +``` + +## Component Details + +### FluxCD +- **Version:** Latest stable +- **Components:** source-controller, kustomize-controller, helm-controller, notification-controller +- **Sync Interval:** 1 minute for GitRepository, 10 minutes for Kustomization +- **Repository:** Gitea (configurable) + +### NVIDIA GPU Operator +- **Version:** v23.9.1 +- **Driver:** 535.129.03 +- **Components:** + - NVIDIA Driver + - Device Plugin + - DCGM Exporter + - MIG Manager + - Node Feature Discovery + +### KServe +- **Version:** v0.12.0 +- **Components:** + - KServe Controller + - Custom Resource Definitions + - GPU Serving Runtime + - Istio Integration + - Model Storage (50Gi PVC) + +### Example Model +- **Name:** Huihui-granite-4.1-30b-abliterated +- **Source:** Hugging Face +- **Resources:** + - CPU: 2-4 cores + - Memory: 8-16Gi + - GPU: 1 NVIDIA GPU + +## Common Tasks + +### Add a New Model + +1. Create a new InferenceService: + ```bash + cat > clusters/k3s-dgx/apps/your-model.yaml << EOF + apiVersion: serving.kserve.io/v1beta1 + kind: InferenceService + metadata: + name: your-model + namespace: kserve + spec: + predictor: + model: + modelFormat: + name: huggingface + storageUri: "hf://your-org/your-model" + resources: + limits: + nvidia.com/gpu: "1" + EOF + ``` + +2. Update the kustomization: + ```bash + echo " - your-model.yaml" >> clusters/k3s-dgx/apps/kustomization.yaml + ``` + +3. Commit and push: + ```bash + git add clusters/k3s-dgx/apps/ + git commit -m "Add new model deployment" + git push + ``` + +### Update GPU Resources + +Edit the resource limits in your InferenceService: +```yaml +resources: + limits: + cpu: "8" + memory: "32Gi" + nvidia.com/gpu: "2" + requests: + cpu: "4" + memory: "16Gi" + nvidia.com/gpu: "2" +``` + +### Monitor Model Performance + +```bash +# Get model endpoint +kubectl get inferenceservice your-model -n kserve + +# View logs +kubectl logs -n kserve -l serving.kserve.io/inferenceservice=your-model -f + +# Check GPU usage +kubectl exec -n kserve -- nvidia-smi +``` + +## Troubleshooting + +### FluxCD Not Syncing + +```bash +# Check FluxCD status +flux check + +# View logs +flux logs + +# Force sync +flux reconcile kustomization flux-system --with-source +``` + +### GPU Not Available + +```bash +# Check GPU nodes +kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' + +# Check GPU operator +kubectl get pods -n gpu-operator + +# View GPU operator logs +kubectl logs -n gpu-operator deployment/gpu-operator +``` + +### KServe Issues + +```bash +# Check KServe pods +kubectl get pods -n kserve + +# Check KServe controller +kubectl logs -n kserve deployment/kserve-controller-manager + +# Describe InferenceService +kubectl describe inferenceservice your-model -n kserve +``` + +## Security Considerations + +1. **Secrets Management:** + - Never commit secrets to git + - Use Kubernetes secrets for sensitive data + - Consider using Sealed Secrets or External Secrets Operator + +2. **Network Policies:** + - Review and restrict network access + - Use Istio for service mesh security + +3. **RBAC:** + - Review FluxCD service account permissions + - Implement principle of least privilege + +## Performance Optimization + +### GPU Optimization +- Use appropriate GPU resource requests/limits +- Monitor GPU utilization with DCGM Exporter +- Consider MIG (Multi-Instance GPU) for better isolation + +### Storage Optimization +- Use fast storage for model cache +- Consider using ReadWriteMany for multi-pod access +- Implement model caching strategies + +### Network Optimization +- Use Istio for efficient load balancing +- Configure appropriate timeouts for large models +- Consider using gRPC for internal communication + +## Scaling + +### Horizontal Scaling +```yaml +# Add to InferenceService +spec: + predictor: + replicas: 3 +``` + +### Vertical Scaling +```yaml +# Update resource limits +resources: + limits: + nvidia.com/gpu: "2" +``` + +## Monitoring + +### Metrics Collection +- DCGM Exporter for GPU metrics +- Prometheus for cluster metrics +- KServe metrics for inference performance + +### Logging +- Structured logging for all components +- Centralized logging with Loki/ELK +- Log retention policies + +### Alerting +- GPU utilization alerts +- Model health alerts +- Resource exhaustion alerts + +## Backup and Recovery + +### GitOps Backup +- All configuration is in git +- Easy rollback with git revert +- Branch-based testing + +### Data Backup +- Model storage backup +- Configuration backup +- Disaster recovery plan + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Test thoroughly +5. Submit a pull request + +## Support + +For issues and questions: +- Check the troubleshooting section +- Review component documentation +- Check FluxCD, KServe, and GPU Operator docs +- Open an issue in the repository \ No newline at end of file diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100755 index 0000000..176932a --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Configuration +GITEA_URL="${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}" +GITEA_BRANCH="${GITEA_BRANCH:-main}" +CLUSTER_NAME="${CLUSTER_NAME:-k3s-dgx}" +NAMESPACE="${NAMESPACE:-flux-system}" + +echo -e "${GREEN}=== Edge GitOps Bootstrap Script ===${NC}" +echo "" + +# Check prerequisites +echo -e "${YELLOW}Checking prerequisites...${NC}" + +if ! command -v kubectl &> /dev/null; then + echo -e "${RED}kubectl is not installed${NC}" + exit 1 +fi + +if ! command -v flux &> /dev/null; then + echo -e "${RED}flux is not installed${NC}" + echo "Install from: https://fluxcd.io/flux/installation/" + exit 1 +fi + +if ! kubectl cluster-info &> /dev/null; then + echo -e "${RED}Cannot connect to kubernetes cluster${NC}" + exit 1 +fi + +echo -e "${GREEN}✓ Prerequisites met${NC}" +echo "" + +# Check if Flux is already installed +if kubectl get namespace flux-system &> /dev/null; then + echo -e "${YELLOW}FluxCD is already installed${NC}" + read -p "Do you want to reinstall? (y/n) " -n 1 -r + echo + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Exiting..." + exit 0 + fi + echo -e "${YELLOW}Uninstalling existing FluxCD...${NC}" + flux uninstall --namespace=flux-system --silent || true +fi + +# Bootstrap FluxCD +echo -e "${YELLOW}Bootstrapping FluxCD...${NC}" +flux bootstrap git \ + --url="$GITEA_URL" \ + --branch="$GITEA_BRANCH" \ + --path="clusters/$CLUSTER_NAME" \ + --namespace="$NAMESPACE" \ + --components=source-controller,kustomize-controller,helm-controller,notification-controller \ + --timeout=10m + +echo -e "${GREEN}✓ FluxCD bootstrapped${NC}" +echo "" + +# Wait for FluxCD to be ready +echo -e "${YELLOW}Waiting for FluxCD components to be ready...${NC}" +kubectl wait --for=condition=ready --timeout=300s \ + -n "$NAMESPACE" \ + deployment/source-controller \ + deployment/kustomize-controller \ + deployment/helm-controller \ + deployment/notification-controller + +echo -e "${GREEN}✓ FluxCD components ready${NC}" +echo "" + +# Verify installation +echo -e "${YELLOW}Verifying installation...${NC}" +flux check + +echo "" +echo -e "${GREEN}=== Bootstrap Complete ===${NC}" +echo "" +echo "Next steps:" +echo "1. Update the Gitea URL in clusters/k3s-dgx/flux-system/gotk-sync.yaml" +echo "2. Commit and push the changes to your repository" +echo "3. Monitor the sync: flux get all --all-namespaces" +echo "" +echo "Useful commands:" +echo " flux get all --all-namespaces # Show all Flux resources" +echo " flux logs # Show Flux logs" +echo " flux reconcile kustomization flux-system --with-source # Force sync" +echo "" \ No newline at end of file diff --git a/clusters/k3s-dgx/apps/huihui-granite-inference.yaml b/clusters/k3s-dgx/apps/huihui-granite-inference.yaml new file mode 100644 index 0000000..bad3c46 --- /dev/null +++ b/clusters/k3s-dgx/apps/huihui-granite-inference.yaml @@ -0,0 +1,22 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: huihui-granite + namespace: kserve +spec: + predictor: + model: + modelFormat: + name: huggingface + args: + - --model_name=huihui-granite + storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated" + resources: + limits: + cpu: "4" + memory: 16Gi + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 8Gi + nvidia.com/gpu: "1" \ No newline at end of file diff --git a/clusters/k3s-dgx/apps/kustomization.yaml b/clusters/k3s-dgx/apps/kustomization.yaml new file mode 100644 index 0000000..345341d --- /dev/null +++ b/clusters/k3s-dgx/apps/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kserve +resources: + - huihui-granite-inference.yaml \ No newline at end of file diff --git a/clusters/k3s-dgx/flux-system/gotk-components.yaml b/clusters/k3s-dgx/flux-system/gotk-components.yaml new file mode 100644 index 0000000..bb5facd --- /dev/null +++ b/clusters/k3s-dgx/flux-system/gotk-components.yaml @@ -0,0 +1,50 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: flux-system +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: helm-controller + namespace: flux-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-reconciler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-admin +subjects: + - kind: ServiceAccount + name: helm-controller + namespace: flux-system +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: bitnami + namespace: flux-system +spec: + interval: 30m + url: https://charts.bitnami.com/bitnami +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: kserve + namespace: flux-system +spec: + interval: 30m + url: https://kserve.github.io/kserve +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: nvidia + namespace: flux-system +spec: + interval: 30m + url: https://nvidia.github.io/k8s-device-plugin \ No newline at end of file diff --git a/clusters/k3s-dgx/flux-system/gotk-sync.yaml b/clusters/k3s-dgx/flux-system/gotk-sync.yaml new file mode 100644 index 0000000..5132246 --- /dev/null +++ b/clusters/k3s-dgx/flux-system/gotk-sync.yaml @@ -0,0 +1,34 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: GitRepository +metadata: + name: flux-system + namespace: flux-system +spec: + interval: 1m0s + ref: + branch: main + secretRef: + name: flux-system + url: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: flux-system + namespace: flux-system +spec: + interval: 10m0s + path: ./clusters/k3s-dgx + prune: true + sourceRef: + kind: GitRepository + name: flux-system + healthChecks: + - apiVersion: apps/v1 + kind: Deployment + name: kustomize-controller + namespace: flux-system + - apiVersion: apps/v1 + kind: Deployment + name: helm-controller + namespace: flux-system \ No newline at end of file diff --git a/clusters/k3s-dgx/flux-system/kustomization.yaml b/clusters/k3s-dgx/flux-system/kustomization.yaml new file mode 100644 index 0000000..ca97d07 --- /dev/null +++ b/clusters/k3s-dgx/flux-system/kustomization.yaml @@ -0,0 +1,13 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - gotk-components.yaml + - gotk-sync.yaml +patches: + - patch: | + - op: add + path: /spec/template/spec/containers/0/args/- + value: --concurrency=20 + target: + kind: Deployment + name: "(kustomize-controller|helm-controller|notification-controller|source-controller)" \ No newline at end of file diff --git a/clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml b/clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml new file mode 100644 index 0000000..6551695 --- /dev/null +++ b/clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: gpu-node-labels + namespace: gpu-operator +data: + labels.yaml: | + - key: accelerator + value: nvidia-tesla + - key: nvidia.com/gpu.present + value: "true" + - key: topology.kubernetes.io/zone + value: "dgx-spark" \ No newline at end of file diff --git a/clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml b/clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml new file mode 100644 index 0000000..4178071 --- /dev/null +++ b/clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml @@ -0,0 +1,70 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: nvidia + namespace: gpu-operator +spec: + interval: 10m + url: https://nvidia.github.io/gpu-operator +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: gpu-operator + namespace: gpu-operator +spec: + interval: 10m + chart: + spec: + chart: gpu-operator + version: "v23.9.1" + sourceRef: + kind: HelmRepository + name: nvidia + namespace: gpu-operator + values: + driver: + enabled: true + image: "nvcr.io/nvidia/driver" + version: "535.129.03" + operator: + defaultRuntime: nvidia-container-runtime + toolkit: + enabled: true + image: "nvcr.io/nvidia/k8s-device-plugin" + version: "v0.14.0" + env: + - name: CONTAINERD_CONFIG + value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml + - name: CONTAINERD_SOCKET + value: /run/k3s/containerd/containerd.sock + - name: CONTAINERD_RUNTIME_CLASS + value: nvidia + devicePlugin: + enabled: true + image: "nvcr.io/nvidia/k8s-device-plugin" + version: "v0.14.0" + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + dcgm: + enabled: true + image: "nvcr.io/nvidia/dcgm-exporter" + version: "3.3.3-3.1.0-ubuntu22.04" + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 500Mi + migManager: + enabled: true + gfd: + enabled: true + node-feature-discovery: + enabled: true \ No newline at end of file diff --git a/clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml b/clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml new file mode 100644 index 0000000..96a4870 --- /dev/null +++ b/clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: gpu-operator + labels: + openshift.io/cluster-monitoring: "true" \ No newline at end of file diff --git a/clusters/k3s-dgx/gpu-support/kustomization.yaml b/clusters/k3s-dgx/gpu-support/kustomization.yaml new file mode 100644 index 0000000..03363b3 --- /dev/null +++ b/clusters/k3s-dgx/gpu-support/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: gpu-operator +resources: + - gpu-operator-namespace.yaml + - gpu-operator-helmrelease.yaml + - gpu-node-labels.yaml \ No newline at end of file diff --git a/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml b/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml new file mode 100644 index 0000000..49d19f5 --- /dev/null +++ b/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml @@ -0,0 +1,39 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: gpu-operator +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: gpu-operator +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + template: + metadata: + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0 + name: nvidia-device-plugin-ctr + args: ["--fail-on-init-error=false"] + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: MIG_STRATEGY + value: "single" + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins \ No newline at end of file diff --git a/clusters/k3s-dgx/gpu-support/runtime-class.yaml b/clusters/k3s-dgx/gpu-support/runtime-class.yaml new file mode 100644 index 0000000..adc5838 --- /dev/null +++ b/clusters/k3s-dgx/gpu-support/runtime-class.yaml @@ -0,0 +1,16 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia +overhead: + podFixed: + memory: "1Gi" + cpu: "500m" +scheduling: + nodeSelector: + nvidia.com/gpu.present: "true" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule \ No newline at end of file diff --git a/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml b/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml new file mode 100644 index 0000000..84a49d9 --- /dev/null +++ b/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml @@ -0,0 +1,42 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: ServingRuntime +metadata: + name: kserve-gpu-runtime + namespace: kserve +spec: + supportedModelFormats: + - name: tensorflow + version: "2" + autoSelect: true + - name: pytorch + version: "2" + autoSelect: true + - name: sklearn + version: "1" + autoSelect: true + - name: xgboost + version: "1" + autoSelect: true + protocol: v1 + protocolVersions: + - v1 + - v2 + containers: + - name: kserve-container + image: kserve/sklearnserver-gpu:latest + resources: + requests: + cpu: "1" + memory: "2Gi" + nvidia.com/gpu: "1" + limits: + cpu: "4" + memory: "8Gi" + nvidia.com/gpu: "1" + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" + multiModel: false + disabled: false \ No newline at end of file diff --git a/clusters/k3s-dgx/kserve/istio-gateway.yaml b/clusters/k3s-dgx/kserve/istio-gateway.yaml new file mode 100644 index 0000000..3bf4b70 --- /dev/null +++ b/clusters/k3s-dgx/kserve/istio-gateway.yaml @@ -0,0 +1,35 @@ +apiVersion: networking.istio.io/v1beta1 +kind: Gateway +metadata: + name: kserve-gateway + namespace: kserve +spec: + selector: + istio: ingressgateway + servers: + - port: + number: 80 + name: http + protocol: HTTP + hosts: + - "*" +--- +apiVersion: networking.istio.io/v1beta1 +kind: VirtualService +metadata: + name: kserve-vs + namespace: kserve +spec: + hosts: + - "*" + gateways: + - kserve-gateway + http: + - match: + - uri: + prefix: /v1/models/ + route: + - destination: + host: kserve-default + port: + number: 80 \ No newline at end of file diff --git a/clusters/k3s-dgx/kserve/kserve-controller.yaml b/clusters/k3s-dgx/kserve/kserve-controller.yaml new file mode 100644 index 0000000..9c4c0ac --- /dev/null +++ b/clusters/k3s-dgx/kserve/kserve-controller.yaml @@ -0,0 +1,43 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: kserve + namespace: kserve +spec: + interval: 10m + url: https://kserve.github.io/kserve +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: kserve + namespace: kserve +spec: + interval: 10m + chart: + spec: + chart: kserve + version: "v0.12.0" + sourceRef: + kind: HelmRepository + name: kserve + namespace: kserve + values: + controller: + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 2 + memory: 2Gi + config: + ingress: + className: istio + storage: + initialCapacity: 10Gi + storageClassName: local-path + knative: + enabled: true + istio: + enabled: true diff --git a/clusters/k3s-dgx/kserve/kserve-namespace.yaml b/clusters/k3s-dgx/kserve/kserve-namespace.yaml new file mode 100644 index 0000000..f102934 --- /dev/null +++ b/clusters/k3s-dgx/kserve/kserve-namespace.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kserve + labels: + istio-injection: enabled + serving.kserve.io/serving-runtime: "true" \ No newline at end of file diff --git a/clusters/k3s-dgx/kserve/kustomization.yaml b/clusters/k3s-dgx/kserve/kustomization.yaml new file mode 100644 index 0000000..19447b8 --- /dev/null +++ b/clusters/k3s-dgx/kserve/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kserve +resources: + - kserve-namespace.yaml + - kserve-controller.yaml + - istio-gateway.yaml + # - gpu-serving-runtime.yaml + # - model-storage-pvc.yaml + # - storage-config.yaml diff --git a/clusters/k3s-dgx/kserve/model-storage-pvc.yaml b/clusters/k3s-dgx/kserve/model-storage-pvc.yaml new file mode 100644 index 0000000..a7e81a8 --- /dev/null +++ b/clusters/k3s-dgx/kserve/model-storage-pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-storage-pvc + namespace: kserve +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: local-path \ No newline at end of file diff --git a/clusters/k3s-dgx/kserve/storage-config.yaml b/clusters/k3s-dgx/kserve/storage-config.yaml new file mode 100644 index 0000000..7cf2122 --- /dev/null +++ b/clusters/k3s-dgx/kserve/storage-config.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: storage-config + namespace: kserve +data: + storageConfig.yaml: | + defaultStorageUri: "pvc://model-storage" + storageSpec: + - name: model-storage + type: pvc + pvcSpec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: local-path + modelCacheSize: 10Gi + modelCacheMemory: 2Gi \ No newline at end of file diff --git a/clusters/k3s-dgx/kustomization.yaml b/clusters/k3s-dgx/kustomization.yaml new file mode 100644 index 0000000..5a7ac53 --- /dev/null +++ b/clusters/k3s-dgx/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - flux-system + - gpu-support + - kserve + - apps \ No newline at end of file diff --git a/configure.sh b/configure.sh new file mode 100755 index 0000000..642802a --- /dev/null +++ b/configure.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Configuration script for Edge GitOps + +set -e + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo -e "${BLUE}=== Edge GitOps Configuration ===${NC}" +echo "" + +# Function to update configuration files +update_config() { + local file=$1 + local key=$2 + local value=$3 + + if [ -f "$file" ]; then + sed -i "s|$key|$value|g" "$file" + echo -e "${GREEN}✓ Updated $file${NC}" + else + echo -e "${RED}✗ File not found: $file${NC}" + fi +} + +# Get Gitea URL +echo -e "${YELLOW}Enter your Gitea repository URL:${NC}" +read -p "URL (default: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git): " GITEA_URL +GITEA_URL=${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git} + +# Get branch name +echo -e "${YELLOW}Enter your branch name:${NC}" +read -p "Branch (default: main): " GITEA_BRANCH +GITEA_BRANCH=${GITEA_BRANCH:-main} + +# Get cluster name +echo -e "${YELLOW}Enter your cluster name:${NC}" +read -p "Cluster name (default: k3s-dgx): " CLUSTER_NAME +CLUSTER_NAME=${CLUSTER_NAME:-k3s-dgx} + +echo "" +echo -e "${BLUE}Configuration Summary:${NC}" +echo " Gitea URL: $GITEA_URL" +echo " Branch: $GITEA_BRANCH" +echo " Cluster: $CLUSTER_NAME" +echo "" + +read -p "Apply these settings? (y/n) " -n 1 -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Aborting..." + exit 0 +fi + +# Update configuration files +echo -e "${YELLOW}Updating configuration files...${NC}" + +update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \ + "ssh://git@gitea.example.com/edge-gitops/edge-gitops.git" \ + "$GITEA_URL" + +update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \ + "branch: main" \ + "branch: $GITEA_BRANCH" + +# Create environment file +cat > .env << EOF +GITEA_URL=$GITEA_URL +GITEA_BRANCH=$GITEA_BRANCH +CLUSTER_NAME=$CLUSTER_NAME +EOF + +echo -e "${GREEN}✓ Created .env file${NC}" + +echo "" +echo -e "${GREEN}=== Configuration Complete ===${NC}" +echo "" +echo "Next steps:" +echo "1. Review the changes: git diff" +echo "2. Commit the changes: git add . && git commit -m 'Configure GitOps settings'" +echo "3. Push to repository: git push origin $GITEA_BRANCH" +echo "4. Run bootstrap: ./bootstrap.sh" +echo "" \ No newline at end of file