init commit

2026-05-05 11:15:49 -05:00
commit 06f52750ac
24 changed files with 1158 additions and 0 deletions
--- a/61
+++ b/61
@@ -0,0 +1,61 @@
+.PHONY: help bootstrap sync status logs clean test
+
+help: ## Show this help message
+	@echo 'Usage: make [target]'
+	@echo ''
+	@echo 'Available targets:'
+	@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "  %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
+
+bootstrap: ## Bootstrap FluxCD on the cluster
+	@./bootstrap.sh
+
+sync: ## Force sync all FluxCD resources
+	@flux reconcile kustomization flux-system --with-source
+	@flux reconcile helmrelease --all
+
+status: ## Show status of all FluxCD resources
+	@flux get all --all-namespaces
+
+logs: ## Show FluxCD logs
+	@flux logs --all-namespaces
+
+clean: ## Remove FluxCD from cluster
+	@flux uninstall --namespace=flux-system --silent
+
+test: ## Test the cluster connectivity
+	@echo "Testing cluster connectivity..."
+	@kubectl cluster-info
+	@echo ""
+	@echo "Testing GPU availability..."
+	@kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
+	@echo ""
+	@echo "Testing FluxCD..."
+	@flux check
+
+apply: ## Apply all manifests directly (for testing)
+	@kubectl apply -k clusters/k3s-dgx
+
+diff: ## Show diff between cluster and git
+	@flux diff kustomization flux-system --with-source
+
+restart: ## Restart all FluxCD controllers
+	@kubectl rollout restart deployment/source-controller -n flux-system
+	@kubectl rollout restart deployment/kustomize-controller -n flux-system
+	@kubectl rollout restart deployment/helm-controller -n flux-system
+	@kubectl rollout restart deployment/notification-controller -n flux-system
+
+gpu-status: ## Show GPU status
+	@kubectl describe nodes | grep -A 5 "nvidia.com/gpu"
+
+kserve-status: ## Show KServe status
+	@kubectl get inferenceservices -n kserve
+	@kubectl get pods -n kserve
+
+model-logs: ## Show model inference logs
+	@kubectl logs -n kserve -l serving.kserve.io/inferenceservice=huihui-granite --tail=100 -f
+
+secrets: ## Generate example secrets (DO NOT COMMIT)
+	@echo "Creating example secrets directory..."
+	@mkdir -p secrets
+	@echo "# Add your secrets here" > secrets/README.md
+	@echo "# DO NOT commit actual secrets to git" >> secrets/README.md
--- a/README.md
+++ b/README.md
@@ -0,0 +1,165 @@
+# Edge GitOps - KServe on k3s with GPU
+
+GitOps setup for deploying ML models using KServe on a k3s cluster with GPU support (DGX Spark).
+
+## Prerequisites
+
+- k3s cluster with GPU support
+- kubectl configured to access the cluster
+- Gitea instance for GitOps repository
+- FluxCD CLI installed
+
+## Architecture
+
+```
+edge-gitops/
+├── clusters/
+│   └── k3s-dgx/
+│       ├── flux-system/          # FluxCD installation
+│       ├── gpu-support/          # NVIDIA GPU Operator
+│       ├── kserve/               # KServe installation
+│       └── apps/                 # ML model deployments
+├── apps/                        # Reusable app manifests
+└── infrastructure/              # Base infrastructure
+```
+
+## Setup Instructions
+
+### 1. Bootstrap FluxCD
+
+```bash
+flux bootstrap git \
+  --url=ssh://git@gitea.example.com/edge-gitops/edge-gitops.git \
+  --branch=main \
+  --path=clusters/k3s-dgx \
+  --components=source-controller,kustomize-controller,helm-controller,notification-controller
+```
+
+### 2. Configure Gitea SSH Key
+
+Generate SSH key for FluxCD:
+```bash
+ssh-keygen -t ed25519 -N "" -f flux-gitea-key
+```
+
+Add the public key to your Gitea repository as a deploy key.
+
+### 3. Update Repository Configuration
+
+Edit `clusters/k3s-dgx/flux-system/gotk-sync.yaml` to match your Gitea URL:
+```yaml
+url: ssh://git@your-gitea-instance.com/edge-gitops/edge-gitops.git
+```
+
+### 4. Deploy the Stack
+
+Commit and push the changes:
+```bash
+git add .
+git commit -m "Initial GitOps setup for KServe on k3s"
+git push origin main
+```
+
+FluxCD will automatically sync the changes to your cluster.
+
+## Components
+
+### GPU Support
+- NVIDIA GPU Operator (v23.9.1)
+- NVIDIA Device Plugin
+- DCGM Exporter for monitoring
+- GPU Node Feature Discovery
+
+### KServe
+- KServe Core (v0.12.0)
+- GPU-enabled Serving Runtime
+- Istio Gateway for networking
+- Model Storage (PVC)
+
+### Example Model
+- Huihui-granite-4.1-30b-abliterated (Hugging Face)
+- GPU-accelerated inference
+- REST API endpoint
+
+## Usage
+
+### Deploy a New Model
+
+1. Create a new InferenceService in `clusters/k3s-dgx/apps/`:
+```yaml
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: your-model
+  namespace: kserve
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: huggingface
+      storageUri: "hf://your-org/your-model"
+      resources:
+        limits:
+          nvidia.com/gpu: "1"
+```
+
+2. Commit and push changes
+
+### Test the Model
+
+```bash
+# Get the service URL
+kubectl get inferenceservice huihui-granite -n kserve
+
+# Test inference
+curl -X POST http://your-service-url/v1/models/huihui-granite:predict \
+  -H "Content-Type: application/json" \
+  -d '{"inputs": [{"name": "text", "shape": [1], "datatype": "BYTES", "data": ["Hello world"]}]}'
+```
+
+## Monitoring
+
+Check FluxCD status:
+```bash
+flux get all --all-namespaces
+```
+
+Check GPU status:
+```bash
+kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
+```
+
+Check KServe services:
+```bash
+kubectl get inferenceservices -n kserve
+```
+
+## Troubleshooting
+
+### GPU Not Available
+```bash
+kubectl describe node | grep -A 5 nvidia.com/gpu
+```
+
+### KServe Pods Not Starting
+```bash
+kubectl logs -n kserve deployment/kserve-controller-manager
+kubectl get pods -n kserve
+```
+
+### FluxCD Sync Issues
+```bash
+flux reconcile kustomization flux-system --with-source
+flux logs
+```
+
+## Customization
+
+### GPU Resources
+Edit `clusters/k3s-dgx/apps/huihui-granite-inference.yaml` to adjust GPU allocation.
+
+### Storage
+Modify `clusters/k3s-dgx/kserve/model-storage-pvc.yaml` for different storage requirements.
+
+### Networking
+Update `clusters/k3s-dgx/kserve/istio-gateway.yaml` for custom ingress configuration.
--- a/SETUP.md
+++ b/SETUP.md
@@ -0,0 +1,297 @@
+# Edge GitOps - Complete Setup Guide
+
+## Quick Start
+
+1. **Configure the setup:**
+   ```bash
+   ./configure.sh
+   ```
+
+2. **Bootstrap FluxCD:**
+   ```bash
+   make bootstrap
+   ```
+
+3. **Monitor the deployment:**
+   ```bash
+   make status
+   ```
+
+## Directory Structure
+
+```
+edge-gitops/
+├── bootstrap.sh              # FluxCD bootstrap script
+├── configure.sh             # Configuration wizard
+├── Makefile                 # Convenient commands
+├── README.md                # Main documentation
+├── .gitignore              # Git ignore rules
+├── .env                    # Environment variables (not committed)
+│
+├── clusters/
+│   └── k3s-dgx/            # Cluster-specific configuration
+│       ├── kustomization.yaml
+│       │
+│       ├── flux-system/    # FluxCD installation
+│       │   ├── kustomization.yaml
+│       │   ├── gotk-components.yaml
+│       │   └── gotk-sync.yaml
+│       │
+│       ├── gpu-support/    # NVIDIA GPU Operator
+│       │   ├── kustomization.yaml
+│       │   ├── gpu-operator-namespace.yaml
+│       │   ├── gpu-operator-helmrelease.yaml
+│       │   └── gpu-node-labels.yaml
+│       │
+│       ├── kserve/         # KServe installation
+│       │   ├── kustomization.yaml
+│       │   ├── kserve-namespace.yaml
+│       │   ├── kserve-crds.yaml
+│       │   ├── kserve-controller.yaml
+│       │   ├── istio-gateway.yaml
+│       │   ├── gpu-serving-runtime.yaml
+│       │   ├── model-storage-pvc.yaml
+│       │   └── storage-config.yaml
+│       │
+│       └── apps/           # ML model deployments
+│           ├── kustomization.yaml
+│           └── huihui-granite-inference.yaml
+│
+├── apps/                   # Reusable application manifests
+└── infrastructure/        # Base infrastructure components
+```
+
+## Component Details
+
+### FluxCD
+- **Version:** Latest stable
+- **Components:** source-controller, kustomize-controller, helm-controller, notification-controller
+- **Sync Interval:** 1 minute for GitRepository, 10 minutes for Kustomization
+- **Repository:** Gitea (configurable)
+
+### NVIDIA GPU Operator
+- **Version:** v23.9.1
+- **Driver:** 535.129.03
+- **Components:**
+  - NVIDIA Driver
+  - Device Plugin
+  - DCGM Exporter
+  - MIG Manager
+  - Node Feature Discovery
+
+### KServe
+- **Version:** v0.12.0
+- **Components:**
+  - KServe Controller
+  - Custom Resource Definitions
+  - GPU Serving Runtime
+  - Istio Integration
+  - Model Storage (50Gi PVC)
+
+### Example Model
+- **Name:** Huihui-granite-4.1-30b-abliterated
+- **Source:** Hugging Face
+- **Resources:**
+  - CPU: 2-4 cores
+  - Memory: 8-16Gi
+  - GPU: 1 NVIDIA GPU
+
+## Common Tasks
+
+### Add a New Model
+
+1. Create a new InferenceService:
+   ```bash
+   cat > clusters/k3s-dgx/apps/your-model.yaml << EOF
+   apiVersion: serving.kserve.io/v1beta1
+   kind: InferenceService
+   metadata:
+     name: your-model
+     namespace: kserve
+   spec:
+     predictor:
+       model:
+         modelFormat:
+           name: huggingface
+         storageUri: "hf://your-org/your-model"
+         resources:
+           limits:
+             nvidia.com/gpu: "1"
+   EOF
+   ```
+
+2. Update the kustomization:
+   ```bash
+   echo "  - your-model.yaml" >> clusters/k3s-dgx/apps/kustomization.yaml
+   ```
+
+3. Commit and push:
+   ```bash
+   git add clusters/k3s-dgx/apps/
+   git commit -m "Add new model deployment"
+   git push
+   ```
+
+### Update GPU Resources
+
+Edit the resource limits in your InferenceService:
+```yaml
+resources:
+  limits:
+    cpu: "8"
+    memory: "32Gi"
+    nvidia.com/gpu: "2"
+  requests:
+    cpu: "4"
+    memory: "16Gi"
+    nvidia.com/gpu: "2"
+```
+
+### Monitor Model Performance
+
+```bash
+# Get model endpoint
+kubectl get inferenceservice your-model -n kserve
+
+# View logs
+kubectl logs -n kserve -l serving.kserve.io/inferenceservice=your-model -f
+
+# Check GPU usage
+kubectl exec -n kserve <pod-name> -- nvidia-smi
+```
+
+## Troubleshooting
+
+### FluxCD Not Syncing
+
+```bash
+# Check FluxCD status
+flux check
+
+# View logs
+flux logs
+
+# Force sync
+flux reconcile kustomization flux-system --with-source
+```
+
+### GPU Not Available
+
+```bash
+# Check GPU nodes
+kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
+
+# Check GPU operator
+kubectl get pods -n gpu-operator
+
+# View GPU operator logs
+kubectl logs -n gpu-operator deployment/gpu-operator
+```
+
+### KServe Issues
+
+```bash
+# Check KServe pods
+kubectl get pods -n kserve
+
+# Check KServe controller
+kubectl logs -n kserve deployment/kserve-controller-manager
+
+# Describe InferenceService
+kubectl describe inferenceservice your-model -n kserve
+```
+
+## Security Considerations
+
+1. **Secrets Management:**
+   - Never commit secrets to git
+   - Use Kubernetes secrets for sensitive data
+   - Consider using Sealed Secrets or External Secrets Operator
+
+2. **Network Policies:**
+   - Review and restrict network access
+   - Use Istio for service mesh security
+
+3. **RBAC:**
+   - Review FluxCD service account permissions
+   - Implement principle of least privilege
+
+## Performance Optimization
+
+### GPU Optimization
+- Use appropriate GPU resource requests/limits
+- Monitor GPU utilization with DCGM Exporter
+- Consider MIG (Multi-Instance GPU) for better isolation
+
+### Storage Optimization
+- Use fast storage for model cache
+- Consider using ReadWriteMany for multi-pod access
+- Implement model caching strategies
+
+### Network Optimization
+- Use Istio for efficient load balancing
+- Configure appropriate timeouts for large models
+- Consider using gRPC for internal communication
+
+## Scaling
+
+### Horizontal Scaling
+```yaml
+# Add to InferenceService
+spec:
+  predictor:
+    replicas: 3
+```
+
+### Vertical Scaling
+```yaml
+# Update resource limits
+resources:
+  limits:
+    nvidia.com/gpu: "2"
+```
+
+## Monitoring
+
+### Metrics Collection
+- DCGM Exporter for GPU metrics
+- Prometheus for cluster metrics
+- KServe metrics for inference performance
+
+### Logging
+- Structured logging for all components
+- Centralized logging with Loki/ELK
+- Log retention policies
+
+### Alerting
+- GPU utilization alerts
+- Model health alerts
+- Resource exhaustion alerts
+
+## Backup and Recovery
+
+### GitOps Backup
+- All configuration is in git
+- Easy rollback with git revert
+- Branch-based testing
+
+### Data Backup
+- Model storage backup
+- Configuration backup
+- Disaster recovery plan
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Make your changes
+4. Test thoroughly
+5. Submit a pull request
+
+## Support
+
+For issues and questions:
+- Check the troubleshooting section
+- Review component documentation
+- Check FluxCD, KServe, and GPU Operator docs
+- Open an issue in the repository
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Configuration
+GITEA_URL="${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}"
+GITEA_BRANCH="${GITEA_BRANCH:-main}"
+CLUSTER_NAME="${CLUSTER_NAME:-k3s-dgx}"
+NAMESPACE="${NAMESPACE:-flux-system}"
+
+echo -e "${GREEN}=== Edge GitOps Bootstrap Script ===${NC}"
+echo ""
+
+# Check prerequisites
+echo -e "${YELLOW}Checking prerequisites...${NC}"
+
+if ! command -v kubectl &> /dev/null; then
+    echo -e "${RED}kubectl is not installed${NC}"
+    exit 1
+fi
+
+if ! command -v flux &> /dev/null; then
+    echo -e "${RED}flux is not installed${NC}"
+    echo "Install from: https://fluxcd.io/flux/installation/"
+    exit 1
+fi
+
+if ! kubectl cluster-info &> /dev/null; then
+    echo -e "${RED}Cannot connect to kubernetes cluster${NC}"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ Prerequisites met${NC}"
+echo ""
+
+# Check if Flux is already installed
+if kubectl get namespace flux-system &> /dev/null; then
+    echo -e "${YELLOW}FluxCD is already installed${NC}"
+    read -p "Do you want to reinstall? (y/n) " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        echo "Exiting..."
+        exit 0
+    fi
+    echo -e "${YELLOW}Uninstalling existing FluxCD...${NC}"
+    flux uninstall --namespace=flux-system --silent || true
+fi
+
+# Bootstrap FluxCD
+echo -e "${YELLOW}Bootstrapping FluxCD...${NC}"
+flux bootstrap git \
+    --url="$GITEA_URL" \
+    --branch="$GITEA_BRANCH" \
+    --path="clusters/$CLUSTER_NAME" \
+    --namespace="$NAMESPACE" \
+    --components=source-controller,kustomize-controller,helm-controller,notification-controller \
+    --timeout=10m
+
+echo -e "${GREEN}✓ FluxCD bootstrapped${NC}"
+echo ""
+
+# Wait for FluxCD to be ready
+echo -e "${YELLOW}Waiting for FluxCD components to be ready...${NC}"
+kubectl wait --for=condition=ready --timeout=300s \
+    -n "$NAMESPACE" \
+    deployment/source-controller \
+    deployment/kustomize-controller \
+    deployment/helm-controller \
+    deployment/notification-controller
+
+echo -e "${GREEN}✓ FluxCD components ready${NC}"
+echo ""
+
+# Verify installation
+echo -e "${YELLOW}Verifying installation...${NC}"
+flux check
+
+echo ""
+echo -e "${GREEN}=== Bootstrap Complete ===${NC}"
+echo ""
+echo "Next steps:"
+echo "1. Update the Gitea URL in clusters/k3s-dgx/flux-system/gotk-sync.yaml"
+echo "2. Commit and push the changes to your repository"
+echo "3. Monitor the sync: flux get all --all-namespaces"
+echo ""
+echo "Useful commands:"
+echo "  flux get all --all-namespaces    # Show all Flux resources"
+echo "  flux logs                        # Show Flux logs"
+echo "  flux reconcile kustomization flux-system --with-source  # Force sync"
+echo ""
--- a/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
+++ b/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
@@ -0,0 +1,22 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: huihui-granite
+  namespace: kserve
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: huggingface
+      args:
+        - --model_name=huihui-granite
+      storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
+      resources:
+        limits:
+          cpu: "4"
+          memory: 16Gi
+          nvidia.com/gpu: "1"
+        requests:
+          cpu: "2"
+          memory: 8Gi
+          nvidia.com/gpu: "1"
--- a/clusters/k3s-dgx/apps/kustomization.yaml
+++ b/clusters/k3s-dgx/apps/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: kserve
+resources:
+  - huihui-granite-inference.yaml
--- a/clusters/k3s-dgx/flux-system/gotk-components.yaml
+++ b/clusters/k3s-dgx/flux-system/gotk-components.yaml
@@ -0,0 +1,50 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: flux-system
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: helm-controller
+  namespace: flux-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: cluster-reconciler
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: cluster-admin
+subjects:
+  - kind: ServiceAccount
+    name: helm-controller
+    namespace: flux-system
+---
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: bitnami
+  namespace: flux-system
+spec:
+  interval: 30m
+  url: https://charts.bitnami.com/bitnami
+---
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: kserve
+  namespace: flux-system
+spec:
+  interval: 30m
+  url: https://kserve.github.io/kserve
+---
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: nvidia
+  namespace: flux-system
+spec:
+  interval: 30m
+  url: https://nvidia.github.io/k8s-device-plugin
--- a/clusters/k3s-dgx/flux-system/gotk-sync.yaml
+++ b/clusters/k3s-dgx/flux-system/gotk-sync.yaml
@@ -0,0 +1,34 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: GitRepository
+metadata:
+  name: flux-system
+  namespace: flux-system
+spec:
+  interval: 1m0s
+  ref:
+    branch: main
+  secretRef:
+    name: flux-system
+  url: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git
+---
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: flux-system
+  namespace: flux-system
+spec:
+  interval: 10m0s
+  path: ./clusters/k3s-dgx
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: flux-system
+  healthChecks:
+    - apiVersion: apps/v1
+      kind: Deployment
+      name: kustomize-controller
+      namespace: flux-system
+    - apiVersion: apps/v1
+      kind: Deployment
+      name: helm-controller
+      namespace: flux-system
--- a/clusters/k3s-dgx/flux-system/kustomization.yaml
+++ b/clusters/k3s-dgx/flux-system/kustomization.yaml
@@ -0,0 +1,13 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - gotk-components.yaml
+  - gotk-sync.yaml
+patches:
+  - patch: |
+      - op: add
+        path: /spec/template/spec/containers/0/args/-
+        value: --concurrency=20
+    target:
+      kind: Deployment
+      name: "(kustomize-controller|helm-controller|notification-controller|source-controller)"
--- a/clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
+++ b/clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gpu-node-labels
+  namespace: gpu-operator
+data:
+  labels.yaml: |
+    - key: accelerator
+      value: nvidia-tesla
+    - key: nvidia.com/gpu.present
+      value: "true"
+    - key: topology.kubernetes.io/zone
+      value: "dgx-spark"
--- a/clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
+++ b/clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
@@ -0,0 +1,70 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: nvidia
+  namespace: gpu-operator
+spec:
+  interval: 10m
+  url: https://nvidia.github.io/gpu-operator
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: gpu-operator
+  namespace: gpu-operator
+spec:
+  interval: 10m
+  chart:
+    spec:
+      chart: gpu-operator
+      version: "v23.9.1"
+      sourceRef:
+        kind: HelmRepository
+        name: nvidia
+        namespace: gpu-operator
+  values:
+    driver:
+      enabled: true
+      image: "nvcr.io/nvidia/driver"
+      version: "535.129.03"
+    operator:
+      defaultRuntime: nvidia-container-runtime
+    toolkit:
+      enabled: true
+      image: "nvcr.io/nvidia/k8s-device-plugin"
+      version: "v0.14.0"
+      env:
+        - name: CONTAINERD_CONFIG
+          value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
+        - name: CONTAINERD_SOCKET
+          value: /run/k3s/containerd/containerd.sock
+        - name: CONTAINERD_RUNTIME_CLASS
+          value: nvidia
+    devicePlugin:
+      enabled: true
+      image: "nvcr.io/nvidia/k8s-device-plugin"
+      version: "v0.14.0"
+      resources:
+        requests:
+          cpu: 100m
+          memory: 100Mi
+        limits:
+          cpu: 500m
+          memory: 500Mi
+    dcgm:
+      enabled: true
+      image: "nvcr.io/nvidia/dcgm-exporter"
+      version: "3.3.3-3.1.0-ubuntu22.04"
+      resources:
+        requests:
+          cpu: 100m
+          memory: 100Mi
+        limits:
+          cpu: 500m
+          memory: 500Mi
+    migManager:
+      enabled: true
+    gfd:
+      enabled: true
+    node-feature-discovery:
+      enabled: true
--- a/clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
+++ b/clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gpu-operator
+  labels:
+    openshift.io/cluster-monitoring: "true"
--- a/clusters/k3s-dgx/gpu-support/kustomization.yaml
+++ b/clusters/k3s-dgx/gpu-support/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: gpu-operator
+resources:
+  - gpu-operator-namespace.yaml
+  - gpu-operator-helmrelease.yaml
+  - gpu-node-labels.yaml
--- a/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
+++ b/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
@@ -0,0 +1,39 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gpu-operator
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: gpu-operator
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-ds
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-ds
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      containers:
+        - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
+          name: nvidia-device-plugin-ctr
+          args: ["--fail-on-init-error=false"]
+          env:
+            - name: NVIDIA_VISIBLE_DEVICES
+              value: "all"
+            - name: MIG_STRATEGY
+              value: "single"
+          volumeMounts:
+            - name: device-plugin
+              mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
--- a/clusters/k3s-dgx/gpu-support/runtime-class.yaml
+++ b/clusters/k3s-dgx/gpu-support/runtime-class.yaml
@@ -0,0 +1,16 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
+overhead:
+  podFixed:
+    memory: "1Gi"
+    cpu: "500m"
+scheduling:
+  nodeSelector:
+    nvidia.com/gpu.present: "true"
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
--- a/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
+++ b/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
@@ -0,0 +1,42 @@
+apiVersion: serving.kserve.io/v1beta1
+kind: ServingRuntime
+metadata:
+  name: kserve-gpu-runtime
+  namespace: kserve
+spec:
+  supportedModelFormats:
+    - name: tensorflow
+      version: "2"
+      autoSelect: true
+    - name: pytorch
+      version: "2"
+      autoSelect: true
+    - name: sklearn
+      version: "1"
+      autoSelect: true
+    - name: xgboost
+      version: "1"
+      autoSelect: true
+  protocol: v1
+  protocolVersions:
+    - v1
+    - v2
+  containers:
+    - name: kserve-container
+      image: kserve/sklearnserver-gpu:latest
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "4"
+          memory: "8Gi"
+          nvidia.com/gpu: "1"
+      env:
+        - name: NVIDIA_VISIBLE_DEVICES
+          value: "all"
+        - name: NVIDIA_DRIVER_CAPABILITIES
+          value: "compute,utility"
+  multiModel: false
+  disabled: false
--- a/clusters/k3s-dgx/kserve/istio-gateway.yaml
+++ b/clusters/k3s-dgx/kserve/istio-gateway.yaml
@@ -0,0 +1,35 @@
+apiVersion: networking.istio.io/v1beta1
+kind: Gateway
+metadata:
+  name: kserve-gateway
+  namespace: kserve
+spec:
+  selector:
+    istio: ingressgateway
+  servers:
+    - port:
+        number: 80
+        name: http
+        protocol: HTTP
+      hosts:
+        - "*"
+---
+apiVersion: networking.istio.io/v1beta1
+kind: VirtualService
+metadata:
+  name: kserve-vs
+  namespace: kserve
+spec:
+  hosts:
+    - "*"
+  gateways:
+    - kserve-gateway
+  http:
+    - match:
+        - uri:
+            prefix: /v1/models/
+      route:
+        - destination:
+            host: kserve-default
+            port:
+              number: 80
--- a/clusters/k3s-dgx/kserve/kserve-controller.yaml
+++ b/clusters/k3s-dgx/kserve/kserve-controller.yaml
@@ -0,0 +1,43 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: kserve
+  namespace: kserve
+spec:
+  interval: 10m
+  url: https://kserve.github.io/kserve
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: kserve
+  namespace: kserve
+spec:
+  interval: 10m
+  chart:
+    spec:
+      chart: kserve
+      version: "v0.12.0"
+      sourceRef:
+        kind: HelmRepository
+        name: kserve
+        namespace: kserve
+  values:
+    controller:
+      resources:
+        requests:
+          cpu: 500m
+          memory: 512Mi
+        limits:
+          cpu: 2
+          memory: 2Gi
+    config:
+      ingress:
+        className: istio
+      storage:
+        initialCapacity: 10Gi
+        storageClassName: local-path
+    knative:
+      enabled: true
+    istio:
+      enabled: true
--- a/clusters/k3s-dgx/kserve/kserve-namespace.yaml
+++ b/clusters/k3s-dgx/kserve/kserve-namespace.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: kserve
+  labels:
+    istio-injection: enabled
+    serving.kserve.io/serving-runtime: "true"
--- a/clusters/k3s-dgx/kserve/kustomization.yaml
+++ b/clusters/k3s-dgx/kserve/kustomization.yaml
@@ -0,0 +1,10 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: kserve
+resources:
+  - kserve-namespace.yaml
+  - kserve-controller.yaml
+  - istio-gateway.yaml
+  # - gpu-serving-runtime.yaml
+  # - model-storage-pvc.yaml
+  # - storage-config.yaml
--- a/clusters/k3s-dgx/kserve/model-storage-pvc.yaml
+++ b/clusters/k3s-dgx/kserve/model-storage-pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: model-storage-pvc
+  namespace: kserve
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 50Gi
+  storageClassName: local-path
--- a/clusters/k3s-dgx/kserve/storage-config.yaml
+++ b/clusters/k3s-dgx/kserve/storage-config.yaml
@@ -0,0 +1,20 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: storage-config
+  namespace: kserve
+data:
+  storageConfig.yaml: |
+    defaultStorageUri: "pvc://model-storage"
+    storageSpec:
+      - name: model-storage
+        type: pvc
+        pvcSpec:
+          accessModes:
+            - ReadWriteOnce
+          resources:
+            requests:
+              storage: 50Gi
+          storageClassName: local-path
+    modelCacheSize: 10Gi
+    modelCacheMemory: 2Gi
--- a/clusters/k3s-dgx/kustomization.yaml
+++ b/clusters/k3s-dgx/kustomization.yaml
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - flux-system
+  - gpu-support
+  - kserve
+  - apps
--- a/configure.sh
+++ b/configure.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Configuration script for Edge GitOps
+
+set -e
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+echo -e "${BLUE}=== Edge GitOps Configuration ===${NC}"
+echo ""
+
+# Function to update configuration files
+update_config() {
+    local file=$1
+    local key=$2
+    local value=$3
+    
+    if [ -f "$file" ]; then
+        sed -i "s|$key|$value|g" "$file"
+        echo -e "${GREEN}✓ Updated $file${NC}"
+    else
+        echo -e "${RED}✗ File not found: $file${NC}"
+    fi
+}
+
+# Get Gitea URL
+echo -e "${YELLOW}Enter your Gitea repository URL:${NC}"
+read -p "URL (default: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git): " GITEA_URL
+GITEA_URL=${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}
+
+# Get branch name
+echo -e "${YELLOW}Enter your branch name:${NC}"
+read -p "Branch (default: main): " GITEA_BRANCH
+GITEA_BRANCH=${GITEA_BRANCH:-main}
+
+# Get cluster name
+echo -e "${YELLOW}Enter your cluster name:${NC}"
+read -p "Cluster name (default: k3s-dgx): " CLUSTER_NAME
+CLUSTER_NAME=${CLUSTER_NAME:-k3s-dgx}
+
+echo ""
+echo -e "${BLUE}Configuration Summary:${NC}"
+echo "  Gitea URL: $GITEA_URL"
+echo "  Branch: $GITEA_BRANCH"
+echo "  Cluster: $CLUSTER_NAME"
+echo ""
+
+read -p "Apply these settings? (y/n) " -n 1 -r
+echo
+if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+    echo "Aborting..."
+    exit 0
+fi
+
+# Update configuration files
+echo -e "${YELLOW}Updating configuration files...${NC}"
+
+update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
+    "ssh://git@gitea.example.com/edge-gitops/edge-gitops.git" \
+    "$GITEA_URL"
+
+update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
+    "branch: main" \
+    "branch: $GITEA_BRANCH"
+
+# Create environment file
+cat > .env << EOF
+GITEA_URL=$GITEA_URL
+GITEA_BRANCH=$GITEA_BRANCH
+CLUSTER_NAME=$CLUSTER_NAME
+EOF
+
+echo -e "${GREEN}✓ Created .env file${NC}"
+
+echo ""
+echo -e "${GREEN}=== Configuration Complete ===${NC}"
+echo ""
+echo "Next steps:"
+echo "1. Review the changes: git diff"
+echo "2. Commit the changes: git add . && git commit -m 'Configure GitOps settings'"
+echo "3. Push to repository: git push origin $GITEA_BRANCH"
+echo "4. Run bootstrap: ./bootstrap.sh"
+echo ""