init commit

2026-05-05 11:15:49 -05:00
commit 06f52750ac
24 changed files with 1158 additions and 0 deletions
--- a/61
+++ b/61
@@ -0,0 +1,61 @@
 .PHONY: help bootstrap sync status logs clean test
 help: ## Show this help message
 	@echo 'Usage: make [target]'
 	@echo ''
 	@echo 'Available targets:'
 	@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "  %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
 bootstrap: ## Bootstrap FluxCD on the cluster
 	@./bootstrap.sh
 sync: ## Force sync all FluxCD resources
 	@flux reconcile kustomization flux-system --with-source
 	@flux reconcile helmrelease --all
 status: ## Show status of all FluxCD resources
 	@flux get all --all-namespaces
 logs: ## Show FluxCD logs
 	@flux logs --all-namespaces
 clean: ## Remove FluxCD from cluster
 	@flux uninstall --namespace=flux-system --silent
 test: ## Test the cluster connectivity
 	@echo "Testing cluster connectivity..."
 	@kubectl cluster-info
 	@echo ""
 	@echo "Testing GPU availability..."
 	@kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
 	@echo ""
 	@echo "Testing FluxCD..."
 	@flux check
 apply: ## Apply all manifests directly (for testing)
 	@kubectl apply -k clusters/k3s-dgx
 diff: ## Show diff between cluster and git
 	@flux diff kustomization flux-system --with-source
 restart: ## Restart all FluxCD controllers
 	@kubectl rollout restart deployment/source-controller -n flux-system
 	@kubectl rollout restart deployment/kustomize-controller -n flux-system
 	@kubectl rollout restart deployment/helm-controller -n flux-system
 	@kubectl rollout restart deployment/notification-controller -n flux-system
 gpu-status: ## Show GPU status
 	@kubectl describe nodes | grep -A 5 "nvidia.com/gpu"
 kserve-status: ## Show KServe status
 	@kubectl get inferenceservices -n kserve
 	@kubectl get pods -n kserve
 model-logs: ## Show model inference logs
 	@kubectl logs -n kserve -l serving.kserve.io/inferenceservice=huihui-granite --tail=100 -f
 secrets: ## Generate example secrets (DO NOT COMMIT)
 	@echo "Creating example secrets directory..."
 	@mkdir -p secrets
 	@echo "# Add your secrets here" > secrets/README.md
 	@echo "# DO NOT commit actual secrets to git" >> secrets/README.md
--- a/README.md
+++ b/README.md
@@ -0,0 +1,165 @@
 # Edge GitOps - KServe on k3s with GPU
 GitOps setup for deploying ML models using KServe on a k3s cluster with GPU support (DGX Spark).
 ## Prerequisites
 - k3s cluster with GPU support
 - kubectl configured to access the cluster
 - Gitea instance for GitOps repository
 - FluxCD CLI installed
 ## Architecture
 ```
 edge-gitops/
 ├── clusters/
 │   └── k3s-dgx/
 │       ├── flux-system/          # FluxCD installation
 │       ├── gpu-support/          # NVIDIA GPU Operator
 │       ├── kserve/               # KServe installation
 │       └── apps/                 # ML model deployments
 ├── apps/                        # Reusable app manifests
 └── infrastructure/              # Base infrastructure
 ```
 ## Setup Instructions
 ### 1. Bootstrap FluxCD
 ```bash
 flux bootstrap git \
  --url=ssh://git@gitea.example.com/edge-gitops/edge-gitops.git \
  --branch=main \
  --path=clusters/k3s-dgx \
  --components=source-controller,kustomize-controller,helm-controller,notification-controller
 ```
 ### 2. Configure Gitea SSH Key
 Generate SSH key for FluxCD:
 ```bash
 ssh-keygen -t ed25519 -N "" -f flux-gitea-key
 ```
 Add the public key to your Gitea repository as a deploy key.
 ### 3. Update Repository Configuration
 Edit `clusters/k3s-dgx/flux-system/gotk-sync.yaml` to match your Gitea URL:
 ```yaml
 url: ssh://git@your-gitea-instance.com/edge-gitops/edge-gitops.git
 ```
 ### 4. Deploy the Stack
 Commit and push the changes:
 ```bash
 git add .
 git commit -m "Initial GitOps setup for KServe on k3s"
 git push origin main
 ```
 FluxCD will automatically sync the changes to your cluster.
 ## Components
 ### GPU Support
 - NVIDIA GPU Operator (v23.9.1)
 - NVIDIA Device Plugin
 - DCGM Exporter for monitoring
 - GPU Node Feature Discovery
 ### KServe
 - KServe Core (v0.12.0)
 - GPU-enabled Serving Runtime
 - Istio Gateway for networking
 - Model Storage (PVC)
 ### Example Model
 - Huihui-granite-4.1-30b-abliterated (Hugging Face)
 - GPU-accelerated inference
 - REST API endpoint
 ## Usage
 ### Deploy a New Model
 1. Create a new InferenceService in `clusters/k3s-dgx/apps/`:
 ```yaml
 apiVersion: serving.kserve.io/v1beta1
 kind: InferenceService
 metadata:
  name: your-model
  namespace: kserve
 spec:
  predictor:
    model:
      modelFormat:
        name: huggingface
      storageUri: "hf://your-org/your-model"
      resources:
        limits:
          nvidia.com/gpu: "1"
 ```
 2. Commit and push changes
 ### Test the Model
 ```bash
 # Get the service URL
 kubectl get inferenceservice huihui-granite -n kserve
 # Test inference
 curl -X POST http://your-service-url/v1/models/huihui-granite:predict \
  -H "Content-Type: application/json" \
  -d '{"inputs": [{"name": "text", "shape": [1], "datatype": "BYTES", "data": ["Hello world"]}]}'
 ```
 ## Monitoring
 Check FluxCD status:
 ```bash
 flux get all --all-namespaces
 ```
 Check GPU status:
 ```bash
 kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
 ```
 Check KServe services:
 ```bash
 kubectl get inferenceservices -n kserve
 ```
 ## Troubleshooting
 ### GPU Not Available
 ```bash
 kubectl describe node | grep -A 5 nvidia.com/gpu
 ```
 ### KServe Pods Not Starting
 ```bash
 kubectl logs -n kserve deployment/kserve-controller-manager
 kubectl get pods -n kserve
 ```
 ### FluxCD Sync Issues
 ```bash
 flux reconcile kustomization flux-system --with-source
 flux logs
 ```
 ## Customization
 ### GPU Resources
 Edit `clusters/k3s-dgx/apps/huihui-granite-inference.yaml` to adjust GPU allocation.
 ### Storage
 Modify `clusters/k3s-dgx/kserve/model-storage-pvc.yaml` for different storage requirements.
 ### Networking
 Update `clusters/k3s-dgx/kserve/istio-gateway.yaml` for custom ingress configuration.
--- a/SETUP.md
+++ b/SETUP.md
@@ -0,0 +1,297 @@
 # Edge GitOps - Complete Setup Guide
 ## Quick Start
 1. **Configure the setup:**
   ```bash
   ./configure.sh
   ```
 2. **Bootstrap FluxCD:**
   ```bash
   make bootstrap
   ```
 3. **Monitor the deployment:**
   ```bash
   make status
   ```
 ## Directory Structure
 ```
 edge-gitops/
 ├── bootstrap.sh              # FluxCD bootstrap script
 ├── configure.sh             # Configuration wizard
 ├── Makefile                 # Convenient commands
 ├── README.md                # Main documentation
 ├── .gitignore              # Git ignore rules
 ├── .env                    # Environment variables (not committed)
 │
 ├── clusters/
 │   └── k3s-dgx/            # Cluster-specific configuration
 │       ├── kustomization.yaml
 │       │
 │       ├── flux-system/    # FluxCD installation
 │       │   ├── kustomization.yaml
 │       │   ├── gotk-components.yaml
 │       │   └── gotk-sync.yaml
 │       │
 │       ├── gpu-support/    # NVIDIA GPU Operator
 │       │   ├── kustomization.yaml
 │       │   ├── gpu-operator-namespace.yaml
 │       │   ├── gpu-operator-helmrelease.yaml
 │       │   └── gpu-node-labels.yaml
 │       │
 │       ├── kserve/         # KServe installation
 │       │   ├── kustomization.yaml
 │       │   ├── kserve-namespace.yaml
 │       │   ├── kserve-crds.yaml
 │       │   ├── kserve-controller.yaml
 │       │   ├── istio-gateway.yaml
 │       │   ├── gpu-serving-runtime.yaml
 │       │   ├── model-storage-pvc.yaml
 │       │   └── storage-config.yaml
 │       │
 │       └── apps/           # ML model deployments
 │           ├── kustomization.yaml
 │           └── huihui-granite-inference.yaml
 │
 ├── apps/                   # Reusable application manifests
 └── infrastructure/        # Base infrastructure components
 ```
 ## Component Details
 ### FluxCD
 - **Version:** Latest stable
 - **Components:** source-controller, kustomize-controller, helm-controller, notification-controller
 - **Sync Interval:** 1 minute for GitRepository, 10 minutes for Kustomization
 - **Repository:** Gitea (configurable)
 ### NVIDIA GPU Operator
 - **Version:** v23.9.1
 - **Driver:** 535.129.03
 - **Components:**
  - NVIDIA Driver
  - Device Plugin
  - DCGM Exporter
  - MIG Manager
  - Node Feature Discovery
 ### KServe
 - **Version:** v0.12.0
 - **Components:**
  - KServe Controller
  - Custom Resource Definitions
  - GPU Serving Runtime
  - Istio Integration
  - Model Storage (50Gi PVC)
 ### Example Model
 - **Name:** Huihui-granite-4.1-30b-abliterated
 - **Source:** Hugging Face
 - **Resources:**
  - CPU: 2-4 cores
  - Memory: 8-16Gi
  - GPU: 1 NVIDIA GPU
 ## Common Tasks
 ### Add a New Model
 1. Create a new InferenceService:
   ```bash
   cat > clusters/k3s-dgx/apps/your-model.yaml << EOF
   apiVersion: serving.kserve.io/v1beta1
   kind: InferenceService
   metadata:
     name: your-model
     namespace: kserve
   spec:
     predictor:
       model:
         modelFormat:
           name: huggingface
         storageUri: "hf://your-org/your-model"
         resources:
           limits:
             nvidia.com/gpu: "1"
   EOF
   ```
 2. Update the kustomization:
   ```bash
   echo "  - your-model.yaml" >> clusters/k3s-dgx/apps/kustomization.yaml
   ```
 3. Commit and push:
   ```bash
   git add clusters/k3s-dgx/apps/
   git commit -m "Add new model deployment"
   git push
   ```
 ### Update GPU Resources
 Edit the resource limits in your InferenceService:
 ```yaml
 resources:
  limits:
    cpu: "8"
    memory: "32Gi"
    nvidia.com/gpu: "2"
  requests:
    cpu: "4"
    memory: "16Gi"
    nvidia.com/gpu: "2"
 ```
 ### Monitor Model Performance
 ```bash
 # Get model endpoint
 kubectl get inferenceservice your-model -n kserve
 # View logs
 kubectl logs -n kserve -l serving.kserve.io/inferenceservice=your-model -f
 # Check GPU usage
 kubectl exec -n kserve <pod-name> -- nvidia-smi
 ```
 ## Troubleshooting
 ### FluxCD Not Syncing
 ```bash
 # Check FluxCD status
 flux check
 # View logs
 flux logs
 # Force sync
 flux reconcile kustomization flux-system --with-source
 ```
 ### GPU Not Available
 ```bash
 # Check GPU nodes
 kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
 # Check GPU operator
 kubectl get pods -n gpu-operator
 # View GPU operator logs
 kubectl logs -n gpu-operator deployment/gpu-operator
 ```
 ### KServe Issues
 ```bash
 # Check KServe pods
 kubectl get pods -n kserve
 # Check KServe controller
 kubectl logs -n kserve deployment/kserve-controller-manager
 # Describe InferenceService
 kubectl describe inferenceservice your-model -n kserve
 ```
 ## Security Considerations
 1. **Secrets Management:**
   - Never commit secrets to git
   - Use Kubernetes secrets for sensitive data
   - Consider using Sealed Secrets or External Secrets Operator
 2. **Network Policies:**
   - Review and restrict network access
   - Use Istio for service mesh security
 3. **RBAC:**
   - Review FluxCD service account permissions
   - Implement principle of least privilege
 ## Performance Optimization
 ### GPU Optimization
 - Use appropriate GPU resource requests/limits
 - Monitor GPU utilization with DCGM Exporter
 - Consider MIG (Multi-Instance GPU) for better isolation
 ### Storage Optimization
 - Use fast storage for model cache
 - Consider using ReadWriteMany for multi-pod access
 - Implement model caching strategies
 ### Network Optimization
 - Use Istio for efficient load balancing
 - Configure appropriate timeouts for large models
 - Consider using gRPC for internal communication
 ## Scaling
 ### Horizontal Scaling
 ```yaml
 # Add to InferenceService
 spec:
  predictor:
    replicas: 3
 ```
 ### Vertical Scaling
 ```yaml
 # Update resource limits
 resources:
  limits:
    nvidia.com/gpu: "2"
 ```
 ## Monitoring
 ### Metrics Collection
 - DCGM Exporter for GPU metrics
 - Prometheus for cluster metrics
 - KServe metrics for inference performance
 ### Logging
 - Structured logging for all components
 - Centralized logging with Loki/ELK
 - Log retention policies
 ### Alerting
 - GPU utilization alerts
 - Model health alerts
 - Resource exhaustion alerts
 ## Backup and Recovery
 ### GitOps Backup
 - All configuration is in git
 - Easy rollback with git revert
 - Branch-based testing
 ### Data Backup
 - Model storage backup
 - Configuration backup
 - Disaster recovery plan
 ## Contributing
 1. Fork the repository
 2. Create a feature branch
 3. Make your changes
 4. Test thoroughly
 5. Submit a pull request
 ## Support
 For issues and questions:
 - Check the troubleshooting section
 - Review component documentation
 - Check FluxCD, KServe, and GPU Operator docs
 - Open an issue in the repository
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -0,0 +1,96 @@
 #!/bin/bash
 set -e
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 # Configuration
 GITEA_URL="${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}"
 GITEA_BRANCH="${GITEA_BRANCH:-main}"
 CLUSTER_NAME="${CLUSTER_NAME:-k3s-dgx}"
 NAMESPACE="${NAMESPACE:-flux-system}"
 echo -e "${GREEN}=== Edge GitOps Bootstrap Script ===${NC}"
 echo ""
 # Check prerequisites
 echo -e "${YELLOW}Checking prerequisites...${NC}"
 if ! command -v kubectl &> /dev/null; then
    echo -e "${RED}kubectl is not installed${NC}"
    exit 1
 fi
 if ! command -v flux &> /dev/null; then
    echo -e "${RED}flux is not installed${NC}"
    echo "Install from: https://fluxcd.io/flux/installation/"
    exit 1
 fi
 if ! kubectl cluster-info &> /dev/null; then
    echo -e "${RED}Cannot connect to kubernetes cluster${NC}"
    exit 1
 fi
 echo -e "${GREEN}✓ Prerequisites met${NC}"
 echo ""
 # Check if Flux is already installed
 if kubectl get namespace flux-system &> /dev/null; then
    echo -e "${YELLOW}FluxCD is already installed${NC}"
    read -p "Do you want to reinstall? (y/n) " -n 1 -r
    echo
    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
        echo "Exiting..."
        exit 0
    fi
    echo -e "${YELLOW}Uninstalling existing FluxCD...${NC}"
    flux uninstall --namespace=flux-system --silent || true
 fi
 # Bootstrap FluxCD
 echo -e "${YELLOW}Bootstrapping FluxCD...${NC}"
 flux bootstrap git \
    --url="$GITEA_URL" \
    --branch="$GITEA_BRANCH" \
    --path="clusters/$CLUSTER_NAME" \
    --namespace="$NAMESPACE" \
    --components=source-controller,kustomize-controller,helm-controller,notification-controller \
    --timeout=10m
 echo -e "${GREEN}✓ FluxCD bootstrapped${NC}"
 echo ""
 # Wait for FluxCD to be ready
 echo -e "${YELLOW}Waiting for FluxCD components to be ready...${NC}"
 kubectl wait --for=condition=ready --timeout=300s \
    -n "$NAMESPACE" \
    deployment/source-controller \
    deployment/kustomize-controller \
    deployment/helm-controller \
    deployment/notification-controller
 echo -e "${GREEN}✓ FluxCD components ready${NC}"
 echo ""
 # Verify installation
 echo -e "${YELLOW}Verifying installation...${NC}"
 flux check
 echo ""
 echo -e "${GREEN}=== Bootstrap Complete ===${NC}"
 echo ""
 echo "Next steps:"
 echo "1. Update the Gitea URL in clusters/k3s-dgx/flux-system/gotk-sync.yaml"
 echo "2. Commit and push the changes to your repository"
 echo "3. Monitor the sync: flux get all --all-namespaces"
 echo ""
 echo "Useful commands:"
 echo "  flux get all --all-namespaces    # Show all Flux resources"
 echo "  flux logs                        # Show Flux logs"
 echo "  flux reconcile kustomization flux-system --with-source  # Force sync"
 echo ""
--- a/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
+++ b/clusters/k3s-dgx/apps/huihui-granite-inference.yaml
@@ -0,0 +1,22 @@
 apiVersion: serving.kserve.io/v1beta1
 kind: InferenceService
 metadata:
  name: huihui-granite
  namespace: kserve
 spec:
  predictor:
    model:
      modelFormat:
        name: huggingface
      args:
        - --model_name=huihui-granite
      storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
      resources:
        limits:
          cpu: "4"
          memory: 16Gi
          nvidia.com/gpu: "1"
        requests:
          cpu: "2"
          memory: 8Gi
          nvidia.com/gpu: "1"
--- a/clusters/k3s-dgx/apps/kustomization.yaml
+++ b/clusters/k3s-dgx/apps/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: kserve
 resources:
  - huihui-granite-inference.yaml
--- a/clusters/k3s-dgx/flux-system/gotk-components.yaml
+++ b/clusters/k3s-dgx/flux-system/gotk-components.yaml
@@ -0,0 +1,50 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: flux-system
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: helm-controller
  namespace: flux-system
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: cluster-reconciler
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cluster-admin
 subjects:
  - kind: ServiceAccount
    name: helm-controller
    namespace: flux-system
 ---
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: bitnami
  namespace: flux-system
 spec:
  interval: 30m
  url: https://charts.bitnami.com/bitnami
 ---
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: kserve
  namespace: flux-system
 spec:
  interval: 30m
  url: https://kserve.github.io/kserve
 ---
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: nvidia
  namespace: flux-system
 spec:
  interval: 30m
  url: https://nvidia.github.io/k8s-device-plugin
--- a/clusters/k3s-dgx/flux-system/gotk-sync.yaml
+++ b/clusters/k3s-dgx/flux-system/gotk-sync.yaml
@@ -0,0 +1,34 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: flux-system
  namespace: flux-system
 spec:
  interval: 1m0s
  ref:
    branch: main
  secretRef:
    name: flux-system
  url: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git
 ---
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: flux-system
  namespace: flux-system
 spec:
  interval: 10m0s
  path: ./clusters/k3s-dgx
  prune: true
  sourceRef:
    kind: GitRepository
    name: flux-system
  healthChecks:
    - apiVersion: apps/v1
      kind: Deployment
      name: kustomize-controller
      namespace: flux-system
    - apiVersion: apps/v1
      kind: Deployment
      name: helm-controller
      namespace: flux-system
--- a/clusters/k3s-dgx/flux-system/kustomization.yaml
+++ b/clusters/k3s-dgx/flux-system/kustomization.yaml
@@ -0,0 +1,13 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - gotk-components.yaml
  - gotk-sync.yaml
 patches:
  - patch: |
      - op: add
        path: /spec/template/spec/containers/0/args/-
        value: --concurrency=20
    target:
      kind: Deployment
      name: "(kustomize-controller|helm-controller|notification-controller|source-controller)"
--- a/clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
+++ b/clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
@@ -0,0 +1,13 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: gpu-node-labels
  namespace: gpu-operator
 data:
  labels.yaml: |
    - key: accelerator
      value: nvidia-tesla
    - key: nvidia.com/gpu.present
      value: "true"
    - key: topology.kubernetes.io/zone
      value: "dgx-spark"
--- a/clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
+++ b/clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
@@ -0,0 +1,70 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: nvidia
  namespace: gpu-operator
 spec:
  interval: 10m
  url: https://nvidia.github.io/gpu-operator
 ---
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: gpu-operator
  namespace: gpu-operator
 spec:
  interval: 10m
  chart:
    spec:
      chart: gpu-operator
      version: "v23.9.1"
      sourceRef:
        kind: HelmRepository
        name: nvidia
        namespace: gpu-operator
  values:
    driver:
      enabled: true
      image: "nvcr.io/nvidia/driver"
      version: "535.129.03"
    operator:
      defaultRuntime: nvidia-container-runtime
    toolkit:
      enabled: true
      image: "nvcr.io/nvidia/k8s-device-plugin"
      version: "v0.14.0"
      env:
        - name: CONTAINERD_CONFIG
          value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
        - name: CONTAINERD_SOCKET
          value: /run/k3s/containerd/containerd.sock
        - name: CONTAINERD_RUNTIME_CLASS
          value: nvidia
    devicePlugin:
      enabled: true
      image: "nvcr.io/nvidia/k8s-device-plugin"
      version: "v0.14.0"
      resources:
        requests:
          cpu: 100m
          memory: 100Mi
        limits:
          cpu: 500m
          memory: 500Mi
    dcgm:
      enabled: true
      image: "nvcr.io/nvidia/dcgm-exporter"
      version: "3.3.3-3.1.0-ubuntu22.04"
      resources:
        requests:
          cpu: 100m
          memory: 100Mi
        limits:
          cpu: 500m
          memory: 500Mi
    migManager:
      enabled: true
    gfd:
      enabled: true
    node-feature-discovery:
      enabled: true
--- a/clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
+++ b/clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
@@ -0,0 +1,6 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: gpu-operator
  labels:
    openshift.io/cluster-monitoring: "true"
--- a/clusters/k3s-dgx/gpu-support/kustomization.yaml
+++ b/clusters/k3s-dgx/gpu-support/kustomization.yaml
@@ -0,0 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: gpu-operator
 resources:
  - gpu-operator-namespace.yaml
  - gpu-operator-helmrelease.yaml
  - gpu-node-labels.yaml
--- a/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
+++ b/clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
@@ -0,0 +1,39 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: gpu-operator
 ---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: nvidia-device-plugin-daemonset
  namespace: gpu-operator
 spec:
  selector:
    matchLabels:
      name: nvidia-device-plugin-ds
  template:
    metadata:
      labels:
        name: nvidia-device-plugin-ds
    spec:
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule
      containers:
        - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
          name: nvidia-device-plugin-ctr
          args: ["--fail-on-init-error=false"]
          env:
            - name: NVIDIA_VISIBLE_DEVICES
              value: "all"
            - name: MIG_STRATEGY
              value: "single"
          volumeMounts:
            - name: device-plugin
              mountPath: /var/lib/kubelet/device-plugins
      volumes:
        - name: device-plugin
          hostPath:
            path: /var/lib/kubelet/device-plugins
--- a/clusters/k3s-dgx/gpu-support/runtime-class.yaml
+++ b/clusters/k3s-dgx/gpu-support/runtime-class.yaml
@@ -0,0 +1,16 @@
 apiVersion: node.k8s.io/v1
 kind: RuntimeClass
 metadata:
  name: nvidia
 handler: nvidia
 overhead:
  podFixed:
    memory: "1Gi"
    cpu: "500m"
 scheduling:
  nodeSelector:
    nvidia.com/gpu.present: "true"
  tolerations:
    - key: nvidia.com/gpu
      operator: Exists
      effect: NoSchedule
--- a/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
+++ b/clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
@@ -0,0 +1,42 @@
 apiVersion: serving.kserve.io/v1beta1
 kind: ServingRuntime
 metadata:
  name: kserve-gpu-runtime
  namespace: kserve
 spec:
  supportedModelFormats:
    - name: tensorflow
      version: "2"
      autoSelect: true
    - name: pytorch
      version: "2"
      autoSelect: true
    - name: sklearn
      version: "1"
      autoSelect: true
    - name: xgboost
      version: "1"
      autoSelect: true
  protocol: v1
  protocolVersions:
    - v1
    - v2
  containers:
    - name: kserve-container
      image: kserve/sklearnserver-gpu:latest
      resources:
        requests:
          cpu: "1"
          memory: "2Gi"
          nvidia.com/gpu: "1"
        limits:
          cpu: "4"
          memory: "8Gi"
          nvidia.com/gpu: "1"
      env:
        - name: NVIDIA_VISIBLE_DEVICES
          value: "all"
        - name: NVIDIA_DRIVER_CAPABILITIES
          value: "compute,utility"
  multiModel: false
  disabled: false
--- a/clusters/k3s-dgx/kserve/istio-gateway.yaml
+++ b/clusters/k3s-dgx/kserve/istio-gateway.yaml
@@ -0,0 +1,35 @@
 apiVersion: networking.istio.io/v1beta1
 kind: Gateway
 metadata:
  name: kserve-gateway
  namespace: kserve
 spec:
  selector:
    istio: ingressgateway
  servers:
    - port:
        number: 80
        name: http
        protocol: HTTP
      hosts:
        - "*"
 ---
 apiVersion: networking.istio.io/v1beta1
 kind: VirtualService
 metadata:
  name: kserve-vs
  namespace: kserve
 spec:
  hosts:
    - "*"
  gateways:
    - kserve-gateway
  http:
    - match:
        - uri:
            prefix: /v1/models/
      route:
        - destination:
            host: kserve-default
            port:
              number: 80
--- a/clusters/k3s-dgx/kserve/kserve-controller.yaml
+++ b/clusters/k3s-dgx/kserve/kserve-controller.yaml
@@ -0,0 +1,43 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: kserve
  namespace: kserve
 spec:
  interval: 10m
  url: https://kserve.github.io/kserve
 ---
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: kserve
  namespace: kserve
 spec:
  interval: 10m
  chart:
    spec:
      chart: kserve
      version: "v0.12.0"
      sourceRef:
        kind: HelmRepository
        name: kserve
        namespace: kserve
  values:
    controller:
      resources:
        requests:
          cpu: 500m
          memory: 512Mi
        limits:
          cpu: 2
          memory: 2Gi
    config:
      ingress:
        className: istio
      storage:
        initialCapacity: 10Gi
        storageClassName: local-path
    knative:
      enabled: true
    istio:
      enabled: true
--- a/clusters/k3s-dgx/kserve/kserve-namespace.yaml
+++ b/clusters/k3s-dgx/kserve/kserve-namespace.yaml
@@ -0,0 +1,7 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: kserve
  labels:
    istio-injection: enabled
    serving.kserve.io/serving-runtime: "true"
--- a/clusters/k3s-dgx/kserve/kustomization.yaml
+++ b/clusters/k3s-dgx/kserve/kustomization.yaml
@@ -0,0 +1,10 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: kserve
 resources:
  - kserve-namespace.yaml
  - kserve-controller.yaml
  - istio-gateway.yaml
  # - gpu-serving-runtime.yaml
  # - model-storage-pvc.yaml
  # - storage-config.yaml
--- a/clusters/k3s-dgx/kserve/model-storage-pvc.yaml
+++ b/clusters/k3s-dgx/kserve/model-storage-pvc.yaml
@@ -0,0 +1,12 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: model-storage-pvc
  namespace: kserve
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi
  storageClassName: local-path
--- a/clusters/k3s-dgx/kserve/storage-config.yaml
+++ b/clusters/k3s-dgx/kserve/storage-config.yaml
@@ -0,0 +1,20 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: storage-config
  namespace: kserve
 data:
  storageConfig.yaml: |
    defaultStorageUri: "pvc://model-storage"
    storageSpec:
      - name: model-storage
        type: pvc
        pvcSpec:
          accessModes:
            - ReadWriteOnce
          resources:
            requests:
              storage: 50Gi
          storageClassName: local-path
    modelCacheSize: 10Gi
    modelCacheMemory: 2Gi
--- a/clusters/k3s-dgx/kustomization.yaml
+++ b/clusters/k3s-dgx/kustomization.yaml
@@ -0,0 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - flux-system
  - gpu-support
  - kserve
  - apps
--- a/configure.sh
+++ b/configure.sh
@@ -0,0 +1,88 @@
 #!/bin/bash
 # Configuration script for Edge GitOps
 set -e
 # Colors
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m'
 echo -e "${BLUE}=== Edge GitOps Configuration ===${NC}"
 echo ""
 # Function to update configuration files
 update_config() {
    local file=$1
    local key=$2
    local value=$3
    if [ -f "$file" ]; then
        sed -i "s|$key|$value|g" "$file"
        echo -e "${GREEN}✓ Updated $file${NC}"
    else
        echo -e "${RED}✗ File not found: $file${NC}"
    fi
 }
 # Get Gitea URL
 echo -e "${YELLOW}Enter your Gitea repository URL:${NC}"
 read -p "URL (default: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git): " GITEA_URL
 GITEA_URL=${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}
 # Get branch name
 echo -e "${YELLOW}Enter your branch name:${NC}"
 read -p "Branch (default: main): " GITEA_BRANCH
 GITEA_BRANCH=${GITEA_BRANCH:-main}
 # Get cluster name
 echo -e "${YELLOW}Enter your cluster name:${NC}"
 read -p "Cluster name (default: k3s-dgx): " CLUSTER_NAME
 CLUSTER_NAME=${CLUSTER_NAME:-k3s-dgx}
 echo ""
 echo -e "${BLUE}Configuration Summary:${NC}"
 echo "  Gitea URL: $GITEA_URL"
 echo "  Branch: $GITEA_BRANCH"
 echo "  Cluster: $CLUSTER_NAME"
 echo ""
 read -p "Apply these settings? (y/n) " -n 1 -r
 echo
 if [[ ! $REPLY =~ ^[Yy]$ ]]; then
    echo "Aborting..."
    exit 0
 fi
 # Update configuration files
 echo -e "${YELLOW}Updating configuration files...${NC}"
 update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
    "ssh://git@gitea.example.com/edge-gitops/edge-gitops.git" \
    "$GITEA_URL"
 update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
    "branch: main" \
    "branch: $GITEA_BRANCH"
 # Create environment file
 cat > .env << EOF
 GITEA_URL=$GITEA_URL
 GITEA_BRANCH=$GITEA_BRANCH
 CLUSTER_NAME=$CLUSTER_NAME
 EOF
 echo -e "${GREEN}✓ Created .env file${NC}"
 echo ""
 echo -e "${GREEN}=== Configuration Complete ===${NC}"
 echo ""
 echo "Next steps:"
 echo "1. Review the changes: git diff"
 echo "2. Commit the changes: git add . && git commit -m 'Configure GitOps settings'"
 echo "3. Push to repository: git push origin $GITEA_BRANCH"
 echo "4. Run bootstrap: ./bootstrap.sh"
 echo ""