init commit
This commit is contained in:
61
Makefile
Normal file
61
Makefile
Normal file
@@ -0,0 +1,61 @@
|
||||
.PHONY: help bootstrap sync status logs clean test
|
||||
|
||||
help: ## Show this help message
|
||||
@echo 'Usage: make [target]'
|
||||
@echo ''
|
||||
@echo 'Available targets:'
|
||||
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||
|
||||
bootstrap: ## Bootstrap FluxCD on the cluster
|
||||
@./bootstrap.sh
|
||||
|
||||
sync: ## Force sync all FluxCD resources
|
||||
@flux reconcile kustomization flux-system --with-source
|
||||
@flux reconcile helmrelease --all
|
||||
|
||||
status: ## Show status of all FluxCD resources
|
||||
@flux get all --all-namespaces
|
||||
|
||||
logs: ## Show FluxCD logs
|
||||
@flux logs --all-namespaces
|
||||
|
||||
clean: ## Remove FluxCD from cluster
|
||||
@flux uninstall --namespace=flux-system --silent
|
||||
|
||||
test: ## Test the cluster connectivity
|
||||
@echo "Testing cluster connectivity..."
|
||||
@kubectl cluster-info
|
||||
@echo ""
|
||||
@echo "Testing GPU availability..."
|
||||
@kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
|
||||
@echo ""
|
||||
@echo "Testing FluxCD..."
|
||||
@flux check
|
||||
|
||||
apply: ## Apply all manifests directly (for testing)
|
||||
@kubectl apply -k clusters/k3s-dgx
|
||||
|
||||
diff: ## Show diff between cluster and git
|
||||
@flux diff kustomization flux-system --with-source
|
||||
|
||||
restart: ## Restart all FluxCD controllers
|
||||
@kubectl rollout restart deployment/source-controller -n flux-system
|
||||
@kubectl rollout restart deployment/kustomize-controller -n flux-system
|
||||
@kubectl rollout restart deployment/helm-controller -n flux-system
|
||||
@kubectl rollout restart deployment/notification-controller -n flux-system
|
||||
|
||||
gpu-status: ## Show GPU status
|
||||
@kubectl describe nodes | grep -A 5 "nvidia.com/gpu"
|
||||
|
||||
kserve-status: ## Show KServe status
|
||||
@kubectl get inferenceservices -n kserve
|
||||
@kubectl get pods -n kserve
|
||||
|
||||
model-logs: ## Show model inference logs
|
||||
@kubectl logs -n kserve -l serving.kserve.io/inferenceservice=huihui-granite --tail=100 -f
|
||||
|
||||
secrets: ## Generate example secrets (DO NOT COMMIT)
|
||||
@echo "Creating example secrets directory..."
|
||||
@mkdir -p secrets
|
||||
@echo "# Add your secrets here" > secrets/README.md
|
||||
@echo "# DO NOT commit actual secrets to git" >> secrets/README.md
|
||||
165
README.md
Normal file
165
README.md
Normal file
@@ -0,0 +1,165 @@
|
||||
# Edge GitOps - KServe on k3s with GPU
|
||||
|
||||
GitOps setup for deploying ML models using KServe on a k3s cluster with GPU support (DGX Spark).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- k3s cluster with GPU support
|
||||
- kubectl configured to access the cluster
|
||||
- Gitea instance for GitOps repository
|
||||
- FluxCD CLI installed
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
edge-gitops/
|
||||
├── clusters/
|
||||
│ └── k3s-dgx/
|
||||
│ ├── flux-system/ # FluxCD installation
|
||||
│ ├── gpu-support/ # NVIDIA GPU Operator
|
||||
│ ├── kserve/ # KServe installation
|
||||
│ └── apps/ # ML model deployments
|
||||
├── apps/ # Reusable app manifests
|
||||
└── infrastructure/ # Base infrastructure
|
||||
```
|
||||
|
||||
## Setup Instructions
|
||||
|
||||
### 1. Bootstrap FluxCD
|
||||
|
||||
```bash
|
||||
flux bootstrap git \
|
||||
--url=ssh://git@gitea.example.com/edge-gitops/edge-gitops.git \
|
||||
--branch=main \
|
||||
--path=clusters/k3s-dgx \
|
||||
--components=source-controller,kustomize-controller,helm-controller,notification-controller
|
||||
```
|
||||
|
||||
### 2. Configure Gitea SSH Key
|
||||
|
||||
Generate SSH key for FluxCD:
|
||||
```bash
|
||||
ssh-keygen -t ed25519 -N "" -f flux-gitea-key
|
||||
```
|
||||
|
||||
Add the public key to your Gitea repository as a deploy key.
|
||||
|
||||
### 3. Update Repository Configuration
|
||||
|
||||
Edit `clusters/k3s-dgx/flux-system/gotk-sync.yaml` to match your Gitea URL:
|
||||
```yaml
|
||||
url: ssh://git@your-gitea-instance.com/edge-gitops/edge-gitops.git
|
||||
```
|
||||
|
||||
### 4. Deploy the Stack
|
||||
|
||||
Commit and push the changes:
|
||||
```bash
|
||||
git add .
|
||||
git commit -m "Initial GitOps setup for KServe on k3s"
|
||||
git push origin main
|
||||
```
|
||||
|
||||
FluxCD will automatically sync the changes to your cluster.
|
||||
|
||||
## Components
|
||||
|
||||
### GPU Support
|
||||
- NVIDIA GPU Operator (v23.9.1)
|
||||
- NVIDIA Device Plugin
|
||||
- DCGM Exporter for monitoring
|
||||
- GPU Node Feature Discovery
|
||||
|
||||
### KServe
|
||||
- KServe Core (v0.12.0)
|
||||
- GPU-enabled Serving Runtime
|
||||
- Istio Gateway for networking
|
||||
- Model Storage (PVC)
|
||||
|
||||
### Example Model
|
||||
- Huihui-granite-4.1-30b-abliterated (Hugging Face)
|
||||
- GPU-accelerated inference
|
||||
- REST API endpoint
|
||||
|
||||
## Usage
|
||||
|
||||
### Deploy a New Model
|
||||
|
||||
1. Create a new InferenceService in `clusters/k3s-dgx/apps/`:
|
||||
```yaml
|
||||
apiVersion: serving.kserve.io/v1beta1
|
||||
kind: InferenceService
|
||||
metadata:
|
||||
name: your-model
|
||||
namespace: kserve
|
||||
spec:
|
||||
predictor:
|
||||
model:
|
||||
modelFormat:
|
||||
name: huggingface
|
||||
storageUri: "hf://your-org/your-model"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "1"
|
||||
```
|
||||
|
||||
2. Commit and push changes
|
||||
|
||||
### Test the Model
|
||||
|
||||
```bash
|
||||
# Get the service URL
|
||||
kubectl get inferenceservice huihui-granite -n kserve
|
||||
|
||||
# Test inference
|
||||
curl -X POST http://your-service-url/v1/models/huihui-granite:predict \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"inputs": [{"name": "text", "shape": [1], "datatype": "BYTES", "data": ["Hello world"]}]}'
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
Check FluxCD status:
|
||||
```bash
|
||||
flux get all --all-namespaces
|
||||
```
|
||||
|
||||
Check GPU status:
|
||||
```bash
|
||||
kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
|
||||
```
|
||||
|
||||
Check KServe services:
|
||||
```bash
|
||||
kubectl get inferenceservices -n kserve
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### GPU Not Available
|
||||
```bash
|
||||
kubectl describe node | grep -A 5 nvidia.com/gpu
|
||||
```
|
||||
|
||||
### KServe Pods Not Starting
|
||||
```bash
|
||||
kubectl logs -n kserve deployment/kserve-controller-manager
|
||||
kubectl get pods -n kserve
|
||||
```
|
||||
|
||||
### FluxCD Sync Issues
|
||||
```bash
|
||||
flux reconcile kustomization flux-system --with-source
|
||||
flux logs
|
||||
```
|
||||
|
||||
## Customization
|
||||
|
||||
### GPU Resources
|
||||
Edit `clusters/k3s-dgx/apps/huihui-granite-inference.yaml` to adjust GPU allocation.
|
||||
|
||||
### Storage
|
||||
Modify `clusters/k3s-dgx/kserve/model-storage-pvc.yaml` for different storage requirements.
|
||||
|
||||
### Networking
|
||||
Update `clusters/k3s-dgx/kserve/istio-gateway.yaml` for custom ingress configuration.
|
||||
297
SETUP.md
Normal file
297
SETUP.md
Normal file
@@ -0,0 +1,297 @@
|
||||
# Edge GitOps - Complete Setup Guide
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. **Configure the setup:**
|
||||
```bash
|
||||
./configure.sh
|
||||
```
|
||||
|
||||
2. **Bootstrap FluxCD:**
|
||||
```bash
|
||||
make bootstrap
|
||||
```
|
||||
|
||||
3. **Monitor the deployment:**
|
||||
```bash
|
||||
make status
|
||||
```
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
edge-gitops/
|
||||
├── bootstrap.sh # FluxCD bootstrap script
|
||||
├── configure.sh # Configuration wizard
|
||||
├── Makefile # Convenient commands
|
||||
├── README.md # Main documentation
|
||||
├── .gitignore # Git ignore rules
|
||||
├── .env # Environment variables (not committed)
|
||||
│
|
||||
├── clusters/
|
||||
│ └── k3s-dgx/ # Cluster-specific configuration
|
||||
│ ├── kustomization.yaml
|
||||
│ │
|
||||
│ ├── flux-system/ # FluxCD installation
|
||||
│ │ ├── kustomization.yaml
|
||||
│ │ ├── gotk-components.yaml
|
||||
│ │ └── gotk-sync.yaml
|
||||
│ │
|
||||
│ ├── gpu-support/ # NVIDIA GPU Operator
|
||||
│ │ ├── kustomization.yaml
|
||||
│ │ ├── gpu-operator-namespace.yaml
|
||||
│ │ ├── gpu-operator-helmrelease.yaml
|
||||
│ │ └── gpu-node-labels.yaml
|
||||
│ │
|
||||
│ ├── kserve/ # KServe installation
|
||||
│ │ ├── kustomization.yaml
|
||||
│ │ ├── kserve-namespace.yaml
|
||||
│ │ ├── kserve-crds.yaml
|
||||
│ │ ├── kserve-controller.yaml
|
||||
│ │ ├── istio-gateway.yaml
|
||||
│ │ ├── gpu-serving-runtime.yaml
|
||||
│ │ ├── model-storage-pvc.yaml
|
||||
│ │ └── storage-config.yaml
|
||||
│ │
|
||||
│ └── apps/ # ML model deployments
|
||||
│ ├── kustomization.yaml
|
||||
│ └── huihui-granite-inference.yaml
|
||||
│
|
||||
├── apps/ # Reusable application manifests
|
||||
└── infrastructure/ # Base infrastructure components
|
||||
```
|
||||
|
||||
## Component Details
|
||||
|
||||
### FluxCD
|
||||
- **Version:** Latest stable
|
||||
- **Components:** source-controller, kustomize-controller, helm-controller, notification-controller
|
||||
- **Sync Interval:** 1 minute for GitRepository, 10 minutes for Kustomization
|
||||
- **Repository:** Gitea (configurable)
|
||||
|
||||
### NVIDIA GPU Operator
|
||||
- **Version:** v23.9.1
|
||||
- **Driver:** 535.129.03
|
||||
- **Components:**
|
||||
- NVIDIA Driver
|
||||
- Device Plugin
|
||||
- DCGM Exporter
|
||||
- MIG Manager
|
||||
- Node Feature Discovery
|
||||
|
||||
### KServe
|
||||
- **Version:** v0.12.0
|
||||
- **Components:**
|
||||
- KServe Controller
|
||||
- Custom Resource Definitions
|
||||
- GPU Serving Runtime
|
||||
- Istio Integration
|
||||
- Model Storage (50Gi PVC)
|
||||
|
||||
### Example Model
|
||||
- **Name:** Huihui-granite-4.1-30b-abliterated
|
||||
- **Source:** Hugging Face
|
||||
- **Resources:**
|
||||
- CPU: 2-4 cores
|
||||
- Memory: 8-16Gi
|
||||
- GPU: 1 NVIDIA GPU
|
||||
|
||||
## Common Tasks
|
||||
|
||||
### Add a New Model
|
||||
|
||||
1. Create a new InferenceService:
|
||||
```bash
|
||||
cat > clusters/k3s-dgx/apps/your-model.yaml << EOF
|
||||
apiVersion: serving.kserve.io/v1beta1
|
||||
kind: InferenceService
|
||||
metadata:
|
||||
name: your-model
|
||||
namespace: kserve
|
||||
spec:
|
||||
predictor:
|
||||
model:
|
||||
modelFormat:
|
||||
name: huggingface
|
||||
storageUri: "hf://your-org/your-model"
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "1"
|
||||
EOF
|
||||
```
|
||||
|
||||
2. Update the kustomization:
|
||||
```bash
|
||||
echo " - your-model.yaml" >> clusters/k3s-dgx/apps/kustomization.yaml
|
||||
```
|
||||
|
||||
3. Commit and push:
|
||||
```bash
|
||||
git add clusters/k3s-dgx/apps/
|
||||
git commit -m "Add new model deployment"
|
||||
git push
|
||||
```
|
||||
|
||||
### Update GPU Resources
|
||||
|
||||
Edit the resource limits in your InferenceService:
|
||||
```yaml
|
||||
resources:
|
||||
limits:
|
||||
cpu: "8"
|
||||
memory: "32Gi"
|
||||
nvidia.com/gpu: "2"
|
||||
requests:
|
||||
cpu: "4"
|
||||
memory: "16Gi"
|
||||
nvidia.com/gpu: "2"
|
||||
```
|
||||
|
||||
### Monitor Model Performance
|
||||
|
||||
```bash
|
||||
# Get model endpoint
|
||||
kubectl get inferenceservice your-model -n kserve
|
||||
|
||||
# View logs
|
||||
kubectl logs -n kserve -l serving.kserve.io/inferenceservice=your-model -f
|
||||
|
||||
# Check GPU usage
|
||||
kubectl exec -n kserve <pod-name> -- nvidia-smi
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### FluxCD Not Syncing
|
||||
|
||||
```bash
|
||||
# Check FluxCD status
|
||||
flux check
|
||||
|
||||
# View logs
|
||||
flux logs
|
||||
|
||||
# Force sync
|
||||
flux reconcile kustomization flux-system --with-source
|
||||
```
|
||||
|
||||
### GPU Not Available
|
||||
|
||||
```bash
|
||||
# Check GPU nodes
|
||||
kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
|
||||
|
||||
# Check GPU operator
|
||||
kubectl get pods -n gpu-operator
|
||||
|
||||
# View GPU operator logs
|
||||
kubectl logs -n gpu-operator deployment/gpu-operator
|
||||
```
|
||||
|
||||
### KServe Issues
|
||||
|
||||
```bash
|
||||
# Check KServe pods
|
||||
kubectl get pods -n kserve
|
||||
|
||||
# Check KServe controller
|
||||
kubectl logs -n kserve deployment/kserve-controller-manager
|
||||
|
||||
# Describe InferenceService
|
||||
kubectl describe inferenceservice your-model -n kserve
|
||||
```
|
||||
|
||||
## Security Considerations
|
||||
|
||||
1. **Secrets Management:**
|
||||
- Never commit secrets to git
|
||||
- Use Kubernetes secrets for sensitive data
|
||||
- Consider using Sealed Secrets or External Secrets Operator
|
||||
|
||||
2. **Network Policies:**
|
||||
- Review and restrict network access
|
||||
- Use Istio for service mesh security
|
||||
|
||||
3. **RBAC:**
|
||||
- Review FluxCD service account permissions
|
||||
- Implement principle of least privilege
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### GPU Optimization
|
||||
- Use appropriate GPU resource requests/limits
|
||||
- Monitor GPU utilization with DCGM Exporter
|
||||
- Consider MIG (Multi-Instance GPU) for better isolation
|
||||
|
||||
### Storage Optimization
|
||||
- Use fast storage for model cache
|
||||
- Consider using ReadWriteMany for multi-pod access
|
||||
- Implement model caching strategies
|
||||
|
||||
### Network Optimization
|
||||
- Use Istio for efficient load balancing
|
||||
- Configure appropriate timeouts for large models
|
||||
- Consider using gRPC for internal communication
|
||||
|
||||
## Scaling
|
||||
|
||||
### Horizontal Scaling
|
||||
```yaml
|
||||
# Add to InferenceService
|
||||
spec:
|
||||
predictor:
|
||||
replicas: 3
|
||||
```
|
||||
|
||||
### Vertical Scaling
|
||||
```yaml
|
||||
# Update resource limits
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: "2"
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Metrics Collection
|
||||
- DCGM Exporter for GPU metrics
|
||||
- Prometheus for cluster metrics
|
||||
- KServe metrics for inference performance
|
||||
|
||||
### Logging
|
||||
- Structured logging for all components
|
||||
- Centralized logging with Loki/ELK
|
||||
- Log retention policies
|
||||
|
||||
### Alerting
|
||||
- GPU utilization alerts
|
||||
- Model health alerts
|
||||
- Resource exhaustion alerts
|
||||
|
||||
## Backup and Recovery
|
||||
|
||||
### GitOps Backup
|
||||
- All configuration is in git
|
||||
- Easy rollback with git revert
|
||||
- Branch-based testing
|
||||
|
||||
### Data Backup
|
||||
- Model storage backup
|
||||
- Configuration backup
|
||||
- Disaster recovery plan
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Make your changes
|
||||
4. Test thoroughly
|
||||
5. Submit a pull request
|
||||
|
||||
## Support
|
||||
|
||||
For issues and questions:
|
||||
- Check the troubleshooting section
|
||||
- Review component documentation
|
||||
- Check FluxCD, KServe, and GPU Operator docs
|
||||
- Open an issue in the repository
|
||||
96
bootstrap.sh
Executable file
96
bootstrap.sh
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Configuration
|
||||
GITEA_URL="${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}"
|
||||
GITEA_BRANCH="${GITEA_BRANCH:-main}"
|
||||
CLUSTER_NAME="${CLUSTER_NAME:-k3s-dgx}"
|
||||
NAMESPACE="${NAMESPACE:-flux-system}"
|
||||
|
||||
echo -e "${GREEN}=== Edge GitOps Bootstrap Script ===${NC}"
|
||||
echo ""
|
||||
|
||||
# Check prerequisites
|
||||
echo -e "${YELLOW}Checking prerequisites...${NC}"
|
||||
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
echo -e "${RED}kubectl is not installed${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v flux &> /dev/null; then
|
||||
echo -e "${RED}flux is not installed${NC}"
|
||||
echo "Install from: https://fluxcd.io/flux/installation/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! kubectl cluster-info &> /dev/null; then
|
||||
echo -e "${RED}Cannot connect to kubernetes cluster${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo -e "${GREEN}✓ Prerequisites met${NC}"
|
||||
echo ""
|
||||
|
||||
# Check if Flux is already installed
|
||||
if kubectl get namespace flux-system &> /dev/null; then
|
||||
echo -e "${YELLOW}FluxCD is already installed${NC}"
|
||||
read -p "Do you want to reinstall? (y/n) " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
echo "Exiting..."
|
||||
exit 0
|
||||
fi
|
||||
echo -e "${YELLOW}Uninstalling existing FluxCD...${NC}"
|
||||
flux uninstall --namespace=flux-system --silent || true
|
||||
fi
|
||||
|
||||
# Bootstrap FluxCD
|
||||
echo -e "${YELLOW}Bootstrapping FluxCD...${NC}"
|
||||
flux bootstrap git \
|
||||
--url="$GITEA_URL" \
|
||||
--branch="$GITEA_BRANCH" \
|
||||
--path="clusters/$CLUSTER_NAME" \
|
||||
--namespace="$NAMESPACE" \
|
||||
--components=source-controller,kustomize-controller,helm-controller,notification-controller \
|
||||
--timeout=10m
|
||||
|
||||
echo -e "${GREEN}✓ FluxCD bootstrapped${NC}"
|
||||
echo ""
|
||||
|
||||
# Wait for FluxCD to be ready
|
||||
echo -e "${YELLOW}Waiting for FluxCD components to be ready...${NC}"
|
||||
kubectl wait --for=condition=ready --timeout=300s \
|
||||
-n "$NAMESPACE" \
|
||||
deployment/source-controller \
|
||||
deployment/kustomize-controller \
|
||||
deployment/helm-controller \
|
||||
deployment/notification-controller
|
||||
|
||||
echo -e "${GREEN}✓ FluxCD components ready${NC}"
|
||||
echo ""
|
||||
|
||||
# Verify installation
|
||||
echo -e "${YELLOW}Verifying installation...${NC}"
|
||||
flux check
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}=== Bootstrap Complete ===${NC}"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Update the Gitea URL in clusters/k3s-dgx/flux-system/gotk-sync.yaml"
|
||||
echo "2. Commit and push the changes to your repository"
|
||||
echo "3. Monitor the sync: flux get all --all-namespaces"
|
||||
echo ""
|
||||
echo "Useful commands:"
|
||||
echo " flux get all --all-namespaces # Show all Flux resources"
|
||||
echo " flux logs # Show Flux logs"
|
||||
echo " flux reconcile kustomization flux-system --with-source # Force sync"
|
||||
echo ""
|
||||
22
clusters/k3s-dgx/apps/huihui-granite-inference.yaml
Normal file
22
clusters/k3s-dgx/apps/huihui-granite-inference.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
apiVersion: serving.kserve.io/v1beta1
|
||||
kind: InferenceService
|
||||
metadata:
|
||||
name: huihui-granite
|
||||
namespace: kserve
|
||||
spec:
|
||||
predictor:
|
||||
model:
|
||||
modelFormat:
|
||||
name: huggingface
|
||||
args:
|
||||
- --model_name=huihui-granite
|
||||
storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
|
||||
resources:
|
||||
limits:
|
||||
cpu: "4"
|
||||
memory: 16Gi
|
||||
nvidia.com/gpu: "1"
|
||||
requests:
|
||||
cpu: "2"
|
||||
memory: 8Gi
|
||||
nvidia.com/gpu: "1"
|
||||
5
clusters/k3s-dgx/apps/kustomization.yaml
Normal file
5
clusters/k3s-dgx/apps/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: kserve
|
||||
resources:
|
||||
- huihui-granite-inference.yaml
|
||||
50
clusters/k3s-dgx/flux-system/gotk-components.yaml
Normal file
50
clusters/k3s-dgx/flux-system/gotk-components.yaml
Normal file
@@ -0,0 +1,50 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: flux-system
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: helm-controller
|
||||
namespace: flux-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: cluster-reconciler
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: cluster-admin
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: helm-controller
|
||||
namespace: flux-system
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: bitnami
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
url: https://charts.bitnami.com/bitnami
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: kserve
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
url: https://kserve.github.io/kserve
|
||||
---
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: nvidia
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 30m
|
||||
url: https://nvidia.github.io/k8s-device-plugin
|
||||
34
clusters/k3s-dgx/flux-system/gotk-sync.yaml
Normal file
34
clusters/k3s-dgx/flux-system/gotk-sync.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: GitRepository
|
||||
metadata:
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 1m0s
|
||||
ref:
|
||||
branch: main
|
||||
secretRef:
|
||||
name: flux-system
|
||||
url: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git
|
||||
---
|
||||
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||
kind: Kustomization
|
||||
metadata:
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
spec:
|
||||
interval: 10m0s
|
||||
path: ./clusters/k3s-dgx
|
||||
prune: true
|
||||
sourceRef:
|
||||
kind: GitRepository
|
||||
name: flux-system
|
||||
healthChecks:
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: kustomize-controller
|
||||
namespace: flux-system
|
||||
- apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: helm-controller
|
||||
namespace: flux-system
|
||||
13
clusters/k3s-dgx/flux-system/kustomization.yaml
Normal file
13
clusters/k3s-dgx/flux-system/kustomization.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- gotk-components.yaml
|
||||
- gotk-sync.yaml
|
||||
patches:
|
||||
- patch: |
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --concurrency=20
|
||||
target:
|
||||
kind: Deployment
|
||||
name: "(kustomize-controller|helm-controller|notification-controller|source-controller)"
|
||||
13
clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
Normal file
13
clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: gpu-node-labels
|
||||
namespace: gpu-operator
|
||||
data:
|
||||
labels.yaml: |
|
||||
- key: accelerator
|
||||
value: nvidia-tesla
|
||||
- key: nvidia.com/gpu.present
|
||||
value: "true"
|
||||
- key: topology.kubernetes.io/zone
|
||||
value: "dgx-spark"
|
||||
70
clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
Normal file
70
clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
Normal file
@@ -0,0 +1,70 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: nvidia
|
||||
namespace: gpu-operator
|
||||
spec:
|
||||
interval: 10m
|
||||
url: https://nvidia.github.io/gpu-operator
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: gpu-operator
|
||||
namespace: gpu-operator
|
||||
spec:
|
||||
interval: 10m
|
||||
chart:
|
||||
spec:
|
||||
chart: gpu-operator
|
||||
version: "v23.9.1"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: nvidia
|
||||
namespace: gpu-operator
|
||||
values:
|
||||
driver:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/driver"
|
||||
version: "535.129.03"
|
||||
operator:
|
||||
defaultRuntime: nvidia-container-runtime
|
||||
toolkit:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/k8s-device-plugin"
|
||||
version: "v0.14.0"
|
||||
env:
|
||||
- name: CONTAINERD_CONFIG
|
||||
value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
|
||||
- name: CONTAINERD_SOCKET
|
||||
value: /run/k3s/containerd/containerd.sock
|
||||
- name: CONTAINERD_RUNTIME_CLASS
|
||||
value: nvidia
|
||||
devicePlugin:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/k8s-device-plugin"
|
||||
version: "v0.14.0"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 500Mi
|
||||
dcgm:
|
||||
enabled: true
|
||||
image: "nvcr.io/nvidia/dcgm-exporter"
|
||||
version: "3.3.3-3.1.0-ubuntu22.04"
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 500Mi
|
||||
migManager:
|
||||
enabled: true
|
||||
gfd:
|
||||
enabled: true
|
||||
node-feature-discovery:
|
||||
enabled: true
|
||||
6
clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
Normal file
6
clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: gpu-operator
|
||||
labels:
|
||||
openshift.io/cluster-monitoring: "true"
|
||||
7
clusters/k3s-dgx/gpu-support/kustomization.yaml
Normal file
7
clusters/k3s-dgx/gpu-support/kustomization.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: gpu-operator
|
||||
resources:
|
||||
- gpu-operator-namespace.yaml
|
||||
- gpu-operator-helmrelease.yaml
|
||||
- gpu-node-labels.yaml
|
||||
39
clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
Normal file
39
clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: gpu-operator
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-device-plugin-daemonset
|
||||
namespace: gpu-operator
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: nvidia-device-plugin-ds
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-device-plugin-ds
|
||||
spec:
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
|
||||
name: nvidia-device-plugin-ctr
|
||||
args: ["--fail-on-init-error=false"]
|
||||
env:
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: "all"
|
||||
- name: MIG_STRATEGY
|
||||
value: "single"
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
16
clusters/k3s-dgx/gpu-support/runtime-class.yaml
Normal file
16
clusters/k3s-dgx/gpu-support/runtime-class.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: nvidia
|
||||
handler: nvidia
|
||||
overhead:
|
||||
podFixed:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
scheduling:
|
||||
nodeSelector:
|
||||
nvidia.com/gpu.present: "true"
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
42
clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
Normal file
42
clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
apiVersion: serving.kserve.io/v1beta1
|
||||
kind: ServingRuntime
|
||||
metadata:
|
||||
name: kserve-gpu-runtime
|
||||
namespace: kserve
|
||||
spec:
|
||||
supportedModelFormats:
|
||||
- name: tensorflow
|
||||
version: "2"
|
||||
autoSelect: true
|
||||
- name: pytorch
|
||||
version: "2"
|
||||
autoSelect: true
|
||||
- name: sklearn
|
||||
version: "1"
|
||||
autoSelect: true
|
||||
- name: xgboost
|
||||
version: "1"
|
||||
autoSelect: true
|
||||
protocol: v1
|
||||
protocolVersions:
|
||||
- v1
|
||||
- v2
|
||||
containers:
|
||||
- name: kserve-container
|
||||
image: kserve/sklearnserver-gpu:latest
|
||||
resources:
|
||||
requests:
|
||||
cpu: "1"
|
||||
memory: "2Gi"
|
||||
nvidia.com/gpu: "1"
|
||||
limits:
|
||||
cpu: "4"
|
||||
memory: "8Gi"
|
||||
nvidia.com/gpu: "1"
|
||||
env:
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: "all"
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: "compute,utility"
|
||||
multiModel: false
|
||||
disabled: false
|
||||
35
clusters/k3s-dgx/kserve/istio-gateway.yaml
Normal file
35
clusters/k3s-dgx/kserve/istio-gateway.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
apiVersion: networking.istio.io/v1beta1
|
||||
kind: Gateway
|
||||
metadata:
|
||||
name: kserve-gateway
|
||||
namespace: kserve
|
||||
spec:
|
||||
selector:
|
||||
istio: ingressgateway
|
||||
servers:
|
||||
- port:
|
||||
number: 80
|
||||
name: http
|
||||
protocol: HTTP
|
||||
hosts:
|
||||
- "*"
|
||||
---
|
||||
apiVersion: networking.istio.io/v1beta1
|
||||
kind: VirtualService
|
||||
metadata:
|
||||
name: kserve-vs
|
||||
namespace: kserve
|
||||
spec:
|
||||
hosts:
|
||||
- "*"
|
||||
gateways:
|
||||
- kserve-gateway
|
||||
http:
|
||||
- match:
|
||||
- uri:
|
||||
prefix: /v1/models/
|
||||
route:
|
||||
- destination:
|
||||
host: kserve-default
|
||||
port:
|
||||
number: 80
|
||||
43
clusters/k3s-dgx/kserve/kserve-controller.yaml
Normal file
43
clusters/k3s-dgx/kserve/kserve-controller.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
apiVersion: source.toolkit.fluxcd.io/v1
|
||||
kind: HelmRepository
|
||||
metadata:
|
||||
name: kserve
|
||||
namespace: kserve
|
||||
spec:
|
||||
interval: 10m
|
||||
url: https://kserve.github.io/kserve
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: kserve
|
||||
namespace: kserve
|
||||
spec:
|
||||
interval: 10m
|
||||
chart:
|
||||
spec:
|
||||
chart: kserve
|
||||
version: "v0.12.0"
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: kserve
|
||||
namespace: kserve
|
||||
values:
|
||||
controller:
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 2
|
||||
memory: 2Gi
|
||||
config:
|
||||
ingress:
|
||||
className: istio
|
||||
storage:
|
||||
initialCapacity: 10Gi
|
||||
storageClassName: local-path
|
||||
knative:
|
||||
enabled: true
|
||||
istio:
|
||||
enabled: true
|
||||
7
clusters/k3s-dgx/kserve/kserve-namespace.yaml
Normal file
7
clusters/k3s-dgx/kserve/kserve-namespace.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: kserve
|
||||
labels:
|
||||
istio-injection: enabled
|
||||
serving.kserve.io/serving-runtime: "true"
|
||||
10
clusters/k3s-dgx/kserve/kustomization.yaml
Normal file
10
clusters/k3s-dgx/kserve/kustomization.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: kserve
|
||||
resources:
|
||||
- kserve-namespace.yaml
|
||||
- kserve-controller.yaml
|
||||
- istio-gateway.yaml
|
||||
# - gpu-serving-runtime.yaml
|
||||
# - model-storage-pvc.yaml
|
||||
# - storage-config.yaml
|
||||
12
clusters/k3s-dgx/kserve/model-storage-pvc.yaml
Normal file
12
clusters/k3s-dgx/kserve/model-storage-pvc.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: model-storage-pvc
|
||||
namespace: kserve
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
storageClassName: local-path
|
||||
20
clusters/k3s-dgx/kserve/storage-config.yaml
Normal file
20
clusters/k3s-dgx/kserve/storage-config.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: storage-config
|
||||
namespace: kserve
|
||||
data:
|
||||
storageConfig.yaml: |
|
||||
defaultStorageUri: "pvc://model-storage"
|
||||
storageSpec:
|
||||
- name: model-storage
|
||||
type: pvc
|
||||
pvcSpec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
storageClassName: local-path
|
||||
modelCacheSize: 10Gi
|
||||
modelCacheMemory: 2Gi
|
||||
7
clusters/k3s-dgx/kustomization.yaml
Normal file
7
clusters/k3s-dgx/kustomization.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- flux-system
|
||||
- gpu-support
|
||||
- kserve
|
||||
- apps
|
||||
88
configure.sh
Executable file
88
configure.sh
Executable file
@@ -0,0 +1,88 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Configuration script for Edge GitOps
|
||||
|
||||
set -e
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m'
|
||||
|
||||
echo -e "${BLUE}=== Edge GitOps Configuration ===${NC}"
|
||||
echo ""
|
||||
|
||||
# Function to update configuration files
|
||||
update_config() {
|
||||
local file=$1
|
||||
local key=$2
|
||||
local value=$3
|
||||
|
||||
if [ -f "$file" ]; then
|
||||
sed -i "s|$key|$value|g" "$file"
|
||||
echo -e "${GREEN}✓ Updated $file${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ File not found: $file${NC}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Get Gitea URL
|
||||
echo -e "${YELLOW}Enter your Gitea repository URL:${NC}"
|
||||
read -p "URL (default: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git): " GITEA_URL
|
||||
GITEA_URL=${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}
|
||||
|
||||
# Get branch name
|
||||
echo -e "${YELLOW}Enter your branch name:${NC}"
|
||||
read -p "Branch (default: main): " GITEA_BRANCH
|
||||
GITEA_BRANCH=${GITEA_BRANCH:-main}
|
||||
|
||||
# Get cluster name
|
||||
echo -e "${YELLOW}Enter your cluster name:${NC}"
|
||||
read -p "Cluster name (default: k3s-dgx): " CLUSTER_NAME
|
||||
CLUSTER_NAME=${CLUSTER_NAME:-k3s-dgx}
|
||||
|
||||
echo ""
|
||||
echo -e "${BLUE}Configuration Summary:${NC}"
|
||||
echo " Gitea URL: $GITEA_URL"
|
||||
echo " Branch: $GITEA_BRANCH"
|
||||
echo " Cluster: $CLUSTER_NAME"
|
||||
echo ""
|
||||
|
||||
read -p "Apply these settings? (y/n) " -n 1 -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||
echo "Aborting..."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Update configuration files
|
||||
echo -e "${YELLOW}Updating configuration files...${NC}"
|
||||
|
||||
update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
|
||||
"ssh://git@gitea.example.com/edge-gitops/edge-gitops.git" \
|
||||
"$GITEA_URL"
|
||||
|
||||
update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
|
||||
"branch: main" \
|
||||
"branch: $GITEA_BRANCH"
|
||||
|
||||
# Create environment file
|
||||
cat > .env << EOF
|
||||
GITEA_URL=$GITEA_URL
|
||||
GITEA_BRANCH=$GITEA_BRANCH
|
||||
CLUSTER_NAME=$CLUSTER_NAME
|
||||
EOF
|
||||
|
||||
echo -e "${GREEN}✓ Created .env file${NC}"
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}=== Configuration Complete ===${NC}"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Review the changes: git diff"
|
||||
echo "2. Commit the changes: git add . && git commit -m 'Configure GitOps settings'"
|
||||
echo "3. Push to repository: git push origin $GITEA_BRANCH"
|
||||
echo "4. Run bootstrap: ./bootstrap.sh"
|
||||
echo ""
|
||||
Reference in New Issue
Block a user