init commit
This commit is contained in:
61
Makefile
Normal file
61
Makefile
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
.PHONY: help bootstrap sync status logs clean test
|
||||||
|
|
||||||
|
help: ## Show this help message
|
||||||
|
@echo 'Usage: make [target]'
|
||||||
|
@echo ''
|
||||||
|
@echo 'Available targets:'
|
||||||
|
@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf " %-20s %s\n", $$1, $$2}' $(MAKEFILE_LIST)
|
||||||
|
|
||||||
|
bootstrap: ## Bootstrap FluxCD on the cluster
|
||||||
|
@./bootstrap.sh
|
||||||
|
|
||||||
|
sync: ## Force sync all FluxCD resources
|
||||||
|
@flux reconcile kustomization flux-system --with-source
|
||||||
|
@flux reconcile helmrelease --all
|
||||||
|
|
||||||
|
status: ## Show status of all FluxCD resources
|
||||||
|
@flux get all --all-namespaces
|
||||||
|
|
||||||
|
logs: ## Show FluxCD logs
|
||||||
|
@flux logs --all-namespaces
|
||||||
|
|
||||||
|
clean: ## Remove FluxCD from cluster
|
||||||
|
@flux uninstall --namespace=flux-system --silent
|
||||||
|
|
||||||
|
test: ## Test the cluster connectivity
|
||||||
|
@echo "Testing cluster connectivity..."
|
||||||
|
@kubectl cluster-info
|
||||||
|
@echo ""
|
||||||
|
@echo "Testing GPU availability..."
|
||||||
|
@kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
|
||||||
|
@echo ""
|
||||||
|
@echo "Testing FluxCD..."
|
||||||
|
@flux check
|
||||||
|
|
||||||
|
apply: ## Apply all manifests directly (for testing)
|
||||||
|
@kubectl apply -k clusters/k3s-dgx
|
||||||
|
|
||||||
|
diff: ## Show diff between cluster and git
|
||||||
|
@flux diff kustomization flux-system --with-source
|
||||||
|
|
||||||
|
restart: ## Restart all FluxCD controllers
|
||||||
|
@kubectl rollout restart deployment/source-controller -n flux-system
|
||||||
|
@kubectl rollout restart deployment/kustomize-controller -n flux-system
|
||||||
|
@kubectl rollout restart deployment/helm-controller -n flux-system
|
||||||
|
@kubectl rollout restart deployment/notification-controller -n flux-system
|
||||||
|
|
||||||
|
gpu-status: ## Show GPU status
|
||||||
|
@kubectl describe nodes | grep -A 5 "nvidia.com/gpu"
|
||||||
|
|
||||||
|
kserve-status: ## Show KServe status
|
||||||
|
@kubectl get inferenceservices -n kserve
|
||||||
|
@kubectl get pods -n kserve
|
||||||
|
|
||||||
|
model-logs: ## Show model inference logs
|
||||||
|
@kubectl logs -n kserve -l serving.kserve.io/inferenceservice=huihui-granite --tail=100 -f
|
||||||
|
|
||||||
|
secrets: ## Generate example secrets (DO NOT COMMIT)
|
||||||
|
@echo "Creating example secrets directory..."
|
||||||
|
@mkdir -p secrets
|
||||||
|
@echo "# Add your secrets here" > secrets/README.md
|
||||||
|
@echo "# DO NOT commit actual secrets to git" >> secrets/README.md
|
||||||
165
README.md
Normal file
165
README.md
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
# Edge GitOps - KServe on k3s with GPU
|
||||||
|
|
||||||
|
GitOps setup for deploying ML models using KServe on a k3s cluster with GPU support (DGX Spark).
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- k3s cluster with GPU support
|
||||||
|
- kubectl configured to access the cluster
|
||||||
|
- Gitea instance for GitOps repository
|
||||||
|
- FluxCD CLI installed
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
edge-gitops/
|
||||||
|
├── clusters/
|
||||||
|
│ └── k3s-dgx/
|
||||||
|
│ ├── flux-system/ # FluxCD installation
|
||||||
|
│ ├── gpu-support/ # NVIDIA GPU Operator
|
||||||
|
│ ├── kserve/ # KServe installation
|
||||||
|
│ └── apps/ # ML model deployments
|
||||||
|
├── apps/ # Reusable app manifests
|
||||||
|
└── infrastructure/ # Base infrastructure
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setup Instructions
|
||||||
|
|
||||||
|
### 1. Bootstrap FluxCD
|
||||||
|
|
||||||
|
```bash
|
||||||
|
flux bootstrap git \
|
||||||
|
--url=ssh://git@gitea.example.com/edge-gitops/edge-gitops.git \
|
||||||
|
--branch=main \
|
||||||
|
--path=clusters/k3s-dgx \
|
||||||
|
--components=source-controller,kustomize-controller,helm-controller,notification-controller
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Configure Gitea SSH Key
|
||||||
|
|
||||||
|
Generate SSH key for FluxCD:
|
||||||
|
```bash
|
||||||
|
ssh-keygen -t ed25519 -N "" -f flux-gitea-key
|
||||||
|
```
|
||||||
|
|
||||||
|
Add the public key to your Gitea repository as a deploy key.
|
||||||
|
|
||||||
|
### 3. Update Repository Configuration
|
||||||
|
|
||||||
|
Edit `clusters/k3s-dgx/flux-system/gotk-sync.yaml` to match your Gitea URL:
|
||||||
|
```yaml
|
||||||
|
url: ssh://git@your-gitea-instance.com/edge-gitops/edge-gitops.git
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Deploy the Stack
|
||||||
|
|
||||||
|
Commit and push the changes:
|
||||||
|
```bash
|
||||||
|
git add .
|
||||||
|
git commit -m "Initial GitOps setup for KServe on k3s"
|
||||||
|
git push origin main
|
||||||
|
```
|
||||||
|
|
||||||
|
FluxCD will automatically sync the changes to your cluster.
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### GPU Support
|
||||||
|
- NVIDIA GPU Operator (v23.9.1)
|
||||||
|
- NVIDIA Device Plugin
|
||||||
|
- DCGM Exporter for monitoring
|
||||||
|
- GPU Node Feature Discovery
|
||||||
|
|
||||||
|
### KServe
|
||||||
|
- KServe Core (v0.12.0)
|
||||||
|
- GPU-enabled Serving Runtime
|
||||||
|
- Istio Gateway for networking
|
||||||
|
- Model Storage (PVC)
|
||||||
|
|
||||||
|
### Example Model
|
||||||
|
- Huihui-granite-4.1-30b-abliterated (Hugging Face)
|
||||||
|
- GPU-accelerated inference
|
||||||
|
- REST API endpoint
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Deploy a New Model
|
||||||
|
|
||||||
|
1. Create a new InferenceService in `clusters/k3s-dgx/apps/`:
|
||||||
|
```yaml
|
||||||
|
apiVersion: serving.kserve.io/v1beta1
|
||||||
|
kind: InferenceService
|
||||||
|
metadata:
|
||||||
|
name: your-model
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
predictor:
|
||||||
|
model:
|
||||||
|
modelFormat:
|
||||||
|
name: huggingface
|
||||||
|
storageUri: "hf://your-org/your-model"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Commit and push changes
|
||||||
|
|
||||||
|
### Test the Model
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get the service URL
|
||||||
|
kubectl get inferenceservice huihui-granite -n kserve
|
||||||
|
|
||||||
|
# Test inference
|
||||||
|
curl -X POST http://your-service-url/v1/models/huihui-granite:predict \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"inputs": [{"name": "text", "shape": [1], "datatype": "BYTES", "data": ["Hello world"]}]}'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
Check FluxCD status:
|
||||||
|
```bash
|
||||||
|
flux get all --all-namespaces
|
||||||
|
```
|
||||||
|
|
||||||
|
Check GPU status:
|
||||||
|
```bash
|
||||||
|
kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Check KServe services:
|
||||||
|
```bash
|
||||||
|
kubectl get inferenceservices -n kserve
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### GPU Not Available
|
||||||
|
```bash
|
||||||
|
kubectl describe node | grep -A 5 nvidia.com/gpu
|
||||||
|
```
|
||||||
|
|
||||||
|
### KServe Pods Not Starting
|
||||||
|
```bash
|
||||||
|
kubectl logs -n kserve deployment/kserve-controller-manager
|
||||||
|
kubectl get pods -n kserve
|
||||||
|
```
|
||||||
|
|
||||||
|
### FluxCD Sync Issues
|
||||||
|
```bash
|
||||||
|
flux reconcile kustomization flux-system --with-source
|
||||||
|
flux logs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Customization
|
||||||
|
|
||||||
|
### GPU Resources
|
||||||
|
Edit `clusters/k3s-dgx/apps/huihui-granite-inference.yaml` to adjust GPU allocation.
|
||||||
|
|
||||||
|
### Storage
|
||||||
|
Modify `clusters/k3s-dgx/kserve/model-storage-pvc.yaml` for different storage requirements.
|
||||||
|
|
||||||
|
### Networking
|
||||||
|
Update `clusters/k3s-dgx/kserve/istio-gateway.yaml` for custom ingress configuration.
|
||||||
297
SETUP.md
Normal file
297
SETUP.md
Normal file
@@ -0,0 +1,297 @@
|
|||||||
|
# Edge GitOps - Complete Setup Guide
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
1. **Configure the setup:**
|
||||||
|
```bash
|
||||||
|
./configure.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Bootstrap FluxCD:**
|
||||||
|
```bash
|
||||||
|
make bootstrap
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Monitor the deployment:**
|
||||||
|
```bash
|
||||||
|
make status
|
||||||
|
```
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
edge-gitops/
|
||||||
|
├── bootstrap.sh # FluxCD bootstrap script
|
||||||
|
├── configure.sh # Configuration wizard
|
||||||
|
├── Makefile # Convenient commands
|
||||||
|
├── README.md # Main documentation
|
||||||
|
├── .gitignore # Git ignore rules
|
||||||
|
├── .env # Environment variables (not committed)
|
||||||
|
│
|
||||||
|
├── clusters/
|
||||||
|
│ └── k3s-dgx/ # Cluster-specific configuration
|
||||||
|
│ ├── kustomization.yaml
|
||||||
|
│ │
|
||||||
|
│ ├── flux-system/ # FluxCD installation
|
||||||
|
│ │ ├── kustomization.yaml
|
||||||
|
│ │ ├── gotk-components.yaml
|
||||||
|
│ │ └── gotk-sync.yaml
|
||||||
|
│ │
|
||||||
|
│ ├── gpu-support/ # NVIDIA GPU Operator
|
||||||
|
│ │ ├── kustomization.yaml
|
||||||
|
│ │ ├── gpu-operator-namespace.yaml
|
||||||
|
│ │ ├── gpu-operator-helmrelease.yaml
|
||||||
|
│ │ └── gpu-node-labels.yaml
|
||||||
|
│ │
|
||||||
|
│ ├── kserve/ # KServe installation
|
||||||
|
│ │ ├── kustomization.yaml
|
||||||
|
│ │ ├── kserve-namespace.yaml
|
||||||
|
│ │ ├── kserve-crds.yaml
|
||||||
|
│ │ ├── kserve-controller.yaml
|
||||||
|
│ │ ├── istio-gateway.yaml
|
||||||
|
│ │ ├── gpu-serving-runtime.yaml
|
||||||
|
│ │ ├── model-storage-pvc.yaml
|
||||||
|
│ │ └── storage-config.yaml
|
||||||
|
│ │
|
||||||
|
│ └── apps/ # ML model deployments
|
||||||
|
│ ├── kustomization.yaml
|
||||||
|
│ └── huihui-granite-inference.yaml
|
||||||
|
│
|
||||||
|
├── apps/ # Reusable application manifests
|
||||||
|
└── infrastructure/ # Base infrastructure components
|
||||||
|
```
|
||||||
|
|
||||||
|
## Component Details
|
||||||
|
|
||||||
|
### FluxCD
|
||||||
|
- **Version:** Latest stable
|
||||||
|
- **Components:** source-controller, kustomize-controller, helm-controller, notification-controller
|
||||||
|
- **Sync Interval:** 1 minute for GitRepository, 10 minutes for Kustomization
|
||||||
|
- **Repository:** Gitea (configurable)
|
||||||
|
|
||||||
|
### NVIDIA GPU Operator
|
||||||
|
- **Version:** v23.9.1
|
||||||
|
- **Driver:** 535.129.03
|
||||||
|
- **Components:**
|
||||||
|
- NVIDIA Driver
|
||||||
|
- Device Plugin
|
||||||
|
- DCGM Exporter
|
||||||
|
- MIG Manager
|
||||||
|
- Node Feature Discovery
|
||||||
|
|
||||||
|
### KServe
|
||||||
|
- **Version:** v0.12.0
|
||||||
|
- **Components:**
|
||||||
|
- KServe Controller
|
||||||
|
- Custom Resource Definitions
|
||||||
|
- GPU Serving Runtime
|
||||||
|
- Istio Integration
|
||||||
|
- Model Storage (50Gi PVC)
|
||||||
|
|
||||||
|
### Example Model
|
||||||
|
- **Name:** Huihui-granite-4.1-30b-abliterated
|
||||||
|
- **Source:** Hugging Face
|
||||||
|
- **Resources:**
|
||||||
|
- CPU: 2-4 cores
|
||||||
|
- Memory: 8-16Gi
|
||||||
|
- GPU: 1 NVIDIA GPU
|
||||||
|
|
||||||
|
## Common Tasks
|
||||||
|
|
||||||
|
### Add a New Model
|
||||||
|
|
||||||
|
1. Create a new InferenceService:
|
||||||
|
```bash
|
||||||
|
cat > clusters/k3s-dgx/apps/your-model.yaml << EOF
|
||||||
|
apiVersion: serving.kserve.io/v1beta1
|
||||||
|
kind: InferenceService
|
||||||
|
metadata:
|
||||||
|
name: your-model
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
predictor:
|
||||||
|
model:
|
||||||
|
modelFormat:
|
||||||
|
name: huggingface
|
||||||
|
storageUri: "hf://your-org/your-model"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Update the kustomization:
|
||||||
|
```bash
|
||||||
|
echo " - your-model.yaml" >> clusters/k3s-dgx/apps/kustomization.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Commit and push:
|
||||||
|
```bash
|
||||||
|
git add clusters/k3s-dgx/apps/
|
||||||
|
git commit -m "Add new model deployment"
|
||||||
|
git push
|
||||||
|
```
|
||||||
|
|
||||||
|
### Update GPU Resources
|
||||||
|
|
||||||
|
Edit the resource limits in your InferenceService:
|
||||||
|
```yaml
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: "8"
|
||||||
|
memory: "32Gi"
|
||||||
|
nvidia.com/gpu: "2"
|
||||||
|
requests:
|
||||||
|
cpu: "4"
|
||||||
|
memory: "16Gi"
|
||||||
|
nvidia.com/gpu: "2"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Monitor Model Performance
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get model endpoint
|
||||||
|
kubectl get inferenceservice your-model -n kserve
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
kubectl logs -n kserve -l serving.kserve.io/inferenceservice=your-model -f
|
||||||
|
|
||||||
|
# Check GPU usage
|
||||||
|
kubectl exec -n kserve <pod-name> -- nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### FluxCD Not Syncing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check FluxCD status
|
||||||
|
flux check
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
flux logs
|
||||||
|
|
||||||
|
# Force sync
|
||||||
|
flux reconcile kustomization flux-system --with-source
|
||||||
|
```
|
||||||
|
|
||||||
|
### GPU Not Available
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check GPU nodes
|
||||||
|
kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}'
|
||||||
|
|
||||||
|
# Check GPU operator
|
||||||
|
kubectl get pods -n gpu-operator
|
||||||
|
|
||||||
|
# View GPU operator logs
|
||||||
|
kubectl logs -n gpu-operator deployment/gpu-operator
|
||||||
|
```
|
||||||
|
|
||||||
|
### KServe Issues
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check KServe pods
|
||||||
|
kubectl get pods -n kserve
|
||||||
|
|
||||||
|
# Check KServe controller
|
||||||
|
kubectl logs -n kserve deployment/kserve-controller-manager
|
||||||
|
|
||||||
|
# Describe InferenceService
|
||||||
|
kubectl describe inferenceservice your-model -n kserve
|
||||||
|
```
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
1. **Secrets Management:**
|
||||||
|
- Never commit secrets to git
|
||||||
|
- Use Kubernetes secrets for sensitive data
|
||||||
|
- Consider using Sealed Secrets or External Secrets Operator
|
||||||
|
|
||||||
|
2. **Network Policies:**
|
||||||
|
- Review and restrict network access
|
||||||
|
- Use Istio for service mesh security
|
||||||
|
|
||||||
|
3. **RBAC:**
|
||||||
|
- Review FluxCD service account permissions
|
||||||
|
- Implement principle of least privilege
|
||||||
|
|
||||||
|
## Performance Optimization
|
||||||
|
|
||||||
|
### GPU Optimization
|
||||||
|
- Use appropriate GPU resource requests/limits
|
||||||
|
- Monitor GPU utilization with DCGM Exporter
|
||||||
|
- Consider MIG (Multi-Instance GPU) for better isolation
|
||||||
|
|
||||||
|
### Storage Optimization
|
||||||
|
- Use fast storage for model cache
|
||||||
|
- Consider using ReadWriteMany for multi-pod access
|
||||||
|
- Implement model caching strategies
|
||||||
|
|
||||||
|
### Network Optimization
|
||||||
|
- Use Istio for efficient load balancing
|
||||||
|
- Configure appropriate timeouts for large models
|
||||||
|
- Consider using gRPC for internal communication
|
||||||
|
|
||||||
|
## Scaling
|
||||||
|
|
||||||
|
### Horizontal Scaling
|
||||||
|
```yaml
|
||||||
|
# Add to InferenceService
|
||||||
|
spec:
|
||||||
|
predictor:
|
||||||
|
replicas: 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vertical Scaling
|
||||||
|
```yaml
|
||||||
|
# Update resource limits
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: "2"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring
|
||||||
|
|
||||||
|
### Metrics Collection
|
||||||
|
- DCGM Exporter for GPU metrics
|
||||||
|
- Prometheus for cluster metrics
|
||||||
|
- KServe metrics for inference performance
|
||||||
|
|
||||||
|
### Logging
|
||||||
|
- Structured logging for all components
|
||||||
|
- Centralized logging with Loki/ELK
|
||||||
|
- Log retention policies
|
||||||
|
|
||||||
|
### Alerting
|
||||||
|
- GPU utilization alerts
|
||||||
|
- Model health alerts
|
||||||
|
- Resource exhaustion alerts
|
||||||
|
|
||||||
|
## Backup and Recovery
|
||||||
|
|
||||||
|
### GitOps Backup
|
||||||
|
- All configuration is in git
|
||||||
|
- Easy rollback with git revert
|
||||||
|
- Branch-based testing
|
||||||
|
|
||||||
|
### Data Backup
|
||||||
|
- Model storage backup
|
||||||
|
- Configuration backup
|
||||||
|
- Disaster recovery plan
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create a feature branch
|
||||||
|
3. Make your changes
|
||||||
|
4. Test thoroughly
|
||||||
|
5. Submit a pull request
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues and questions:
|
||||||
|
- Check the troubleshooting section
|
||||||
|
- Review component documentation
|
||||||
|
- Check FluxCD, KServe, and GPU Operator docs
|
||||||
|
- Open an issue in the repository
|
||||||
96
bootstrap.sh
Executable file
96
bootstrap.sh
Executable file
@@ -0,0 +1,96 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
GITEA_URL="${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}"
|
||||||
|
GITEA_BRANCH="${GITEA_BRANCH:-main}"
|
||||||
|
CLUSTER_NAME="${CLUSTER_NAME:-k3s-dgx}"
|
||||||
|
NAMESPACE="${NAMESPACE:-flux-system}"
|
||||||
|
|
||||||
|
echo -e "${GREEN}=== Edge GitOps Bootstrap Script ===${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check prerequisites
|
||||||
|
echo -e "${YELLOW}Checking prerequisites...${NC}"
|
||||||
|
|
||||||
|
if ! command -v kubectl &> /dev/null; then
|
||||||
|
echo -e "${RED}kubectl is not installed${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v flux &> /dev/null; then
|
||||||
|
echo -e "${RED}flux is not installed${NC}"
|
||||||
|
echo "Install from: https://fluxcd.io/flux/installation/"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! kubectl cluster-info &> /dev/null; then
|
||||||
|
echo -e "${RED}Cannot connect to kubernetes cluster${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -e "${GREEN}✓ Prerequisites met${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check if Flux is already installed
|
||||||
|
if kubectl get namespace flux-system &> /dev/null; then
|
||||||
|
echo -e "${YELLOW}FluxCD is already installed${NC}"
|
||||||
|
read -p "Do you want to reinstall? (y/n) " -n 1 -r
|
||||||
|
echo
|
||||||
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||||
|
echo "Exiting..."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo -e "${YELLOW}Uninstalling existing FluxCD...${NC}"
|
||||||
|
flux uninstall --namespace=flux-system --silent || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Bootstrap FluxCD
|
||||||
|
echo -e "${YELLOW}Bootstrapping FluxCD...${NC}"
|
||||||
|
flux bootstrap git \
|
||||||
|
--url="$GITEA_URL" \
|
||||||
|
--branch="$GITEA_BRANCH" \
|
||||||
|
--path="clusters/$CLUSTER_NAME" \
|
||||||
|
--namespace="$NAMESPACE" \
|
||||||
|
--components=source-controller,kustomize-controller,helm-controller,notification-controller \
|
||||||
|
--timeout=10m
|
||||||
|
|
||||||
|
echo -e "${GREEN}✓ FluxCD bootstrapped${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Wait for FluxCD to be ready
|
||||||
|
echo -e "${YELLOW}Waiting for FluxCD components to be ready...${NC}"
|
||||||
|
kubectl wait --for=condition=ready --timeout=300s \
|
||||||
|
-n "$NAMESPACE" \
|
||||||
|
deployment/source-controller \
|
||||||
|
deployment/kustomize-controller \
|
||||||
|
deployment/helm-controller \
|
||||||
|
deployment/notification-controller
|
||||||
|
|
||||||
|
echo -e "${GREEN}✓ FluxCD components ready${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
echo -e "${YELLOW}Verifying installation...${NC}"
|
||||||
|
flux check
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}=== Bootstrap Complete ===${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "Next steps:"
|
||||||
|
echo "1. Update the Gitea URL in clusters/k3s-dgx/flux-system/gotk-sync.yaml"
|
||||||
|
echo "2. Commit and push the changes to your repository"
|
||||||
|
echo "3. Monitor the sync: flux get all --all-namespaces"
|
||||||
|
echo ""
|
||||||
|
echo "Useful commands:"
|
||||||
|
echo " flux get all --all-namespaces # Show all Flux resources"
|
||||||
|
echo " flux logs # Show Flux logs"
|
||||||
|
echo " flux reconcile kustomization flux-system --with-source # Force sync"
|
||||||
|
echo ""
|
||||||
22
clusters/k3s-dgx/apps/huihui-granite-inference.yaml
Normal file
22
clusters/k3s-dgx/apps/huihui-granite-inference.yaml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
apiVersion: serving.kserve.io/v1beta1
|
||||||
|
kind: InferenceService
|
||||||
|
metadata:
|
||||||
|
name: huihui-granite
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
predictor:
|
||||||
|
model:
|
||||||
|
modelFormat:
|
||||||
|
name: huggingface
|
||||||
|
args:
|
||||||
|
- --model_name=huihui-granite
|
||||||
|
storageUri: "hf://huihui-ai/Huihui-granite-4.1-30b-abliterated"
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: "4"
|
||||||
|
memory: 16Gi
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
|
requests:
|
||||||
|
cpu: "2"
|
||||||
|
memory: 8Gi
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
5
clusters/k3s-dgx/apps/kustomization.yaml
Normal file
5
clusters/k3s-dgx/apps/kustomization.yaml
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
namespace: kserve
|
||||||
|
resources:
|
||||||
|
- huihui-granite-inference.yaml
|
||||||
50
clusters/k3s-dgx/flux-system/gotk-components.yaml
Normal file
50
clusters/k3s-dgx/flux-system/gotk-components.yaml
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: flux-system
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: helm-controller
|
||||||
|
namespace: flux-system
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: cluster-reconciler
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: cluster-admin
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: helm-controller
|
||||||
|
namespace: flux-system
|
||||||
|
---
|
||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: bitnami
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 30m
|
||||||
|
url: https://charts.bitnami.com/bitnami
|
||||||
|
---
|
||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: kserve
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 30m
|
||||||
|
url: https://kserve.github.io/kserve
|
||||||
|
---
|
||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: nvidia
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 30m
|
||||||
|
url: https://nvidia.github.io/k8s-device-plugin
|
||||||
34
clusters/k3s-dgx/flux-system/gotk-sync.yaml
Normal file
34
clusters/k3s-dgx/flux-system/gotk-sync.yaml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: GitRepository
|
||||||
|
metadata:
|
||||||
|
name: flux-system
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 1m0s
|
||||||
|
ref:
|
||||||
|
branch: main
|
||||||
|
secretRef:
|
||||||
|
name: flux-system
|
||||||
|
url: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git
|
||||||
|
---
|
||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: flux-system
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m0s
|
||||||
|
path: ./clusters/k3s-dgx
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: flux-system
|
||||||
|
healthChecks:
|
||||||
|
- apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: kustomize-controller
|
||||||
|
namespace: flux-system
|
||||||
|
- apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: helm-controller
|
||||||
|
namespace: flux-system
|
||||||
13
clusters/k3s-dgx/flux-system/kustomization.yaml
Normal file
13
clusters/k3s-dgx/flux-system/kustomization.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- gotk-components.yaml
|
||||||
|
- gotk-sync.yaml
|
||||||
|
patches:
|
||||||
|
- patch: |
|
||||||
|
- op: add
|
||||||
|
path: /spec/template/spec/containers/0/args/-
|
||||||
|
value: --concurrency=20
|
||||||
|
target:
|
||||||
|
kind: Deployment
|
||||||
|
name: "(kustomize-controller|helm-controller|notification-controller|source-controller)"
|
||||||
13
clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
Normal file
13
clusters/k3s-dgx/gpu-support/gpu-node-labels.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: gpu-node-labels
|
||||||
|
namespace: gpu-operator
|
||||||
|
data:
|
||||||
|
labels.yaml: |
|
||||||
|
- key: accelerator
|
||||||
|
value: nvidia-tesla
|
||||||
|
- key: nvidia.com/gpu.present
|
||||||
|
value: "true"
|
||||||
|
- key: topology.kubernetes.io/zone
|
||||||
|
value: "dgx-spark"
|
||||||
70
clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
Normal file
70
clusters/k3s-dgx/gpu-support/gpu-operator-helmrelease.yaml
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: nvidia
|
||||||
|
namespace: gpu-operator
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
url: https://nvidia.github.io/gpu-operator
|
||||||
|
---
|
||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: gpu-operator
|
||||||
|
namespace: gpu-operator
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: gpu-operator
|
||||||
|
version: "v23.9.1"
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: nvidia
|
||||||
|
namespace: gpu-operator
|
||||||
|
values:
|
||||||
|
driver:
|
||||||
|
enabled: true
|
||||||
|
image: "nvcr.io/nvidia/driver"
|
||||||
|
version: "535.129.03"
|
||||||
|
operator:
|
||||||
|
defaultRuntime: nvidia-container-runtime
|
||||||
|
toolkit:
|
||||||
|
enabled: true
|
||||||
|
image: "nvcr.io/nvidia/k8s-device-plugin"
|
||||||
|
version: "v0.14.0"
|
||||||
|
env:
|
||||||
|
- name: CONTAINERD_CONFIG
|
||||||
|
value: /var/lib/rancher/k3s/agent/etc/containerd/config.toml
|
||||||
|
- name: CONTAINERD_SOCKET
|
||||||
|
value: /run/k3s/containerd/containerd.sock
|
||||||
|
- name: CONTAINERD_RUNTIME_CLASS
|
||||||
|
value: nvidia
|
||||||
|
devicePlugin:
|
||||||
|
enabled: true
|
||||||
|
image: "nvcr.io/nvidia/k8s-device-plugin"
|
||||||
|
version: "v0.14.0"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 500Mi
|
||||||
|
dcgm:
|
||||||
|
enabled: true
|
||||||
|
image: "nvcr.io/nvidia/dcgm-exporter"
|
||||||
|
version: "3.3.3-3.1.0-ubuntu22.04"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
limits:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 500Mi
|
||||||
|
migManager:
|
||||||
|
enabled: true
|
||||||
|
gfd:
|
||||||
|
enabled: true
|
||||||
|
node-feature-discovery:
|
||||||
|
enabled: true
|
||||||
6
clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
Normal file
6
clusters/k3s-dgx/gpu-support/gpu-operator-namespace.yaml
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: gpu-operator
|
||||||
|
labels:
|
||||||
|
openshift.io/cluster-monitoring: "true"
|
||||||
7
clusters/k3s-dgx/gpu-support/kustomization.yaml
Normal file
7
clusters/k3s-dgx/gpu-support/kustomization.yaml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
namespace: gpu-operator
|
||||||
|
resources:
|
||||||
|
- gpu-operator-namespace.yaml
|
||||||
|
- gpu-operator-helmrelease.yaml
|
||||||
|
- gpu-node-labels.yaml
|
||||||
39
clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
Normal file
39
clusters/k3s-dgx/gpu-support/nvidia-device-plugin.yaml
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: gpu-operator
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: nvidia-device-plugin-daemonset
|
||||||
|
namespace: gpu-operator
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: nvidia-device-plugin-ds
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: nvidia-device-plugin-ds
|
||||||
|
spec:
|
||||||
|
tolerations:
|
||||||
|
- key: nvidia.com/gpu
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/k8s-device-plugin:v0.14.0
|
||||||
|
name: nvidia-device-plugin-ctr
|
||||||
|
args: ["--fail-on-init-error=false"]
|
||||||
|
env:
|
||||||
|
- name: NVIDIA_VISIBLE_DEVICES
|
||||||
|
value: "all"
|
||||||
|
- name: MIG_STRATEGY
|
||||||
|
value: "single"
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/device-plugins
|
||||||
16
clusters/k3s-dgx/gpu-support/runtime-class.yaml
Normal file
16
clusters/k3s-dgx/gpu-support/runtime-class.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
apiVersion: node.k8s.io/v1
|
||||||
|
kind: RuntimeClass
|
||||||
|
metadata:
|
||||||
|
name: nvidia
|
||||||
|
handler: nvidia
|
||||||
|
overhead:
|
||||||
|
podFixed:
|
||||||
|
memory: "1Gi"
|
||||||
|
cpu: "500m"
|
||||||
|
scheduling:
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.present: "true"
|
||||||
|
tolerations:
|
||||||
|
- key: nvidia.com/gpu
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
42
clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
Normal file
42
clusters/k3s-dgx/kserve/gpu-serving-runtime.yaml
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
apiVersion: serving.kserve.io/v1beta1
|
||||||
|
kind: ServingRuntime
|
||||||
|
metadata:
|
||||||
|
name: kserve-gpu-runtime
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
supportedModelFormats:
|
||||||
|
- name: tensorflow
|
||||||
|
version: "2"
|
||||||
|
autoSelect: true
|
||||||
|
- name: pytorch
|
||||||
|
version: "2"
|
||||||
|
autoSelect: true
|
||||||
|
- name: sklearn
|
||||||
|
version: "1"
|
||||||
|
autoSelect: true
|
||||||
|
- name: xgboost
|
||||||
|
version: "1"
|
||||||
|
autoSelect: true
|
||||||
|
protocol: v1
|
||||||
|
protocolVersions:
|
||||||
|
- v1
|
||||||
|
- v2
|
||||||
|
containers:
|
||||||
|
- name: kserve-container
|
||||||
|
image: kserve/sklearnserver-gpu:latest
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "1"
|
||||||
|
memory: "2Gi"
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
|
limits:
|
||||||
|
cpu: "4"
|
||||||
|
memory: "8Gi"
|
||||||
|
nvidia.com/gpu: "1"
|
||||||
|
env:
|
||||||
|
- name: NVIDIA_VISIBLE_DEVICES
|
||||||
|
value: "all"
|
||||||
|
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||||
|
value: "compute,utility"
|
||||||
|
multiModel: false
|
||||||
|
disabled: false
|
||||||
35
clusters/k3s-dgx/kserve/istio-gateway.yaml
Normal file
35
clusters/k3s-dgx/kserve/istio-gateway.yaml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
apiVersion: networking.istio.io/v1beta1
|
||||||
|
kind: Gateway
|
||||||
|
metadata:
|
||||||
|
name: kserve-gateway
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
istio: ingressgateway
|
||||||
|
servers:
|
||||||
|
- port:
|
||||||
|
number: 80
|
||||||
|
name: http
|
||||||
|
protocol: HTTP
|
||||||
|
hosts:
|
||||||
|
- "*"
|
||||||
|
---
|
||||||
|
apiVersion: networking.istio.io/v1beta1
|
||||||
|
kind: VirtualService
|
||||||
|
metadata:
|
||||||
|
name: kserve-vs
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
hosts:
|
||||||
|
- "*"
|
||||||
|
gateways:
|
||||||
|
- kserve-gateway
|
||||||
|
http:
|
||||||
|
- match:
|
||||||
|
- uri:
|
||||||
|
prefix: /v1/models/
|
||||||
|
route:
|
||||||
|
- destination:
|
||||||
|
host: kserve-default
|
||||||
|
port:
|
||||||
|
number: 80
|
||||||
43
clusters/k3s-dgx/kserve/kserve-controller.yaml
Normal file
43
clusters/k3s-dgx/kserve/kserve-controller.yaml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
apiVersion: source.toolkit.fluxcd.io/v1
|
||||||
|
kind: HelmRepository
|
||||||
|
metadata:
|
||||||
|
name: kserve
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
url: https://kserve.github.io/kserve
|
||||||
|
---
|
||||||
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
||||||
|
kind: HelmRelease
|
||||||
|
metadata:
|
||||||
|
name: kserve
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
chart:
|
||||||
|
spec:
|
||||||
|
chart: kserve
|
||||||
|
version: "v0.12.0"
|
||||||
|
sourceRef:
|
||||||
|
kind: HelmRepository
|
||||||
|
name: kserve
|
||||||
|
namespace: kserve
|
||||||
|
values:
|
||||||
|
controller:
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 500m
|
||||||
|
memory: 512Mi
|
||||||
|
limits:
|
||||||
|
cpu: 2
|
||||||
|
memory: 2Gi
|
||||||
|
config:
|
||||||
|
ingress:
|
||||||
|
className: istio
|
||||||
|
storage:
|
||||||
|
initialCapacity: 10Gi
|
||||||
|
storageClassName: local-path
|
||||||
|
knative:
|
||||||
|
enabled: true
|
||||||
|
istio:
|
||||||
|
enabled: true
|
||||||
7
clusters/k3s-dgx/kserve/kserve-namespace.yaml
Normal file
7
clusters/k3s-dgx/kserve/kserve-namespace.yaml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: kserve
|
||||||
|
labels:
|
||||||
|
istio-injection: enabled
|
||||||
|
serving.kserve.io/serving-runtime: "true"
|
||||||
10
clusters/k3s-dgx/kserve/kustomization.yaml
Normal file
10
clusters/k3s-dgx/kserve/kustomization.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
namespace: kserve
|
||||||
|
resources:
|
||||||
|
- kserve-namespace.yaml
|
||||||
|
- kserve-controller.yaml
|
||||||
|
- istio-gateway.yaml
|
||||||
|
# - gpu-serving-runtime.yaml
|
||||||
|
# - model-storage-pvc.yaml
|
||||||
|
# - storage-config.yaml
|
||||||
12
clusters/k3s-dgx/kserve/model-storage-pvc.yaml
Normal file
12
clusters/k3s-dgx/kserve/model-storage-pvc.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: model-storage-pvc
|
||||||
|
namespace: kserve
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 50Gi
|
||||||
|
storageClassName: local-path
|
||||||
20
clusters/k3s-dgx/kserve/storage-config.yaml
Normal file
20
clusters/k3s-dgx/kserve/storage-config.yaml
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: storage-config
|
||||||
|
namespace: kserve
|
||||||
|
data:
|
||||||
|
storageConfig.yaml: |
|
||||||
|
defaultStorageUri: "pvc://model-storage"
|
||||||
|
storageSpec:
|
||||||
|
- name: model-storage
|
||||||
|
type: pvc
|
||||||
|
pvcSpec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 50Gi
|
||||||
|
storageClassName: local-path
|
||||||
|
modelCacheSize: 10Gi
|
||||||
|
modelCacheMemory: 2Gi
|
||||||
7
clusters/k3s-dgx/kustomization.yaml
Normal file
7
clusters/k3s-dgx/kustomization.yaml
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- flux-system
|
||||||
|
- gpu-support
|
||||||
|
- kserve
|
||||||
|
- apps
|
||||||
88
configure.sh
Executable file
88
configure.sh
Executable file
@@ -0,0 +1,88 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Configuration script for Edge GitOps
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
echo -e "${BLUE}=== Edge GitOps Configuration ===${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Function to update configuration files
|
||||||
|
update_config() {
|
||||||
|
local file=$1
|
||||||
|
local key=$2
|
||||||
|
local value=$3
|
||||||
|
|
||||||
|
if [ -f "$file" ]; then
|
||||||
|
sed -i "s|$key|$value|g" "$file"
|
||||||
|
echo -e "${GREEN}✓ Updated $file${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ File not found: $file${NC}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get Gitea URL
|
||||||
|
echo -e "${YELLOW}Enter your Gitea repository URL:${NC}"
|
||||||
|
read -p "URL (default: ssh://git@gitea.example.com/edge-gitops/edge-gitops.git): " GITEA_URL
|
||||||
|
GITEA_URL=${GITEA_URL:-ssh://git@gitea.example.com/edge-gitops/edge-gitops.git}
|
||||||
|
|
||||||
|
# Get branch name
|
||||||
|
echo -e "${YELLOW}Enter your branch name:${NC}"
|
||||||
|
read -p "Branch (default: main): " GITEA_BRANCH
|
||||||
|
GITEA_BRANCH=${GITEA_BRANCH:-main}
|
||||||
|
|
||||||
|
# Get cluster name
|
||||||
|
echo -e "${YELLOW}Enter your cluster name:${NC}"
|
||||||
|
read -p "Cluster name (default: k3s-dgx): " CLUSTER_NAME
|
||||||
|
CLUSTER_NAME=${CLUSTER_NAME:-k3s-dgx}
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${BLUE}Configuration Summary:${NC}"
|
||||||
|
echo " Gitea URL: $GITEA_URL"
|
||||||
|
echo " Branch: $GITEA_BRANCH"
|
||||||
|
echo " Cluster: $CLUSTER_NAME"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
read -p "Apply these settings? (y/n) " -n 1 -r
|
||||||
|
echo
|
||||||
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||||||
|
echo "Aborting..."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Update configuration files
|
||||||
|
echo -e "${YELLOW}Updating configuration files...${NC}"
|
||||||
|
|
||||||
|
update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
|
||||||
|
"ssh://git@gitea.example.com/edge-gitops/edge-gitops.git" \
|
||||||
|
"$GITEA_URL"
|
||||||
|
|
||||||
|
update_config "clusters/k3s-dgx/flux-system/gotk-sync.yaml" \
|
||||||
|
"branch: main" \
|
||||||
|
"branch: $GITEA_BRANCH"
|
||||||
|
|
||||||
|
# Create environment file
|
||||||
|
cat > .env << EOF
|
||||||
|
GITEA_URL=$GITEA_URL
|
||||||
|
GITEA_BRANCH=$GITEA_BRANCH
|
||||||
|
CLUSTER_NAME=$CLUSTER_NAME
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo -e "${GREEN}✓ Created .env file${NC}"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}=== Configuration Complete ===${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "Next steps:"
|
||||||
|
echo "1. Review the changes: git diff"
|
||||||
|
echo "2. Commit the changes: git add . && git commit -m 'Configure GitOps settings'"
|
||||||
|
echo "3. Push to repository: git push origin $GITEA_BRANCH"
|
||||||
|
echo "4. Run bootstrap: ./bootstrap.sh"
|
||||||
|
echo ""
|
||||||
Reference in New Issue
Block a user