Refactor launch-cluster script to include cluster running checks and streamline start process for head and worker nodes

This commit is contained in:
Eugene Rakhmatulin
2025-12-18 14:50:26 -08:00
parent f7a15bfaf5
commit 6c04ebfca1

View File

@@ -310,6 +310,34 @@ if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then
trap cleanup EXIT INT TERM HUP
fi
# Check if cluster is already running
check_cluster_running() {
local running=false
# Check Head
if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo "Warning: Container '$CONTAINER_NAME' is already running on head node ($HEAD_IP)."
running=true
fi
# Check Workers
for worker in "${WORKER_NODES[@]}"; do
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
running=true
fi
done
if [[ "$running" == "true" ]]; then
echo "Cluster containers are already running. Please stop them first or use a different name."
exit 1
fi
}
# Start Cluster Function
start_cluster() {
check_cluster_running
# Start Head Node
echo "Starting Head Node on $HEAD_IP..."
docker run -d --privileged --gpus all --rm \
@@ -341,6 +369,9 @@ for worker in "${WORKER_NODES[@]}"; do
--head-ip $HEAD_IP"
done
wait_for_cluster
}
# Wait for Cluster Readiness
wait_for_cluster() {
echo "Waiting for cluster to be ready..."
@@ -365,11 +396,11 @@ wait_for_cluster() {
}
if [[ "$ACTION" == "exec" ]]; then
wait_for_cluster
echo "Executing command: $COMMAND_TO_RUN"
eval "$COMMAND_TO_RUN"
start_cluster
echo "Executing command on head node: $COMMAND_TO_RUN"
docker exec -it "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
elif [[ "$ACTION" == "start" ]]; then
wait_for_cluster
start_cluster
if [[ "$DAEMON_MODE" == "true" ]]; then
echo "Cluster started in background (Daemon mode)."
else