Refactor launch-cluster script to include cluster running checks and streamline start process for head and worker nodes

This commit is contained in:
Eugene Rakhmatulin
2025-12-18 14:50:26 -08:00
parent f7a15bfaf5
commit 6c04ebfca1

View File

@@ -310,6 +310,34 @@ if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then
trap cleanup EXIT INT TERM HUP trap cleanup EXIT INT TERM HUP
fi fi
# Check if cluster is already running
check_cluster_running() {
local running=false
# Check Head
if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo "Warning: Container '$CONTAINER_NAME' is already running on head node ($HEAD_IP)."
running=true
fi
# Check Workers
for worker in "${WORKER_NODES[@]}"; do
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
running=true
fi
done
if [[ "$running" == "true" ]]; then
echo "Cluster containers are already running. Please stop them first or use a different name."
exit 1
fi
}
# Start Cluster Function
start_cluster() {
check_cluster_running
# Start Head Node # Start Head Node
echo "Starting Head Node on $HEAD_IP..." echo "Starting Head Node on $HEAD_IP..."
docker run -d --privileged --gpus all --rm \ docker run -d --privileged --gpus all --rm \
@@ -341,6 +369,9 @@ for worker in "${WORKER_NODES[@]}"; do
--head-ip $HEAD_IP" --head-ip $HEAD_IP"
done done
wait_for_cluster
}
# Wait for Cluster Readiness # Wait for Cluster Readiness
wait_for_cluster() { wait_for_cluster() {
echo "Waiting for cluster to be ready..." echo "Waiting for cluster to be ready..."
@@ -365,11 +396,11 @@ wait_for_cluster() {
} }
if [[ "$ACTION" == "exec" ]]; then if [[ "$ACTION" == "exec" ]]; then
wait_for_cluster start_cluster
echo "Executing command: $COMMAND_TO_RUN" echo "Executing command on head node: $COMMAND_TO_RUN"
eval "$COMMAND_TO_RUN" docker exec -it "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
elif [[ "$ACTION" == "start" ]]; then elif [[ "$ACTION" == "start" ]]; then
wait_for_cluster start_cluster
if [[ "$DAEMON_MODE" == "true" ]]; then if [[ "$DAEMON_MODE" == "true" ]]; then
echo "Cluster started in background (Daemon mode)." echo "Cluster started in background (Daemon mode)."
else else