Refactor launch-cluster script to include cluster running checks and streamline start process for head and worker nodes
This commit is contained in:
@@ -310,6 +310,34 @@ if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then
|
||||
trap cleanup EXIT INT TERM HUP
|
||||
fi
|
||||
|
||||
# Check if cluster is already running
|
||||
check_cluster_running() {
|
||||
local running=false
|
||||
|
||||
# Check Head
|
||||
if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
|
||||
echo "Warning: Container '$CONTAINER_NAME' is already running on head node ($HEAD_IP)."
|
||||
running=true
|
||||
fi
|
||||
|
||||
# Check Workers
|
||||
for worker in "${WORKER_NODES[@]}"; do
|
||||
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
|
||||
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
|
||||
running=true
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$running" == "true" ]]; then
|
||||
echo "Cluster containers are already running. Please stop them first or use a different name."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Start Cluster Function
|
||||
start_cluster() {
|
||||
check_cluster_running
|
||||
|
||||
# Start Head Node
|
||||
echo "Starting Head Node on $HEAD_IP..."
|
||||
docker run -d --privileged --gpus all --rm \
|
||||
@@ -341,6 +369,9 @@ for worker in "${WORKER_NODES[@]}"; do
|
||||
--head-ip $HEAD_IP"
|
||||
done
|
||||
|
||||
wait_for_cluster
|
||||
}
|
||||
|
||||
# Wait for Cluster Readiness
|
||||
wait_for_cluster() {
|
||||
echo "Waiting for cluster to be ready..."
|
||||
@@ -365,11 +396,11 @@ wait_for_cluster() {
|
||||
}
|
||||
|
||||
if [[ "$ACTION" == "exec" ]]; then
|
||||
wait_for_cluster
|
||||
echo "Executing command: $COMMAND_TO_RUN"
|
||||
eval "$COMMAND_TO_RUN"
|
||||
start_cluster
|
||||
echo "Executing command on head node: $COMMAND_TO_RUN"
|
||||
docker exec -it "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
|
||||
elif [[ "$ACTION" == "start" ]]; then
|
||||
wait_for_cluster
|
||||
start_cluster
|
||||
if [[ "$DAEMON_MODE" == "true" ]]; then
|
||||
echo "Cluster started in background (Daemon mode)."
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user