Refactor launch-cluster script to include cluster running checks and streamline start process for head and worker nodes

This commit is contained in:
Eugene Rakhmatulin
2025-12-18 14:50:26 -08:00
parent f7a15bfaf5
commit 6c04ebfca1

View File

@@ -310,9 +310,37 @@ if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then
trap cleanup EXIT INT TERM HUP trap cleanup EXIT INT TERM HUP
fi fi
# Start Head Node # Check if cluster is already running
echo "Starting Head Node on $HEAD_IP..." check_cluster_running() {
docker run -d --privileged --gpus all --rm \ local running=false
# Check Head
if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
echo "Warning: Container '$CONTAINER_NAME' is already running on head node ($HEAD_IP)."
running=true
fi
# Check Workers
for worker in "${WORKER_NODES[@]}"; do
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
running=true
fi
done
if [[ "$running" == "true" ]]; then
echo "Cluster containers are already running. Please stop them first or use a different name."
exit 1
fi
}
# Start Cluster Function
start_cluster() {
check_cluster_running
# Start Head Node
echo "Starting Head Node on $HEAD_IP..."
docker run -d --privileged --gpus all --rm \
--ipc=host --network host \ --ipc=host --network host \
--name "$CONTAINER_NAME" \ --name "$CONTAINER_NAME" \
-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \ -e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \
@@ -324,8 +352,8 @@ docker run -d --privileged --gpus all --rm \
--eth-if "$ETH_IF" \ --eth-if "$ETH_IF" \
--ib-if "$IB_IF" --ib-if "$IB_IF"
# Start Worker Nodes # Start Worker Nodes
for worker in "${WORKER_NODES[@]}"; do for worker in "${WORKER_NODES[@]}"; do
echo "Starting Worker Node on $worker..." echo "Starting Worker Node on $worker..."
ssh "$worker" "docker run -d --privileged --gpus all --rm \ ssh "$worker" "docker run -d --privileged --gpus all --rm \
--ipc=host --network host \ --ipc=host --network host \
@@ -339,7 +367,10 @@ for worker in "${WORKER_NODES[@]}"; do
--eth-if $ETH_IF \ --eth-if $ETH_IF \
--ib-if $IB_IF \ --ib-if $IB_IF \
--head-ip $HEAD_IP" --head-ip $HEAD_IP"
done done
wait_for_cluster
}
# Wait for Cluster Readiness # Wait for Cluster Readiness
wait_for_cluster() { wait_for_cluster() {
@@ -365,11 +396,11 @@ wait_for_cluster() {
} }
if [[ "$ACTION" == "exec" ]]; then if [[ "$ACTION" == "exec" ]]; then
wait_for_cluster start_cluster
echo "Executing command: $COMMAND_TO_RUN" echo "Executing command on head node: $COMMAND_TO_RUN"
eval "$COMMAND_TO_RUN" docker exec -it "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
elif [[ "$ACTION" == "start" ]]; then elif [[ "$ACTION" == "start" ]]; then
wait_for_cluster start_cluster
if [[ "$DAEMON_MODE" == "true" ]]; then if [[ "$DAEMON_MODE" == "true" ]]; then
echo "Cluster started in background (Daemon mode)." echo "Cluster started in background (Daemon mode)."
else else