Refactor launch-cluster script to include cluster running checks and streamline start process for head and worker nodes
This commit is contained in:
@@ -310,36 +310,67 @@ if [[ "$DAEMON_MODE" == "false" ]] || [[ "$ACTION" == "exec" ]]; then
|
|||||||
trap cleanup EXIT INT TERM HUP
|
trap cleanup EXIT INT TERM HUP
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Start Head Node
|
# Check if cluster is already running
|
||||||
echo "Starting Head Node on $HEAD_IP..."
|
check_cluster_running() {
|
||||||
docker run -d --privileged --gpus all --rm \
|
local running=false
|
||||||
--ipc=host --network host \
|
|
||||||
--name "$CONTAINER_NAME" \
|
# Check Head
|
||||||
-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \
|
if docker ps --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}$"; then
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
echo "Warning: Container '$CONTAINER_NAME' is already running on head node ($HEAD_IP)."
|
||||||
"$IMAGE_NAME" \
|
running=true
|
||||||
./run-cluster-node.sh \
|
fi
|
||||||
--role head \
|
|
||||||
--host-ip "$HEAD_IP" \
|
# Check Workers
|
||||||
--eth-if "$ETH_IF" \
|
for worker in "${WORKER_NODES[@]}"; do
|
||||||
--ib-if "$IB_IF"
|
if ssh "$worker" "docker ps --format '{{.Names}}' | grep -q '^${CONTAINER_NAME}$'"; then
|
||||||
|
echo "Warning: Container '$CONTAINER_NAME' is already running on worker node ($worker)."
|
||||||
|
running=true
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$running" == "true" ]]; then
|
||||||
|
echo "Cluster containers are already running. Please stop them first or use a different name."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# Start Worker Nodes
|
# Start Cluster Function
|
||||||
for worker in "${WORKER_NODES[@]}"; do
|
start_cluster() {
|
||||||
echo "Starting Worker Node on $worker..."
|
check_cluster_running
|
||||||
ssh "$worker" "docker run -d --privileged --gpus all --rm \
|
|
||||||
|
# Start Head Node
|
||||||
|
echo "Starting Head Node on $HEAD_IP..."
|
||||||
|
docker run -d --privileged --gpus all --rm \
|
||||||
--ipc=host --network host \
|
--ipc=host --network host \
|
||||||
--name $CONTAINER_NAME \
|
--name "$CONTAINER_NAME" \
|
||||||
-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \
|
-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \
|
||||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
$IMAGE_NAME \
|
"$IMAGE_NAME" \
|
||||||
./run-cluster-node.sh \
|
./run-cluster-node.sh \
|
||||||
--role node \
|
--role head \
|
||||||
--host-ip $worker \
|
--host-ip "$HEAD_IP" \
|
||||||
--eth-if $ETH_IF \
|
--eth-if "$ETH_IF" \
|
||||||
--ib-if $IB_IF \
|
--ib-if "$IB_IF"
|
||||||
--head-ip $HEAD_IP"
|
|
||||||
done
|
# Start Worker Nodes
|
||||||
|
for worker in "${WORKER_NODES[@]}"; do
|
||||||
|
echo "Starting Worker Node on $worker..."
|
||||||
|
ssh "$worker" "docker run -d --privileged --gpus all --rm \
|
||||||
|
--ipc=host --network host \
|
||||||
|
--name $CONTAINER_NAME \
|
||||||
|
-e NCCL_DEBUG=INFO -e NCCL_IGNORE_CPU_AFFINITY=1 \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
$IMAGE_NAME \
|
||||||
|
./run-cluster-node.sh \
|
||||||
|
--role node \
|
||||||
|
--host-ip $worker \
|
||||||
|
--eth-if $ETH_IF \
|
||||||
|
--ib-if $IB_IF \
|
||||||
|
--head-ip $HEAD_IP"
|
||||||
|
done
|
||||||
|
|
||||||
|
wait_for_cluster
|
||||||
|
}
|
||||||
|
|
||||||
# Wait for Cluster Readiness
|
# Wait for Cluster Readiness
|
||||||
wait_for_cluster() {
|
wait_for_cluster() {
|
||||||
@@ -365,11 +396,11 @@ wait_for_cluster() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if [[ "$ACTION" == "exec" ]]; then
|
if [[ "$ACTION" == "exec" ]]; then
|
||||||
wait_for_cluster
|
start_cluster
|
||||||
echo "Executing command: $COMMAND_TO_RUN"
|
echo "Executing command on head node: $COMMAND_TO_RUN"
|
||||||
eval "$COMMAND_TO_RUN"
|
docker exec -it "$CONTAINER_NAME" bash -i -c "$COMMAND_TO_RUN"
|
||||||
elif [[ "$ACTION" == "start" ]]; then
|
elif [[ "$ACTION" == "start" ]]; then
|
||||||
wait_for_cluster
|
start_cluster
|
||||||
if [[ "$DAEMON_MODE" == "true" ]]; then
|
if [[ "$DAEMON_MODE" == "true" ]]; then
|
||||||
echo "Cluster started in background (Daemon mode)."
|
echo "Cluster started in background (Daemon mode)."
|
||||||
else
|
else
|
||||||
|
|||||||
Reference in New Issue
Block a user