From f7a15bfaf5158069294a0eaa9ab5622521e94a61 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 18 Dec 2025 14:22:48 -0800 Subject: [PATCH] Enhance launch-cluster script with improved SSH connectivity checks for worker nodes --- launch-cluster.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 29de635..55cb870 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -77,7 +77,7 @@ fi if [[ -z "$ETH_IF" || -z "$IB_IF" ]]; then echo "Auto-detecting interfaces..." - # Get all Up interfaces: "mlx5_0 port 1 ==> enp1s0f0np0 (Up)" + # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)" # We capture: IB_DEV, NET_DEV mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') @@ -229,6 +229,21 @@ echo "Worker Nodes: ${WORKER_NODES[*]}" echo "Container Name: $CONTAINER_NAME" echo "Action: $ACTION" +# Check SSH connectivity to worker nodes +if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]]; then + if [ ${#WORKER_NODES[@]} -gt 0 ]; then + echo "Checking SSH connectivity to worker nodes..." + for worker in "${WORKER_NODES[@]}"; do + if ! ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$worker" true 2>/dev/null; then + echo "Error: Passwordless SSH to $worker failed." + echo " Please ensure SSH keys are configured and the host is reachable." + exit 1 + fi + echo " SSH to $worker: OK" + done + fi +fi + if [[ "$CHECK_CONFIG" == "true" ]]; then echo "Configuration Check Complete." echo " Image Name: $IMAGE_NAME"