Enhance launch-cluster script with improved SSH connectivity checks for worker nodes

This commit is contained in:
Eugene Rakhmatulin
2025-12-18 14:22:48 -08:00
parent 25b1d8eb4f
commit f7a15bfaf5

View File

@@ -77,7 +77,7 @@ fi
if [[ -z "$ETH_IF" || -z "$IB_IF" ]]; then if [[ -z "$ETH_IF" || -z "$IB_IF" ]]; then
echo "Auto-detecting interfaces..." echo "Auto-detecting interfaces..."
# Get all Up interfaces: "mlx5_0 port 1 ==> enp1s0f0np0 (Up)" # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)"
# We capture: IB_DEV, NET_DEV # We capture: IB_DEV, NET_DEV
mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}')
@@ -229,6 +229,21 @@ echo "Worker Nodes: ${WORKER_NODES[*]}"
echo "Container Name: $CONTAINER_NAME" echo "Container Name: $CONTAINER_NAME"
echo "Action: $ACTION" echo "Action: $ACTION"
# Check SSH connectivity to worker nodes
if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ]]; then
if [ ${#WORKER_NODES[@]} -gt 0 ]; then
echo "Checking SSH connectivity to worker nodes..."
for worker in "${WORKER_NODES[@]}"; do
if ! ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$worker" true 2>/dev/null; then
echo "Error: Passwordless SSH to $worker failed."
echo " Please ensure SSH keys are configured and the host is reachable."
exit 1
fi
echo " SSH to $worker: OK"
done
fi
fi
if [[ "$CHECK_CONFIG" == "true" ]]; then if [[ "$CHECK_CONFIG" == "true" ]]; then
echo "Configuration Check Complete." echo "Configuration Check Complete."
echo " Image Name: $IMAGE_NAME" echo " Image Name: $IMAGE_NAME"