diff --git a/autodiscover.sh b/autodiscover.sh new file mode 100644 index 0000000..43e622f --- /dev/null +++ b/autodiscover.sh @@ -0,0 +1,168 @@ +#!/bin/bash + +# Function to detect IB and Ethernet interfaces +detect_interfaces() { + # If both interfaces are already set, nothing to do + if [[ -n "$ETH_IF" && -n "$IB_IF" ]]; then + return 0 + fi + + # Check for required tools + if ! command -v ibdev2netdev &> /dev/null; then + echo "Error: ibdev2netdev not found. Cannot auto-detect interfaces." + return 1 + fi + + echo "Auto-detecting interfaces..." + + # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)" + # We capture: IB_DEV, NET_DEV + mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') + + if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then + echo "Error: No active IB interfaces found." + return 1 + fi + + DETECTED_IB_IFS=() + CANDIDATE_ETH_IFS=() + + for pair in "${IB_NET_PAIRS[@]}"; do + ib_dev=$(echo "$pair" | awk '{print $1}') + net_dev=$(echo "$pair" | awk '{print $2}') + + DETECTED_IB_IFS+=("$ib_dev") + + # Check if interface has an IP address + if ip addr show "$net_dev" | grep -q "inet "; then + CANDIDATE_ETH_IFS+=("$net_dev") + fi + done + + # Set IB_IF if not provided + if [[ -z "$IB_IF" ]]; then + IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") + echo " Detected IB_IF: $IB_IF" + fi + + # Set ETH_IF if not provided + if [[ -z "$ETH_IF" ]]; then + if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then + echo "Error: No active IB-associated interfaces have IP addresses." + return 1 + fi + + # Selection logic: Prefer interface without capital 'P' + SELECTED_ETH="" + for iface in "${CANDIDATE_ETH_IFS[@]}"; do + if [[ "$iface" != *"P"* ]]; then + SELECTED_ETH="$iface" + break + fi + done + + # Fallback: Use the first one if all have 'P' or none found yet + if [[ -z "$SELECTED_ETH" ]]; then + SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" + fi + + ETH_IF="$SELECTED_ETH" + echo " Detected ETH_IF: $ETH_IF" + fi +} + +# Function to detect local IP +detect_local_ip() { + if [[ -n "$LOCAL_IP" ]]; then + return 0 + fi + + # Ensure interface is detected if not provided + if [[ -z "$ETH_IF" ]]; then + detect_interfaces || return 1 + fi + + # Get CIDR of the selected ETH_IF + CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1) + + if [[ -z "$CIDR" ]]; then + echo "Error: Could not determine IP/CIDR for interface $ETH_IF" + return 1 + fi + + LOCAL_IP=${CIDR%/*} + echo " Detected Local IP: $LOCAL_IP ($CIDR)" +} + +# Function to detect cluster nodes +detect_nodes() { + detect_local_ip || return 1 + + # If nodes are already set, populate PEER_NODES and return + if [[ -n "$NODES_ARG" ]]; then + PEER_NODES=() + IFS=',' read -ra ALL_NODES <<< "$NODES_ARG" + for node in "${ALL_NODES[@]}"; do + node=$(echo "$node" | xargs) + if [[ "$node" != "$LOCAL_IP" ]]; then + PEER_NODES+=("$node") + fi + done + return 0 + fi + + echo "Auto-detecting nodes..." + + if ! command -v nc &> /dev/null; then + echo "Error: nc (netcat) not found. Please install netcat." + return 1 + fi + + if ! command -v python3 &> /dev/null; then + echo "Error: python3 not found. Please install python3." + return 1 + fi + + DETECTED_IPS=("$LOCAL_IP") + PEER_NODES=() + + echo " Scanning for SSH peers on $CIDR..." + + # Generate list of IPs using python + ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR") + + TEMP_IPS_FILE=$(mktemp) + + # Scan in parallel + for ip in $ALL_IPS; do + # Skip own IP + if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi + + ( + # Check port 22 with 1 second timeout + if nc -z -w 1 "$ip" 22 &>/dev/null; then + echo "$ip" >> "$TEMP_IPS_FILE" + fi + ) & + done + + # Wait for all background scans to complete + wait + + # Read found IPs + if [[ -f "$TEMP_IPS_FILE" ]]; then + while read -r ip; do + DETECTED_IPS+=("$ip") + PEER_NODES+=("$ip") + echo " Found peer: $ip" + done < "$TEMP_IPS_FILE" + rm -f "$TEMP_IPS_FILE" + fi + + # Sort IPs + IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}")) + unset IFS + + NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") + echo " Cluster Nodes: $NODES_ARG" +} diff --git a/build-and-copy.sh b/build-and-copy.sh index ae56d58..c970b65 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -82,18 +82,33 @@ while [[ "$#" -gt 0 ]]; do --vllm-ref) VLLM_REF="$2"; shift ;; -c|--copy-to|--copy-to-host|--copy-to-hosts) shift - if [ "$#" -eq 0 ]; then - echo "Error: --copy-to requires at least one host" - exit 1 - fi - EXISTING_HOSTS=${#COPY_HOSTS[@]} + # Consume arguments until the next flag or end of args while [[ "$#" -gt 0 && "$1" != -* ]]; do add_copy_hosts "$1" shift done - if [ "${#COPY_HOSTS[@]}" -eq "$EXISTING_HOSTS" ]; then - echo "Error: --copy-to requires at least one host" - exit 1 + + # If no hosts specified, use autodiscovery + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "No hosts specified. Using autodiscovery..." + source "$(dirname "$0")/autodiscover.sh" + + detect_nodes + if [ $? -ne 0 ]; then + echo "Error: Autodiscovery failed." + exit 1 + fi + + # Use PEER_NODES directly + if [ ${#PEER_NODES[@]} -gt 0 ]; then + COPY_HOSTS=("${PEER_NODES[@]}") + fi + + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" fi continue ;; diff --git a/launch-cluster.sh b/launch-cluster.sh index 38aba90..796718f 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -98,140 +98,12 @@ fi # --- Auto-Detection Logic --- -# Check for required tools if auto-detection is needed -if [[ -z "$ETH_IF" || -z "$IB_IF" || -z "$NODES_ARG" ]]; then - if ! command -v ibdev2netdev &> /dev/null; then - echo "Error: ibdev2netdev not found. Cannot auto-detect interfaces." - exit 1 - fi -fi +# Source autodiscover module +source "$(dirname "$0")/autodiscover.sh" -# 1. Detect Interfaces (ETH_IF and IB_IF) -if [[ -z "$ETH_IF" || -z "$IB_IF" ]]; then - echo "Auto-detecting interfaces..." - - # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)" - # We capture: IB_DEV, NET_DEV - mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') - - if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then - echo "Error: No active IB interfaces found." - exit 1 - fi - - DETECTED_IB_IFS=() - CANDIDATE_ETH_IFS=() - - for pair in "${IB_NET_PAIRS[@]}"; do - ib_dev=$(echo "$pair" | awk '{print $1}') - net_dev=$(echo "$pair" | awk '{print $2}') - - DETECTED_IB_IFS+=("$ib_dev") - - # Check if interface has an IP address - if ip addr show "$net_dev" | grep -q "inet "; then - CANDIDATE_ETH_IFS+=("$net_dev") - fi - done - - # Set IB_IF if not provided - if [[ -z "$IB_IF" ]]; then - IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") - echo " Detected IB_IF: $IB_IF" - fi - - # Set ETH_IF if not provided - if [[ -z "$ETH_IF" ]]; then - if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then - echo "Error: No active IB-associated interfaces have IP addresses." - exit 1 - fi - - # Selection logic: Prefer interface without capital 'P' - SELECTED_ETH="" - for iface in "${CANDIDATE_ETH_IFS[@]}"; do - if [[ "$iface" != *"P"* ]]; then - SELECTED_ETH="$iface" - break - fi - done - - # Fallback: Use the first one if all have 'P' or none found yet - if [[ -z "$SELECTED_ETH" ]]; then - SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" - fi - - ETH_IF="$SELECTED_ETH" - echo " Detected ETH_IF: $ETH_IF" - fi -fi - -# 2. Detect Nodes if not provided -if [[ -z "$NODES_ARG" ]]; then - echo "Auto-detecting nodes..." - - if ! command -v nc &> /dev/null; then - echo "Error: nc (netcat) not found. Please install netcat." - exit 1 - fi - - if ! command -v python3 &> /dev/null; then - echo "Error: python3 not found. Please install python3." - exit 1 - fi - - # Get CIDR of the selected ETH_IF - CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1) - - if [[ -z "$CIDR" ]]; then - echo "Error: Could not determine IP/CIDR for interface $ETH_IF" - exit 1 - fi - - LOCAL_IP=${CIDR%/*} - echo " Detected Local IP: $LOCAL_IP ($CIDR)" - - DETECTED_IPS=("$LOCAL_IP") - - echo " Scanning for SSH peers on $CIDR..." - - # Generate list of IPs using python - ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR") - - TEMP_IPS_FILE=$(mktemp) - - # Scan in parallel - for ip in $ALL_IPS; do - # Skip own IP - if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi - - ( - # Check port 22 with 1 second timeout - if nc -z -w 1 "$ip" 22 &>/dev/null; then - echo "$ip" >> "$TEMP_IPS_FILE" - fi - ) & - done - - # Wait for all background scans to complete - wait - - # Read found IPs - if [[ -f "$TEMP_IPS_FILE" ]]; then - while read -r ip; do - DETECTED_IPS+=("$ip") - echo " Found peer: $ip" - done < "$TEMP_IPS_FILE" - rm -f "$TEMP_IPS_FILE" - fi - - # Sort IPs - IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}")) - unset IFS - - NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") - echo " Cluster Nodes: $NODES_ARG" -fi +# Perform auto-detection +detect_interfaces || exit 1 +detect_nodes || exit 1 if [[ -z "$NODES_ARG" ]]; then echo "Error: Nodes argument (-n) is mandatory or could not be auto-detected." @@ -242,33 +114,26 @@ fi IFS=',' read -r -a ALL_NODES <<< "$NODES_ARG" # Detect Head IP (Local IP) -HEAD_IP="" -LOCAL_IPS=$(hostname -I) +detect_local_ip || exit 1 +HEAD_IP="$LOCAL_IP" + +# Verify HEAD_IP is in ALL_NODES +FOUND_HEAD=false for ip in "${ALL_NODES[@]}"; do - # Trim whitespace ip=$(echo "$ip" | xargs) - if [[ " $LOCAL_IPS " =~ " $ip " ]]; then - HEAD_IP="$ip" + if [[ "$ip" == "$HEAD_IP" ]]; then + FOUND_HEAD=true break fi done -if [[ -z "$HEAD_IP" ]]; then - echo "Error: Could not determine Head IP. This script must be run on one of the nodes specified in -n." +if [ "$FOUND_HEAD" = false ]; then + echo "Error: Local IP ($HEAD_IP) is not in the list of nodes ($NODES_ARG)." exit 1 fi -# Identify Worker Nodes -WORKER_NODES=() -for ip in "${ALL_NODES[@]}"; do - ip=$(echo "$ip" | xargs) - if [[ "$ip" != "$HEAD_IP" ]]; then - WORKER_NODES+=("$ip") - fi -done - echo "Head Node: $HEAD_IP" -echo "Worker Nodes: ${WORKER_NODES[*]}" +echo "Worker Nodes: ${PEER_NODES[*]}" echo "Container Name: $CONTAINER_NAME" echo "Action: $ACTION"