Enhance launch-cluster script with auto-detection for interfaces and nodes
This commit is contained in:
@@ -3,24 +3,27 @@
|
||||
# Default Configuration
|
||||
IMAGE_NAME="vllm-node"
|
||||
DEFAULT_CONTAINER_NAME="vllm_node"
|
||||
ETH_IF="enp1s0f1np1"
|
||||
IB_IF="rocep1s0f1,roceP2p1s0f1"
|
||||
# ETH_IF and IB_IF will be auto-detected if not provided
|
||||
ETH_IF=""
|
||||
IB_IF=""
|
||||
|
||||
# Initialize variables
|
||||
NODES_ARG=""
|
||||
CONTAINER_NAME="$DEFAULT_CONTAINER_NAME"
|
||||
COMMAND_TO_RUN=""
|
||||
DAEMON_MODE="false"
|
||||
CHECK_CONFIG="false"
|
||||
ACTION="start"
|
||||
|
||||
# Function to print usage
|
||||
usage() {
|
||||
echo "Usage: $0 -n <node_ips> [-t <image_name>] [--name <container_name>] [--eth-if <if_name>] [--ib-if <if_name>] [-d] [action] [command]"
|
||||
echo " -n, --nodes Comma-separated list of node IPs (Mandatory)"
|
||||
echo "Usage: $0 [-n <node_ips>] [-t <image_name>] [--name <container_name>] [--eth-if <if_name>] [--ib-if <if_name>] [--check-config] [-d] [action] [command]"
|
||||
echo " -n, --nodes Comma-separated list of node IPs (Optional, auto-detected if omitted)"
|
||||
echo " -t Docker image name (Optional, default: $IMAGE_NAME)"
|
||||
echo " --name Container name (Optional, default: $DEFAULT_CONTAINER_NAME)"
|
||||
echo " --eth-if Ethernet interface (Optional, default: $ETH_IF)"
|
||||
echo " --ib-if InfiniBand interface (Optional, default: $IB_IF)"
|
||||
echo " --eth-if Ethernet interface (Optional, auto-detected)"
|
||||
echo " --ib-if InfiniBand interface (Optional, auto-detected)"
|
||||
echo " --check-config Check configuration and auto-detection without launching"
|
||||
echo " -d Daemon mode (only for 'start' action)"
|
||||
echo " action start | stop | status | exec (Default: start)"
|
||||
echo " command Command to run (only for 'exec' action)"
|
||||
@@ -35,6 +38,7 @@ while [[ "$#" -gt 0 ]]; do
|
||||
--name) CONTAINER_NAME="$2"; shift ;;
|
||||
--eth-if) ETH_IF="$2"; shift ;;
|
||||
--ib-if) IB_IF="$2"; shift ;;
|
||||
--check-config) CHECK_CONFIG="true" ;;
|
||||
-d) DAEMON_MODE="true" ;;
|
||||
-h|--help) usage ;;
|
||||
start|stop|status)
|
||||
@@ -59,8 +63,135 @@ while [[ "$#" -gt 0 ]]; do
|
||||
shift
|
||||
done
|
||||
|
||||
# --- Auto-Detection Logic ---
|
||||
|
||||
# Check for required tools if auto-detection is needed
|
||||
if [[ -z "$ETH_IF" || -z "$IB_IF" || -z "$NODES_ARG" ]]; then
|
||||
if ! command -v ibdev2netdev &> /dev/null; then
|
||||
echo "Error: ibdev2netdev not found. Cannot auto-detect interfaces."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# 1. Detect Interfaces (ETH_IF and IB_IF)
|
||||
if [[ -z "$ETH_IF" || -z "$IB_IF" ]]; then
|
||||
echo "Auto-detecting interfaces..."
|
||||
|
||||
# Get all Up interfaces: "mlx5_0 port 1 ==> enp1s0f0np0 (Up)"
|
||||
# We capture: IB_DEV, NET_DEV
|
||||
mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}')
|
||||
|
||||
if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then
|
||||
echo "Error: No active IB interfaces found."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DETECTED_IB_IFS=()
|
||||
CANDIDATE_ETH_IFS=()
|
||||
|
||||
for pair in "${IB_NET_PAIRS[@]}"; do
|
||||
ib_dev=$(echo "$pair" | awk '{print $1}')
|
||||
net_dev=$(echo "$pair" | awk '{print $2}')
|
||||
|
||||
DETECTED_IB_IFS+=("$ib_dev")
|
||||
|
||||
# Check if interface has an IP address
|
||||
if ip addr show "$net_dev" | grep -q "inet "; then
|
||||
CANDIDATE_ETH_IFS+=("$net_dev")
|
||||
fi
|
||||
done
|
||||
|
||||
# Set IB_IF if not provided
|
||||
if [[ -z "$IB_IF" ]]; then
|
||||
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
|
||||
echo " Detected IB_IF: $IB_IF"
|
||||
fi
|
||||
|
||||
# Set ETH_IF if not provided
|
||||
if [[ -z "$ETH_IF" ]]; then
|
||||
if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then
|
||||
echo "Error: No active IB-associated interfaces have IP addresses."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Selection logic: Prefer interface without capital 'P'
|
||||
SELECTED_ETH=""
|
||||
for iface in "${CANDIDATE_ETH_IFS[@]}"; do
|
||||
if [[ "$iface" != *"P"* ]]; then
|
||||
SELECTED_ETH="$iface"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Fallback: Use the first one if all have 'P' or none found yet
|
||||
if [[ -z "$SELECTED_ETH" ]]; then
|
||||
SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}"
|
||||
fi
|
||||
|
||||
ETH_IF="$SELECTED_ETH"
|
||||
echo " Detected ETH_IF: $ETH_IF"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 2. Detect Nodes if not provided
|
||||
if [[ -z "$NODES_ARG" ]]; then
|
||||
echo "Error: Nodes argument (-n) is mandatory."
|
||||
echo "Auto-detecting nodes..."
|
||||
|
||||
if ! command -v avahi-browse &> /dev/null; then
|
||||
echo "Error: avahi-browse not found. Please install avahi-utils."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Get local IP of the selected ETH_IF
|
||||
LOCAL_IP=$(ip -4 addr show "$ETH_IF" | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | head -n 1)
|
||||
if [[ -z "$LOCAL_IP" ]]; then
|
||||
echo "Error: Could not determine IP for interface $ETH_IF"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo " Detected Local IP: $LOCAL_IP"
|
||||
|
||||
DETECTED_IPS=("$LOCAL_IP")
|
||||
|
||||
# Scan for other nodes
|
||||
echo " Scanning for peers via avahi..."
|
||||
# Run avahi-browse, filter for _ssh._tcp, and look for our interface
|
||||
# Note: avahi-browse output format varies, we use -p (parsable)
|
||||
# Format: =;interface;IPv4;name;type;domain;hostname;ip;port;txt
|
||||
|
||||
# We only care about services on our selected ETH_IF or related interfaces?
|
||||
# The reference script scans ALL interfaces found by ibdev2netdev.
|
||||
# Let's stick to the reference logic: scan on all IB-associated interfaces.
|
||||
|
||||
TEMP_FILE=$(mktemp)
|
||||
trap 'rm -f "$TEMP_FILE"' EXIT
|
||||
|
||||
avahi_output=$(avahi-browse -p -r -f -t _ssh._tcp 2>/dev/null)
|
||||
|
||||
# Filter by the selected management interface (ETH_IF)
|
||||
echo "$avahi_output" | grep ";$ETH_IF;" > "$TEMP_FILE"
|
||||
|
||||
# Extract IPs
|
||||
while IFS=';' read -r prefix iface protocol name type domain hostname ip port txt; do
|
||||
if [[ "$ip" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
# Avoid duplicates
|
||||
if [[ ! " ${DETECTED_IPS[@]} " =~ " ${ip} " ]]; then
|
||||
DETECTED_IPS+=("$ip")
|
||||
echo " Found peer: $ip ($hostname)"
|
||||
fi
|
||||
fi
|
||||
done < <(grep "^=" "$TEMP_FILE" | grep "IPv4")
|
||||
|
||||
# Sort IPs
|
||||
IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}"))
|
||||
unset IFS
|
||||
|
||||
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
|
||||
echo " Cluster Nodes: $NODES_ARG"
|
||||
fi
|
||||
|
||||
if [[ -z "$NODES_ARG" ]]; then
|
||||
echo "Error: Nodes argument (-n) is mandatory or could not be auto-detected."
|
||||
usage
|
||||
fi
|
||||
|
||||
@@ -98,8 +229,19 @@ echo "Worker Nodes: ${WORKER_NODES[*]}"
|
||||
echo "Container Name: $CONTAINER_NAME"
|
||||
echo "Action: $ACTION"
|
||||
|
||||
if [[ "$CHECK_CONFIG" == "true" ]]; then
|
||||
echo "Configuration Check Complete."
|
||||
echo " Image Name: $IMAGE_NAME"
|
||||
echo " ETH Interface: $ETH_IF"
|
||||
echo " IB Interface: $IB_IF"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Cleanup Function
|
||||
cleanup() {
|
||||
# Remove traps to prevent nested cleanup
|
||||
trap - EXIT INT TERM HUP
|
||||
|
||||
echo ""
|
||||
echo "Stopping cluster..."
|
||||
|
||||
|
||||
Reference in New Issue
Block a user