Autodiscovery refactoring with mesh support

This commit is contained in:
Eugene Rakhmatulin
2026-03-26 15:47:41 -07:00
parent 83a74bccec
commit a78e221de3
5 changed files with 401 additions and 270 deletions

View File

@@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
# Load .env file if exists (for shared configuration) # Load .env file if exists (for shared configuration)
# This is called early so that DOTENV_* variables are available to all functions # This is called early so that DOTENV_* variables are available to all functions
load_env_if_exists() { load_env_if_exists() {
@@ -8,8 +10,7 @@ load_env_if_exists() {
# If CONFIG_FILE is not set, check default location # If CONFIG_FILE is not set, check default location
if [[ -z "$env_file" ]]; then if [[ -z "$env_file" ]]; then
local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" env_file="$SCRIPT_DIR/.env"
env_file="$script_dir/.env"
config_explicit="false" config_explicit="false"
fi fi
@@ -47,6 +48,9 @@ load_env_if_exists() {
# Load .env file # Load .env file
load_env_if_exists load_env_if_exists
# Mesh mode flag (set by detect_interfaces)
MESH_MODE="false"
# Function to detect IB and Ethernet interfaces # Function to detect IB and Ethernet interfaces
detect_interfaces() { detect_interfaces() {
# If both interfaces are already set, nothing to do # If both interfaces are already set, nothing to do
@@ -72,49 +76,115 @@ detect_interfaces() {
fi fi
DETECTED_IB_IFS=() DETECTED_IB_IFS=()
CANDIDATE_ETH_IFS=() ALL_NET_IFS=()
for pair in "${IB_NET_PAIRS[@]}"; do for pair in "${IB_NET_PAIRS[@]}"; do
ib_dev=$(echo "$pair" | awk '{print $1}') ib_dev=$(echo "$pair" | awk '{print $1}')
net_dev=$(echo "$pair" | awk '{print $2}') net_dev=$(echo "$pair" | awk '{print $2}')
DETECTED_IB_IFS+=("$ib_dev") DETECTED_IB_IFS+=("$ib_dev")
ALL_NET_IFS+=("$net_dev")
done
# Check if interface has an IP address local num_up="${#IB_NET_PAIRS[@]}"
if ip addr show "$net_dev" | grep -q "inet "; then
CANDIDATE_ETH_IFS+=("$net_dev") # --- Sanity checks ---
# 1. enp* (no capital P) interfaces MUST have an IP
for net_dev in "${ALL_NET_IFS[@]}"; do
if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then
if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned."
return 1
fi
fi fi
done done
# 2. No two interfaces with IPs should share the same subnet
declare -A SEEN_SUBNETS
for net_dev in "${ALL_NET_IFS[@]}"; do
local cidr
cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
# Compute network address using python3
local net_addr
net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null)
if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then
echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration."
return 1
fi
SEEN_SUBNETS["$net_addr"]="$net_dev"
done
# --- Mode selection ---
if [[ "$num_up" -eq 2 ]]; then
# Non-mesh configuration
MESH_MODE="false"
echo " Non-mesh mode: 2 CX7 interfaces active."
# Set IB_IF if not provided # Set IB_IF if not provided
if [[ -z "$IB_IF" ]]; then if [[ -z "$IB_IF" ]]; then
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
echo " Detected IB_IF: $IB_IF" echo " Detected IB_IF: $IB_IF"
fi fi
# Set ETH_IF if not provided # Set ETH_IF if not provided: prefer interface without capital 'P'
if [[ -z "$ETH_IF" ]]; then if [[ -z "$ETH_IF" ]]; then
if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then local selected_eth=""
echo "Error: No active IB-associated interfaces have IP addresses." for net_dev in "${ALL_NET_IFS[@]}"; do
return 1 if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
if [[ "$net_dev" != *P* ]]; then
selected_eth="$net_dev"
break
fi fi
fi
# Selection logic: Prefer interface without capital 'P' done
SELECTED_ETH="" # Fallback: first interface with an IP
for iface in "${CANDIDATE_ETH_IFS[@]}"; do if [[ -z "$selected_eth" ]]; then
if [[ "$iface" != *"P"* ]]; then for net_dev in "${ALL_NET_IFS[@]}"; do
SELECTED_ETH="$iface" if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
selected_eth="$net_dev"
break break
fi fi
done done
fi
# Fallback: Use the first one if all have 'P' or none found yet if [[ -z "$selected_eth" ]]; then
if [[ -z "$SELECTED_ETH" ]]; then echo "Error: No active IB-associated interfaces have IP addresses."
SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" return 1
fi
ETH_IF="$selected_eth"
echo " Detected ETH_IF: $ETH_IF"
fi fi
ETH_IF="$SELECTED_ETH" elif [[ "$num_up" -eq 4 ]]; then
# Mesh configuration
MESH_MODE="true"
echo " Mesh mode: all 4 CX7 interfaces active."
# Set IB_IF to all four RoCE interfaces (hardcoded for mesh)
if [[ -z "$IB_IF" ]]; then
IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1"
echo " Detected IB_IF: $IB_IF"
fi
# Set ETH_IF: check enP7s7 first, then wlP9s9
if [[ -z "$ETH_IF" ]]; then
if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then
ETH_IF="enP7s7"
echo " Detected ETH_IF: $ETH_IF" echo " Detected ETH_IF: $ETH_IF"
elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then
ETH_IF="wlP9s9"
echo " Detected ETH_IF: $ETH_IF"
echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited."
else
echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination."
return 1
fi
fi
else
echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)."
return 1
fi fi
} }
@@ -141,6 +211,41 @@ detect_local_ip() {
echo " Detected Local IP: $LOCAL_IP ($CIDR)" echo " Detected Local IP: $LOCAL_IP ($CIDR)"
} }
# Scan a subnet for GB10-capable peers via SSH
# Usage: _scan_subnet_for_gb10 <cidr> <local_ip_to_exclude> <output_file>
_scan_subnet_for_gb10() {
local cidr="$1"
local exclude_ip="$2"
local out_file="$3"
if ! command -v python3 &> /dev/null; then
echo "Error: python3 not found."
return 1
fi
if ! command -v nc &> /dev/null; then
echo "Error: nc (netcat) not found."
return 1
fi
local all_ips
all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr")
for ip in $all_ips; do
[[ "$ip" == "$exclude_ip" ]] && continue
(
if nc -z -w 1 "$ip" 22 &>/dev/null; then
# Check if remote is a GB10 system
if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
"nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \
2>/dev/null | grep -q "NVIDIA GB10"; then
echo "$ip" >> "$out_file"
fi
fi
) &
done
wait
}
# Function to detect cluster nodes # Function to detect cluster nodes
detect_nodes() { detect_nodes() {
detect_local_ip || return 1 detect_local_ip || return 1
@@ -158,71 +263,164 @@ detect_nodes() {
return 0 return 0
fi fi
# Try to use COPY_HOSTS from .env # Try to use CLUSTER_NODES from .env
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then
echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES"
PEER_NODES=() PEER_NODES=()
IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS" IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES"
for node in "${ALL_NODES[@]}"; do for node in "${ALL_NODES[@]}"; do
node=$(echo "$node" | xargs) node=$(echo "$node" | xargs)
PEER_NODES+=("$node") [[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node")
done done
NODES_ARG="$DOTENV_COPY_HOSTS" NODES_ARG="$DOTENV_CLUSTER_NODES"
return 0 return 0
fi fi
echo "Auto-detecting nodes..." echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..."
if ! command -v nc &> /dev/null; then local temp_file
echo "Error: nc (netcat) not found. Please install netcat." temp_file=$(mktemp)
return 1
fi
if ! command -v python3 &> /dev/null; then _scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file"
echo "Error: python3 not found. Please install python3."
return 1
fi
DETECTED_IPS=("$LOCAL_IP")
PEER_NODES=() PEER_NODES=()
local detected_ips=("$LOCAL_IP")
echo " Scanning for SSH peers on $CIDR..." if [[ -f "$temp_file" ]]; then
# Generate list of IPs using python
ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR")
TEMP_IPS_FILE=$(mktemp)
# Scan in parallel
for ip in $ALL_IPS; do
# Skip own IP
if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi
(
# Check port 22 with 1 second timeout
if nc -z -w 1 "$ip" 22 &>/dev/null; then
echo "$ip" >> "$TEMP_IPS_FILE"
fi
) &
done
# Wait for all background scans to complete
wait
# Read found IPs
if [[ -f "$TEMP_IPS_FILE" ]]; then
while read -r ip; do while read -r ip; do
DETECTED_IPS+=("$ip")
PEER_NODES+=("$ip") PEER_NODES+=("$ip")
echo " Found peer: $ip" detected_ips+=("$ip")
done < "$TEMP_IPS_FILE" echo " Found GB10 peer: $ip"
rm -f "$TEMP_IPS_FILE" done < <(sort "$temp_file")
rm -f "$temp_file"
fi fi
# Sort IPs # Sort and set NODES_ARG
IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}")) IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}"))
unset IFS unset IFS
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
echo " Cluster Nodes: $NODES_ARG" echo " Cluster Nodes: $NODES_ARG"
} }
# Function to detect COPY_HOSTS for build/model distribution
# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network)
# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers
detect_copy_hosts() {
if [[ "$MESH_MODE" == "false" ]]; then
COPY_PEER_NODES=("${PEER_NODES[@]}")
return 0
fi
# Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets
echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..."
local temp_file
temp_file=$(mktemp)
for iface in enp1s0f0np0 enp1s0f1np1; do
local cidr
cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
local local_iface_ip="${cidr%/*}"
echo " Scanning $iface ($cidr)..."
_scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file"
done
# Deduplicate and collect results
COPY_PEER_NODES=()
declare -A _SEEN_COPY
if [[ -f "$temp_file" ]]; then
while read -r ip; do
if [[ -z "${_SEEN_COPY[$ip]}" ]]; then
_SEEN_COPY["$ip"]=1
COPY_PEER_NODES+=("$ip")
echo " Found GB10 copy host: $ip"
fi
done < <(sort "$temp_file")
rm -f "$temp_file"
fi
}
# Save discovered configuration to .env
# Skips if .env already exists unless FORCE_DISCOVER=true
save_config() {
local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}"
# Skip if .env exists and not forced
if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then
return 0
fi
echo ""
local save_prompt="Save discovered configuration to $env_file?"
if [[ -f "$env_file" ]]; then
save_prompt="Overwrite existing configuration in $env_file?"
fi
read -r -p "$save_prompt [Y/n]: " response
response="${response,,}"
if [[ "$response" =~ ^(n|no)$ ]]; then
return 0
fi
# Build list of all cluster nodes (local + peers)
local all_cluster_nodes=()
if [[ -n "$LOCAL_IP" ]]; then
all_cluster_nodes+=("$LOCAL_IP")
fi
for node in "${PEER_NODES[@]}"; do
all_cluster_nodes+=("$node")
done
# Per-node confirmation for CLUSTER_NODES
echo ""
echo "Select nodes for CLUSTER_NODES:"
local selected_cluster=()
for node in "${all_cluster_nodes[@]}"; do
local label="$node"
[[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)"
read -r -p " Include $label? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_cluster+=("$node")
fi
done
if [[ "${#selected_cluster[@]}" -eq 0 ]]; then
echo "No nodes selected. Aborting save."
return 1
fi
# Per-node confirmation for COPY_HOSTS
echo ""
echo "Select nodes for COPY_HOSTS (build/model distribution):"
local selected_copy=()
for node in "${COPY_PEER_NODES[@]}"; do
read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_copy+=("$node")
fi
done
# Write .env
{
echo "# Auto-generated by autodiscover.sh"
echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")"
if [[ "${#selected_copy[@]}" -gt 0 ]]; then
echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")"
fi
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
} > "$env_file"
echo ""
echo "Saved to $env_file"
}
# Convenience function: run full autodiscovery pipeline
run_autodiscover() {
detect_interfaces || return 1
detect_local_ip || return 1
detect_nodes || return 1
detect_copy_hosts || return 1
save_config
}

View File

@@ -282,6 +282,7 @@ usage() {
echo " --network <network> : Docker network to use during build" echo " --network <network> : Docker network to use during build"
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
echo " --config : Path to .env configuration file (default: .env in script directory)" echo " --config : Path to .env configuration file (default: .env in script directory)"
echo " --setup : Force autodiscovery and save configuration (even if .env exists)"
echo " -h, --help : Show this help message" echo " -h, --help : Show this help message"
exit 1 exit 1
} }
@@ -334,6 +335,7 @@ while [[ "$#" -gt 0 ]]; do
fi fi
;; ;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
-h|--help) usage ;; -h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;; *) echo "Unknown parameter passed: $1"; usage ;;
esac esac
@@ -343,6 +345,18 @@ done
# Source autodiscover.sh to load .env file # Source autodiscover.sh to load .env file
source "$(dirname "$0")/autodiscover.sh" source "$(dirname "$0")/autodiscover.sh"
# If --setup: force full autodiscovery and save configuration
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
echo "Running full autodiscovery (--setup)..."
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
fi
# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments # Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
@@ -351,14 +365,13 @@ if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else else
echo "No hosts specified. Using autodiscovery..." echo "No hosts specified. Using autodiscovery..."
detect_nodes detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
if [ $? -ne 0 ]; then detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
echo "Error: Autodiscovery failed." detect_nodes || { echo "Error: Node detection failed."; exit 1; }
exit 1 detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
fi
if [ ${#PEER_NODES[@]} -gt 0 ]; then if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}") COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then

View File

@@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub"
COPY_HOSTS=() COPY_HOSTS=()
SSH_USER="$USER" SSH_USER="$USER"
PARALLEL_COPY=false PARALLEL_COPY=false
CONFIG_FILE=""
CONFIG_FILE_SET=false
# Help function # Help function
usage() { usage() {
@@ -16,6 +18,7 @@ usage() {
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " -u, --user <user> : Username for ssh commands (default: \$USER)" echo " -u, --user <user> : Username for ssh commands (default: \$USER)"
echo " --config <file> : Path to .env configuration file (default: .env in script directory)"
echo " -h, --help : Show this help message" echo " -h, --help : Show this help message"
exit 1 exit 1
} }
@@ -53,42 +56,22 @@ copy_model_to_host() {
} }
# Argument parsing # Argument parsing
COPY_TO_FLAG=false
while [[ "$#" -gt 0 ]]; do while [[ "$#" -gt 0 ]]; do
case $1 in case $1 in
-c|--copy-to|--copy-to-host|--copy-to-hosts) -c|--copy-to|--copy-to-host|--copy-to-hosts)
COPY_TO_FLAG=true
shift shift
# Consume arguments until the next flag or end of args # Consume arguments until the next flag or end of args
while [[ "$#" -gt 0 && "$1" != -* ]]; do while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1" add_copy_hosts "$1"
shift shift
done done
# If no hosts specified, use autodiscovery
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "No hosts specified. Using autodiscovery..."
source "$(dirname "$0")/autodiscover.sh"
detect_nodes
if [ $? -ne 0 ]; then
echo "Error: Autodiscovery failed."
exit 1
fi
# Use PEER_NODES directly
if [ ${#PEER_NODES[@]} -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
continue continue
;; ;;
--copy-parallel) PARALLEL_COPY=true ;; --copy-parallel) PARALLEL_COPY=true ;;
-u|--user) SSH_USER="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
-h|--help) usage ;; -h|--help) usage ;;
*) *)
# If positional argument is provided # If positional argument is provided
@@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do
shift shift
done done
# Export config so autodiscover.sh picks it up
export CONFIG_FILE CONFIG_FILE_SET
# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available
source "$(dirname "$0")/autodiscover.sh"
# Validate model name is provided # Validate model name is provided
if [ -z "${MODEL_NAME:-}" ]; then if [ -z "${MODEL_NAME:-}" ]; then
echo "Error: Model name is required." echo "Error: Model name is required."
usage usage
fi fi
# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
# --copy-to was specified but no hosts given: use .env or autodiscover
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else
echo "No hosts specified. Using autodiscovery..."
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}"
fi
elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then
# No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly
: # intentional no-op; user didn't ask for copy
fi
# Check if uvx is installed # Check if uvx is installed
if ! command -v uvx &> /dev/null; then if ! command -v uvx &> /dev/null; then
echo "Error: 'uvx' command not found." echo "Error: 'uvx' command not found."

View File

@@ -68,7 +68,8 @@ usage() {
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)" echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
echo " --config Path to .env configuration file (default: .env in script directory)" echo " --config Path to .env configuration file (default: .env in script directory)
--setup Force autodiscovery and save configuration (even if .env exists)"
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
echo "" echo ""
@@ -131,6 +132,7 @@ while [[ "$#" -gt 0 ]]; do
-d) DAEMON_MODE="true" ;; -d) DAEMON_MODE="true" ;;
-h|--help) usage ;; -h|--help) usage ;;
--config) CONFIG_FILE="$2"; shift ;; --config) CONFIG_FILE="$2"; shift ;;
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
start|stop|status) start|stop|status)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
@@ -411,6 +413,21 @@ done
# Source autodiscover module # Source autodiscover module
source "$(dirname "$0")/autodiscover.sh" source "$(dirname "$0")/autodiscover.sh"
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
# --setup: force full autodiscovery and save configuration
echo "Running full autodiscovery (--setup)..."
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
fi
if [[ "$SOLO_MODE" == "true" ]]; then if [[ "$SOLO_MODE" == "true" ]]; then
# Solo mode: skip node detection, just get local IP # Solo mode: skip node detection, just get local IP
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1 # Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1

View File

@@ -547,7 +547,6 @@ def load_env_file() -> dict[str, str]:
Reads the .env file created by --discover for persistent cluster configuration. Reads the .env file created by --discover for persistent cluster configuration.
EXTENSIBILITY: EXTENSIBILITY:
- To add new persistent settings: Just add them to save_env_file()
- To support multiple .env files: Add a --env-file CLI argument - To support multiple .env files: Add a --env-file CLI argument
- To add validation: Check for required keys after loading - To add validation: Check for required keys after loading
@@ -573,57 +572,16 @@ def load_env_file() -> dict[str, str]:
return env return env
def save_env_file(env: dict[str, str]) -> None:
"""
Save environment variables to .env file.
Persists cluster configuration discovered by autodiscover.sh.
Values are properly quoted if they contain spaces or commas.
EXTENSIBILITY:
- To add new persistent settings: Just add them to the env dict before calling
- To add timestamps/metadata: Add comment lines to the output
- To support append mode: Read existing, merge, then write
Args:
env: Dictionary of key=value pairs to save
"""
lines = ["# Auto-generated by run-recipe.py --discover", ""]
for key, value in sorted(env.items()):
# Quote values with spaces
if " " in value or "," in value:
lines.append(f'{key}="{value}"')
else:
lines.append(f"{key}={value}")
lines.append("")
with open(ENV_FILE, "w") as f:
f.write("\n".join(lines))
print(f"Saved to {ENV_FILE}")
def run_autodiscover() -> dict[str, str] | None: def run_autodiscover() -> dict[str, str] | None:
""" """
Run autodiscover.sh and return discovered configuration. Run autodiscover.sh interactively and return discovered configuration.
Executes the autodiscover.sh script to detect cluster topology, Executes the autodiscover.sh script to detect cluster topology,
then presents an interactive node selection menu. including interactive per-node confirmation and .env saving.
After autodiscover.sh completes, reads configuration from .env file.
EXTENSIBILITY:
- To add new discovery methods: Extend autodiscover.sh or add Python detection here
- To add GPU detection: Add nvidia-smi parsing to discovered env
- To skip interactive selection: Add a --non-interactive flag
- To add node health checks: Ping/SSH test each discovered node
DISCOVERED VARIABLES:
CLUSTER_NODES: Comma-separated list of node IPs (user-selected)
LOCAL_IP: This machine's IP address
ETH_IF: Ethernet interface name (e.g., 'eth0')
IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available
Returns: Returns:
Dictionary with discovered configuration, or None if discovery failed Dictionary with discovered configuration from .env, or None if discovery failed
""" """
if not AUTODISCOVER_SCRIPT.exists(): if not AUTODISCOVER_SCRIPT.exists():
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
@@ -632,86 +590,29 @@ def run_autodiscover() -> dict[str, str] | None:
print("Running autodiscover...") print("Running autodiscover...")
print() print()
# Run autodiscover in a subshell and capture the variables # Build env for the subprocess so CONFIG_FILE is passed through
# We source the script and print the variables we care about env_vars = os.environ.copy()
env_vars["CONFIG_FILE"] = str(ENV_FILE)
env_vars["CONFIG_FILE_SET"] = "true"
# Run autodiscover interactively so its prompts are shown to the user
script = f""" script = f"""
source '{AUTODISCOVER_SCRIPT}' source '{AUTODISCOVER_SCRIPT}'
detect_interfaces run_autodiscover
detect_local_ip
detect_nodes
echo "CLUSTER_NODES=$NODES_ARG"
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
""" """
result = subprocess.run(["bash", "-c", script], capture_output=True, text=True) result = subprocess.run(["bash", "-c", script], env=env_vars)
if result.returncode != 0: if result.returncode != 0:
print("Autodiscover output:")
print(result.stdout)
if result.stderr:
print(result.stderr)
print("Error: Autodiscover failed") print("Error: Autodiscover failed")
return None return None
# Print the autodiscover output (excluding the final variable lines) # Read configuration from the .env file that autodiscover.sh wrote
output_lines = result.stdout.strip().split("\n") env = load_env_file()
env = {} if not env.get("CLUSTER_NODES"):
for line in output_lines: print("Autodiscover completed but no CLUSTER_NODES found in .env")
if "=" in line and any(
line.startswith(k)
for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]
):
key, _, value = line.partition("=")
env[key] = value
else:
print(line)
print()
# Interactive node selection
if env.get("CLUSTER_NODES"):
all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()]
local_ip = env.get("LOCAL_IP", "")
if len(all_nodes) > 1:
print("Select which nodes to include in the cluster:")
print()
selected_nodes = []
for node in all_nodes:
is_local = node == local_ip
label = f"{node} (this machine)" if is_local else node
# Default to yes for all nodes
while True:
response = input(f" Include {label}? [Y/n]: ").strip().lower()
if response in ("", "y", "yes"):
selected_nodes.append(node)
break
elif response in ("n", "no"):
break
else:
print(" Please enter 'y' or 'n'")
print()
if not selected_nodes:
print("No nodes selected. Aborting.")
return None return None
if len(selected_nodes) == 1:
print(f"Only one node selected: {selected_nodes[0]}")
print("This will run in solo mode (single node).")
else:
print(
f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}"
)
env["CLUSTER_NODES"] = ",".join(selected_nodes)
print()
return env return env
@@ -990,8 +891,6 @@ Examples:
print(f" {key}={value}") print(f" {key}={value}")
print() print()
save_env_file(env)
if not args.recipe: if not args.recipe:
return 0 return 0
@@ -1058,20 +957,6 @@ Examples:
nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
nodes_from_env = True nodes_from_env = True
if nodes:
# Ask if user wants to save to .env
print()
response = (
input(
"Save this configuration to .env for future use? [Y/n]: "
)
.strip()
.lower()
)
if response in ("", "y", "yes"):
save_env_file(discovered_env)
print()
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh # Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
eth_if = args.eth_if or None eth_if = args.eth_if or None
ib_if = args.ib_if or None ib_if = args.ib_if or None