Autodiscovery refactoring with mesh support

This commit is contained in:
Eugene Rakhmatulin
2026-03-26 15:47:41 -07:00
parent 83a74bccec
commit a78e221de3
5 changed files with 401 additions and 270 deletions

View File

@@ -1,43 +1,44 @@
#!/bin/bash #!/bin/bash
SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
# Load .env file if exists (for shared configuration) # Load .env file if exists (for shared configuration)
# This is called early so that DOTENV_* variables are available to all functions # This is called early so that DOTENV_* variables are available to all functions
load_env_if_exists() { load_env_if_exists() {
local env_file="${CONFIG_FILE:-}" local env_file="${CONFIG_FILE:-}"
local config_explicit="${CONFIG_FILE_SET:-false}" local config_explicit="${CONFIG_FILE_SET:-false}"
# If CONFIG_FILE is not set, check default location # If CONFIG_FILE is not set, check default location
if [[ -z "$env_file" ]]; then if [[ -z "$env_file" ]]; then
local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" env_file="$SCRIPT_DIR/.env"
env_file="$script_dir/.env"
config_explicit="false" config_explicit="false"
fi fi
# Validate config file exists if explicitly specified # Validate config file exists if explicitly specified
if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then
echo "Error: Config file not found: $env_file" echo "Error: Config file not found: $env_file"
exit 1 exit 1
fi fi
if [[ -f "$env_file" ]]; then if [[ -f "$env_file" ]]; then
# Load .env variables with DOTENV_ prefix # Load .env variables with DOTENV_ prefix
while IFS='=' read -r key value || [[ -n "$key" ]]; do while IFS='=' read -r key value || [[ -n "$key" ]]; do
# Skip comments and empty lines # Skip comments and empty lines
[[ "$key" =~ ^[[:space:]]*# ]] && continue [[ "$key" =~ ^[[:space:]]*# ]] && continue
[[ -z "$key" ]] && continue [[ -z "$key" ]] && continue
# Remove leading/trailing whitespace from key # Remove leading/trailing whitespace from key
key=$(echo "$key" | xargs) key=$(echo "$key" | xargs)
# Skip if key is empty after trimming # Skip if key is empty after trimming
[[ -z "$key" ]] && continue [[ -z "$key" ]] && continue
# Remove quotes from value # Remove quotes from value
value="${value%\"}" value="${value%\"}"
value="${value#\"}" value="${value#\"}"
value="${value%\'}" value="${value%\'}"
value="${value#\'}" value="${value#\'}"
# Export with DOTENV_ prefix # Export with DOTENV_ prefix
export "DOTENV_$key=$value" export "DOTENV_$key=$value"
done < "$env_file" done < "$env_file"
@@ -47,6 +48,9 @@ load_env_if_exists() {
# Load .env file # Load .env file
load_env_if_exists load_env_if_exists
# Mesh mode flag (set by detect_interfaces)
MESH_MODE="false"
# Function to detect IB and Ethernet interfaces # Function to detect IB and Ethernet interfaces
detect_interfaces() { detect_interfaces() {
# If both interfaces are already set, nothing to do # If both interfaces are already set, nothing to do
@@ -61,60 +65,126 @@ detect_interfaces() {
fi fi
echo "Auto-detecting interfaces..." echo "Auto-detecting interfaces..."
# Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)" # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)"
# We capture: IB_DEV, NET_DEV # We capture: IB_DEV, NET_DEV
mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}')
if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then
echo "Error: No active IB interfaces found." echo "Error: No active IB interfaces found."
return 1 return 1
fi fi
DETECTED_IB_IFS=() DETECTED_IB_IFS=()
CANDIDATE_ETH_IFS=() ALL_NET_IFS=()
for pair in "${IB_NET_PAIRS[@]}"; do for pair in "${IB_NET_PAIRS[@]}"; do
ib_dev=$(echo "$pair" | awk '{print $1}') ib_dev=$(echo "$pair" | awk '{print $1}')
net_dev=$(echo "$pair" | awk '{print $2}') net_dev=$(echo "$pair" | awk '{print $2}')
DETECTED_IB_IFS+=("$ib_dev") DETECTED_IB_IFS+=("$ib_dev")
ALL_NET_IFS+=("$net_dev")
# Check if interface has an IP address done
if ip addr show "$net_dev" | grep -q "inet "; then
CANDIDATE_ETH_IFS+=("$net_dev") local num_up="${#IB_NET_PAIRS[@]}"
# --- Sanity checks ---
# 1. enp* (no capital P) interfaces MUST have an IP
for net_dev in "${ALL_NET_IFS[@]}"; do
if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then
if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned."
return 1
fi
fi fi
done done
# Set IB_IF if not provided # 2. No two interfaces with IPs should share the same subnet
if [[ -z "$IB_IF" ]]; then declare -A SEEN_SUBNETS
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") for net_dev in "${ALL_NET_IFS[@]}"; do
echo " Detected IB_IF: $IB_IF" local cidr
fi cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
# Set ETH_IF if not provided # Compute network address using python3
if [[ -z "$ETH_IF" ]]; then local net_addr
if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null)
echo "Error: No active IB-associated interfaces have IP addresses." if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then
echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration."
return 1 return 1
fi fi
SEEN_SUBNETS["$net_addr"]="$net_dev"
# Selection logic: Prefer interface without capital 'P' done
SELECTED_ETH=""
for iface in "${CANDIDATE_ETH_IFS[@]}"; do # --- Mode selection ---
if [[ "$iface" != *"P"* ]]; then
SELECTED_ETH="$iface" if [[ "$num_up" -eq 2 ]]; then
break # Non-mesh configuration
fi MESH_MODE="false"
done echo " Non-mesh mode: 2 CX7 interfaces active."
# Fallback: Use the first one if all have 'P' or none found yet # Set IB_IF if not provided
if [[ -z "$SELECTED_ETH" ]]; then if [[ -z "$IB_IF" ]]; then
SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
echo " Detected IB_IF: $IB_IF"
fi fi
ETH_IF="$SELECTED_ETH" # Set ETH_IF if not provided: prefer interface without capital 'P'
echo " Detected ETH_IF: $ETH_IF" if [[ -z "$ETH_IF" ]]; then
local selected_eth=""
for net_dev in "${ALL_NET_IFS[@]}"; do
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
if [[ "$net_dev" != *P* ]]; then
selected_eth="$net_dev"
break
fi
fi
done
# Fallback: first interface with an IP
if [[ -z "$selected_eth" ]]; then
for net_dev in "${ALL_NET_IFS[@]}"; do
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
selected_eth="$net_dev"
break
fi
done
fi
if [[ -z "$selected_eth" ]]; then
echo "Error: No active IB-associated interfaces have IP addresses."
return 1
fi
ETH_IF="$selected_eth"
echo " Detected ETH_IF: $ETH_IF"
fi
elif [[ "$num_up" -eq 4 ]]; then
# Mesh configuration
MESH_MODE="true"
echo " Mesh mode: all 4 CX7 interfaces active."
# Set IB_IF to all four RoCE interfaces (hardcoded for mesh)
if [[ -z "$IB_IF" ]]; then
IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1"
echo " Detected IB_IF: $IB_IF"
fi
# Set ETH_IF: check enP7s7 first, then wlP9s9
if [[ -z "$ETH_IF" ]]; then
if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then
ETH_IF="enP7s7"
echo " Detected ETH_IF: $ETH_IF"
elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then
ETH_IF="wlP9s9"
echo " Detected ETH_IF: $ETH_IF"
echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited."
else
echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination."
return 1
fi
fi
else
echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)."
return 1
fi fi
} }
@@ -131,16 +201,51 @@ detect_local_ip() {
# Get CIDR of the selected ETH_IF # Get CIDR of the selected ETH_IF
CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1) CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1)
if [[ -z "$CIDR" ]]; then if [[ -z "$CIDR" ]]; then
echo "Error: Could not determine IP/CIDR for interface $ETH_IF" echo "Error: Could not determine IP/CIDR for interface $ETH_IF"
return 1 return 1
fi fi
LOCAL_IP=${CIDR%/*} LOCAL_IP=${CIDR%/*}
echo " Detected Local IP: $LOCAL_IP ($CIDR)" echo " Detected Local IP: $LOCAL_IP ($CIDR)"
} }
# Scan a subnet for GB10-capable peers via SSH
# Usage: _scan_subnet_for_gb10 <cidr> <local_ip_to_exclude> <output_file>
_scan_subnet_for_gb10() {
local cidr="$1"
local exclude_ip="$2"
local out_file="$3"
if ! command -v python3 &> /dev/null; then
echo "Error: python3 not found."
return 1
fi
if ! command -v nc &> /dev/null; then
echo "Error: nc (netcat) not found."
return 1
fi
local all_ips
all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr")
for ip in $all_ips; do
[[ "$ip" == "$exclude_ip" ]] && continue
(
if nc -z -w 1 "$ip" 22 &>/dev/null; then
# Check if remote is a GB10 system
if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
"nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \
2>/dev/null | grep -q "NVIDIA GB10"; then
echo "$ip" >> "$out_file"
fi
fi
) &
done
wait
}
# Function to detect cluster nodes # Function to detect cluster nodes
detect_nodes() { detect_nodes() {
detect_local_ip || return 1 detect_local_ip || return 1
@@ -157,72 +262,165 @@ detect_nodes() {
done done
return 0 return 0
fi fi
# Try to use COPY_HOSTS from .env # Try to use CLUSTER_NODES from .env
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then
echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES"
PEER_NODES=() PEER_NODES=()
IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS" IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES"
for node in "${ALL_NODES[@]}"; do for node in "${ALL_NODES[@]}"; do
node=$(echo "$node" | xargs) node=$(echo "$node" | xargs)
PEER_NODES+=("$node") [[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node")
done done
NODES_ARG="$DOTENV_COPY_HOSTS" NODES_ARG="$DOTENV_CLUSTER_NODES"
return 0 return 0
fi fi
echo "Auto-detecting nodes..." echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..."
if ! command -v nc &> /dev/null; then local temp_file
echo "Error: nc (netcat) not found. Please install netcat." temp_file=$(mktemp)
return 1
fi _scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file"
if ! command -v python3 &> /dev/null; then PEER_NODES=()
echo "Error: python3 not found. Please install python3." local detected_ips=("$LOCAL_IP")
return 1 if [[ -f "$temp_file" ]]; then
while read -r ip; do
PEER_NODES+=("$ip")
detected_ips+=("$ip")
echo " Found GB10 peer: $ip"
done < <(sort "$temp_file")
rm -f "$temp_file"
fi fi
DETECTED_IPS=("$LOCAL_IP") # Sort and set NODES_ARG
PEER_NODES=() IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}"))
echo " Scanning for SSH peers on $CIDR..."
# Generate list of IPs using python
ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR")
TEMP_IPS_FILE=$(mktemp)
# Scan in parallel
for ip in $ALL_IPS; do
# Skip own IP
if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi
(
# Check port 22 with 1 second timeout
if nc -z -w 1 "$ip" 22 &>/dev/null; then
echo "$ip" >> "$TEMP_IPS_FILE"
fi
) &
done
# Wait for all background scans to complete
wait
# Read found IPs
if [[ -f "$TEMP_IPS_FILE" ]]; then
while read -r ip; do
DETECTED_IPS+=("$ip")
PEER_NODES+=("$ip")
echo " Found peer: $ip"
done < "$TEMP_IPS_FILE"
rm -f "$TEMP_IPS_FILE"
fi
# Sort IPs
IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}"))
unset IFS unset IFS
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
echo " Cluster Nodes: $NODES_ARG" echo " Cluster Nodes: $NODES_ARG"
} }
# Function to detect COPY_HOSTS for build/model distribution
# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network)
# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers
detect_copy_hosts() {
if [[ "$MESH_MODE" == "false" ]]; then
COPY_PEER_NODES=("${PEER_NODES[@]}")
return 0
fi
# Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets
echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..."
local temp_file
temp_file=$(mktemp)
for iface in enp1s0f0np0 enp1s0f1np1; do
local cidr
cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1)
[[ -z "$cidr" ]] && continue
local local_iface_ip="${cidr%/*}"
echo " Scanning $iface ($cidr)..."
_scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file"
done
# Deduplicate and collect results
COPY_PEER_NODES=()
declare -A _SEEN_COPY
if [[ -f "$temp_file" ]]; then
while read -r ip; do
if [[ -z "${_SEEN_COPY[$ip]}" ]]; then
_SEEN_COPY["$ip"]=1
COPY_PEER_NODES+=("$ip")
echo " Found GB10 copy host: $ip"
fi
done < <(sort "$temp_file")
rm -f "$temp_file"
fi
}
# Save discovered configuration to .env
# Skips if .env already exists unless FORCE_DISCOVER=true
save_config() {
local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}"
# Skip if .env exists and not forced
if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then
return 0
fi
echo ""
local save_prompt="Save discovered configuration to $env_file?"
if [[ -f "$env_file" ]]; then
save_prompt="Overwrite existing configuration in $env_file?"
fi
read -r -p "$save_prompt [Y/n]: " response
response="${response,,}"
if [[ "$response" =~ ^(n|no)$ ]]; then
return 0
fi
# Build list of all cluster nodes (local + peers)
local all_cluster_nodes=()
if [[ -n "$LOCAL_IP" ]]; then
all_cluster_nodes+=("$LOCAL_IP")
fi
for node in "${PEER_NODES[@]}"; do
all_cluster_nodes+=("$node")
done
# Per-node confirmation for CLUSTER_NODES
echo ""
echo "Select nodes for CLUSTER_NODES:"
local selected_cluster=()
for node in "${all_cluster_nodes[@]}"; do
local label="$node"
[[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)"
read -r -p " Include $label? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_cluster+=("$node")
fi
done
if [[ "${#selected_cluster[@]}" -eq 0 ]]; then
echo "No nodes selected. Aborting save."
return 1
fi
# Per-node confirmation for COPY_HOSTS
echo ""
echo "Select nodes for COPY_HOSTS (build/model distribution):"
local selected_copy=()
for node in "${COPY_PEER_NODES[@]}"; do
read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r
r="${r,,}"
if [[ ! "$r" =~ ^(n|no)$ ]]; then
selected_copy+=("$node")
fi
done
# Write .env
{
echo "# Auto-generated by autodiscover.sh"
echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")"
if [[ "${#selected_copy[@]}" -gt 0 ]]; then
echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")"
fi
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
} > "$env_file"
echo ""
echo "Saved to $env_file"
}
# Convenience function: run full autodiscovery pipeline
run_autodiscover() {
detect_interfaces || return 1
detect_local_ip || return 1
detect_nodes || return 1
detect_copy_hosts || return 1
save_config
}

View File

@@ -282,6 +282,7 @@ usage() {
echo " --network <network> : Docker network to use during build" echo " --network <network> : Docker network to use during build"
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
echo " --config : Path to .env configuration file (default: .env in script directory)" echo " --config : Path to .env configuration file (default: .env in script directory)"
echo " --setup : Force autodiscovery and save configuration (even if .env exists)"
echo " -h, --help : Show this help message" echo " -h, --help : Show this help message"
exit 1 exit 1
} }
@@ -334,6 +335,7 @@ while [[ "$#" -gt 0 ]]; do
fi fi
;; ;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
-h|--help) usage ;; -h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;; *) echo "Unknown parameter passed: $1"; usage ;;
esac esac
@@ -343,6 +345,18 @@ done
# Source autodiscover.sh to load .env file # Source autodiscover.sh to load .env file
source "$(dirname "$0")/autodiscover.sh" source "$(dirname "$0")/autodiscover.sh"
# If --setup: force full autodiscovery and save configuration
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
echo "Running full autodiscovery (--setup)..."
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
fi
# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments # Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
@@ -351,19 +365,18 @@ if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else else
echo "No hosts specified. Using autodiscovery..." echo "No hosts specified. Using autodiscovery..."
detect_nodes detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
if [ $? -ne 0 ]; then detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
echo "Error: Autodiscovery failed." detect_nodes || { echo "Error: Node detection failed."; exit 1; }
exit 1 detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
fi
if [ ${#PEER_NODES[@]} -gt 0 ]; then if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}") COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes." echo "Error: Autodiscovery found no other nodes."
exit 1 exit 1
fi fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi fi

View File

@@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub"
COPY_HOSTS=() COPY_HOSTS=()
SSH_USER="$USER" SSH_USER="$USER"
PARALLEL_COPY=false PARALLEL_COPY=false
CONFIG_FILE=""
CONFIG_FILE_SET=false
# Help function # Help function
usage() { usage() {
@@ -16,6 +18,7 @@ usage() {
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " -u, --user <user> : Username for ssh commands (default: \$USER)" echo " -u, --user <user> : Username for ssh commands (default: \$USER)"
echo " --config <file> : Path to .env configuration file (default: .env in script directory)"
echo " -h, --help : Show this help message" echo " -h, --help : Show this help message"
exit 1 exit 1
} }
@@ -37,11 +40,11 @@ copy_model_to_host() {
local host="$1" local host="$1"
local model_name="$2" local model_name="$2"
local model_dir="$3" local model_dir="$3"
echo "Copying model '$model_name' to ${SSH_USER}@${host}..." echo "Copying model '$model_name' to ${SSH_USER}@${host}..."
local host_copy_start host_copy_end host_copy_time local host_copy_start host_copy_end host_copy_time
host_copy_start=$(date +%s) host_copy_start=$(date +%s)
if rsync -av --mkpath --progress "$model_dir" "${SSH_USER}@${host}:$HUB_PATH/"; then if rsync -av --mkpath --progress "$model_dir" "${SSH_USER}@${host}:$HUB_PATH/"; then
host_copy_end=$(date +%s) host_copy_end=$(date +%s)
host_copy_time=$((host_copy_end - host_copy_start)) host_copy_time=$((host_copy_end - host_copy_start))
@@ -53,44 +56,24 @@ copy_model_to_host() {
} }
# Argument parsing # Argument parsing
COPY_TO_FLAG=false
while [[ "$#" -gt 0 ]]; do while [[ "$#" -gt 0 ]]; do
case $1 in case $1 in
-c|--copy-to|--copy-to-host|--copy-to-hosts) -c|--copy-to|--copy-to-host|--copy-to-hosts)
COPY_TO_FLAG=true
shift shift
# Consume arguments until the next flag or end of args # Consume arguments until the next flag or end of args
while [[ "$#" -gt 0 && "$1" != -* ]]; do while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1" add_copy_hosts "$1"
shift shift
done done
# If no hosts specified, use autodiscovery
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "No hosts specified. Using autodiscovery..."
source "$(dirname "$0")/autodiscover.sh"
detect_nodes
if [ $? -ne 0 ]; then
echo "Error: Autodiscovery failed."
exit 1
fi
# Use PEER_NODES directly
if [ ${#PEER_NODES[@]} -gt 0 ]; then
COPY_HOSTS=("${PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
fi
continue continue
;; ;;
--copy-parallel) PARALLEL_COPY=true ;; --copy-parallel) PARALLEL_COPY=true ;;
-u|--user) SSH_USER="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;;
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
-h|--help) usage ;; -h|--help) usage ;;
*) *)
# If positional argument is provided # If positional argument is provided
if [ -z "${MODEL_NAME:-}" ]; then if [ -z "${MODEL_NAME:-}" ]; then
MODEL_NAME="$1" MODEL_NAME="$1"
@@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do
shift shift
done done
# Export config so autodiscover.sh picks it up
export CONFIG_FILE CONFIG_FILE_SET
# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available
source "$(dirname "$0")/autodiscover.sh"
# Validate model name is provided # Validate model name is provided
if [ -z "${MODEL_NAME:-}" ]; then if [ -z "${MODEL_NAME:-}" ]; then
echo "Error: Model name is required." echo "Error: Model name is required."
usage usage
fi fi
# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
# --copy-to was specified but no hosts given: use .env or autodiscover
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
else
echo "No hosts specified. Using autodiscovery..."
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
fi
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: Autodiscovery found no other nodes."
exit 1
fi
echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}"
fi
elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then
# No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly
: # intentional no-op; user didn't ask for copy
fi
# Check if uvx is installed # Check if uvx is installed
if ! command -v uvx &> /dev/null; then if ! command -v uvx &> /dev/null; then
echo "Error: 'uvx' command not found." echo "Error: 'uvx' command not found."
@@ -231,4 +249,4 @@ if [ "$COPY_TIME" -gt 0 ]; then
fi fi
echo "Total: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))" echo "Total: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
echo "=========================================" echo "========================================="
echo "Done downloading $MODEL_NAME." echo "Done downloading $MODEL_NAME."

View File

@@ -68,7 +68,8 @@ usage() {
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)" echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
echo " --config Path to .env configuration file (default: .env in script directory)" echo " --config Path to .env configuration file (default: .env in script directory)
--setup Force autodiscovery and save configuration (even if .env exists)"
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
echo "" echo ""
@@ -131,6 +132,7 @@ while [[ "$#" -gt 0 ]]; do
-d) DAEMON_MODE="true" ;; -d) DAEMON_MODE="true" ;;
-h|--help) usage ;; -h|--help) usage ;;
--config) CONFIG_FILE="$2"; shift ;; --config) CONFIG_FILE="$2"; shift ;;
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
start|stop|status) start|stop|status)
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
@@ -411,6 +413,21 @@ done
# Source autodiscover module # Source autodiscover module
source "$(dirname "$0")/autodiscover.sh" source "$(dirname "$0")/autodiscover.sh"
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
# --setup: force full autodiscovery and save configuration
echo "Running full autodiscovery (--setup)..."
detect_interfaces || exit 1
detect_local_ip || exit 1
detect_nodes || exit 1
detect_copy_hosts || exit 1
save_config || exit 1
# Reload .env so DOTENV_* variables reflect saved config
load_env_if_exists
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
fi
if [[ "$SOLO_MODE" == "true" ]]; then if [[ "$SOLO_MODE" == "true" ]]; then
# Solo mode: skip node detection, just get local IP # Solo mode: skip node detection, just get local IP
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1 # Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1

View File

@@ -547,7 +547,6 @@ def load_env_file() -> dict[str, str]:
Reads the .env file created by --discover for persistent cluster configuration. Reads the .env file created by --discover for persistent cluster configuration.
EXTENSIBILITY: EXTENSIBILITY:
- To add new persistent settings: Just add them to save_env_file()
- To support multiple .env files: Add a --env-file CLI argument - To support multiple .env files: Add a --env-file CLI argument
- To add validation: Check for required keys after loading - To add validation: Check for required keys after loading
@@ -573,57 +572,16 @@ def load_env_file() -> dict[str, str]:
return env return env
def save_env_file(env: dict[str, str]) -> None:
"""
Save environment variables to .env file.
Persists cluster configuration discovered by autodiscover.sh.
Values are properly quoted if they contain spaces or commas.
EXTENSIBILITY:
- To add new persistent settings: Just add them to the env dict before calling
- To add timestamps/metadata: Add comment lines to the output
- To support append mode: Read existing, merge, then write
Args:
env: Dictionary of key=value pairs to save
"""
lines = ["# Auto-generated by run-recipe.py --discover", ""]
for key, value in sorted(env.items()):
# Quote values with spaces
if " " in value or "," in value:
lines.append(f'{key}="{value}"')
else:
lines.append(f"{key}={value}")
lines.append("")
with open(ENV_FILE, "w") as f:
f.write("\n".join(lines))
print(f"Saved to {ENV_FILE}")
def run_autodiscover() -> dict[str, str] | None: def run_autodiscover() -> dict[str, str] | None:
""" """
Run autodiscover.sh and return discovered configuration. Run autodiscover.sh interactively and return discovered configuration.
Executes the autodiscover.sh script to detect cluster topology, Executes the autodiscover.sh script to detect cluster topology,
then presents an interactive node selection menu. including interactive per-node confirmation and .env saving.
After autodiscover.sh completes, reads configuration from .env file.
EXTENSIBILITY:
- To add new discovery methods: Extend autodiscover.sh or add Python detection here
- To add GPU detection: Add nvidia-smi parsing to discovered env
- To skip interactive selection: Add a --non-interactive flag
- To add node health checks: Ping/SSH test each discovered node
DISCOVERED VARIABLES:
CLUSTER_NODES: Comma-separated list of node IPs (user-selected)
LOCAL_IP: This machine's IP address
ETH_IF: Ethernet interface name (e.g., 'eth0')
IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available
Returns: Returns:
Dictionary with discovered configuration, or None if discovery failed Dictionary with discovered configuration from .env, or None if discovery failed
""" """
if not AUTODISCOVER_SCRIPT.exists(): if not AUTODISCOVER_SCRIPT.exists():
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
@@ -632,85 +590,28 @@ def run_autodiscover() -> dict[str, str] | None:
print("Running autodiscover...") print("Running autodiscover...")
print() print()
# Run autodiscover in a subshell and capture the variables # Build env for the subprocess so CONFIG_FILE is passed through
# We source the script and print the variables we care about env_vars = os.environ.copy()
env_vars["CONFIG_FILE"] = str(ENV_FILE)
env_vars["CONFIG_FILE_SET"] = "true"
# Run autodiscover interactively so its prompts are shown to the user
script = f""" script = f"""
source '{AUTODISCOVER_SCRIPT}' source '{AUTODISCOVER_SCRIPT}'
detect_interfaces run_autodiscover
detect_local_ip
detect_nodes
echo "CLUSTER_NODES=$NODES_ARG"
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
""" """
result = subprocess.run(["bash", "-c", script], capture_output=True, text=True) result = subprocess.run(["bash", "-c", script], env=env_vars)
if result.returncode != 0: if result.returncode != 0:
print("Autodiscover output:")
print(result.stdout)
if result.stderr:
print(result.stderr)
print("Error: Autodiscover failed") print("Error: Autodiscover failed")
return None return None
# Print the autodiscover output (excluding the final variable lines) # Read configuration from the .env file that autodiscover.sh wrote
output_lines = result.stdout.strip().split("\n") env = load_env_file()
env = {} if not env.get("CLUSTER_NODES"):
for line in output_lines: print("Autodiscover completed but no CLUSTER_NODES found in .env")
if "=" in line and any( return None
line.startswith(k)
for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]
):
key, _, value = line.partition("=")
env[key] = value
else:
print(line)
print()
# Interactive node selection
if env.get("CLUSTER_NODES"):
all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()]
local_ip = env.get("LOCAL_IP", "")
if len(all_nodes) > 1:
print("Select which nodes to include in the cluster:")
print()
selected_nodes = []
for node in all_nodes:
is_local = node == local_ip
label = f"{node} (this machine)" if is_local else node
# Default to yes for all nodes
while True:
response = input(f" Include {label}? [Y/n]: ").strip().lower()
if response in ("", "y", "yes"):
selected_nodes.append(node)
break
elif response in ("n", "no"):
break
else:
print(" Please enter 'y' or 'n'")
print()
if not selected_nodes:
print("No nodes selected. Aborting.")
return None
if len(selected_nodes) == 1:
print(f"Only one node selected: {selected_nodes[0]}")
print("This will run in solo mode (single node).")
else:
print(
f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}"
)
env["CLUSTER_NODES"] = ",".join(selected_nodes)
print()
return env return env
@@ -990,8 +891,6 @@ Examples:
print(f" {key}={value}") print(f" {key}={value}")
print() print()
save_env_file(env)
if not args.recipe: if not args.recipe:
return 0 return 0
@@ -1058,20 +957,6 @@ Examples:
nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
nodes_from_env = True nodes_from_env = True
if nodes:
# Ask if user wants to save to .env
print()
response = (
input(
"Save this configuration to .env for future use? [Y/n]: "
)
.strip()
.lower()
)
if response in ("", "y", "yes"):
save_env_file(discovered_env)
print()
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh # Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
eth_if = args.eth_if or None eth_if = args.eth_if or None
ib_if = args.ib_if or None ib_if = args.ib_if or None