From a78e221de3827a2eacae641023c5b6feab8698d5 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 15:47:41 -0700 Subject: [PATCH] Autodiscovery refactoring with mesh support --- autodiscover.sh | 400 ++++++++++++++++++++++++++++++++++------------ build-and-copy.sh | 31 ++-- hf-download.sh | 72 +++++---- launch-cluster.sh | 19 ++- run-recipe.py | 149 ++--------------- 5 files changed, 401 insertions(+), 270 deletions(-) diff --git a/autodiscover.sh b/autodiscover.sh index 11d771d..a68c9d3 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -1,43 +1,44 @@ #!/bin/bash +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + # Load .env file if exists (for shared configuration) # This is called early so that DOTENV_* variables are available to all functions load_env_if_exists() { local env_file="${CONFIG_FILE:-}" local config_explicit="${CONFIG_FILE_SET:-false}" - + # If CONFIG_FILE is not set, check default location if [[ -z "$env_file" ]]; then - local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" - env_file="$script_dir/.env" + env_file="$SCRIPT_DIR/.env" config_explicit="false" fi - + # Validate config file exists if explicitly specified if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then echo "Error: Config file not found: $env_file" exit 1 fi - + if [[ -f "$env_file" ]]; then # Load .env variables with DOTENV_ prefix while IFS='=' read -r key value || [[ -n "$key" ]]; do # Skip comments and empty lines [[ "$key" =~ ^[[:space:]]*# ]] && continue [[ -z "$key" ]] && continue - + # Remove leading/trailing whitespace from key key=$(echo "$key" | xargs) - + # Skip if key is empty after trimming [[ -z "$key" ]] && continue - + # Remove quotes from value value="${value%\"}" value="${value#\"}" value="${value%\'}" value="${value#\'}" - + # Export with DOTENV_ prefix export "DOTENV_$key=$value" done < "$env_file" @@ -47,6 +48,9 @@ load_env_if_exists() { # Load .env file load_env_if_exists +# Mesh mode flag (set by detect_interfaces) +MESH_MODE="false" + # Function to detect IB and Ethernet interfaces detect_interfaces() { # If both interfaces are already set, nothing to do @@ -61,60 +65,126 @@ detect_interfaces() { fi echo "Auto-detecting interfaces..." - + # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)" # We capture: IB_DEV, NET_DEV mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') - + if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then echo "Error: No active IB interfaces found." return 1 fi DETECTED_IB_IFS=() - CANDIDATE_ETH_IFS=() + ALL_NET_IFS=() for pair in "${IB_NET_PAIRS[@]}"; do ib_dev=$(echo "$pair" | awk '{print $1}') net_dev=$(echo "$pair" | awk '{print $2}') - DETECTED_IB_IFS+=("$ib_dev") - - # Check if interface has an IP address - if ip addr show "$net_dev" | grep -q "inet "; then - CANDIDATE_ETH_IFS+=("$net_dev") + ALL_NET_IFS+=("$net_dev") + done + + local num_up="${#IB_NET_PAIRS[@]}" + + # --- Sanity checks --- + + # 1. enp* (no capital P) interfaces MUST have an IP + for net_dev in "${ALL_NET_IFS[@]}"; do + if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then + if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then + echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned." + return 1 + fi fi done - # Set IB_IF if not provided - if [[ -z "$IB_IF" ]]; then - IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") - echo " Detected IB_IF: $IB_IF" - fi - - # Set ETH_IF if not provided - if [[ -z "$ETH_IF" ]]; then - if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then - echo "Error: No active IB-associated interfaces have IP addresses." + # 2. No two interfaces with IPs should share the same subnet + declare -A SEEN_SUBNETS + for net_dev in "${ALL_NET_IFS[@]}"; do + local cidr + cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1) + [[ -z "$cidr" ]] && continue + # Compute network address using python3 + local net_addr + net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null) + if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then + echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration." return 1 fi - - # Selection logic: Prefer interface without capital 'P' - SELECTED_ETH="" - for iface in "${CANDIDATE_ETH_IFS[@]}"; do - if [[ "$iface" != *"P"* ]]; then - SELECTED_ETH="$iface" - break - fi - done - - # Fallback: Use the first one if all have 'P' or none found yet - if [[ -z "$SELECTED_ETH" ]]; then - SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" + SEEN_SUBNETS["$net_addr"]="$net_dev" + done + + # --- Mode selection --- + + if [[ "$num_up" -eq 2 ]]; then + # Non-mesh configuration + MESH_MODE="false" + echo " Non-mesh mode: 2 CX7 interfaces active." + + # Set IB_IF if not provided + if [[ -z "$IB_IF" ]]; then + IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") + echo " Detected IB_IF: $IB_IF" fi - - ETH_IF="$SELECTED_ETH" - echo " Detected ETH_IF: $ETH_IF" + + # Set ETH_IF if not provided: prefer interface without capital 'P' + if [[ -z "$ETH_IF" ]]; then + local selected_eth="" + for net_dev in "${ALL_NET_IFS[@]}"; do + if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then + if [[ "$net_dev" != *P* ]]; then + selected_eth="$net_dev" + break + fi + fi + done + # Fallback: first interface with an IP + if [[ -z "$selected_eth" ]]; then + for net_dev in "${ALL_NET_IFS[@]}"; do + if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then + selected_eth="$net_dev" + break + fi + done + fi + if [[ -z "$selected_eth" ]]; then + echo "Error: No active IB-associated interfaces have IP addresses." + return 1 + fi + ETH_IF="$selected_eth" + echo " Detected ETH_IF: $ETH_IF" + fi + + elif [[ "$num_up" -eq 4 ]]; then + # Mesh configuration + MESH_MODE="true" + echo " Mesh mode: all 4 CX7 interfaces active." + + # Set IB_IF to all four RoCE interfaces (hardcoded for mesh) + if [[ -z "$IB_IF" ]]; then + IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1" + echo " Detected IB_IF: $IB_IF" + fi + + # Set ETH_IF: check enP7s7 first, then wlP9s9 + if [[ -z "$ETH_IF" ]]; then + if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then + ETH_IF="enP7s7" + echo " Detected ETH_IF: $ETH_IF" + elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then + ETH_IF="wlP9s9" + echo " Detected ETH_IF: $ETH_IF" + echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited." + else + echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination." + return 1 + fi + fi + + else + echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)." + return 1 fi } @@ -131,16 +201,51 @@ detect_local_ip() { # Get CIDR of the selected ETH_IF CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1) - + if [[ -z "$CIDR" ]]; then echo "Error: Could not determine IP/CIDR for interface $ETH_IF" return 1 fi - + LOCAL_IP=${CIDR%/*} echo " Detected Local IP: $LOCAL_IP ($CIDR)" } +# Scan a subnet for GB10-capable peers via SSH +# Usage: _scan_subnet_for_gb10 +_scan_subnet_for_gb10() { + local cidr="$1" + local exclude_ip="$2" + local out_file="$3" + + if ! command -v python3 &> /dev/null; then + echo "Error: python3 not found." + return 1 + fi + if ! command -v nc &> /dev/null; then + echo "Error: nc (netcat) not found." + return 1 + fi + + local all_ips + all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr") + + for ip in $all_ips; do + [[ "$ip" == "$exclude_ip" ]] && continue + ( + if nc -z -w 1 "$ip" 22 &>/dev/null; then + # Check if remote is a GB10 system + if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \ + "nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \ + 2>/dev/null | grep -q "NVIDIA GB10"; then + echo "$ip" >> "$out_file" + fi + fi + ) & + done + wait +} + # Function to detect cluster nodes detect_nodes() { detect_local_ip || return 1 @@ -157,72 +262,165 @@ detect_nodes() { done return 0 fi - - # Try to use COPY_HOSTS from .env - if [[ -n "$DOTENV_COPY_HOSTS" ]]; then - echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + + # Try to use CLUSTER_NODES from .env + if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then + echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES" PEER_NODES=() - IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS" + IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES" for node in "${ALL_NODES[@]}"; do node=$(echo "$node" | xargs) - PEER_NODES+=("$node") + [[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node") done - NODES_ARG="$DOTENV_COPY_HOSTS" + NODES_ARG="$DOTENV_CLUSTER_NODES" return 0 fi - echo "Auto-detecting nodes..." - - if ! command -v nc &> /dev/null; then - echo "Error: nc (netcat) not found. Please install netcat." - return 1 - fi - - if ! command -v python3 &> /dev/null; then - echo "Error: python3 not found. Please install python3." - return 1 + echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..." + + local temp_file + temp_file=$(mktemp) + + _scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file" + + PEER_NODES=() + local detected_ips=("$LOCAL_IP") + if [[ -f "$temp_file" ]]; then + while read -r ip; do + PEER_NODES+=("$ip") + detected_ips+=("$ip") + echo " Found GB10 peer: $ip" + done < <(sort "$temp_file") + rm -f "$temp_file" fi - DETECTED_IPS=("$LOCAL_IP") - PEER_NODES=() - - echo " Scanning for SSH peers on $CIDR..." - - # Generate list of IPs using python - ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR") - - TEMP_IPS_FILE=$(mktemp) - - # Scan in parallel - for ip in $ALL_IPS; do - # Skip own IP - if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi - - ( - # Check port 22 with 1 second timeout - if nc -z -w 1 "$ip" 22 &>/dev/null; then - echo "$ip" >> "$TEMP_IPS_FILE" - fi - ) & - done - - # Wait for all background scans to complete - wait - - # Read found IPs - if [[ -f "$TEMP_IPS_FILE" ]]; then - while read -r ip; do - DETECTED_IPS+=("$ip") - PEER_NODES+=("$ip") - echo " Found peer: $ip" - done < "$TEMP_IPS_FILE" - rm -f "$TEMP_IPS_FILE" - fi - - # Sort IPs - IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}")) + # Sort and set NODES_ARG + IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}")) unset IFS - NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") echo " Cluster Nodes: $NODES_ARG" } + +# Function to detect COPY_HOSTS for build/model distribution +# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network) +# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers +detect_copy_hosts() { + if [[ "$MESH_MODE" == "false" ]]; then + COPY_PEER_NODES=("${PEER_NODES[@]}") + return 0 + fi + + # Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets + echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..." + + local temp_file + temp_file=$(mktemp) + + for iface in enp1s0f0np0 enp1s0f1np1; do + local cidr + cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1) + [[ -z "$cidr" ]] && continue + local local_iface_ip="${cidr%/*}" + echo " Scanning $iface ($cidr)..." + _scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file" + done + + # Deduplicate and collect results + COPY_PEER_NODES=() + declare -A _SEEN_COPY + if [[ -f "$temp_file" ]]; then + while read -r ip; do + if [[ -z "${_SEEN_COPY[$ip]}" ]]; then + _SEEN_COPY["$ip"]=1 + COPY_PEER_NODES+=("$ip") + echo " Found GB10 copy host: $ip" + fi + done < <(sort "$temp_file") + rm -f "$temp_file" + fi +} + +# Save discovered configuration to .env +# Skips if .env already exists unless FORCE_DISCOVER=true +save_config() { + local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}" + + # Skip if .env exists and not forced + if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then + return 0 + fi + + echo "" + local save_prompt="Save discovered configuration to $env_file?" + if [[ -f "$env_file" ]]; then + save_prompt="Overwrite existing configuration in $env_file?" + fi + read -r -p "$save_prompt [Y/n]: " response + response="${response,,}" + if [[ "$response" =~ ^(n|no)$ ]]; then + return 0 + fi + + # Build list of all cluster nodes (local + peers) + local all_cluster_nodes=() + if [[ -n "$LOCAL_IP" ]]; then + all_cluster_nodes+=("$LOCAL_IP") + fi + for node in "${PEER_NODES[@]}"; do + all_cluster_nodes+=("$node") + done + + # Per-node confirmation for CLUSTER_NODES + echo "" + echo "Select nodes for CLUSTER_NODES:" + local selected_cluster=() + for node in "${all_cluster_nodes[@]}"; do + local label="$node" + [[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)" + read -r -p " Include $label? [Y/n]: " r + r="${r,,}" + if [[ ! "$r" =~ ^(n|no)$ ]]; then + selected_cluster+=("$node") + fi + done + + if [[ "${#selected_cluster[@]}" -eq 0 ]]; then + echo "No nodes selected. Aborting save." + return 1 + fi + + # Per-node confirmation for COPY_HOSTS + echo "" + echo "Select nodes for COPY_HOSTS (build/model distribution):" + local selected_copy=() + for node in "${COPY_PEER_NODES[@]}"; do + read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r + r="${r,,}" + if [[ ! "$r" =~ ^(n|no)$ ]]; then + selected_copy+=("$node") + fi + done + + # Write .env + { + echo "# Auto-generated by autodiscover.sh" + echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")" + if [[ "${#selected_copy[@]}" -gt 0 ]]; then + echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")" + fi + echo "LOCAL_IP=$LOCAL_IP" + echo "ETH_IF=$ETH_IF" + echo "IB_IF=$IB_IF" + } > "$env_file" + echo "" + echo "Saved to $env_file" +} + +# Convenience function: run full autodiscovery pipeline +run_autodiscover() { + detect_interfaces || return 1 + detect_local_ip || return 1 + detect_nodes || return 1 + detect_copy_hosts || return 1 + save_config +} diff --git a/build-and-copy.sh b/build-and-copy.sh index dec93b4..90d6a27 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -282,6 +282,7 @@ usage() { echo " --network : Docker network to use during build" echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" echo " --config : Path to .env configuration file (default: .env in script directory)" + echo " --setup : Force autodiscovery and save configuration (even if .env exists)" echo " -h, --help : Show this help message" exit 1 } @@ -334,6 +335,7 @@ while [[ "$#" -gt 0 ]]; do fi ;; --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; + --setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac @@ -343,6 +345,18 @@ done # Source autodiscover.sh to load .env file source "$(dirname "$0")/autodiscover.sh" +# If --setup: force full autodiscovery and save configuration +if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then + echo "Running full autodiscovery (--setup)..." + detect_interfaces || exit 1 + detect_local_ip || exit 1 + detect_nodes || exit 1 + detect_copy_hosts || exit 1 + save_config || exit 1 + # Reload .env so DOTENV_* variables reflect saved config + load_env_if_exists +fi + # Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [[ -n "$DOTENV_COPY_HOSTS" ]]; then @@ -351,19 +365,18 @@ if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") else echo "No hosts specified. Using autodiscovery..." - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi + detect_interfaces || { echo "Error: Interface detection failed."; exit 1; } + detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; } + detect_nodes || { echo "Error: Node detection failed."; exit 1; } + detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; } - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") + if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then + COPY_HOSTS=("${COPY_PEER_NODES[@]}") fi if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 + echo "Error: Autodiscovery found no other nodes." + exit 1 fi echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" fi diff --git a/hf-download.sh b/hf-download.sh index c4c8d9f..3b5bdd6 100755 --- a/hf-download.sh +++ b/hf-download.sh @@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub" COPY_HOSTS=() SSH_USER="$USER" PARALLEL_COPY=false +CONFIG_FILE="" +CONFIG_FILE_SET=false # Help function usage() { @@ -16,6 +18,7 @@ usage() { echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " -u, --user : Username for ssh commands (default: \$USER)" + echo " --config : Path to .env configuration file (default: .env in script directory)" echo " -h, --help : Show this help message" exit 1 } @@ -37,11 +40,11 @@ copy_model_to_host() { local host="$1" local model_name="$2" local model_dir="$3" - + echo "Copying model '$model_name' to ${SSH_USER}@${host}..." local host_copy_start host_copy_end host_copy_time host_copy_start=$(date +%s) - + if rsync -av --mkpath --progress "$model_dir" "${SSH_USER}@${host}:$HUB_PATH/"; then host_copy_end=$(date +%s) host_copy_time=$((host_copy_end - host_copy_start)) @@ -53,44 +56,24 @@ copy_model_to_host() { } # Argument parsing +COPY_TO_FLAG=false while [[ "$#" -gt 0 ]]; do case $1 in -c|--copy-to|--copy-to-host|--copy-to-hosts) + COPY_TO_FLAG=true shift # Consume arguments until the next flag or end of args while [[ "$#" -gt 0 && "$1" != -* ]]; do add_copy_hosts "$1" shift done - - # If no hosts specified, use autodiscovery - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "No hosts specified. Using autodiscovery..." - source "$(dirname "$0")/autodiscover.sh" - - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi - - # Use PEER_NODES directly - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") - fi - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 - fi - echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" - fi continue ;; --copy-parallel) PARALLEL_COPY=true ;; -u|--user) SSH_USER="$2"; shift ;; + --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; -h|--help) usage ;; - *) + *) # If positional argument is provided if [ -z "${MODEL_NAME:-}" ]; then MODEL_NAME="$1" @@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do shift done +# Export config so autodiscover.sh picks it up +export CONFIG_FILE CONFIG_FILE_SET + +# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available +source "$(dirname "$0")/autodiscover.sh" + # Validate model name is provided if [ -z "${MODEL_NAME:-}" ]; then echo "Error: Model name is required." usage fi +# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env +if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + # --copy-to was specified but no hosts given: use .env or autodiscover + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" + COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") + else + echo "No hosts specified. Using autodiscovery..." + detect_interfaces || { echo "Error: Interface detection failed."; exit 1; } + detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; } + detect_nodes || { echo "Error: Node detection failed."; exit 1; } + detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; } + + if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then + COPY_HOSTS=("${COPY_PEER_NODES[@]}") + fi + + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}" + fi +elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then + # No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly + : # intentional no-op; user didn't ask for copy +fi + # Check if uvx is installed if ! command -v uvx &> /dev/null; then echo "Error: 'uvx' command not found." @@ -231,4 +249,4 @@ if [ "$COPY_TIME" -gt 0 ]; then fi echo "Total: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))" echo "=========================================" -echo "Done downloading $MODEL_NAME." \ No newline at end of file +echo "Done downloading $MODEL_NAME." diff --git a/launch-cluster.sh b/launch-cluster.sh index 4f3bcf6..1b267f9 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -68,7 +68,8 @@ usage() { echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" - echo " --config Path to .env configuration file (default: .env in script directory)" + echo " --config Path to .env configuration file (default: .env in script directory) + --setup Force autodiscovery and save configuration (even if .env exists)" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" @@ -131,6 +132,7 @@ while [[ "$#" -gt 0 ]]; do -d) DAEMON_MODE="true" ;; -h|--help) usage ;; --config) CONFIG_FILE="$2"; shift ;; + --setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; start|stop|status) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." @@ -411,6 +413,21 @@ done # Source autodiscover module source "$(dirname "$0")/autodiscover.sh" +if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then + # --setup: force full autodiscovery and save configuration + echo "Running full autodiscovery (--setup)..." + detect_interfaces || exit 1 + detect_local_ip || exit 1 + detect_nodes || exit 1 + detect_copy_hosts || exit 1 + save_config || exit 1 + # Reload .env so DOTENV_* variables reflect saved config + load_env_if_exists + [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES" + [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF" + [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF" +fi + if [[ "$SOLO_MODE" == "true" ]]; then # Solo mode: skip node detection, just get local IP # Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1 diff --git a/run-recipe.py b/run-recipe.py index b33b33b..842f99b 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -547,7 +547,6 @@ def load_env_file() -> dict[str, str]: Reads the .env file created by --discover for persistent cluster configuration. EXTENSIBILITY: - - To add new persistent settings: Just add them to save_env_file() - To support multiple .env files: Add a --env-file CLI argument - To add validation: Check for required keys after loading @@ -573,57 +572,16 @@ def load_env_file() -> dict[str, str]: return env -def save_env_file(env: dict[str, str]) -> None: - """ - Save environment variables to .env file. - - Persists cluster configuration discovered by autodiscover.sh. - Values are properly quoted if they contain spaces or commas. - - EXTENSIBILITY: - - To add new persistent settings: Just add them to the env dict before calling - - To add timestamps/metadata: Add comment lines to the output - - To support append mode: Read existing, merge, then write - - Args: - env: Dictionary of key=value pairs to save - """ - lines = ["# Auto-generated by run-recipe.py --discover", ""] - for key, value in sorted(env.items()): - # Quote values with spaces - if " " in value or "," in value: - lines.append(f'{key}="{value}"') - else: - lines.append(f"{key}={value}") - lines.append("") - - with open(ENV_FILE, "w") as f: - f.write("\n".join(lines)) - - print(f"Saved to {ENV_FILE}") - - def run_autodiscover() -> dict[str, str] | None: """ - Run autodiscover.sh and return discovered configuration. + Run autodiscover.sh interactively and return discovered configuration. Executes the autodiscover.sh script to detect cluster topology, - then presents an interactive node selection menu. - - EXTENSIBILITY: - - To add new discovery methods: Extend autodiscover.sh or add Python detection here - - To add GPU detection: Add nvidia-smi parsing to discovered env - - To skip interactive selection: Add a --non-interactive flag - - To add node health checks: Ping/SSH test each discovered node - - DISCOVERED VARIABLES: - CLUSTER_NODES: Comma-separated list of node IPs (user-selected) - LOCAL_IP: This machine's IP address - ETH_IF: Ethernet interface name (e.g., 'eth0') - IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available + including interactive per-node confirmation and .env saving. + After autodiscover.sh completes, reads configuration from .env file. Returns: - Dictionary with discovered configuration, or None if discovery failed + Dictionary with discovered configuration from .env, or None if discovery failed """ if not AUTODISCOVER_SCRIPT.exists(): print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") @@ -632,85 +590,28 @@ def run_autodiscover() -> dict[str, str] | None: print("Running autodiscover...") print() - # Run autodiscover in a subshell and capture the variables - # We source the script and print the variables we care about + # Build env for the subprocess so CONFIG_FILE is passed through + env_vars = os.environ.copy() + env_vars["CONFIG_FILE"] = str(ENV_FILE) + env_vars["CONFIG_FILE_SET"] = "true" + + # Run autodiscover interactively so its prompts are shown to the user script = f""" source '{AUTODISCOVER_SCRIPT}' - detect_interfaces - detect_local_ip - detect_nodes - echo "CLUSTER_NODES=$NODES_ARG" - echo "LOCAL_IP=$LOCAL_IP" - echo "ETH_IF=$ETH_IF" - echo "IB_IF=$IB_IF" + run_autodiscover """ - result = subprocess.run(["bash", "-c", script], capture_output=True, text=True) + result = subprocess.run(["bash", "-c", script], env=env_vars) if result.returncode != 0: - print("Autodiscover output:") - print(result.stdout) - if result.stderr: - print(result.stderr) print("Error: Autodiscover failed") return None - # Print the autodiscover output (excluding the final variable lines) - output_lines = result.stdout.strip().split("\n") - env = {} - for line in output_lines: - if "=" in line and any( - line.startswith(k) - for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="] - ): - key, _, value = line.partition("=") - env[key] = value - else: - print(line) - - print() - - # Interactive node selection - if env.get("CLUSTER_NODES"): - all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()] - local_ip = env.get("LOCAL_IP", "") - - if len(all_nodes) > 1: - print("Select which nodes to include in the cluster:") - print() - - selected_nodes = [] - for node in all_nodes: - is_local = node == local_ip - label = f"{node} (this machine)" if is_local else node - - # Default to yes for all nodes - while True: - response = input(f" Include {label}? [Y/n]: ").strip().lower() - if response in ("", "y", "yes"): - selected_nodes.append(node) - break - elif response in ("n", "no"): - break - else: - print(" Please enter 'y' or 'n'") - - print() - - if not selected_nodes: - print("No nodes selected. Aborting.") - return None - - if len(selected_nodes) == 1: - print(f"Only one node selected: {selected_nodes[0]}") - print("This will run in solo mode (single node).") - else: - print( - f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}" - ) - - env["CLUSTER_NODES"] = ",".join(selected_nodes) - print() + # Read configuration from the .env file that autodiscover.sh wrote + env = load_env_file() + if not env.get("CLUSTER_NODES"): + print("Autodiscover completed but no CLUSTER_NODES found in .env") + return None return env @@ -990,8 +891,6 @@ Examples: print(f" {key}={value}") print() - save_env_file(env) - if not args.recipe: return 0 @@ -1058,20 +957,6 @@ Examples: nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) nodes_from_env = True - if nodes: - # Ask if user wants to save to .env - print() - response = ( - input( - "Save this configuration to .env for future use? [Y/n]: " - ) - .strip() - .lower() - ) - if response in ("", "y", "yes"): - save_env_file(discovered_env) - print() - # Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh eth_if = args.eth_if or None ib_if = args.ib_if or None