Autodiscovery refactoring with mesh support
This commit is contained in:
358
autodiscover.sh
358
autodiscover.sh
@@ -1,5 +1,7 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
|
||||||
|
|
||||||
# Load .env file if exists (for shared configuration)
|
# Load .env file if exists (for shared configuration)
|
||||||
# This is called early so that DOTENV_* variables are available to all functions
|
# This is called early so that DOTENV_* variables are available to all functions
|
||||||
load_env_if_exists() {
|
load_env_if_exists() {
|
||||||
@@ -8,8 +10,7 @@ load_env_if_exists() {
|
|||||||
|
|
||||||
# If CONFIG_FILE is not set, check default location
|
# If CONFIG_FILE is not set, check default location
|
||||||
if [[ -z "$env_file" ]]; then
|
if [[ -z "$env_file" ]]; then
|
||||||
local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
|
env_file="$SCRIPT_DIR/.env"
|
||||||
env_file="$script_dir/.env"
|
|
||||||
config_explicit="false"
|
config_explicit="false"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -47,6 +48,9 @@ load_env_if_exists() {
|
|||||||
# Load .env file
|
# Load .env file
|
||||||
load_env_if_exists
|
load_env_if_exists
|
||||||
|
|
||||||
|
# Mesh mode flag (set by detect_interfaces)
|
||||||
|
MESH_MODE="false"
|
||||||
|
|
||||||
# Function to detect IB and Ethernet interfaces
|
# Function to detect IB and Ethernet interfaces
|
||||||
detect_interfaces() {
|
detect_interfaces() {
|
||||||
# If both interfaces are already set, nothing to do
|
# If both interfaces are already set, nothing to do
|
||||||
@@ -72,49 +76,115 @@ detect_interfaces() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
DETECTED_IB_IFS=()
|
DETECTED_IB_IFS=()
|
||||||
CANDIDATE_ETH_IFS=()
|
ALL_NET_IFS=()
|
||||||
|
|
||||||
for pair in "${IB_NET_PAIRS[@]}"; do
|
for pair in "${IB_NET_PAIRS[@]}"; do
|
||||||
ib_dev=$(echo "$pair" | awk '{print $1}')
|
ib_dev=$(echo "$pair" | awk '{print $1}')
|
||||||
net_dev=$(echo "$pair" | awk '{print $2}')
|
net_dev=$(echo "$pair" | awk '{print $2}')
|
||||||
|
|
||||||
DETECTED_IB_IFS+=("$ib_dev")
|
DETECTED_IB_IFS+=("$ib_dev")
|
||||||
|
ALL_NET_IFS+=("$net_dev")
|
||||||
|
done
|
||||||
|
|
||||||
# Check if interface has an IP address
|
local num_up="${#IB_NET_PAIRS[@]}"
|
||||||
if ip addr show "$net_dev" | grep -q "inet "; then
|
|
||||||
CANDIDATE_ETH_IFS+=("$net_dev")
|
# --- Sanity checks ---
|
||||||
|
|
||||||
|
# 1. enp* (no capital P) interfaces MUST have an IP
|
||||||
|
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||||
|
if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then
|
||||||
|
if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
|
||||||
|
echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# Set IB_IF if not provided
|
# 2. No two interfaces with IPs should share the same subnet
|
||||||
if [[ -z "$IB_IF" ]]; then
|
declare -A SEEN_SUBNETS
|
||||||
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
|
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||||
echo " Detected IB_IF: $IB_IF"
|
local cidr
|
||||||
fi
|
cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1)
|
||||||
|
[[ -z "$cidr" ]] && continue
|
||||||
# Set ETH_IF if not provided
|
# Compute network address using python3
|
||||||
if [[ -z "$ETH_IF" ]]; then
|
local net_addr
|
||||||
if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then
|
net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null)
|
||||||
echo "Error: No active IB-associated interfaces have IP addresses."
|
if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then
|
||||||
|
echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration."
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
|
SEEN_SUBNETS["$net_addr"]="$net_dev"
|
||||||
|
done
|
||||||
|
|
||||||
# Selection logic: Prefer interface without capital 'P'
|
# --- Mode selection ---
|
||||||
SELECTED_ETH=""
|
|
||||||
for iface in "${CANDIDATE_ETH_IFS[@]}"; do
|
|
||||||
if [[ "$iface" != *"P"* ]]; then
|
|
||||||
SELECTED_ETH="$iface"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# Fallback: Use the first one if all have 'P' or none found yet
|
if [[ "$num_up" -eq 2 ]]; then
|
||||||
if [[ -z "$SELECTED_ETH" ]]; then
|
# Non-mesh configuration
|
||||||
SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}"
|
MESH_MODE="false"
|
||||||
|
echo " Non-mesh mode: 2 CX7 interfaces active."
|
||||||
|
|
||||||
|
# Set IB_IF if not provided
|
||||||
|
if [[ -z "$IB_IF" ]]; then
|
||||||
|
IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}")
|
||||||
|
echo " Detected IB_IF: $IB_IF"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ETH_IF="$SELECTED_ETH"
|
# Set ETH_IF if not provided: prefer interface without capital 'P'
|
||||||
echo " Detected ETH_IF: $ETH_IF"
|
if [[ -z "$ETH_IF" ]]; then
|
||||||
|
local selected_eth=""
|
||||||
|
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||||
|
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
|
||||||
|
if [[ "$net_dev" != *P* ]]; then
|
||||||
|
selected_eth="$net_dev"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
# Fallback: first interface with an IP
|
||||||
|
if [[ -z "$selected_eth" ]]; then
|
||||||
|
for net_dev in "${ALL_NET_IFS[@]}"; do
|
||||||
|
if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then
|
||||||
|
selected_eth="$net_dev"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
if [[ -z "$selected_eth" ]]; then
|
||||||
|
echo "Error: No active IB-associated interfaces have IP addresses."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
ETH_IF="$selected_eth"
|
||||||
|
echo " Detected ETH_IF: $ETH_IF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
elif [[ "$num_up" -eq 4 ]]; then
|
||||||
|
# Mesh configuration
|
||||||
|
MESH_MODE="true"
|
||||||
|
echo " Mesh mode: all 4 CX7 interfaces active."
|
||||||
|
|
||||||
|
# Set IB_IF to all four RoCE interfaces (hardcoded for mesh)
|
||||||
|
if [[ -z "$IB_IF" ]]; then
|
||||||
|
IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1"
|
||||||
|
echo " Detected IB_IF: $IB_IF"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set ETH_IF: check enP7s7 first, then wlP9s9
|
||||||
|
if [[ -z "$ETH_IF" ]]; then
|
||||||
|
if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then
|
||||||
|
ETH_IF="enP7s7"
|
||||||
|
echo " Detected ETH_IF: $ETH_IF"
|
||||||
|
elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then
|
||||||
|
ETH_IF="wlP9s9"
|
||||||
|
echo " Detected ETH_IF: $ETH_IF"
|
||||||
|
echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited."
|
||||||
|
else
|
||||||
|
echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
else
|
||||||
|
echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)."
|
||||||
|
return 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -141,6 +211,41 @@ detect_local_ip() {
|
|||||||
echo " Detected Local IP: $LOCAL_IP ($CIDR)"
|
echo " Detected Local IP: $LOCAL_IP ($CIDR)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Scan a subnet for GB10-capable peers via SSH
|
||||||
|
# Usage: _scan_subnet_for_gb10 <cidr> <local_ip_to_exclude> <output_file>
|
||||||
|
_scan_subnet_for_gb10() {
|
||||||
|
local cidr="$1"
|
||||||
|
local exclude_ip="$2"
|
||||||
|
local out_file="$3"
|
||||||
|
|
||||||
|
if ! command -v python3 &> /dev/null; then
|
||||||
|
echo "Error: python3 not found."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
if ! command -v nc &> /dev/null; then
|
||||||
|
echo "Error: nc (netcat) not found."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local all_ips
|
||||||
|
all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr")
|
||||||
|
|
||||||
|
for ip in $all_ips; do
|
||||||
|
[[ "$ip" == "$exclude_ip" ]] && continue
|
||||||
|
(
|
||||||
|
if nc -z -w 1 "$ip" 22 &>/dev/null; then
|
||||||
|
# Check if remote is a GB10 system
|
||||||
|
if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \
|
||||||
|
"nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \
|
||||||
|
2>/dev/null | grep -q "NVIDIA GB10"; then
|
||||||
|
echo "$ip" >> "$out_file"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
) &
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
}
|
||||||
|
|
||||||
# Function to detect cluster nodes
|
# Function to detect cluster nodes
|
||||||
detect_nodes() {
|
detect_nodes() {
|
||||||
detect_local_ip || return 1
|
detect_local_ip || return 1
|
||||||
@@ -158,71 +263,164 @@ detect_nodes() {
|
|||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Try to use COPY_HOSTS from .env
|
# Try to use CLUSTER_NODES from .env
|
||||||
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then
|
||||||
echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
|
echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES"
|
||||||
PEER_NODES=()
|
PEER_NODES=()
|
||||||
IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS"
|
IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES"
|
||||||
for node in "${ALL_NODES[@]}"; do
|
for node in "${ALL_NODES[@]}"; do
|
||||||
node=$(echo "$node" | xargs)
|
node=$(echo "$node" | xargs)
|
||||||
PEER_NODES+=("$node")
|
[[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node")
|
||||||
done
|
done
|
||||||
NODES_ARG="$DOTENV_COPY_HOSTS"
|
NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Auto-detecting nodes..."
|
echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..."
|
||||||
|
|
||||||
if ! command -v nc &> /dev/null; then
|
local temp_file
|
||||||
echo "Error: nc (netcat) not found. Please install netcat."
|
temp_file=$(mktemp)
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! command -v python3 &> /dev/null; then
|
_scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file"
|
||||||
echo "Error: python3 not found. Please install python3."
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
DETECTED_IPS=("$LOCAL_IP")
|
|
||||||
PEER_NODES=()
|
PEER_NODES=()
|
||||||
|
local detected_ips=("$LOCAL_IP")
|
||||||
echo " Scanning for SSH peers on $CIDR..."
|
if [[ -f "$temp_file" ]]; then
|
||||||
|
|
||||||
# Generate list of IPs using python
|
|
||||||
ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR")
|
|
||||||
|
|
||||||
TEMP_IPS_FILE=$(mktemp)
|
|
||||||
|
|
||||||
# Scan in parallel
|
|
||||||
for ip in $ALL_IPS; do
|
|
||||||
# Skip own IP
|
|
||||||
if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi
|
|
||||||
|
|
||||||
(
|
|
||||||
# Check port 22 with 1 second timeout
|
|
||||||
if nc -z -w 1 "$ip" 22 &>/dev/null; then
|
|
||||||
echo "$ip" >> "$TEMP_IPS_FILE"
|
|
||||||
fi
|
|
||||||
) &
|
|
||||||
done
|
|
||||||
|
|
||||||
# Wait for all background scans to complete
|
|
||||||
wait
|
|
||||||
|
|
||||||
# Read found IPs
|
|
||||||
if [[ -f "$TEMP_IPS_FILE" ]]; then
|
|
||||||
while read -r ip; do
|
while read -r ip; do
|
||||||
DETECTED_IPS+=("$ip")
|
PEER_NODES+=("$ip")
|
||||||
PEER_NODES+=("$ip")
|
detected_ips+=("$ip")
|
||||||
echo " Found peer: $ip"
|
echo " Found GB10 peer: $ip"
|
||||||
done < "$TEMP_IPS_FILE"
|
done < <(sort "$temp_file")
|
||||||
rm -f "$TEMP_IPS_FILE"
|
rm -f "$temp_file"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Sort IPs
|
# Sort and set NODES_ARG
|
||||||
IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}"))
|
IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}"))
|
||||||
unset IFS
|
unset IFS
|
||||||
|
|
||||||
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
|
NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}")
|
||||||
echo " Cluster Nodes: $NODES_ARG"
|
echo " Cluster Nodes: $NODES_ARG"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Function to detect COPY_HOSTS for build/model distribution
|
||||||
|
# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network)
|
||||||
|
# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers
|
||||||
|
detect_copy_hosts() {
|
||||||
|
if [[ "$MESH_MODE" == "false" ]]; then
|
||||||
|
COPY_PEER_NODES=("${PEER_NODES[@]}")
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets
|
||||||
|
echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..."
|
||||||
|
|
||||||
|
local temp_file
|
||||||
|
temp_file=$(mktemp)
|
||||||
|
|
||||||
|
for iface in enp1s0f0np0 enp1s0f1np1; do
|
||||||
|
local cidr
|
||||||
|
cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1)
|
||||||
|
[[ -z "$cidr" ]] && continue
|
||||||
|
local local_iface_ip="${cidr%/*}"
|
||||||
|
echo " Scanning $iface ($cidr)..."
|
||||||
|
_scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file"
|
||||||
|
done
|
||||||
|
|
||||||
|
# Deduplicate and collect results
|
||||||
|
COPY_PEER_NODES=()
|
||||||
|
declare -A _SEEN_COPY
|
||||||
|
if [[ -f "$temp_file" ]]; then
|
||||||
|
while read -r ip; do
|
||||||
|
if [[ -z "${_SEEN_COPY[$ip]}" ]]; then
|
||||||
|
_SEEN_COPY["$ip"]=1
|
||||||
|
COPY_PEER_NODES+=("$ip")
|
||||||
|
echo " Found GB10 copy host: $ip"
|
||||||
|
fi
|
||||||
|
done < <(sort "$temp_file")
|
||||||
|
rm -f "$temp_file"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Save discovered configuration to .env
|
||||||
|
# Skips if .env already exists unless FORCE_DISCOVER=true
|
||||||
|
save_config() {
|
||||||
|
local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}"
|
||||||
|
|
||||||
|
# Skip if .env exists and not forced
|
||||||
|
if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
local save_prompt="Save discovered configuration to $env_file?"
|
||||||
|
if [[ -f "$env_file" ]]; then
|
||||||
|
save_prompt="Overwrite existing configuration in $env_file?"
|
||||||
|
fi
|
||||||
|
read -r -p "$save_prompt [Y/n]: " response
|
||||||
|
response="${response,,}"
|
||||||
|
if [[ "$response" =~ ^(n|no)$ ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Build list of all cluster nodes (local + peers)
|
||||||
|
local all_cluster_nodes=()
|
||||||
|
if [[ -n "$LOCAL_IP" ]]; then
|
||||||
|
all_cluster_nodes+=("$LOCAL_IP")
|
||||||
|
fi
|
||||||
|
for node in "${PEER_NODES[@]}"; do
|
||||||
|
all_cluster_nodes+=("$node")
|
||||||
|
done
|
||||||
|
|
||||||
|
# Per-node confirmation for CLUSTER_NODES
|
||||||
|
echo ""
|
||||||
|
echo "Select nodes for CLUSTER_NODES:"
|
||||||
|
local selected_cluster=()
|
||||||
|
for node in "${all_cluster_nodes[@]}"; do
|
||||||
|
local label="$node"
|
||||||
|
[[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)"
|
||||||
|
read -r -p " Include $label? [Y/n]: " r
|
||||||
|
r="${r,,}"
|
||||||
|
if [[ ! "$r" =~ ^(n|no)$ ]]; then
|
||||||
|
selected_cluster+=("$node")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "${#selected_cluster[@]}" -eq 0 ]]; then
|
||||||
|
echo "No nodes selected. Aborting save."
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Per-node confirmation for COPY_HOSTS
|
||||||
|
echo ""
|
||||||
|
echo "Select nodes for COPY_HOSTS (build/model distribution):"
|
||||||
|
local selected_copy=()
|
||||||
|
for node in "${COPY_PEER_NODES[@]}"; do
|
||||||
|
read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r
|
||||||
|
r="${r,,}"
|
||||||
|
if [[ ! "$r" =~ ^(n|no)$ ]]; then
|
||||||
|
selected_copy+=("$node")
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Write .env
|
||||||
|
{
|
||||||
|
echo "# Auto-generated by autodiscover.sh"
|
||||||
|
echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")"
|
||||||
|
if [[ "${#selected_copy[@]}" -gt 0 ]]; then
|
||||||
|
echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")"
|
||||||
|
fi
|
||||||
|
echo "LOCAL_IP=$LOCAL_IP"
|
||||||
|
echo "ETH_IF=$ETH_IF"
|
||||||
|
echo "IB_IF=$IB_IF"
|
||||||
|
} > "$env_file"
|
||||||
|
echo ""
|
||||||
|
echo "Saved to $env_file"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convenience function: run full autodiscovery pipeline
|
||||||
|
run_autodiscover() {
|
||||||
|
detect_interfaces || return 1
|
||||||
|
detect_local_ip || return 1
|
||||||
|
detect_nodes || return 1
|
||||||
|
detect_copy_hosts || return 1
|
||||||
|
save_config
|
||||||
|
}
|
||||||
|
|||||||
@@ -282,6 +282,7 @@ usage() {
|
|||||||
echo " --network <network> : Docker network to use during build"
|
echo " --network <network> : Docker network to use during build"
|
||||||
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
|
echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory"
|
||||||
echo " --config : Path to .env configuration file (default: .env in script directory)"
|
echo " --config : Path to .env configuration file (default: .env in script directory)"
|
||||||
|
echo " --setup : Force autodiscovery and save configuration (even if .env exists)"
|
||||||
echo " -h, --help : Show this help message"
|
echo " -h, --help : Show this help message"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
@@ -334,6 +335,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
fi
|
fi
|
||||||
;;
|
;;
|
||||||
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
|
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
|
||||||
|
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
*) echo "Unknown parameter passed: $1"; usage ;;
|
*) echo "Unknown parameter passed: $1"; usage ;;
|
||||||
esac
|
esac
|
||||||
@@ -343,6 +345,18 @@ done
|
|||||||
# Source autodiscover.sh to load .env file
|
# Source autodiscover.sh to load .env file
|
||||||
source "$(dirname "$0")/autodiscover.sh"
|
source "$(dirname "$0")/autodiscover.sh"
|
||||||
|
|
||||||
|
# If --setup: force full autodiscovery and save configuration
|
||||||
|
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
|
||||||
|
echo "Running full autodiscovery (--setup)..."
|
||||||
|
detect_interfaces || exit 1
|
||||||
|
detect_local_ip || exit 1
|
||||||
|
detect_nodes || exit 1
|
||||||
|
detect_copy_hosts || exit 1
|
||||||
|
save_config || exit 1
|
||||||
|
# Reload .env so DOTENV_* variables reflect saved config
|
||||||
|
load_env_if_exists
|
||||||
|
fi
|
||||||
|
|
||||||
# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments
|
# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments
|
||||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||||
@@ -351,19 +365,18 @@ if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
|||||||
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
|
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
|
||||||
else
|
else
|
||||||
echo "No hosts specified. Using autodiscovery..."
|
echo "No hosts specified. Using autodiscovery..."
|
||||||
detect_nodes
|
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
|
||||||
if [ $? -ne 0 ]; then
|
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
|
||||||
echo "Error: Autodiscovery failed."
|
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
|
||||||
exit 1
|
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
|
||||||
COPY_HOSTS=("${PEER_NODES[@]}")
|
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
echo "Error: Autodiscovery found no other nodes."
|
echo "Error: Autodiscovery found no other nodes."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub"
|
|||||||
COPY_HOSTS=()
|
COPY_HOSTS=()
|
||||||
SSH_USER="$USER"
|
SSH_USER="$USER"
|
||||||
PARALLEL_COPY=false
|
PARALLEL_COPY=false
|
||||||
|
CONFIG_FILE=""
|
||||||
|
CONFIG_FILE_SET=false
|
||||||
|
|
||||||
# Help function
|
# Help function
|
||||||
usage() {
|
usage() {
|
||||||
@@ -16,6 +18,7 @@ usage() {
|
|||||||
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
|
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
|
||||||
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
|
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
|
||||||
echo " -u, --user <user> : Username for ssh commands (default: \$USER)"
|
echo " -u, --user <user> : Username for ssh commands (default: \$USER)"
|
||||||
|
echo " --config <file> : Path to .env configuration file (default: .env in script directory)"
|
||||||
echo " -h, --help : Show this help message"
|
echo " -h, --help : Show this help message"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
@@ -53,42 +56,22 @@ copy_model_to_host() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Argument parsing
|
# Argument parsing
|
||||||
|
COPY_TO_FLAG=false
|
||||||
while [[ "$#" -gt 0 ]]; do
|
while [[ "$#" -gt 0 ]]; do
|
||||||
case $1 in
|
case $1 in
|
||||||
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
||||||
|
COPY_TO_FLAG=true
|
||||||
shift
|
shift
|
||||||
# Consume arguments until the next flag or end of args
|
# Consume arguments until the next flag or end of args
|
||||||
while [[ "$#" -gt 0 && "$1" != -* ]]; do
|
while [[ "$#" -gt 0 && "$1" != -* ]]; do
|
||||||
add_copy_hosts "$1"
|
add_copy_hosts "$1"
|
||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
# If no hosts specified, use autodiscovery
|
|
||||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
|
||||||
echo "No hosts specified. Using autodiscovery..."
|
|
||||||
source "$(dirname "$0")/autodiscover.sh"
|
|
||||||
|
|
||||||
detect_nodes
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo "Error: Autodiscovery failed."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Use PEER_NODES directly
|
|
||||||
if [ ${#PEER_NODES[@]} -gt 0 ]; then
|
|
||||||
COPY_HOSTS=("${PEER_NODES[@]}")
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
|
||||||
echo "Error: Autodiscovery found no other nodes."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "Autodiscovered hosts: ${COPY_HOSTS[*]}"
|
|
||||||
fi
|
|
||||||
continue
|
continue
|
||||||
;;
|
;;
|
||||||
--copy-parallel) PARALLEL_COPY=true ;;
|
--copy-parallel) PARALLEL_COPY=true ;;
|
||||||
-u|--user) SSH_USER="$2"; shift ;;
|
-u|--user) SSH_USER="$2"; shift ;;
|
||||||
|
--config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
*)
|
*)
|
||||||
# If positional argument is provided
|
# If positional argument is provided
|
||||||
@@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
shift
|
shift
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Export config so autodiscover.sh picks it up
|
||||||
|
export CONFIG_FILE CONFIG_FILE_SET
|
||||||
|
|
||||||
|
# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available
|
||||||
|
source "$(dirname "$0")/autodiscover.sh"
|
||||||
|
|
||||||
# Validate model name is provided
|
# Validate model name is provided
|
||||||
if [ -z "${MODEL_NAME:-}" ]; then
|
if [ -z "${MODEL_NAME:-}" ]; then
|
||||||
echo "Error: Model name is required."
|
echo "Error: Model name is required."
|
||||||
usage
|
usage
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env
|
||||||
|
if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
|
# --copy-to was specified but no hosts given: use .env or autodiscover
|
||||||
|
if [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||||
|
echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS"
|
||||||
|
IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS"
|
||||||
|
COPY_HOSTS=("${HOSTS_FROM_ENV[@]}")
|
||||||
|
else
|
||||||
|
echo "No hosts specified. Using autodiscovery..."
|
||||||
|
detect_interfaces || { echo "Error: Interface detection failed."; exit 1; }
|
||||||
|
detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; }
|
||||||
|
detect_nodes || { echo "Error: Node detection failed."; exit 1; }
|
||||||
|
detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; }
|
||||||
|
|
||||||
|
if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then
|
||||||
|
COPY_HOSTS=("${COPY_PEER_NODES[@]}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
||||||
|
echo "Error: Autodiscovery found no other nodes."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}"
|
||||||
|
fi
|
||||||
|
elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then
|
||||||
|
# No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly
|
||||||
|
: # intentional no-op; user didn't ask for copy
|
||||||
|
fi
|
||||||
|
|
||||||
# Check if uvx is installed
|
# Check if uvx is installed
|
||||||
if ! command -v uvx &> /dev/null; then
|
if ! command -v uvx &> /dev/null; then
|
||||||
echo "Error: 'uvx' command not found."
|
echo "Error: 'uvx' command not found."
|
||||||
|
|||||||
@@ -68,7 +68,8 @@ usage() {
|
|||||||
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
|
echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)"
|
||||||
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
echo " --pids-limit Process limit (default: 4096, only with --non-privileged)"
|
||||||
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)"
|
||||||
echo " --config Path to .env configuration file (default: .env in script directory)"
|
echo " --config Path to .env configuration file (default: .env in script directory)
|
||||||
|
--setup Force autodiscovery and save configuration (even if .env exists)"
|
||||||
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script."
|
||||||
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
echo " command Command to run (only for 'exec' action). Not compatible with --launch-script."
|
||||||
echo ""
|
echo ""
|
||||||
@@ -131,6 +132,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
-d) DAEMON_MODE="true" ;;
|
-d) DAEMON_MODE="true" ;;
|
||||||
-h|--help) usage ;;
|
-h|--help) usage ;;
|
||||||
--config) CONFIG_FILE="$2"; shift ;;
|
--config) CONFIG_FILE="$2"; shift ;;
|
||||||
|
--setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;;
|
||||||
start|stop|status)
|
start|stop|status)
|
||||||
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
|
||||||
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script."
|
||||||
@@ -411,6 +413,21 @@ done
|
|||||||
# Source autodiscover module
|
# Source autodiscover module
|
||||||
source "$(dirname "$0")/autodiscover.sh"
|
source "$(dirname "$0")/autodiscover.sh"
|
||||||
|
|
||||||
|
if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then
|
||||||
|
# --setup: force full autodiscovery and save configuration
|
||||||
|
echo "Running full autodiscovery (--setup)..."
|
||||||
|
detect_interfaces || exit 1
|
||||||
|
detect_local_ip || exit 1
|
||||||
|
detect_nodes || exit 1
|
||||||
|
detect_copy_hosts || exit 1
|
||||||
|
save_config || exit 1
|
||||||
|
# Reload .env so DOTENV_* variables reflect saved config
|
||||||
|
load_env_if_exists
|
||||||
|
[[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES"
|
||||||
|
[[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF"
|
||||||
|
[[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF"
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ "$SOLO_MODE" == "true" ]]; then
|
if [[ "$SOLO_MODE" == "true" ]]; then
|
||||||
# Solo mode: skip node detection, just get local IP
|
# Solo mode: skip node detection, just get local IP
|
||||||
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1
|
# Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1
|
||||||
|
|||||||
149
run-recipe.py
149
run-recipe.py
@@ -547,7 +547,6 @@ def load_env_file() -> dict[str, str]:
|
|||||||
Reads the .env file created by --discover for persistent cluster configuration.
|
Reads the .env file created by --discover for persistent cluster configuration.
|
||||||
|
|
||||||
EXTENSIBILITY:
|
EXTENSIBILITY:
|
||||||
- To add new persistent settings: Just add them to save_env_file()
|
|
||||||
- To support multiple .env files: Add a --env-file CLI argument
|
- To support multiple .env files: Add a --env-file CLI argument
|
||||||
- To add validation: Check for required keys after loading
|
- To add validation: Check for required keys after loading
|
||||||
|
|
||||||
@@ -573,57 +572,16 @@ def load_env_file() -> dict[str, str]:
|
|||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
def save_env_file(env: dict[str, str]) -> None:
|
|
||||||
"""
|
|
||||||
Save environment variables to .env file.
|
|
||||||
|
|
||||||
Persists cluster configuration discovered by autodiscover.sh.
|
|
||||||
Values are properly quoted if they contain spaces or commas.
|
|
||||||
|
|
||||||
EXTENSIBILITY:
|
|
||||||
- To add new persistent settings: Just add them to the env dict before calling
|
|
||||||
- To add timestamps/metadata: Add comment lines to the output
|
|
||||||
- To support append mode: Read existing, merge, then write
|
|
||||||
|
|
||||||
Args:
|
|
||||||
env: Dictionary of key=value pairs to save
|
|
||||||
"""
|
|
||||||
lines = ["# Auto-generated by run-recipe.py --discover", ""]
|
|
||||||
for key, value in sorted(env.items()):
|
|
||||||
# Quote values with spaces
|
|
||||||
if " " in value or "," in value:
|
|
||||||
lines.append(f'{key}="{value}"')
|
|
||||||
else:
|
|
||||||
lines.append(f"{key}={value}")
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
with open(ENV_FILE, "w") as f:
|
|
||||||
f.write("\n".join(lines))
|
|
||||||
|
|
||||||
print(f"Saved to {ENV_FILE}")
|
|
||||||
|
|
||||||
|
|
||||||
def run_autodiscover() -> dict[str, str] | None:
|
def run_autodiscover() -> dict[str, str] | None:
|
||||||
"""
|
"""
|
||||||
Run autodiscover.sh and return discovered configuration.
|
Run autodiscover.sh interactively and return discovered configuration.
|
||||||
|
|
||||||
Executes the autodiscover.sh script to detect cluster topology,
|
Executes the autodiscover.sh script to detect cluster topology,
|
||||||
then presents an interactive node selection menu.
|
including interactive per-node confirmation and .env saving.
|
||||||
|
After autodiscover.sh completes, reads configuration from .env file.
|
||||||
EXTENSIBILITY:
|
|
||||||
- To add new discovery methods: Extend autodiscover.sh or add Python detection here
|
|
||||||
- To add GPU detection: Add nvidia-smi parsing to discovered env
|
|
||||||
- To skip interactive selection: Add a --non-interactive flag
|
|
||||||
- To add node health checks: Ping/SSH test each discovered node
|
|
||||||
|
|
||||||
DISCOVERED VARIABLES:
|
|
||||||
CLUSTER_NODES: Comma-separated list of node IPs (user-selected)
|
|
||||||
LOCAL_IP: This machine's IP address
|
|
||||||
ETH_IF: Ethernet interface name (e.g., 'eth0')
|
|
||||||
IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with discovered configuration, or None if discovery failed
|
Dictionary with discovered configuration from .env, or None if discovery failed
|
||||||
"""
|
"""
|
||||||
if not AUTODISCOVER_SCRIPT.exists():
|
if not AUTODISCOVER_SCRIPT.exists():
|
||||||
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
|
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
|
||||||
@@ -632,85 +590,28 @@ def run_autodiscover() -> dict[str, str] | None:
|
|||||||
print("Running autodiscover...")
|
print("Running autodiscover...")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Run autodiscover in a subshell and capture the variables
|
# Build env for the subprocess so CONFIG_FILE is passed through
|
||||||
# We source the script and print the variables we care about
|
env_vars = os.environ.copy()
|
||||||
|
env_vars["CONFIG_FILE"] = str(ENV_FILE)
|
||||||
|
env_vars["CONFIG_FILE_SET"] = "true"
|
||||||
|
|
||||||
|
# Run autodiscover interactively so its prompts are shown to the user
|
||||||
script = f"""
|
script = f"""
|
||||||
source '{AUTODISCOVER_SCRIPT}'
|
source '{AUTODISCOVER_SCRIPT}'
|
||||||
detect_interfaces
|
run_autodiscover
|
||||||
detect_local_ip
|
|
||||||
detect_nodes
|
|
||||||
echo "CLUSTER_NODES=$NODES_ARG"
|
|
||||||
echo "LOCAL_IP=$LOCAL_IP"
|
|
||||||
echo "ETH_IF=$ETH_IF"
|
|
||||||
echo "IB_IF=$IB_IF"
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
result = subprocess.run(["bash", "-c", script], capture_output=True, text=True)
|
result = subprocess.run(["bash", "-c", script], env=env_vars)
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
print("Autodiscover output:")
|
|
||||||
print(result.stdout)
|
|
||||||
if result.stderr:
|
|
||||||
print(result.stderr)
|
|
||||||
print("Error: Autodiscover failed")
|
print("Error: Autodiscover failed")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Print the autodiscover output (excluding the final variable lines)
|
# Read configuration from the .env file that autodiscover.sh wrote
|
||||||
output_lines = result.stdout.strip().split("\n")
|
env = load_env_file()
|
||||||
env = {}
|
if not env.get("CLUSTER_NODES"):
|
||||||
for line in output_lines:
|
print("Autodiscover completed but no CLUSTER_NODES found in .env")
|
||||||
if "=" in line and any(
|
return None
|
||||||
line.startswith(k)
|
|
||||||
for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]
|
|
||||||
):
|
|
||||||
key, _, value = line.partition("=")
|
|
||||||
env[key] = value
|
|
||||||
else:
|
|
||||||
print(line)
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Interactive node selection
|
|
||||||
if env.get("CLUSTER_NODES"):
|
|
||||||
all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()]
|
|
||||||
local_ip = env.get("LOCAL_IP", "")
|
|
||||||
|
|
||||||
if len(all_nodes) > 1:
|
|
||||||
print("Select which nodes to include in the cluster:")
|
|
||||||
print()
|
|
||||||
|
|
||||||
selected_nodes = []
|
|
||||||
for node in all_nodes:
|
|
||||||
is_local = node == local_ip
|
|
||||||
label = f"{node} (this machine)" if is_local else node
|
|
||||||
|
|
||||||
# Default to yes for all nodes
|
|
||||||
while True:
|
|
||||||
response = input(f" Include {label}? [Y/n]: ").strip().lower()
|
|
||||||
if response in ("", "y", "yes"):
|
|
||||||
selected_nodes.append(node)
|
|
||||||
break
|
|
||||||
elif response in ("n", "no"):
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print(" Please enter 'y' or 'n'")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
if not selected_nodes:
|
|
||||||
print("No nodes selected. Aborting.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
if len(selected_nodes) == 1:
|
|
||||||
print(f"Only one node selected: {selected_nodes[0]}")
|
|
||||||
print("This will run in solo mode (single node).")
|
|
||||||
else:
|
|
||||||
print(
|
|
||||||
f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
env["CLUSTER_NODES"] = ",".join(selected_nodes)
|
|
||||||
print()
|
|
||||||
|
|
||||||
return env
|
return env
|
||||||
|
|
||||||
@@ -990,8 +891,6 @@ Examples:
|
|||||||
print(f" {key}={value}")
|
print(f" {key}={value}")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
save_env_file(env)
|
|
||||||
|
|
||||||
if not args.recipe:
|
if not args.recipe:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@@ -1058,20 +957,6 @@ Examples:
|
|||||||
nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
|
nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
|
||||||
nodes_from_env = True
|
nodes_from_env = True
|
||||||
|
|
||||||
if nodes:
|
|
||||||
# Ask if user wants to save to .env
|
|
||||||
print()
|
|
||||||
response = (
|
|
||||||
input(
|
|
||||||
"Save this configuration to .env for future use? [Y/n]: "
|
|
||||||
)
|
|
||||||
.strip()
|
|
||||||
.lower()
|
|
||||||
)
|
|
||||||
if response in ("", "y", "yes"):
|
|
||||||
save_env_file(discovered_env)
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
|
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
|
||||||
eth_if = args.eth_if or None
|
eth_if = args.eth_if or None
|
||||||
ib_if = args.ib_if or None
|
ib_if = args.ib_if or None
|
||||||
|
|||||||
Reference in New Issue
Block a user