Autodiscovery refactoring with mesh support

This commit is contained in:
Eugene Rakhmatulin
2026-03-26 15:47:41 -07:00
parent 83a74bccec
commit a78e221de3
5 changed files with 401 additions and 270 deletions

View File

@@ -547,7 +547,6 @@ def load_env_file() -> dict[str, str]:
Reads the .env file created by --discover for persistent cluster configuration.
EXTENSIBILITY:
- To add new persistent settings: Just add them to save_env_file()
- To support multiple .env files: Add a --env-file CLI argument
- To add validation: Check for required keys after loading
@@ -573,57 +572,16 @@ def load_env_file() -> dict[str, str]:
return env
def save_env_file(env: dict[str, str]) -> None:
"""
Save environment variables to .env file.
Persists cluster configuration discovered by autodiscover.sh.
Values are properly quoted if they contain spaces or commas.
EXTENSIBILITY:
- To add new persistent settings: Just add them to the env dict before calling
- To add timestamps/metadata: Add comment lines to the output
- To support append mode: Read existing, merge, then write
Args:
env: Dictionary of key=value pairs to save
"""
lines = ["# Auto-generated by run-recipe.py --discover", ""]
for key, value in sorted(env.items()):
# Quote values with spaces
if " " in value or "," in value:
lines.append(f'{key}="{value}"')
else:
lines.append(f"{key}={value}")
lines.append("")
with open(ENV_FILE, "w") as f:
f.write("\n".join(lines))
print(f"Saved to {ENV_FILE}")
def run_autodiscover() -> dict[str, str] | None:
"""
Run autodiscover.sh and return discovered configuration.
Run autodiscover.sh interactively and return discovered configuration.
Executes the autodiscover.sh script to detect cluster topology,
then presents an interactive node selection menu.
EXTENSIBILITY:
- To add new discovery methods: Extend autodiscover.sh or add Python detection here
- To add GPU detection: Add nvidia-smi parsing to discovered env
- To skip interactive selection: Add a --non-interactive flag
- To add node health checks: Ping/SSH test each discovered node
DISCOVERED VARIABLES:
CLUSTER_NODES: Comma-separated list of node IPs (user-selected)
LOCAL_IP: This machine's IP address
ETH_IF: Ethernet interface name (e.g., 'eth0')
IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available
including interactive per-node confirmation and .env saving.
After autodiscover.sh completes, reads configuration from .env file.
Returns:
Dictionary with discovered configuration, or None if discovery failed
Dictionary with discovered configuration from .env, or None if discovery failed
"""
if not AUTODISCOVER_SCRIPT.exists():
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
@@ -632,85 +590,28 @@ def run_autodiscover() -> dict[str, str] | None:
print("Running autodiscover...")
print()
# Run autodiscover in a subshell and capture the variables
# We source the script and print the variables we care about
# Build env for the subprocess so CONFIG_FILE is passed through
env_vars = os.environ.copy()
env_vars["CONFIG_FILE"] = str(ENV_FILE)
env_vars["CONFIG_FILE_SET"] = "true"
# Run autodiscover interactively so its prompts are shown to the user
script = f"""
source '{AUTODISCOVER_SCRIPT}'
detect_interfaces
detect_local_ip
detect_nodes
echo "CLUSTER_NODES=$NODES_ARG"
echo "LOCAL_IP=$LOCAL_IP"
echo "ETH_IF=$ETH_IF"
echo "IB_IF=$IB_IF"
run_autodiscover
"""
result = subprocess.run(["bash", "-c", script], capture_output=True, text=True)
result = subprocess.run(["bash", "-c", script], env=env_vars)
if result.returncode != 0:
print("Autodiscover output:")
print(result.stdout)
if result.stderr:
print(result.stderr)
print("Error: Autodiscover failed")
return None
# Print the autodiscover output (excluding the final variable lines)
output_lines = result.stdout.strip().split("\n")
env = {}
for line in output_lines:
if "=" in line and any(
line.startswith(k)
for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]
):
key, _, value = line.partition("=")
env[key] = value
else:
print(line)
print()
# Interactive node selection
if env.get("CLUSTER_NODES"):
all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()]
local_ip = env.get("LOCAL_IP", "")
if len(all_nodes) > 1:
print("Select which nodes to include in the cluster:")
print()
selected_nodes = []
for node in all_nodes:
is_local = node == local_ip
label = f"{node} (this machine)" if is_local else node
# Default to yes for all nodes
while True:
response = input(f" Include {label}? [Y/n]: ").strip().lower()
if response in ("", "y", "yes"):
selected_nodes.append(node)
break
elif response in ("n", "no"):
break
else:
print(" Please enter 'y' or 'n'")
print()
if not selected_nodes:
print("No nodes selected. Aborting.")
return None
if len(selected_nodes) == 1:
print(f"Only one node selected: {selected_nodes[0]}")
print("This will run in solo mode (single node).")
else:
print(
f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}"
)
env["CLUSTER_NODES"] = ",".join(selected_nodes)
print()
# Read configuration from the .env file that autodiscover.sh wrote
env = load_env_file()
if not env.get("CLUSTER_NODES"):
print("Autodiscover completed but no CLUSTER_NODES found in .env")
return None
return env
@@ -990,8 +891,6 @@ Examples:
print(f" {key}={value}")
print()
save_env_file(env)
if not args.recipe:
return 0
@@ -1058,20 +957,6 @@ Examples:
nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
nodes_from_env = True
if nodes:
# Ask if user wants to save to .env
print()
response = (
input(
"Save this configuration to .env for future use? [Y/n]: "
)
.strip()
.lower()
)
if response in ("", "y", "yes"):
save_env_file(discovered_env)
print()
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
eth_if = args.eth_if or None
ib_if = args.ib_if or None