Autodiscovery refactoring with mesh support
This commit is contained in:
149
run-recipe.py
149
run-recipe.py
@@ -547,7 +547,6 @@ def load_env_file() -> dict[str, str]:
|
||||
Reads the .env file created by --discover for persistent cluster configuration.
|
||||
|
||||
EXTENSIBILITY:
|
||||
- To add new persistent settings: Just add them to save_env_file()
|
||||
- To support multiple .env files: Add a --env-file CLI argument
|
||||
- To add validation: Check for required keys after loading
|
||||
|
||||
@@ -573,57 +572,16 @@ def load_env_file() -> dict[str, str]:
|
||||
return env
|
||||
|
||||
|
||||
def save_env_file(env: dict[str, str]) -> None:
|
||||
"""
|
||||
Save environment variables to .env file.
|
||||
|
||||
Persists cluster configuration discovered by autodiscover.sh.
|
||||
Values are properly quoted if they contain spaces or commas.
|
||||
|
||||
EXTENSIBILITY:
|
||||
- To add new persistent settings: Just add them to the env dict before calling
|
||||
- To add timestamps/metadata: Add comment lines to the output
|
||||
- To support append mode: Read existing, merge, then write
|
||||
|
||||
Args:
|
||||
env: Dictionary of key=value pairs to save
|
||||
"""
|
||||
lines = ["# Auto-generated by run-recipe.py --discover", ""]
|
||||
for key, value in sorted(env.items()):
|
||||
# Quote values with spaces
|
||||
if " " in value or "," in value:
|
||||
lines.append(f'{key}="{value}"')
|
||||
else:
|
||||
lines.append(f"{key}={value}")
|
||||
lines.append("")
|
||||
|
||||
with open(ENV_FILE, "w") as f:
|
||||
f.write("\n".join(lines))
|
||||
|
||||
print(f"Saved to {ENV_FILE}")
|
||||
|
||||
|
||||
def run_autodiscover() -> dict[str, str] | None:
|
||||
"""
|
||||
Run autodiscover.sh and return discovered configuration.
|
||||
Run autodiscover.sh interactively and return discovered configuration.
|
||||
|
||||
Executes the autodiscover.sh script to detect cluster topology,
|
||||
then presents an interactive node selection menu.
|
||||
|
||||
EXTENSIBILITY:
|
||||
- To add new discovery methods: Extend autodiscover.sh or add Python detection here
|
||||
- To add GPU detection: Add nvidia-smi parsing to discovered env
|
||||
- To skip interactive selection: Add a --non-interactive flag
|
||||
- To add node health checks: Ping/SSH test each discovered node
|
||||
|
||||
DISCOVERED VARIABLES:
|
||||
CLUSTER_NODES: Comma-separated list of node IPs (user-selected)
|
||||
LOCAL_IP: This machine's IP address
|
||||
ETH_IF: Ethernet interface name (e.g., 'eth0')
|
||||
IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available
|
||||
including interactive per-node confirmation and .env saving.
|
||||
After autodiscover.sh completes, reads configuration from .env file.
|
||||
|
||||
Returns:
|
||||
Dictionary with discovered configuration, or None if discovery failed
|
||||
Dictionary with discovered configuration from .env, or None if discovery failed
|
||||
"""
|
||||
if not AUTODISCOVER_SCRIPT.exists():
|
||||
print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}")
|
||||
@@ -632,85 +590,28 @@ def run_autodiscover() -> dict[str, str] | None:
|
||||
print("Running autodiscover...")
|
||||
print()
|
||||
|
||||
# Run autodiscover in a subshell and capture the variables
|
||||
# We source the script and print the variables we care about
|
||||
# Build env for the subprocess so CONFIG_FILE is passed through
|
||||
env_vars = os.environ.copy()
|
||||
env_vars["CONFIG_FILE"] = str(ENV_FILE)
|
||||
env_vars["CONFIG_FILE_SET"] = "true"
|
||||
|
||||
# Run autodiscover interactively so its prompts are shown to the user
|
||||
script = f"""
|
||||
source '{AUTODISCOVER_SCRIPT}'
|
||||
detect_interfaces
|
||||
detect_local_ip
|
||||
detect_nodes
|
||||
echo "CLUSTER_NODES=$NODES_ARG"
|
||||
echo "LOCAL_IP=$LOCAL_IP"
|
||||
echo "ETH_IF=$ETH_IF"
|
||||
echo "IB_IF=$IB_IF"
|
||||
run_autodiscover
|
||||
"""
|
||||
|
||||
result = subprocess.run(["bash", "-c", script], capture_output=True, text=True)
|
||||
result = subprocess.run(["bash", "-c", script], env=env_vars)
|
||||
|
||||
if result.returncode != 0:
|
||||
print("Autodiscover output:")
|
||||
print(result.stdout)
|
||||
if result.stderr:
|
||||
print(result.stderr)
|
||||
print("Error: Autodiscover failed")
|
||||
return None
|
||||
|
||||
# Print the autodiscover output (excluding the final variable lines)
|
||||
output_lines = result.stdout.strip().split("\n")
|
||||
env = {}
|
||||
for line in output_lines:
|
||||
if "=" in line and any(
|
||||
line.startswith(k)
|
||||
for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]
|
||||
):
|
||||
key, _, value = line.partition("=")
|
||||
env[key] = value
|
||||
else:
|
||||
print(line)
|
||||
|
||||
print()
|
||||
|
||||
# Interactive node selection
|
||||
if env.get("CLUSTER_NODES"):
|
||||
all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()]
|
||||
local_ip = env.get("LOCAL_IP", "")
|
||||
|
||||
if len(all_nodes) > 1:
|
||||
print("Select which nodes to include in the cluster:")
|
||||
print()
|
||||
|
||||
selected_nodes = []
|
||||
for node in all_nodes:
|
||||
is_local = node == local_ip
|
||||
label = f"{node} (this machine)" if is_local else node
|
||||
|
||||
# Default to yes for all nodes
|
||||
while True:
|
||||
response = input(f" Include {label}? [Y/n]: ").strip().lower()
|
||||
if response in ("", "y", "yes"):
|
||||
selected_nodes.append(node)
|
||||
break
|
||||
elif response in ("n", "no"):
|
||||
break
|
||||
else:
|
||||
print(" Please enter 'y' or 'n'")
|
||||
|
||||
print()
|
||||
|
||||
if not selected_nodes:
|
||||
print("No nodes selected. Aborting.")
|
||||
return None
|
||||
|
||||
if len(selected_nodes) == 1:
|
||||
print(f"Only one node selected: {selected_nodes[0]}")
|
||||
print("This will run in solo mode (single node).")
|
||||
else:
|
||||
print(
|
||||
f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}"
|
||||
)
|
||||
|
||||
env["CLUSTER_NODES"] = ",".join(selected_nodes)
|
||||
print()
|
||||
# Read configuration from the .env file that autodiscover.sh wrote
|
||||
env = load_env_file()
|
||||
if not env.get("CLUSTER_NODES"):
|
||||
print("Autodiscover completed but no CLUSTER_NODES found in .env")
|
||||
return None
|
||||
|
||||
return env
|
||||
|
||||
@@ -990,8 +891,6 @@ Examples:
|
||||
print(f" {key}={value}")
|
||||
print()
|
||||
|
||||
save_env_file(env)
|
||||
|
||||
if not args.recipe:
|
||||
return 0
|
||||
|
||||
@@ -1058,20 +957,6 @@ Examples:
|
||||
nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
|
||||
nodes_from_env = True
|
||||
|
||||
if nodes:
|
||||
# Ask if user wants to save to .env
|
||||
print()
|
||||
response = (
|
||||
input(
|
||||
"Save this configuration to .env for future use? [Y/n]: "
|
||||
)
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
if response in ("", "y", "yes"):
|
||||
save_env_file(discovered_env)
|
||||
print()
|
||||
|
||||
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
|
||||
eth_if = args.eth_if or None
|
||||
ib_if = args.ib_if or None
|
||||
|
||||
Reference in New Issue
Block a user