Added extra passthrough variables to run-recipe
This commit is contained in:
125
run-recipe.py
125
run-recipe.py
@@ -825,7 +825,17 @@ Examples:
|
|||||||
dest="no_ray",
|
dest="no_ray",
|
||||||
help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
|
help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
|
||||||
)
|
)
|
||||||
|
launch_group.add_argument("--name", dest="container_name", help="Override container name (default: vllm_node)")
|
||||||
|
launch_group.add_argument("--eth-if", dest="eth_if", help="Ethernet interface (overrides .env and auto-detection)")
|
||||||
|
launch_group.add_argument("--ib-if", dest="ib_if", help="InfiniBand interface (overrides .env and auto-detection)")
|
||||||
|
launch_group.add_argument("-j", dest="build_jobs", type=int, metavar="N", help="Number of parallel build jobs inside container")
|
||||||
|
launch_group.add_argument("--no-cache-dirs", action="store_true", dest="no_cache_dirs", help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton")
|
||||||
|
launch_group.add_argument("--non-privileged", action="store_true", dest="non_privileged", help="Run in non-privileged mode (removes --privileged and --ipc=host)")
|
||||||
|
launch_group.add_argument("--mem-limit-gb", type=int, dest="mem_limit_gb", help="Memory limit in GB (only with --non-privileged)")
|
||||||
|
launch_group.add_argument("--mem-swap-limit-gb", type=int, dest="mem_swap_limit_gb", help="Memory+swap limit in GB (only with --non-privileged)")
|
||||||
|
launch_group.add_argument("--pids-limit", type=int, dest="pids_limit", help="Process limit (only with --non-privileged, default: 4096)")
|
||||||
|
launch_group.add_argument("--shm-size-gb", type=int, dest="shm_size_gb", help="Shared memory size in GB (only with --non-privileged, default: 64)")
|
||||||
|
|
||||||
# Cluster discovery options
|
# Cluster discovery options
|
||||||
discover_group = parser.add_argument_group("Cluster discovery")
|
discover_group = parser.add_argument_group("Cluster discovery")
|
||||||
discover_group.add_argument(
|
discover_group.add_argument(
|
||||||
@@ -900,39 +910,51 @@ Examples:
|
|||||||
build_args = recipe.get("build_args", [])
|
build_args = recipe.get("build_args", [])
|
||||||
|
|
||||||
# Parse nodes - check command line first, then .env file, then autodiscover
|
# Parse nodes - check command line first, then .env file, then autodiscover
|
||||||
nodes = parse_nodes(args.nodes)
|
nodes = parse_nodes(args.nodes) if not args.solo else None
|
||||||
nodes_from_env = False
|
nodes_from_env = False
|
||||||
|
eth_if = None
|
||||||
|
ib_if = None
|
||||||
|
|
||||||
if not nodes and not args.solo:
|
if not args.solo:
|
||||||
# Try to load from .env file
|
# Try to load from .env file
|
||||||
env = load_env_file()
|
env = load_env_file()
|
||||||
if env.get("CLUSTER_NODES"):
|
if not nodes:
|
||||||
nodes = parse_nodes(env["CLUSTER_NODES"])
|
if env.get("CLUSTER_NODES"):
|
||||||
nodes_from_env = True
|
nodes = parse_nodes(env["CLUSTER_NODES"])
|
||||||
if nodes:
|
|
||||||
print(f"Using cluster nodes from .env: {', '.join(nodes)}")
|
|
||||||
print()
|
|
||||||
else:
|
|
||||||
# No nodes specified and no .env - run autodiscover
|
|
||||||
print("No cluster nodes configured. Running autodiscover...")
|
|
||||||
print()
|
|
||||||
|
|
||||||
discovered_env = run_autodiscover()
|
|
||||||
if discovered_env and discovered_env.get("CLUSTER_NODES"):
|
|
||||||
nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
|
|
||||||
nodes_from_env = True
|
nodes_from_env = True
|
||||||
|
|
||||||
if nodes:
|
if nodes:
|
||||||
# Ask if user wants to save to .env
|
print(f"Using cluster nodes from .env: {', '.join(nodes)}")
|
||||||
print()
|
|
||||||
response = input("Save this configuration to .env for future use? [Y/n]: ").strip().lower()
|
|
||||||
if response in ("", "y", "yes"):
|
|
||||||
save_env_file(discovered_env)
|
|
||||||
print()
|
print()
|
||||||
|
else:
|
||||||
|
# No nodes specified and no .env - run autodiscover
|
||||||
|
print("No cluster nodes configured. Running autodiscover...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
discovered_env = run_autodiscover()
|
||||||
|
if discovered_env and discovered_env.get("CLUSTER_NODES"):
|
||||||
|
nodes = parse_nodes(discovered_env["CLUSTER_NODES"])
|
||||||
|
nodes_from_env = True
|
||||||
|
|
||||||
|
if nodes:
|
||||||
|
# Ask if user wants to save to .env
|
||||||
|
print()
|
||||||
|
response = input("Save this configuration to .env for future use? [Y/n]: ").strip().lower()
|
||||||
|
if response in ("", "y", "yes"):
|
||||||
|
save_env_file(discovered_env)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh
|
||||||
|
eth_if = args.eth_if or None
|
||||||
|
ib_if = args.ib_if or None
|
||||||
|
if not eth_if or not ib_if:
|
||||||
|
if not eth_if and env.get("ETH_IF"):
|
||||||
|
eth_if = env["ETH_IF"]
|
||||||
|
if not ib_if and env.get("IB_IF"):
|
||||||
|
ib_if = env["IB_IF"]
|
||||||
|
|
||||||
worker_nodes = get_worker_nodes(nodes) if nodes else []
|
worker_nodes = get_worker_nodes(nodes) if nodes else []
|
||||||
is_cluster = len(nodes) > 1
|
is_cluster = len(nodes) > 1
|
||||||
|
|
||||||
# Check if recipe requires cluster mode
|
# Check if recipe requires cluster mode
|
||||||
cluster_only = recipe.get("cluster_only", False)
|
cluster_only = recipe.get("cluster_only", False)
|
||||||
solo_only = recipe.get("solo_only", False)
|
solo_only = recipe.get("solo_only", False)
|
||||||
@@ -981,8 +1003,16 @@ Examples:
|
|||||||
if worker_nodes:
|
if worker_nodes:
|
||||||
print(f" Workers: {', '.join(worker_nodes)}")
|
print(f" Workers: {', '.join(worker_nodes)}")
|
||||||
print(f"Solo mode: {is_solo}")
|
print(f"Solo mode: {is_solo}")
|
||||||
|
if eth_if:
|
||||||
|
print(f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}")
|
||||||
|
if ib_if:
|
||||||
|
print(f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}")
|
||||||
|
if args.container_name:
|
||||||
|
print(f"Container name: {args.container_name}")
|
||||||
|
if args.non_privileged:
|
||||||
|
print("Non-privileged mode: Yes")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# --- Build Phase ---
|
# --- Build Phase ---
|
||||||
if args.build_only or args.setup or args.force_build:
|
if args.build_only or args.setup or args.force_build:
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
@@ -1133,6 +1163,26 @@ Examples:
|
|||||||
cmd_parts.extend(["--nccl-debug", args.nccl_debug])
|
cmd_parts.extend(["--nccl-debug", args.nccl_debug])
|
||||||
for env_var in args.env_vars:
|
for env_var in args.env_vars:
|
||||||
cmd_parts.extend(["-e", env_var])
|
cmd_parts.extend(["-e", env_var])
|
||||||
|
if args.container_name:
|
||||||
|
cmd_parts.extend(["--name", args.container_name])
|
||||||
|
if eth_if:
|
||||||
|
cmd_parts.extend(["--eth-if", eth_if])
|
||||||
|
if ib_if:
|
||||||
|
cmd_parts.extend(["--ib-if", ib_if])
|
||||||
|
if args.build_jobs:
|
||||||
|
cmd_parts.extend(["-j", str(args.build_jobs)])
|
||||||
|
if args.no_cache_dirs:
|
||||||
|
cmd_parts.append("--no-cache-dirs")
|
||||||
|
if args.non_privileged:
|
||||||
|
cmd_parts.append("--non-privileged")
|
||||||
|
if args.mem_limit_gb:
|
||||||
|
cmd_parts.extend(["--mem-limit-gb", str(args.mem_limit_gb)])
|
||||||
|
if args.mem_swap_limit_gb:
|
||||||
|
cmd_parts.extend(["--mem-swap-limit-gb", str(args.mem_swap_limit_gb)])
|
||||||
|
if args.pids_limit:
|
||||||
|
cmd_parts.extend(["--pids-limit", str(args.pids_limit)])
|
||||||
|
if args.shm_size_gb:
|
||||||
|
cmd_parts.extend(["--shm-size-gb", str(args.shm_size_gb)])
|
||||||
cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"])
|
cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"])
|
||||||
print(" ".join(cmd_parts))
|
print(" ".join(cmd_parts))
|
||||||
print()
|
print()
|
||||||
@@ -1176,10 +1226,31 @@ Examples:
|
|||||||
|
|
||||||
if args.nccl_debug:
|
if args.nccl_debug:
|
||||||
cmd.extend(["--nccl-debug", args.nccl_debug])
|
cmd.extend(["--nccl-debug", args.nccl_debug])
|
||||||
|
|
||||||
for env_var in args.env_vars:
|
for env_var in args.env_vars:
|
||||||
cmd.extend(["-e", env_var])
|
cmd.extend(["-e", env_var])
|
||||||
|
|
||||||
|
if args.container_name:
|
||||||
|
cmd.extend(["--name", args.container_name])
|
||||||
|
if eth_if:
|
||||||
|
cmd.extend(["--eth-if", eth_if])
|
||||||
|
if ib_if:
|
||||||
|
cmd.extend(["--ib-if", ib_if])
|
||||||
|
if args.build_jobs:
|
||||||
|
cmd.extend(["-j", str(args.build_jobs)])
|
||||||
|
if args.no_cache_dirs:
|
||||||
|
cmd.append("--no-cache-dirs")
|
||||||
|
if args.non_privileged:
|
||||||
|
cmd.append("--non-privileged")
|
||||||
|
if args.mem_limit_gb:
|
||||||
|
cmd.extend(["--mem-limit-gb", str(args.mem_limit_gb)])
|
||||||
|
if args.mem_swap_limit_gb:
|
||||||
|
cmd.extend(["--mem-swap-limit-gb", str(args.mem_swap_limit_gb)])
|
||||||
|
if args.pids_limit:
|
||||||
|
cmd.extend(["--pids-limit", str(args.pids_limit)])
|
||||||
|
if args.shm_size_gb:
|
||||||
|
cmd.extend(["--shm-size-gb", str(args.shm_size_gb)])
|
||||||
|
|
||||||
# Add launch script
|
# Add launch script
|
||||||
cmd.extend(["--launch-script", temp_script])
|
cmd.extend(["--launch-script", temp_script])
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user