diff --git a/launch-cluster.sh b/launch-cluster.sh index 31b8bc3..1f13a2f 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -16,6 +16,7 @@ fi ETH_IF="" IB_IF="" NCCL_DEBUG_VAL="" +MASTER_PORT="29501" # Initialize variables NODES_ARG="" @@ -57,6 +58,7 @@ usage() { echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted." echo " --check-config Check configuration and auto-detection without launching" echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster" + echo " --master-port Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501)" echo " --no-ray No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)" echo " --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)" echo " -d Daemon mode (only for 'start' action)" @@ -94,6 +96,7 @@ while [[ "$#" -gt 0 ]]; do NCCL_DEBUG_VAL="INFO" fi ;; + --master-port|--head-port) MASTER_PORT="$2"; shift ;; --check-config) CHECK_CONFIG="true" ;; --solo) SOLO_MODE="true" ;; --no-ray) NO_RAY_MODE="true" ;; @@ -554,7 +557,7 @@ apply_mod_to_container() { # Prints the path of the temp file (caller must delete it). make_node_script() { local script_path="$1"; local nnodes="$2"; local node_rank="$3"; local master_addr="$4" - local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr" + local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr --master-port $MASTER_PORT" [[ "$node_rank" -gt 0 ]] && extra="$extra --headless" local tmp; tmp=$(mktemp /tmp/vllm_node_script_XXXXXX.sh) @@ -613,7 +616,7 @@ start_ray_head() { local container="$1" echo "Starting Ray HEAD node on $HEAD_IP..." docker exec -d "$container" bash -c \ - "ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \ + "ray start --block --head --port $MASTER_PORT --object-store-memory 1073741824 --num-cpus 2 \ --node-ip-address $HEAD_IP --include-dashboard=false --disable-usage-stats \ >> /proc/1/fd/1 2>&1" } @@ -625,7 +628,7 @@ start_ray_worker() { ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \ "docker exec -d $container bash -c \ 'ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \ - --address=$HEAD_IP:6379 --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'" + --address=$HEAD_IP:$MASTER_PORT --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'" } # Start Cluster Function @@ -765,7 +768,7 @@ exec_no_ray_cluster() { else local clean clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//') - worker_cmd="$clean --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --headless" + worker_cmd="$clean --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --master-port $MASTER_PORT --headless" fi echo "Launching worker (rank $rank) on $worker..." ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" \ @@ -780,7 +783,7 @@ exec_no_ray_cluster() { else local clean clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//') - head_cmd="$clean --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP" + head_cmd="$clean --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP --master-port $MASTER_PORT" fi echo "Executing command on head node (rank 0): $head_cmd" diff --git a/run-recipe.py b/run-recipe.py index f130dda..ba4563b 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -827,6 +827,7 @@ Examples: dest="no_ray", help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)" ) + launch_group.add_argument("--master-port", "--head-port", type=int, dest="master_port", help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)") launch_group.add_argument("--name", dest="container_name", help="Override container name (default: vllm_node)") launch_group.add_argument("--eth-if", dest="eth_if", help="Ethernet interface (overrides .env and auto-detection)") launch_group.add_argument("--ib-if", dest="ib_if", help="InfiniBand interface (overrides .env and auto-detection)") @@ -1165,6 +1166,8 @@ Examples: cmd_parts.extend(["--nccl-debug", args.nccl_debug]) for env_var in args.env_vars: cmd_parts.extend(["-e", env_var]) + if args.master_port: + cmd_parts.extend(["--master-port", str(args.master_port)]) if args.container_name: cmd_parts.extend(["--name", args.container_name]) if eth_if: @@ -1232,6 +1235,8 @@ Examples: for env_var in args.env_vars: cmd.extend(["-e", env_var]) + if args.master_port: + cmd.extend(["--master-port", str(args.master_port)]) if args.container_name: cmd.extend(["--name", args.container_name]) if eth_if: