Added master-port argument

This commit is contained in:
Eugene Rakhmatulin
2026-03-18 16:57:55 -07:00
parent 7b752c31c5
commit 7e4150feed
2 changed files with 13 additions and 5 deletions

View File

@@ -16,6 +16,7 @@ fi
ETH_IF=""
IB_IF=""
NCCL_DEBUG_VAL=""
MASTER_PORT="29501"
# Initialize variables
NODES_ARG=""
@@ -57,6 +58,7 @@ usage() {
echo " --launch-script Path to bash script to execute in the container (from examples/ directory or absolute path). If launch script is specified, action should be omitted."
echo " --check-config Check configuration and auto-detection without launching"
echo " --solo Solo mode: skip autodetection, launch only on current node, do not launch Ray cluster"
echo " --master-port Port for cluster coordination: Ray head port or PyTorch distributed master port (default: 29501)"
echo " --no-ray No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)"
echo " --no-cache-dirs Do not mount default cache directories (~/.cache/vllm, ~/.cache/flashinfer, ~/.triton)"
echo " -d Daemon mode (only for 'start' action)"
@@ -94,6 +96,7 @@ while [[ "$#" -gt 0 ]]; do
NCCL_DEBUG_VAL="INFO"
fi
;;
--master-port|--head-port) MASTER_PORT="$2"; shift ;;
--check-config) CHECK_CONFIG="true" ;;
--solo) SOLO_MODE="true" ;;
--no-ray) NO_RAY_MODE="true" ;;
@@ -554,7 +557,7 @@ apply_mod_to_container() {
# Prints the path of the temp file (caller must delete it).
make_node_script() {
local script_path="$1"; local nnodes="$2"; local node_rank="$3"; local master_addr="$4"
local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr"
local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr --master-port $MASTER_PORT"
[[ "$node_rank" -gt 0 ]] && extra="$extra --headless"
local tmp; tmp=$(mktemp /tmp/vllm_node_script_XXXXXX.sh)
@@ -613,7 +616,7 @@ start_ray_head() {
local container="$1"
echo "Starting Ray HEAD node on $HEAD_IP..."
docker exec -d "$container" bash -c \
"ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
"ray start --block --head --port $MASTER_PORT --object-store-memory 1073741824 --num-cpus 2 \
--node-ip-address $HEAD_IP --include-dashboard=false --disable-usage-stats \
>> /proc/1/fd/1 2>&1"
}
@@ -625,7 +628,7 @@ start_ray_worker() {
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
"docker exec -d $container bash -c \
'ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
--address=$HEAD_IP:6379 --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'"
--address=$HEAD_IP:$MASTER_PORT --node-ip-address $worker_ip >> /proc/1/fd/1 2>&1'"
}
# Start Cluster Function
@@ -765,7 +768,7 @@ exec_no_ray_cluster() {
else
local clean
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
worker_cmd="$clean --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --headless"
worker_cmd="$clean --nnodes $total_nodes --node-rank $rank --master-addr $HEAD_IP --master-port $MASTER_PORT --headless"
fi
echo "Launching worker (rank $rank) on $worker..."
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker" \
@@ -780,7 +783,7 @@ exec_no_ray_cluster() {
else
local clean
clean=$(echo "$base_cmd" | sed 's/--distributed-executor-backend[[:space:]]*[^[:space:]]*//')
head_cmd="$clean --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP"
head_cmd="$clean --nnodes $total_nodes --node-rank 0 --master-addr $HEAD_IP --master-port $MASTER_PORT"
fi
echo "Executing command on head node (rank 0): $head_cmd"