This commit is contained in:
Eugene Rakhmatulin
2026-03-13 12:32:43 -07:00
parent 4bcbbaa25a
commit caa83d9e5b

View File

@@ -549,52 +549,39 @@ apply_mod_to_container() {
fi fi
} }
# Copy Launch Script to Container Function # Build a patched copy of the launch script on the host for a specific node.
copy_launch_script_to_container() { # Strips --distributed-executor-backend and appends multi-node args.
local container="$1" # Prints the path of the temp file (caller must delete it).
local script_path="$2" make_node_script() {
local script_path="$1"; local nnodes="$2"; local node_rank="$3"; local master_addr="$4"
echo "Copying launch script to head node..."
# Copy script into container as /workspace/exec-script.sh
docker cp "$script_path" "$container:/workspace/exec-script.sh"
# Make executable
docker exec "$container" chmod +x /workspace/exec-script.sh
echo " Launch script copied to head node"
}
# Copy Launch Script to Worker via SSH + docker cp
copy_launch_script_to_worker() {
local worker_ip="$1"; local container="$2"; local script_path="$3"
echo "Copying launch script to worker $worker_ip..."
local remote_tmp="/tmp/vllm_script_$(date +%s)_$RANDOM.sh"
scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$worker_ip:$remote_tmp"
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
"docker cp $remote_tmp $container:/workspace/exec-script.sh && \
docker exec $container chmod +x /workspace/exec-script.sh && \
rm -f $remote_tmp"
}
# Patch /workspace/exec-script.sh in container: inject --nnodes/--node-rank/--master-addr/--headless
patch_script_in_container() {
local is_local="$1"; local node_ip="$2"; local container="$3"
local nnodes="$4"; local node_rank="$5"; local master_addr="$6"
local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr" local extra="--nnodes $nnodes --node-rank $node_rank --master-addr $master_addr"
[[ "$node_rank" -gt 0 ]] && extra="$extra --headless" [[ "$node_rank" -gt 0 ]] && extra="$extra --headless"
local patch="sed -i '/--distributed-executor-backend/d' /workspace/exec-script.sh && \ local tmp; tmp=$(mktemp /tmp/vllm_node_script_XXXXXX.sh)
echo '$extra' >> /workspace/exec-script.sh && \ grep -v -- '--distributed-executor-backend' "$script_path" > "$tmp"
chmod +x /workspace/exec-script.sh" echo "$extra" >> "$tmp"
chmod +x "$tmp"
echo "$tmp"
}
if [[ "$is_local" == "true" ]]; then # Copy a script file into a local container as /workspace/exec-script.sh
docker exec "$container" bash -c "$patch" copy_script_to_container() {
else local container="$1"; local script_path="$2"; local label="${3:-node}"
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" \ echo "Copying launch script to $label..."
"docker exec $container bash -c \"$patch\"" docker cp "$script_path" "$container:/workspace/exec-script.sh" || { echo "Error: docker cp to $label failed"; exit 1; }
fi docker exec "$container" chmod +x /workspace/exec-script.sh
}
# Copy a script file to a remote container via scp + docker cp
copy_script_to_worker() {
local worker_ip="$1"; local container="$2"; local script_path="$3"
echo "Copying launch script to worker $worker_ip..."
local remote_tmp="/tmp/vllm_script_$(date +%s)_$RANDOM.sh"
scp -o BatchMode=yes -o StrictHostKeyChecking=no "$script_path" "$worker_ip:$remote_tmp" || { echo "Error: scp to $worker_ip failed"; exit 1; }
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$worker_ip" \
"docker cp $remote_tmp $container:/workspace/exec-script.sh && \
docker exec $container chmod +x /workspace/exec-script.sh && \
rm -f $remote_tmp" || { echo "Error: docker cp to worker $worker_ip failed"; exit 1; }
} }
# Build -e KEY=VALUE flags for a given node IP (used in docker run and docker exec) # Build -e KEY=VALUE flags for a given node IP (used in docker run and docker exec)
@@ -696,15 +683,21 @@ start_cluster() {
# Copy (and patch for no-ray) launch script # Copy (and patch for no-ray) launch script
if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then
local total_nodes=$(( 1 + ${#PEER_NODES[@]} )) local total_nodes=$(( 1 + ${#PEER_NODES[@]} ))
copy_launch_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH"
if [[ "$NO_RAY_MODE" == "true" ]]; then if [[ "$NO_RAY_MODE" == "true" ]]; then
patch_script_in_container "true" "$HEAD_IP" "$CONTAINER_NAME" "$total_nodes" "0" "$HEAD_IP" # Build per-node patched scripts on the host, then copy
local head_script; head_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "0" "$HEAD_IP")
copy_script_to_container "$CONTAINER_NAME" "$head_script" "head node ($HEAD_IP)"
rm -f "$head_script"
local rank=1 local rank=1
for worker in "${PEER_NODES[@]}"; do for worker in "${PEER_NODES[@]}"; do
copy_launch_script_to_worker "$worker" "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" local worker_script; worker_script=$(make_node_script "$LAUNCH_SCRIPT_PATH" "$total_nodes" "$rank" "$HEAD_IP")
patch_script_in_container "false" "$worker" "$CONTAINER_NAME" "$total_nodes" "$rank" "$HEAD_IP" copy_script_to_worker "$worker" "$CONTAINER_NAME" "$worker_script"
rm -f "$worker_script"
(( rank++ )) (( rank++ ))
done done
else
copy_script_to_container "$CONTAINER_NAME" "$LAUNCH_SCRIPT_PATH" "head node"
fi fi
fi fi