From e67abd5e6ef36df11cd57a36ac3bcf2ddcfdbc48 Mon Sep 17 00:00:00 2001 From: Eric Lewis Date: Thu, 18 Dec 2025 00:30:27 -0500 Subject: [PATCH 1/2] Add multi-host copy support to build-and-copy.sh Updated build-and-copy.sh to support copying Docker images to multiple hosts using the new -c/--copy-to flag, which accepts space- or comma-separated host lists. The old --copy-to-host flag is retained as an alias for backward compatibility, and -h is now used for help. The README was updated to document these changes and provide new usage examples. --- README.md | 36 +++++++++++++++-------- build-and-copy.sh | 75 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index cfa201e..114b75d 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,12 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run ## CHANGELOG +### 2025-12-18 + +Updated `build-and-copy.sh` to support copying to multiple hosts. +- Added `-c, --copy-to` (accepts space- or comma-separated host lists) and kept `--copy-to-host` as a backward-compatible alias. +- Short `-h` is now used for help. + ### 2025-12-15 Updated `build-and-copy.sh` flags: @@ -28,7 +34,7 @@ Triton is now being built from the source, alongside with its companion triton_k Added new flags to `build-and-copy.sh`: - `--triton-sha `: Specify Triton commit SHA (defaults to v3.5.1 currently) -- `--no-build`: Skip building and only copy existing image (requires `--copy-to-host`) +- `--no-build`: Skip building and only copy existing image (requires `--copy-to`) ### 2025-12-11 update @@ -65,7 +71,7 @@ Using a provided build script is recommended, but if you want to build using `do ### Using the Build Script (Recommended) -The `build-and-copy.sh` script automates the build process and optionally copies the image to another node. This is the recommended method for building and deploying to multiple Spark nodes. +The `build-and-copy.sh` script automates the build process and optionally copies the image to one or more nodes. This is the recommended method for building and deploying to multiple Spark nodes. **Basic usage (build only):** @@ -79,18 +85,24 @@ The `build-and-copy.sh` script automates the build process and optionally copies ./build-and-copy.sh --tag my-vllm-node ``` -**Build and copy to another Spark node:** +**Build and copy to Spark node(s):** -Using the same username as currently logged-in user: +Using the same username as currently logged-in user (single host): ```bash -./build-and-copy.sh --copy-to-host 192.168.177.12 +./build-and-copy.sh --copy-to 192.168.177.12 +``` + +Copy to multiple hosts (space- or comma-separated after the flag): + +```bash +./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13 ``` Using a different username: ```bash -./build-and-copy.sh --copy-to-host 192.168.177.12 --user your_username +./build-and-copy.sh --copy-to 192.168.177.12 --user your_username ``` **Force rebuild vLLM source only:** @@ -108,7 +120,7 @@ Using a different username: **Combined example (rebuild vLLM and copy to another node):** ```bash -./build-and-copy.sh --rebuild-vllm --copy-to-host 192.168.177.12 +./build-and-copy.sh --rebuild-vllm --copy-to 192.168.177.12 ``` **Build with specific Triton commit:** @@ -120,7 +132,7 @@ Using a different username: **Copy existing image without rebuilding:** ```bash -./build-and-copy.sh --no-build --copy-to-host 192.168.177.12 +./build-and-copy.sh --no-build --copy-to 192.168.177.12 ``` **Available options:** @@ -132,10 +144,11 @@ Using a different username: | `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) | | `--triton-ref ` | Triton commit SHA, branch or tag (default: 'v3.5.1') | | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: 'main') | -| `-h, --copy-to-host ` | Host address to copy the image to after building | +| `-c, --copy-to ` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). | +| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). | | `-u, --user ` | Username for SSH connection (default: current user) | -| `--no-build` | Skip building, only copy existing image (requires `--copy-to-host`) | -| `--help` | Show help message | +| `--no-build` | Skip building, only copy existing image (requires `--copy-to`) | +| `-h, --help` | Show help message | **IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! @@ -330,4 +343,3 @@ Modify `--num-prompts` to benchmark concurrent requests - the command above will ### Hardware Architecture **Note:** The Dockerfile defaults to `TORCH_CUDA_ARCH_LIST=12.1a` (NVIDIA GB10). If you are using different hardware, update the `ENV` variable in the Dockerfile before building. - diff --git a/build-and-copy.sh b/build-and-copy.sh index 19a2c11..ddc1b5f 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -8,11 +8,33 @@ START_TIME=$(date +%s) IMAGE_TAG="vllm-node" REBUILD_DEPS=false REBUILD_VLLM=false -COPY_HOST="" +COPY_HOSTS=() SSH_USER="$USER" NO_BUILD=false TRITON_REF="v3.5.1" VLLM_REF="main" +TMP_IMAGE="" + +cleanup() { + if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then + rm -f "$TMP_IMAGE" + fi +} + +trap cleanup EXIT + +add_copy_hosts() { + local token part + for token in "$@"; do + IFS=',' read -ra PARTS <<< "$token" + for part in "${PARTS[@]}"; do + part="${part//[[:space:]]/}" + if [ -n "$part" ]; then + COPY_HOSTS+=("$part") + fi + done + done +} # Help function usage() { @@ -22,10 +44,11 @@ usage() { echo " --rebuild-vllm : Set cache bust for vllm" echo " --triton-ref : Triton commit SHA, branch or tag (default: 'v3.5.1')" echo " --vllm-ref : vLLM commit SHA, branch or tag (default: 'main')" - echo " -h, --copy-to-host : Host address to copy the image to (if not set, don't copy)" + echo " -c, --copy-to : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag." + echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " -u, --user : Username for ssh command (default: \$USER)" - echo " --no-build : Skip building, only copy image (requires --copy-to-host)" - echo " --help : Show this help message" + echo " --no-build : Skip building, only copy image (requires --copy-to)" + echo " -h, --help : Show this help message" exit 1 } @@ -37,18 +60,34 @@ while [[ "$#" -gt 0 ]]; do --rebuild-vllm) REBUILD_VLLM=true ;; --triton-ref) TRITON_REF="$2"; shift ;; --vllm-ref) VLLM_REF="$2"; shift ;; - -h|--copy-to-host) COPY_HOST="$2"; shift ;; + -c|--copy-to|--copy-to-host|--copy-to-hosts) + shift + if [ "$#" -eq 0 ]; then + echo "Error: --copy-to requires at least one host" + exit 1 + fi + EXISTING_HOSTS=${#COPY_HOSTS[@]} + while [[ "$#" -gt 0 && "$1" != -* ]]; do + add_copy_hosts "$1" + shift + done + if [ "${#COPY_HOSTS[@]}" -eq "$EXISTING_HOSTS" ]; then + echo "Error: --copy-to requires at least one host" + exit 1 + fi + continue + ;; -u|--user) SSH_USER="$2"; shift ;; --no-build) NO_BUILD=true ;; - --help) usage ;; + -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done # Validate --no-build usage -if [ "$NO_BUILD" = true ] && [ -z "$COPY_HOST" ]; then - echo "Error: --no-build requires --copy-to-host to be specified" +if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: --no-build requires --copy-to to be specified" exit 1 fi @@ -89,11 +128,23 @@ fi # Copy to host if requested COPY_TIME=0 -if [ -n "$COPY_HOST" ]; then - echo "Copying image '$IMAGE_TAG' to ${SSH_USER}@${COPY_HOST}..." +if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then + echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}" COPY_START=$(date +%s) - # Using the pipe method from README.md - docker save "$IMAGE_TAG" | ssh "${SSH_USER}@${COPY_HOST}" "docker load" + + TMP_IMAGE=$(mktemp -t vllm_image.XXXXXX) + echo "Saving image locally to $TMP_IMAGE..." + docker save -o "$TMP_IMAGE" "$IMAGE_TAG" + + for host in "${COPY_HOSTS[@]}"; do + echo "Loading image into ${SSH_USER}@${host}..." + HOST_COPY_START=$(date +%s) + cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load" + HOST_COPY_END=$(date +%s) + HOST_COPY_TIME=$((HOST_COPY_END - HOST_COPY_START)) + printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((HOST_COPY_TIME/3600)) $((HOST_COPY_TIME%3600/60)) $((HOST_COPY_TIME%60)) + done + COPY_END=$(date +%s) COPY_TIME=$((COPY_END - COPY_START)) echo "Copy complete." From 11355677f6ec4ca2799fbda0d7d6b833bcfbf1ad Mon Sep 17 00:00:00 2001 From: Eric Lewis Date: Thu, 18 Dec 2025 01:24:48 -0500 Subject: [PATCH 2/2] Add parallel copy option to build-and-copy.sh Introduced the --copy-parallel flag to enable concurrent copying of Docker images to multiple hosts. Updated the README with usage instructions and details about the new option. Refactored the script to support both serial and parallel copy modes for improved efficiency. --- README.md | 10 +++++++++- build-and-copy.sh | 50 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 114b75d..8040546 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,9 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run ### 2025-12-18 -Updated `build-and-copy.sh` to support copying to multiple hosts. +Updated `build-and-copy.sh` to support copying to multiple hosts. - Added `-c, --copy-to` (accepts space- or comma-separated host lists) and kept `--copy-to-host` as a backward-compatible alias. +- Added `--copy-parallel` to copy to all hosts concurrently. - Short `-h` is now used for help. ### 2025-12-15 @@ -99,6 +100,12 @@ Copy to multiple hosts (space- or comma-separated after the flag): ./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13 ``` +Copy to multiple hosts in parallel: + +```bash +./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13 --copy-parallel +``` + Using a different username: ```bash @@ -146,6 +153,7 @@ Using a different username: | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: 'main') | | `-c, --copy-to ` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). | | `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). | +| `--copy-parallel` | Copy to all specified hosts concurrently. | | `-u, --user ` | Username for SSH connection (default: current user) | | `--no-build` | Skip building, only copy existing image (requires `--copy-to`) | | `-h, --help` | Show help message | diff --git a/build-and-copy.sh b/build-and-copy.sh index ddc1b5f..8bf46f6 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -14,6 +14,7 @@ NO_BUILD=false TRITON_REF="v3.5.1" VLLM_REF="main" TMP_IMAGE="" +PARALLEL_COPY=false cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -36,6 +37,21 @@ add_copy_hosts() { done } +copy_to_host() { + local host="$1" + echo "Loading image into ${SSH_USER}@${host}..." + local host_copy_start host_copy_end host_copy_time + host_copy_start=$(date +%s) + if cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load"; then + host_copy_end=$(date +%s) + host_copy_time=$((host_copy_end - host_copy_start)) + printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((host_copy_time/3600)) $((host_copy_time%3600/60)) $((host_copy_time%60)) + else + echo "Copy to $host failed." + return 1 + fi +} + # Help function usage() { echo "Usage: $0 [OPTIONS]" @@ -46,6 +62,7 @@ usage() { echo " --vllm-ref : vLLM commit SHA, branch or tag (default: 'main')" echo " -c, --copy-to : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag." echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." + echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " -u, --user : Username for ssh command (default: \$USER)" echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " -h, --help : Show this help message" @@ -78,6 +95,7 @@ while [[ "$#" -gt 0 ]]; do continue ;; -u|--user) SSH_USER="$2"; shift ;; + --copy-parallel) PARALLEL_COPY=true ;; --no-build) NO_BUILD=true ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; @@ -130,20 +148,36 @@ fi COPY_TIME=0 if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}" + if [ "$PARALLEL_COPY" = true ]; then + echo "Parallel copy enabled." + fi COPY_START=$(date +%s) TMP_IMAGE=$(mktemp -t vllm_image.XXXXXX) echo "Saving image locally to $TMP_IMAGE..." docker save -o "$TMP_IMAGE" "$IMAGE_TAG" - for host in "${COPY_HOSTS[@]}"; do - echo "Loading image into ${SSH_USER}@${host}..." - HOST_COPY_START=$(date +%s) - cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load" - HOST_COPY_END=$(date +%s) - HOST_COPY_TIME=$((HOST_COPY_END - HOST_COPY_START)) - printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((HOST_COPY_TIME/3600)) $((HOST_COPY_TIME%3600/60)) $((HOST_COPY_TIME%60)) - done + if [ "$PARALLEL_COPY" = true ]; then + PIDS=() + for host in "${COPY_HOSTS[@]}"; do + copy_to_host "$host" & + PIDS+=($!) + done + COPY_FAILURE=0 + for pid in "${PIDS[@]}"; do + if ! wait "$pid"; then + COPY_FAILURE=1 + fi + done + if [ "$COPY_FAILURE" -ne 0 ]; then + echo "One or more copies failed." + exit 1 + fi + else + for host in "${COPY_HOSTS[@]}"; do + copy_to_host "$host" + done + fi COPY_END=$(date +%s) COPY_TIME=$((COPY_END - COPY_START))