From e67abd5e6ef36df11cd57a36ac3bcf2ddcfdbc48 Mon Sep 17 00:00:00 2001 From: Eric Lewis Date: Thu, 18 Dec 2025 00:30:27 -0500 Subject: [PATCH] Add multi-host copy support to build-and-copy.sh Updated build-and-copy.sh to support copying Docker images to multiple hosts using the new -c/--copy-to flag, which accepts space- or comma-separated host lists. The old --copy-to-host flag is retained as an alias for backward compatibility, and -h is now used for help. The README was updated to document these changes and provide new usage examples. --- README.md | 36 +++++++++++++++-------- build-and-copy.sh | 75 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index cfa201e..114b75d 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,12 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run ## CHANGELOG +### 2025-12-18 + +Updated `build-and-copy.sh` to support copying to multiple hosts. +- Added `-c, --copy-to` (accepts space- or comma-separated host lists) and kept `--copy-to-host` as a backward-compatible alias. +- Short `-h` is now used for help. + ### 2025-12-15 Updated `build-and-copy.sh` flags: @@ -28,7 +34,7 @@ Triton is now being built from the source, alongside with its companion triton_k Added new flags to `build-and-copy.sh`: - `--triton-sha `: Specify Triton commit SHA (defaults to v3.5.1 currently) -- `--no-build`: Skip building and only copy existing image (requires `--copy-to-host`) +- `--no-build`: Skip building and only copy existing image (requires `--copy-to`) ### 2025-12-11 update @@ -65,7 +71,7 @@ Using a provided build script is recommended, but if you want to build using `do ### Using the Build Script (Recommended) -The `build-and-copy.sh` script automates the build process and optionally copies the image to another node. This is the recommended method for building and deploying to multiple Spark nodes. +The `build-and-copy.sh` script automates the build process and optionally copies the image to one or more nodes. This is the recommended method for building and deploying to multiple Spark nodes. **Basic usage (build only):** @@ -79,18 +85,24 @@ The `build-and-copy.sh` script automates the build process and optionally copies ./build-and-copy.sh --tag my-vllm-node ``` -**Build and copy to another Spark node:** +**Build and copy to Spark node(s):** -Using the same username as currently logged-in user: +Using the same username as currently logged-in user (single host): ```bash -./build-and-copy.sh --copy-to-host 192.168.177.12 +./build-and-copy.sh --copy-to 192.168.177.12 +``` + +Copy to multiple hosts (space- or comma-separated after the flag): + +```bash +./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13 ``` Using a different username: ```bash -./build-and-copy.sh --copy-to-host 192.168.177.12 --user your_username +./build-and-copy.sh --copy-to 192.168.177.12 --user your_username ``` **Force rebuild vLLM source only:** @@ -108,7 +120,7 @@ Using a different username: **Combined example (rebuild vLLM and copy to another node):** ```bash -./build-and-copy.sh --rebuild-vllm --copy-to-host 192.168.177.12 +./build-and-copy.sh --rebuild-vllm --copy-to 192.168.177.12 ``` **Build with specific Triton commit:** @@ -120,7 +132,7 @@ Using a different username: **Copy existing image without rebuilding:** ```bash -./build-and-copy.sh --no-build --copy-to-host 192.168.177.12 +./build-and-copy.sh --no-build --copy-to 192.168.177.12 ``` **Available options:** @@ -132,10 +144,11 @@ Using a different username: | `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) | | `--triton-ref ` | Triton commit SHA, branch or tag (default: 'v3.5.1') | | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: 'main') | -| `-h, --copy-to-host ` | Host address to copy the image to after building | +| `-c, --copy-to ` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). | +| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). | | `-u, --user ` | Username for SSH connection (default: current user) | -| `--no-build` | Skip building, only copy existing image (requires `--copy-to-host`) | -| `--help` | Show help message | +| `--no-build` | Skip building, only copy existing image (requires `--copy-to`) | +| `-h, --help` | Show help message | **IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! @@ -330,4 +343,3 @@ Modify `--num-prompts` to benchmark concurrent requests - the command above will ### Hardware Architecture **Note:** The Dockerfile defaults to `TORCH_CUDA_ARCH_LIST=12.1a` (NVIDIA GB10). If you are using different hardware, update the `ENV` variable in the Dockerfile before building. - diff --git a/build-and-copy.sh b/build-and-copy.sh index 19a2c11..ddc1b5f 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -8,11 +8,33 @@ START_TIME=$(date +%s) IMAGE_TAG="vllm-node" REBUILD_DEPS=false REBUILD_VLLM=false -COPY_HOST="" +COPY_HOSTS=() SSH_USER="$USER" NO_BUILD=false TRITON_REF="v3.5.1" VLLM_REF="main" +TMP_IMAGE="" + +cleanup() { + if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then + rm -f "$TMP_IMAGE" + fi +} + +trap cleanup EXIT + +add_copy_hosts() { + local token part + for token in "$@"; do + IFS=',' read -ra PARTS <<< "$token" + for part in "${PARTS[@]}"; do + part="${part//[[:space:]]/}" + if [ -n "$part" ]; then + COPY_HOSTS+=("$part") + fi + done + done +} # Help function usage() { @@ -22,10 +44,11 @@ usage() { echo " --rebuild-vllm : Set cache bust for vllm" echo " --triton-ref : Triton commit SHA, branch or tag (default: 'v3.5.1')" echo " --vllm-ref : vLLM commit SHA, branch or tag (default: 'main')" - echo " -h, --copy-to-host : Host address to copy the image to (if not set, don't copy)" + echo " -c, --copy-to : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag." + echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " -u, --user : Username for ssh command (default: \$USER)" - echo " --no-build : Skip building, only copy image (requires --copy-to-host)" - echo " --help : Show this help message" + echo " --no-build : Skip building, only copy image (requires --copy-to)" + echo " -h, --help : Show this help message" exit 1 } @@ -37,18 +60,34 @@ while [[ "$#" -gt 0 ]]; do --rebuild-vllm) REBUILD_VLLM=true ;; --triton-ref) TRITON_REF="$2"; shift ;; --vllm-ref) VLLM_REF="$2"; shift ;; - -h|--copy-to-host) COPY_HOST="$2"; shift ;; + -c|--copy-to|--copy-to-host|--copy-to-hosts) + shift + if [ "$#" -eq 0 ]; then + echo "Error: --copy-to requires at least one host" + exit 1 + fi + EXISTING_HOSTS=${#COPY_HOSTS[@]} + while [[ "$#" -gt 0 && "$1" != -* ]]; do + add_copy_hosts "$1" + shift + done + if [ "${#COPY_HOSTS[@]}" -eq "$EXISTING_HOSTS" ]; then + echo "Error: --copy-to requires at least one host" + exit 1 + fi + continue + ;; -u|--user) SSH_USER="$2"; shift ;; --no-build) NO_BUILD=true ;; - --help) usage ;; + -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done # Validate --no-build usage -if [ "$NO_BUILD" = true ] && [ -z "$COPY_HOST" ]; then - echo "Error: --no-build requires --copy-to-host to be specified" +if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: --no-build requires --copy-to to be specified" exit 1 fi @@ -89,11 +128,23 @@ fi # Copy to host if requested COPY_TIME=0 -if [ -n "$COPY_HOST" ]; then - echo "Copying image '$IMAGE_TAG' to ${SSH_USER}@${COPY_HOST}..." +if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then + echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}" COPY_START=$(date +%s) - # Using the pipe method from README.md - docker save "$IMAGE_TAG" | ssh "${SSH_USER}@${COPY_HOST}" "docker load" + + TMP_IMAGE=$(mktemp -t vllm_image.XXXXXX) + echo "Saving image locally to $TMP_IMAGE..." + docker save -o "$TMP_IMAGE" "$IMAGE_TAG" + + for host in "${COPY_HOSTS[@]}"; do + echo "Loading image into ${SSH_USER}@${host}..." + HOST_COPY_START=$(date +%s) + cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load" + HOST_COPY_END=$(date +%s) + HOST_COPY_TIME=$((HOST_COPY_END - HOST_COPY_START)) + printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((HOST_COPY_TIME/3600)) $((HOST_COPY_TIME%3600/60)) $((HOST_COPY_TIME%60)) + done + COPY_END=$(date +%s) COPY_TIME=$((COPY_END - COPY_START)) echo "Copy complete."