Add parallel copy option to build-and-copy.sh

Introduced the --copy-parallel flag to enable concurrent copying of Docker images to multiple hosts. Updated the README with usage instructions and details about the new option. Refactored the script to support both serial and parallel copy modes for improved efficiency.
This commit is contained in:
Eric Lewis
2025-12-18 01:24:48 -05:00
parent e67abd5e6e
commit 11355677f6
2 changed files with 51 additions and 9 deletions

View File

@@ -14,8 +14,9 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run
### 2025-12-18
Updated `build-and-copy.sh` to support copying to multiple hosts.
Updated `build-and-copy.sh` to support copying to multiple hosts.
- Added `-c, --copy-to` (accepts space- or comma-separated host lists) and kept `--copy-to-host` as a backward-compatible alias.
- Added `--copy-parallel` to copy to all hosts concurrently.
- Short `-h` is now used for help.
### 2025-12-15
@@ -99,6 +100,12 @@ Copy to multiple hosts (space- or comma-separated after the flag):
./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13
```
Copy to multiple hosts in parallel:
```bash
./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13 --copy-parallel
```
Using a different username:
```bash
@@ -146,6 +153,7 @@ Using a different username:
| `--vllm-ref <ref>` | vLLM commit SHA, branch or tag (default: 'main') |
| `-c, --copy-to <host[,host...] or host host...>` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). |
| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). |
| `--copy-parallel` | Copy to all specified hosts concurrently. |
| `-u, --user <user>` | Username for SSH connection (default: current user) |
| `--no-build` | Skip building, only copy existing image (requires `--copy-to`) |
| `-h, --help` | Show help message |

View File

@@ -14,6 +14,7 @@ NO_BUILD=false
TRITON_REF="v3.5.1"
VLLM_REF="main"
TMP_IMAGE=""
PARALLEL_COPY=false
cleanup() {
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
@@ -36,6 +37,21 @@ add_copy_hosts() {
done
}
copy_to_host() {
local host="$1"
echo "Loading image into ${SSH_USER}@${host}..."
local host_copy_start host_copy_end host_copy_time
host_copy_start=$(date +%s)
if cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load"; then
host_copy_end=$(date +%s)
host_copy_time=$((host_copy_end - host_copy_start))
printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((host_copy_time/3600)) $((host_copy_time%3600/60)) $((host_copy_time%60))
else
echo "Copy to $host failed."
return 1
fi
}
# Help function
usage() {
echo "Usage: $0 [OPTIONS]"
@@ -46,6 +62,7 @@ usage() {
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag."
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
echo " --no-build : Skip building, only copy image (requires --copy-to)"
echo " -h, --help : Show this help message"
@@ -78,6 +95,7 @@ while [[ "$#" -gt 0 ]]; do
continue
;;
-u|--user) SSH_USER="$2"; shift ;;
--copy-parallel) PARALLEL_COPY=true ;;
--no-build) NO_BUILD=true ;;
-h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;;
@@ -130,20 +148,36 @@ fi
COPY_TIME=0
if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then
echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}"
if [ "$PARALLEL_COPY" = true ]; then
echo "Parallel copy enabled."
fi
COPY_START=$(date +%s)
TMP_IMAGE=$(mktemp -t vllm_image.XXXXXX)
echo "Saving image locally to $TMP_IMAGE..."
docker save -o "$TMP_IMAGE" "$IMAGE_TAG"
for host in "${COPY_HOSTS[@]}"; do
echo "Loading image into ${SSH_USER}@${host}..."
HOST_COPY_START=$(date +%s)
cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load"
HOST_COPY_END=$(date +%s)
HOST_COPY_TIME=$((HOST_COPY_END - HOST_COPY_START))
printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((HOST_COPY_TIME/3600)) $((HOST_COPY_TIME%3600/60)) $((HOST_COPY_TIME%60))
done
if [ "$PARALLEL_COPY" = true ]; then
PIDS=()
for host in "${COPY_HOSTS[@]}"; do
copy_to_host "$host" &
PIDS+=($!)
done
COPY_FAILURE=0
for pid in "${PIDS[@]}"; do
if ! wait "$pid"; then
COPY_FAILURE=1
fi
done
if [ "$COPY_FAILURE" -ne 0 ]; then
echo "One or more copies failed."
exit 1
fi
else
for host in "${COPY_HOSTS[@]}"; do
copy_to_host "$host"
done
fi
COPY_END=$(date +%s)
COPY_TIME=$((COPY_END - COPY_START))