Files
spark-vllm-docker/build-and-copy.sh
Eric Lewis 11355677f6 Add parallel copy option to build-and-copy.sh
Introduced the --copy-parallel flag to enable concurrent copying of Docker images to multiple hosts. Updated the README with usage instructions and details about the new option. Refactored the script to support both serial and parallel copy modes for improved efficiency.
2025-12-18 01:24:48 -05:00

207 lines
6.2 KiB
Bash
Executable File

#!/bin/bash
set -e
# Start total time tracking
START_TIME=$(date +%s)
# Default values
IMAGE_TAG="vllm-node"
REBUILD_DEPS=false
REBUILD_VLLM=false
COPY_HOSTS=()
SSH_USER="$USER"
NO_BUILD=false
TRITON_REF="v3.5.1"
VLLM_REF="main"
TMP_IMAGE=""
PARALLEL_COPY=false
cleanup() {
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
rm -f "$TMP_IMAGE"
fi
}
trap cleanup EXIT
add_copy_hosts() {
local token part
for token in "$@"; do
IFS=',' read -ra PARTS <<< "$token"
for part in "${PARTS[@]}"; do
part="${part//[[:space:]]/}"
if [ -n "$part" ]; then
COPY_HOSTS+=("$part")
fi
done
done
}
copy_to_host() {
local host="$1"
echo "Loading image into ${SSH_USER}@${host}..."
local host_copy_start host_copy_end host_copy_time
host_copy_start=$(date +%s)
if cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load"; then
host_copy_end=$(date +%s)
host_copy_time=$((host_copy_end - host_copy_start))
printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((host_copy_time/3600)) $((host_copy_time%3600/60)) $((host_copy_time%60))
else
echo "Copy to $host failed."
return 1
fi
}
# Help function
usage() {
echo "Usage: $0 [OPTIONS]"
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
echo " --rebuild-deps : Set cache bust for dependencies"
echo " --rebuild-vllm : Set cache bust for vllm"
echo " --triton-ref <ref> : Triton commit SHA, branch or tag (default: 'v3.5.1')"
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag."
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
echo " --no-build : Skip building, only copy image (requires --copy-to)"
echo " -h, --help : Show this help message"
exit 1
}
# Argument parsing
while [[ "$#" -gt 0 ]]; do
case $1 in
-t|--tag) IMAGE_TAG="$2"; shift ;;
--rebuild-deps) REBUILD_DEPS=true ;;
--rebuild-vllm) REBUILD_VLLM=true ;;
--triton-ref) TRITON_REF="$2"; shift ;;
--vllm-ref) VLLM_REF="$2"; shift ;;
-c|--copy-to|--copy-to-host|--copy-to-hosts)
shift
if [ "$#" -eq 0 ]; then
echo "Error: --copy-to requires at least one host"
exit 1
fi
EXISTING_HOSTS=${#COPY_HOSTS[@]}
while [[ "$#" -gt 0 && "$1" != -* ]]; do
add_copy_hosts "$1"
shift
done
if [ "${#COPY_HOSTS[@]}" -eq "$EXISTING_HOSTS" ]; then
echo "Error: --copy-to requires at least one host"
exit 1
fi
continue
;;
-u|--user) SSH_USER="$2"; shift ;;
--copy-parallel) PARALLEL_COPY=true ;;
--no-build) NO_BUILD=true ;;
-h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;;
esac
shift
done
# Validate --no-build usage
if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
echo "Error: --no-build requires --copy-to to be specified"
exit 1
fi
# Build image (unless --no-build is set)
BUILD_TIME=0
if [ "$NO_BUILD" = false ]; then
# Construct build command
CMD=("docker" "build" "-t" "$IMAGE_TAG")
if [ "$REBUILD_DEPS" = true ]; then
echo "Setting CACHEBUST_DEPS..."
CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")
fi
if [ "$REBUILD_VLLM" = true ]; then
echo "Setting CACHEBUST_VLLM..."
CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
fi
# Add TRITON_REF to build arguments
CMD+=("--build-arg" "TRITON_REF=$TRITON_REF")
# Add VLLM_REF to build arguments
CMD+=("--build-arg" "VLLM_REF=$VLLM_REF")
# Add build context
CMD+=(".")
# Execute build
echo "Building image with command: ${CMD[*]}"
BUILD_START=$(date +%s)
"${CMD[@]}"
BUILD_END=$(date +%s)
BUILD_TIME=$((BUILD_END - BUILD_START))
else
echo "Skipping build (--no-build specified)"
fi
# Copy to host if requested
COPY_TIME=0
if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then
echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}"
if [ "$PARALLEL_COPY" = true ]; then
echo "Parallel copy enabled."
fi
COPY_START=$(date +%s)
TMP_IMAGE=$(mktemp -t vllm_image.XXXXXX)
echo "Saving image locally to $TMP_IMAGE..."
docker save -o "$TMP_IMAGE" "$IMAGE_TAG"
if [ "$PARALLEL_COPY" = true ]; then
PIDS=()
for host in "${COPY_HOSTS[@]}"; do
copy_to_host "$host" &
PIDS+=($!)
done
COPY_FAILURE=0
for pid in "${PIDS[@]}"; do
if ! wait "$pid"; then
COPY_FAILURE=1
fi
done
if [ "$COPY_FAILURE" -ne 0 ]; then
echo "One or more copies failed."
exit 1
fi
else
for host in "${COPY_HOSTS[@]}"; do
copy_to_host "$host"
done
fi
COPY_END=$(date +%s)
COPY_TIME=$((COPY_END - COPY_START))
echo "Copy complete."
else
echo "No host specified, skipping copy."
fi
# Calculate total time
END_TIME=$(date +%s)
TOTAL_TIME=$((END_TIME - START_TIME))
# Display timing statistics
echo ""
echo "========================================="
echo " TIMING STATISTICS"
echo "========================================="
if [ "$BUILD_TIME" -gt 0 ]; then
echo "Docker Build: $(printf '%02d:%02d:%02d' $((BUILD_TIME/3600)) $((BUILD_TIME%3600/60)) $((BUILD_TIME%60)))"
fi
if [ "$COPY_TIME" -gt 0 ]; then
echo "Image Copy: $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))"
fi
echo "Total Time: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
echo "========================================="
echo "Done."