Introduced the --copy-parallel flag to enable concurrent copying of Docker images to multiple hosts. Updated the README with usage instructions and details about the new option. Refactored the script to support both serial and parallel copy modes for improved efficiency.
207 lines
6.2 KiB
Bash
Executable File
207 lines
6.2 KiB
Bash
Executable File
#!/bin/bash
|
|
set -e
|
|
|
|
# Start total time tracking
|
|
START_TIME=$(date +%s)
|
|
|
|
# Default values
|
|
IMAGE_TAG="vllm-node"
|
|
REBUILD_DEPS=false
|
|
REBUILD_VLLM=false
|
|
COPY_HOSTS=()
|
|
SSH_USER="$USER"
|
|
NO_BUILD=false
|
|
TRITON_REF="v3.5.1"
|
|
VLLM_REF="main"
|
|
TMP_IMAGE=""
|
|
PARALLEL_COPY=false
|
|
|
|
cleanup() {
|
|
if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then
|
|
rm -f "$TMP_IMAGE"
|
|
fi
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
add_copy_hosts() {
|
|
local token part
|
|
for token in "$@"; do
|
|
IFS=',' read -ra PARTS <<< "$token"
|
|
for part in "${PARTS[@]}"; do
|
|
part="${part//[[:space:]]/}"
|
|
if [ -n "$part" ]; then
|
|
COPY_HOSTS+=("$part")
|
|
fi
|
|
done
|
|
done
|
|
}
|
|
|
|
copy_to_host() {
|
|
local host="$1"
|
|
echo "Loading image into ${SSH_USER}@${host}..."
|
|
local host_copy_start host_copy_end host_copy_time
|
|
host_copy_start=$(date +%s)
|
|
if cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load"; then
|
|
host_copy_end=$(date +%s)
|
|
host_copy_time=$((host_copy_end - host_copy_start))
|
|
printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((host_copy_time/3600)) $((host_copy_time%3600/60)) $((host_copy_time%60))
|
|
else
|
|
echo "Copy to $host failed."
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Help function
|
|
usage() {
|
|
echo "Usage: $0 [OPTIONS]"
|
|
echo " -t, --tag <tag> : Image tag (default: 'vllm-node')"
|
|
echo " --rebuild-deps : Set cache bust for dependencies"
|
|
echo " --rebuild-vllm : Set cache bust for vllm"
|
|
echo " --triton-ref <ref> : Triton commit SHA, branch or tag (default: 'v3.5.1')"
|
|
echo " --vllm-ref <ref> : vLLM commit SHA, branch or tag (default: 'main')"
|
|
echo " -c, --copy-to <hosts> : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag."
|
|
echo " --copy-to-host : Alias for --copy-to (backwards compatibility)."
|
|
echo " --copy-parallel : Copy to all hosts in parallel instead of serially."
|
|
echo " -u, --user <user> : Username for ssh command (default: \$USER)"
|
|
echo " --no-build : Skip building, only copy image (requires --copy-to)"
|
|
echo " -h, --help : Show this help message"
|
|
exit 1
|
|
}
|
|
|
|
# Argument parsing
|
|
while [[ "$#" -gt 0 ]]; do
|
|
case $1 in
|
|
-t|--tag) IMAGE_TAG="$2"; shift ;;
|
|
--rebuild-deps) REBUILD_DEPS=true ;;
|
|
--rebuild-vllm) REBUILD_VLLM=true ;;
|
|
--triton-ref) TRITON_REF="$2"; shift ;;
|
|
--vllm-ref) VLLM_REF="$2"; shift ;;
|
|
-c|--copy-to|--copy-to-host|--copy-to-hosts)
|
|
shift
|
|
if [ "$#" -eq 0 ]; then
|
|
echo "Error: --copy-to requires at least one host"
|
|
exit 1
|
|
fi
|
|
EXISTING_HOSTS=${#COPY_HOSTS[@]}
|
|
while [[ "$#" -gt 0 && "$1" != -* ]]; do
|
|
add_copy_hosts "$1"
|
|
shift
|
|
done
|
|
if [ "${#COPY_HOSTS[@]}" -eq "$EXISTING_HOSTS" ]; then
|
|
echo "Error: --copy-to requires at least one host"
|
|
exit 1
|
|
fi
|
|
continue
|
|
;;
|
|
-u|--user) SSH_USER="$2"; shift ;;
|
|
--copy-parallel) PARALLEL_COPY=true ;;
|
|
--no-build) NO_BUILD=true ;;
|
|
-h|--help) usage ;;
|
|
*) echo "Unknown parameter passed: $1"; usage ;;
|
|
esac
|
|
shift
|
|
done
|
|
|
|
# Validate --no-build usage
|
|
if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then
|
|
echo "Error: --no-build requires --copy-to to be specified"
|
|
exit 1
|
|
fi
|
|
|
|
# Build image (unless --no-build is set)
|
|
BUILD_TIME=0
|
|
if [ "$NO_BUILD" = false ]; then
|
|
# Construct build command
|
|
CMD=("docker" "build" "-t" "$IMAGE_TAG")
|
|
|
|
if [ "$REBUILD_DEPS" = true ]; then
|
|
echo "Setting CACHEBUST_DEPS..."
|
|
CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)")
|
|
fi
|
|
|
|
if [ "$REBUILD_VLLM" = true ]; then
|
|
echo "Setting CACHEBUST_VLLM..."
|
|
CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)")
|
|
fi
|
|
|
|
# Add TRITON_REF to build arguments
|
|
CMD+=("--build-arg" "TRITON_REF=$TRITON_REF")
|
|
|
|
# Add VLLM_REF to build arguments
|
|
CMD+=("--build-arg" "VLLM_REF=$VLLM_REF")
|
|
|
|
# Add build context
|
|
CMD+=(".")
|
|
|
|
# Execute build
|
|
echo "Building image with command: ${CMD[*]}"
|
|
BUILD_START=$(date +%s)
|
|
"${CMD[@]}"
|
|
BUILD_END=$(date +%s)
|
|
BUILD_TIME=$((BUILD_END - BUILD_START))
|
|
else
|
|
echo "Skipping build (--no-build specified)"
|
|
fi
|
|
|
|
# Copy to host if requested
|
|
COPY_TIME=0
|
|
if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then
|
|
echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}"
|
|
if [ "$PARALLEL_COPY" = true ]; then
|
|
echo "Parallel copy enabled."
|
|
fi
|
|
COPY_START=$(date +%s)
|
|
|
|
TMP_IMAGE=$(mktemp -t vllm_image.XXXXXX)
|
|
echo "Saving image locally to $TMP_IMAGE..."
|
|
docker save -o "$TMP_IMAGE" "$IMAGE_TAG"
|
|
|
|
if [ "$PARALLEL_COPY" = true ]; then
|
|
PIDS=()
|
|
for host in "${COPY_HOSTS[@]}"; do
|
|
copy_to_host "$host" &
|
|
PIDS+=($!)
|
|
done
|
|
COPY_FAILURE=0
|
|
for pid in "${PIDS[@]}"; do
|
|
if ! wait "$pid"; then
|
|
COPY_FAILURE=1
|
|
fi
|
|
done
|
|
if [ "$COPY_FAILURE" -ne 0 ]; then
|
|
echo "One or more copies failed."
|
|
exit 1
|
|
fi
|
|
else
|
|
for host in "${COPY_HOSTS[@]}"; do
|
|
copy_to_host "$host"
|
|
done
|
|
fi
|
|
|
|
COPY_END=$(date +%s)
|
|
COPY_TIME=$((COPY_END - COPY_START))
|
|
echo "Copy complete."
|
|
else
|
|
echo "No host specified, skipping copy."
|
|
fi
|
|
|
|
# Calculate total time
|
|
END_TIME=$(date +%s)
|
|
TOTAL_TIME=$((END_TIME - START_TIME))
|
|
|
|
# Display timing statistics
|
|
echo ""
|
|
echo "========================================="
|
|
echo " TIMING STATISTICS"
|
|
echo "========================================="
|
|
if [ "$BUILD_TIME" -gt 0 ]; then
|
|
echo "Docker Build: $(printf '%02d:%02d:%02d' $((BUILD_TIME/3600)) $((BUILD_TIME%3600/60)) $((BUILD_TIME%60)))"
|
|
fi
|
|
if [ "$COPY_TIME" -gt 0 ]; then
|
|
echo "Image Copy: $(printf '%02d:%02d:%02d' $((COPY_TIME/3600)) $((COPY_TIME%3600/60)) $((COPY_TIME%60)))"
|
|
fi
|
|
echo "Total Time: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))"
|
|
echo "========================================="
|
|
echo "Done."
|