diff --git a/README.md b/README.md index 06cfe09..f4fa77c 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,18 @@ The Dockerfile builds from the main branch of VLLM, so depending on when you run ## CHANGELOG +### 2025-12-19 + +Updated `build-and-copy.sh` to support copying to multiple hosts (thanks @eric-humane for the contribution). +- Added `-c, --copy-to` (accepts space- or comma-separated host lists) and kept `--copy-to-host` as a backward-compatible alias. +- Added `--copy-parallel` to copy to all hosts concurrently. +- **BREAKING CHANGE**: Short `-h` argument is now used for help. Use `-c` for copy. + ### 2025-12-18 -Added `launch-cluster.sh` convenience script for basic cluster management - see details below. - -Added `-j` / `--build-jobs` argument to `build-and-copy.sh` to control build parallelism. -Added `--nccl-debug` option to specify NCCL debug level. Default is none to decrease verbosity. +- Added `launch-cluster.sh` convenience script for basic cluster management - see details below. +- Added `-j` / `--build-jobs` argument to `build-and-copy.sh` to control build parallelism. +- Added `--nccl-debug` option to specify NCCL debug level. Default is none to decrease verbosity. ### 2025-12-15 @@ -48,7 +54,7 @@ Triton is now being built from the source, alongside with its companion triton_k Added new flags to `build-and-copy.sh`: - `--triton-sha `: Specify Triton commit SHA (defaults to v3.5.1 currently) -- `--no-build`: Skip building and only copy existing image (requires `--copy-to-host`) +- `--no-build`: Skip building and only copy existing image (requires `--copy-to`) ### 2025-12-11 update @@ -86,7 +92,7 @@ Using a provided build script is recommended, but if you want to build using `do ### Using the Build Script (Recommended) -The `build-and-copy.sh` script automates the build process and optionally copies the image to another node. This is the recommended method for building and deploying to multiple Spark nodes. +The `build-and-copy.sh` script automates the build process and optionally copies the image to one or more nodes. This is the recommended method for building and deploying to multiple Spark nodes. **Basic usage (build only):** @@ -100,18 +106,30 @@ The `build-and-copy.sh` script automates the build process and optionally copies ./build-and-copy.sh --tag my-vllm-node ``` -**Build and copy to another Spark node:** +**Build and copy to Spark node(s):** -Using the same username as currently logged-in user: +Using the same username as currently logged-in user (single host): ```bash -./build-and-copy.sh --copy-to-host 192.168.177.12 +./build-and-copy.sh --copy-to 192.168.177.12 +``` + +Copy to multiple hosts (space- or comma-separated after the flag): + +```bash +./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13 +``` + +Copy to multiple hosts in parallel: + +```bash +./build-and-copy.sh --copy-to 192.168.177.12 192.168.177.13 --copy-parallel ``` Using a different username: ```bash -./build-and-copy.sh --copy-to-host 192.168.177.12 --user your_username +./build-and-copy.sh --copy-to 192.168.177.12 --user your_username ``` **Force rebuild vLLM source only:** @@ -129,7 +147,7 @@ Using a different username: **Combined example (rebuild vLLM and copy to another node):** ```bash -./build-and-copy.sh --rebuild-vllm --copy-to-host 192.168.177.12 +./build-and-copy.sh --rebuild-vllm --copy-to 192.168.177.12 ``` **Build with specific Triton commit:** @@ -141,7 +159,7 @@ Using a different username: **Copy existing image without rebuilding:** ```bash -./build-and-copy.sh --no-build --copy-to-host 192.168.177.12 +./build-and-copy.sh --no-build --copy-to 192.168.177.12 ``` **Available options:** @@ -153,11 +171,13 @@ Using a different username: | `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) | | `--triton-ref ` | Triton commit SHA, branch or tag (default: 'v3.5.1') | | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: 'main') | +| `-c, --copy-to ` | Host(s) to copy the image to after building (space- or comma-separated list after the flag). | +| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). | +| `--copy-parallel` | Copy to all specified hosts concurrently. | | `-j, --build-jobs ` | Number of parallel build jobs (default: Dockerfile default) | -| `-h, --copy-to-host ` | Host address to copy the image to after building | | `-u, --user ` | Username for SSH connection (default: current user) | -| `--no-build` | Skip building, only copy existing image (requires `--copy-to-host`) | -| `--help` | Show help message | +| `--no-build` | Skip building, only copy existing image (requires `--copy-to`) | +| `-h, --help` | Show help message | **IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! @@ -430,4 +450,3 @@ Modify `--num-prompts` to benchmark concurrent requests - the command above will ### Hardware Architecture **Note:** The Dockerfile defaults to `TORCH_CUDA_ARCH_LIST=12.1a` (NVIDIA GB10). If you are using different hardware, update the `ENV` variable in the Dockerfile before building. - diff --git a/build-and-copy.sh b/build-and-copy.sh index 01bfd7e..ae56d58 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -8,11 +8,50 @@ START_TIME=$(date +%s) IMAGE_TAG="vllm-node" REBUILD_DEPS=false REBUILD_VLLM=false -COPY_HOST="" +COPY_HOSTS=() SSH_USER="$USER" NO_BUILD=false TRITON_REF="v3.5.1" VLLM_REF="main" +TMP_IMAGE="" +PARALLEL_COPY=false + +cleanup() { + if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then + echo "Cleaning up temporary image $TMP_IMAGE" + rm -f "$TMP_IMAGE" + fi +} + +trap cleanup EXIT + +add_copy_hosts() { + local token part + for token in "$@"; do + IFS=',' read -ra PARTS <<< "$token" + for part in "${PARTS[@]}"; do + part="${part//[[:space:]]/}" + if [ -n "$part" ]; then + COPY_HOSTS+=("$part") + fi + done + done +} + +copy_to_host() { + local host="$1" + echo "Loading image into ${SSH_USER}@${host}..." + local host_copy_start host_copy_end host_copy_time + host_copy_start=$(date +%s) + if cat "$TMP_IMAGE" | ssh "${SSH_USER}@${host}" "docker load"; then + host_copy_end=$(date +%s) + host_copy_time=$((host_copy_end - host_copy_start)) + printf "Copy to %s completed in %02d:%02d:%02d\n" "$host" $((host_copy_time/3600)) $((host_copy_time%3600/60)) $((host_copy_time%60)) + else + echo "Copy to $host failed." + return 1 + fi +} BUILD_JOBS="16" # Help function @@ -23,11 +62,13 @@ usage() { echo " --rebuild-vllm : Set cache bust for vllm" echo " --triton-ref : Triton commit SHA, branch or tag (default: 'v3.5.1')" echo " --vllm-ref : vLLM commit SHA, branch or tag (default: 'main')" + echo " -c, --copy-to : Host(s) to copy the image to. Accepts comma or space-delimited lists after the flag." + echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." + echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " -j, --build-jobs : Number of concurrent build jobs (default: \${BUILD_JOBS})" - echo " -h, --copy-to-host : Host address to copy the image to (if not set, don't copy)" echo " -u, --user : Username for ssh command (default: \$USER)" - echo " --no-build : Skip building, only copy image (requires --copy-to-host)" - echo " --help : Show this help message" + echo " --no-build : Skip building, only copy image (requires --copy-to)" + echo " -h, --help : Show this help message" exit 1 } @@ -39,19 +80,36 @@ while [[ "$#" -gt 0 ]]; do --rebuild-vllm) REBUILD_VLLM=true ;; --triton-ref) TRITON_REF="$2"; shift ;; --vllm-ref) VLLM_REF="$2"; shift ;; + -c|--copy-to|--copy-to-host|--copy-to-hosts) + shift + if [ "$#" -eq 0 ]; then + echo "Error: --copy-to requires at least one host" + exit 1 + fi + EXISTING_HOSTS=${#COPY_HOSTS[@]} + while [[ "$#" -gt 0 && "$1" != -* ]]; do + add_copy_hosts "$1" + shift + done + if [ "${#COPY_HOSTS[@]}" -eq "$EXISTING_HOSTS" ]; then + echo "Error: --copy-to requires at least one host" + exit 1 + fi + continue + ;; -j|--build-jobs) BUILD_JOBS="$2"; shift ;; - -h|--copy-to-host) COPY_HOST="$2"; shift ;; -u|--user) SSH_USER="$2"; shift ;; + --copy-parallel) PARALLEL_COPY=true ;; --no-build) NO_BUILD=true ;; - --help) usage ;; + -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done # Validate --no-build usage -if [ "$NO_BUILD" = true ] && [ -z "$COPY_HOST" ]; then - echo "Error: --no-build requires --copy-to-host to be specified" +if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: --no-build requires --copy-to to be specified" exit 1 fi @@ -95,11 +153,39 @@ fi # Copy to host if requested COPY_TIME=0 -if [ -n "$COPY_HOST" ]; then - echo "Copying image '$IMAGE_TAG' to ${SSH_USER}@${COPY_HOST}..." +if [ "${#COPY_HOSTS[@]}" -gt 0 ]; then + echo "Copying image '$IMAGE_TAG' to ${#COPY_HOSTS[@]} host(s): ${COPY_HOSTS[*]}" + if [ "$PARALLEL_COPY" = true ]; then + echo "Parallel copy enabled." + fi COPY_START=$(date +%s) - # Using the pipe method from README.md - docker save "$IMAGE_TAG" | ssh "${SSH_USER}@${COPY_HOST}" "docker load" + + TMP_IMAGE=$(mktemp -t vllm_image.XXXXXX) + echo "Saving image locally to $TMP_IMAGE..." + docker save -o "$TMP_IMAGE" "$IMAGE_TAG" + + if [ "$PARALLEL_COPY" = true ]; then + PIDS=() + for host in "${COPY_HOSTS[@]}"; do + copy_to_host "$host" & + PIDS+=($!) + done + COPY_FAILURE=0 + for pid in "${PIDS[@]}"; do + if ! wait "$pid"; then + COPY_FAILURE=1 + fi + done + if [ "$COPY_FAILURE" -ne 0 ]; then + echo "One or more copies failed." + exit 1 + fi + else + for host in "${COPY_HOSTS[@]}"; do + copy_to_host "$host" + done + fi + COPY_END=$(date +%s) COPY_TIME=$((COPY_END - COPY_START)) echo "Copy complete."