From 270446be27992e9c45eef165cde1f4ea49f8940d Mon Sep 17 00:00:00 2001 From: eugr Date: Fri, 5 Dec 2025 11:28:43 -0800 Subject: [PATCH] Add build-and-copy script for automated image building and deployment --- README.md | 65 +++++++++++++++++++++++++++++++++++++++++++-- build-and-copy.sh | 67 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 2 deletions(-) create mode 100755 build-and-copy.sh diff --git a/README.md b/README.md index a0159a2..76e594e 100644 --- a/README.md +++ b/README.md @@ -48,9 +48,70 @@ docker build \ -t vllm-node . ``` -### Copying the container to another Spark node +### Option D: Using the Build Script (Recommended) -To avoid extra network overhead, you can copy the image directly to your second Spark node via ConnectX 7 interface by using the following command: +The `build-and-copy.sh` script automates the build process and optionally copies the image to another node. This is the recommended method for building and deploying to multiple Spark nodes. + +**Basic usage (build only):** + +```bash +./build-and-copy.sh +``` + +**Build with a custom tag:** + +```bash +./build-and-copy.sh --tag my-vllm-node +``` + +**Build and copy to another Spark node:** + +Using the same username as currently logged-in user: + +```bash +./build-and-copy.sh --copy-to-host 192.168.177.12 +``` + +Using a different username: + +```bash +./build-and-copy.sh --copy-to-host 192.168.177.12 --user your_username +``` + +**Force rebuild vLLM source only:** + +```bash +./build-and-copy.sh --rebuild-vllm +``` + +**Force rebuild all dependencies:** + +```bash +./build-and-copy.sh --rebuild-deps +``` + +**Combined example (rebuild vLLM and copy to another node):** + +```bash +./build-and-copy.sh --rebuild-vllm --copy-to-host 192.168.177.12 +``` + +**Available options:** + +| Flag | Description | +| :--- | :--- | +| `-t, --tag ` | Image tag (default: 'vllm-node') | +| `--rebuild-deps` | Force rebuild all dependencies (sets CACHEBUST_DEPS) | +| `--rebuild-vllm` | Force rebuild vLLM source only (sets CACHEBUST_VLLM) | +| `-h, --copy-to-host ` | Host address to copy the image to after building | +| `-u, --user ` | Username for SSH connection (default: current user) | +| `--help` | Show help message | + +**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! + +### Copying the container to another Spark node (Manual Method) + +Alternatively, you can manually copy the image directly to your second Spark node via ConnectX 7 interface by using the following command: ```bash docker save vllm-node | ssh your_username@another_spark_hostname_or_ip "docker load" diff --git a/build-and-copy.sh b/build-and-copy.sh new file mode 100755 index 0000000..7dbbf17 --- /dev/null +++ b/build-and-copy.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -e + +# Default values +IMAGE_TAG="vllm-node" +REBUILD_DEPS=false +REBUILD_VLLM=false +COPY_HOST="" +SSH_USER="$USER" + +# Help function +usage() { + echo "Usage: $0 [OPTIONS]" + echo " -t, --tag : Image tag (default: 'vllm-node')" + echo " --rebuild-deps : Set cache bust for dependencies" + echo " --rebuild-vllm : Set cache bust for vllm" + echo " -h, --copy-to-host : Host address to copy the image to (if not set, don't copy)" + echo " -u, --user : Username for ssh command (default: \$USER)" + echo " --help : Show this help message" + exit 1 +} + +# Argument parsing +while [[ "$#" -gt 0 ]]; do + case $1 in + -t|--tag) IMAGE_TAG="$2"; shift ;; + --rebuild-deps) REBUILD_DEPS=true ;; + --rebuild-vllm) REBUILD_VLLM=true ;; + -h|--copy-to-host) COPY_HOST="$2"; shift ;; + -u|--user) SSH_USER="$2"; shift ;; + --help) usage ;; + *) echo "Unknown parameter passed: $1"; usage ;; + esac + shift +done + +# Construct build command +CMD=("docker" "build" "-t" "$IMAGE_TAG") + +if [ "$REBUILD_DEPS" = true ]; then + echo "Setting CACHEBUST_DEPS..." + CMD+=("--build-arg" "CACHEBUST_DEPS=$(date +%s)") +fi + +if [ "$REBUILD_VLLM" = true ]; then + echo "Setting CACHEBUST_VLLM..." + CMD+=("--build-arg" "CACHEBUST_VLLM=$(date +%s)") +fi + +# Add build context +CMD+=(".") + +# Execute build +echo "Building image with command: ${CMD[*]}" +"${CMD[@]}" + +# Copy to host if requested +if [ -n "$COPY_HOST" ]; then + echo "Copying image '$IMAGE_TAG' to ${SSH_USER}@${COPY_HOST}..." + # Using the pipe method from README.md + docker save "$IMAGE_TAG" | ssh "${SSH_USER}@${COPY_HOST}" "docker load" + echo "Copy complete." +else + echo "No host specified, skipping copy." +fi + +echo "Done."