From 990a7b3837338cff52d37c123bced61f6747bdba Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Mon, 23 Mar 2026 15:43:18 -0700 Subject: [PATCH 01/48] Use mesh-optimized NCCL --- Dockerfile | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index ef0d766..75c16e6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN apt update && \ libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \ python3-dev python3-pip git wget \ libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ - ccache \ + ccache devscripts debhelper fakeroot \ && rm -rf /var/lib/apt/lists/* \ && pip install uv @@ -59,14 +59,19 @@ ENV CCACHE_COMPRESS=1 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache ENV CMAKE_CUDA_COMPILER_LAUNCHER=ccache -# Setup Workspace -WORKDIR $VLLM_BASE_DIR - # 2. Set Environment Variables ARG TORCH_CUDA_ARCH_LIST="12.1a" ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas +# Setup Workspace +WORKDIR $VLLM_BASE_DIR + +# Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb +RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \ + cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \ + make pkg.debian.build && apt install -y --no-install-recommends ./build/pkg/deb/*.deb + # ========================================================= # STAGE 2: FlashInfer Builder # ========================================================= @@ -234,13 +239,16 @@ ENV UV_SYSTEM_PYTHON=1 ENV UV_BREAK_SYSTEM_PACKAGES=1 ENV UV_LINK_MODE=copy +# Mount additional packages from base builder image # Install runtime dependencies -RUN apt update && \ +RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target=/workspace/nccl-pkg \ + apt update && \ apt install -y --no-install-recommends \ python3 python3-pip python3-dev vim curl git wget \ libcudnn9-cuda-13 \ libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ libxcb1 \ + && cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \ && rm -rf /var/lib/apt/lists/* \ && pip install uv From f8c2653fd34ec18d4645bbebe70fe867dcf3d875 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Mon, 23 Mar 2026 23:20:59 -0700 Subject: [PATCH 02/48] Quick fix for NCCL dependency --- Dockerfile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 75c16e6..a9a2485 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ RUN apt update && \ curl vim cmake build-essential ninja-build \ libcudnn9-cuda-13 libcudnn9-dev-cuda-13 \ python3-dev python3-pip git wget \ - libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ + libibverbs1 libibverbs-dev rdma-core \ ccache devscripts debhelper fakeroot \ && rm -rf /var/lib/apt/lists/* \ && pip install uv @@ -70,7 +70,7 @@ WORKDIR $VLLM_BASE_DIR # Build NCCL with mesh support (TODO: only do it if arch is 12.1) - artifacts will be in /workspace/nccl/build/pkg/deb RUN git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git && \ cd nccl && make -j ${BUILD_JOBS} src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" && \ - make pkg.debian.build && apt install -y --no-install-recommends ./build/pkg/deb/*.deb + make pkg.debian.build && apt install -y --no-install-recommends --allow-downgrades ./build/pkg/deb/*.deb # ========================================================= # STAGE 2: FlashInfer Builder @@ -246,7 +246,7 @@ RUN --mount=type=bind,from=base,source=/workspace/vllm/nccl/build/pkg/deb,target apt install -y --no-install-recommends \ python3 python3-pip python3-dev vim curl git wget \ libcudnn9-cuda-13 \ - libnccl-dev libnccl2 libibverbs1 libibverbs-dev rdma-core \ + libibverbs1 libibverbs-dev rdma-core \ libxcb1 \ && cd /workspace/nccl-pkg && apt install -y --no-install-recommends --allow-downgrades ./*.deb \ && rm -rf /var/lib/apt/lists/* \ @@ -290,4 +290,8 @@ ENV PATH=$VLLM_BASE_DIR:$PATH # Final extra deps RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ - uv pip install ray[default] fastsafetensors \ No newline at end of file + uv pip install ray[default] fastsafetensors + +# Fix NCCL +RUN rm /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2 && \ + ln -s /usr/lib/aarch64-linux-gnu/libnccl.so.2 /usr/local/lib/python3.12/dist-packages/nvidia/nccl/lib/libnccl.so.2 \ No newline at end of file From efacbd69f2e1892b00e4788f094893a17b03e350 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 12:43:12 -0700 Subject: [PATCH 03/48] Updated Nemotron3-Super recipe --- recipes/nemotron-3-super-nvfp4.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/recipes/nemotron-3-super-nvfp4.yaml b/recipes/nemotron-3-super-nvfp4.yaml index ec790c2..8e2449d 100644 --- a/recipes/nemotron-3-super-nvfp4.yaml +++ b/recipes/nemotron-3-super-nvfp4.yaml @@ -9,8 +9,12 @@ container: vllm-node cluster_only: false solo_only: false -mods: - - mods/nemotron-super +# mods: +# - mods/nemotron-super + +env: + VLLM_FLASHINFER_ALLREDUCE_BACKEND: trtllm + VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1 container: vllm-node defaults: @@ -35,7 +39,8 @@ command: | --enable-auto-tool-choice \ --load-format fastsafetensors \ --tool-call-parser qwen3_coder \ - --reasoning-parser-plugin super_v3_reasoning_parser.py \ - --reasoning-parser super_v3 \ + --reasoning-parser nemotron_v3 \ + --mamba_ssm_cache_dtype float32 \ --tensor-parallel-size {tensor_parallel} \ + --attention-backend TRITON_ATTN \ --distributed-executor-backend ray \ No newline at end of file From ad2cd3373f7d0ef115fc526453494c07756b613a Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 14:18:00 -0700 Subject: [PATCH 04/48] .env configuration support for launch-cluster.sh --- .env.example | 29 +++ launch-cluster.sh | 138 +++++++++++ run-recipe.py | 612 +++++++++++++++++++++++++++++----------------- 3 files changed, 548 insertions(+), 231 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2a3acbb --- /dev/null +++ b/.env.example @@ -0,0 +1,29 @@ +# Example .env configuration file for spark-vllm-docker +# Copy this file to .env and customize for your environment + +# Cluster configuration +# CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node) +CLUSTER_NODES="192.168.1.1,192.168.1.2,192.168.1.3" + +# ETH_IF: Ethernet interface name (optional, auto-detected if not specified) +ETH_IF="eth0" + +# IB_IF: InfiniBand interface name (optional, auto-detected if not specified) +IB_IF="ib0" + +# MASTER_PORT: Port for cluster coordination (default: 29501) +MASTER_PORT="29501" + +# CONTAINER_NAME: Container name (default: vllm_node) +CONTAINER_NAME="vllm_node" + +# Container environment variables +# Any variable starting with CONTAINER_ will be converted to -e flags +# Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO +CONTAINER_NCCL_DEBUG="INFO" +CONTAINER_HF_TOKEN="your_huggingface_token_here" +CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1" + +# Additional container environment variables +# CONTAINER_MAX_JOBS="16" +# CONTAINER_CUDA_VISIBLE_DEVICES="0,1" diff --git a/launch-cluster.sh b/launch-cluster.sh index f11ab11..1f6bfb9 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -30,6 +30,7 @@ MOD_PATHS=() MOD_TYPES=() LAUNCH_SCRIPT_PATH="" SCRIPT_DIR="$(dirname "$(realpath "$0")")" +CONFIG_FILE="" # Will be set to default after argument parsing ACTIONS_ARG="" SOLO_MODE="false" @@ -67,9 +68,27 @@ usage() { echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" + echo " --config Path to .env configuration file (default: .env in script directory)" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" + echo "Supported .env file variables:" + echo " CLUSTER_NODES Comma-separated list of node IPs" + echo " ETH_IF Ethernet interface name" + echo " IB_IF InfiniBand interface name" + echo " MASTER_PORT Port for cluster coordination (default: 29501)" + echo " CONTAINER_NAME Container name (default: vllm_node)" + echo " CONTAINER_* Any variable starting with CONTAINER_ becomes -e flag" + echo " Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO" + echo "" + echo "Example .env file:" + echo " CLUSTER_NODES=192.168.1.1,192.168.1.2" + echo " ETH_IF=eth0" + echo " IB_IF=ib0" + echo " MASTER_PORT=29501" + echo " CONTAINER_NCCL_DEBUG=INFO" + echo " CONTAINER_HF_TOKEN=abc123" + echo "" echo "Launch Script Usage:" echo " $0 --launch-script examples/my-script.sh # Script copied to container and executed" echo " $0 --launch-script /path/to/script.sh # Uses absolute path to script" @@ -108,6 +127,7 @@ while [[ "$#" -gt 0 ]]; do --shm-size-gb) SHM_SIZE_GB="$2"; shift ;; -d) DAEMON_MODE="true" ;; -h|--help) usage ;; + --config) CONFIG_FILE="$2"; shift ;; start|stop|status) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." @@ -133,6 +153,108 @@ while [[ "$#" -gt 0 ]]; do shift done +# Set .env file path (use default if not specified) +if [[ -z "$CONFIG_FILE" ]]; then + CONFIG_FILE="$SCRIPT_DIR/.env" +fi + +# Load .env file if exists +if [[ -f "$CONFIG_FILE" ]]; then + echo "Loading configuration from .env file..." + + # Validate .env file syntax + if ! python3 -c " +import sys +import re + +env_file = '$CONFIG_FILE' +seen_keys = set() + +with open(env_file, 'r') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + # Skip empty lines and comments + if not line or line.startswith('#'): + continue + + # Check for key=value format + if '=' not in line: + print(f'Error: Invalid syntax at line {line_num}: missing \"=\"') + sys.exit(1) + + key = line.split('=', 1)[0].strip() + + # Validate key format (alphanumeric + underscore) + if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', key): + print(f'Error: Invalid key format at line {line_num}: {key}') + sys.exit(1) + + # Check for duplicates + if key in seen_keys: + print(f'Error: Duplicate key at line {line_num}: {key}') + sys.exit(1) + + seen_keys.add(key) + +sys.exit(0) +" 2>/dev/null; then + echo "Error: Invalid .env file syntax. Aborting." + exit 1 + fi + + # Load .env variables with DOTENV_ prefix + while IFS='=' read -r key value || [[ -n "$key" ]]; do + # Skip comments and empty lines + [[ "$key" =~ ^[[:space:]]*# ]] && continue + [[ -z "$key" ]] && continue + + # Remove leading/trailing whitespace from key + key=$(echo "$key" | xargs) + + # Skip if key is empty after trimming + [[ -z "$key" ]] && continue + + # Remove quotes and whitespace from value using Python for proper shlex handling + value=$(python3 -c " +import shlex +import sys +value = '''$value''' +# Strip whitespace +value = value.strip() +# Remove surrounding quotes if present +if (value.startswith('\"') and value.endswith('\"')) or (value.startswith(\"'\" ) and value.endswith(\"'\")): + value = value[1:-1] +print(value) +") + + # Export with DOTENV_ prefix + export "DOTENV_$key=$value" + done < "$CONFIG_FILE" + + echo "Loaded .env variables: $(compgen -v DOTENV_ | tr '\n' ' ')" +fi + +# Apply .env configuration (CLI args take precedence) +if [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]]; then + NODES_ARG="$DOTENV_CLUSTER_NODES" +fi + +if [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]]; then + ETH_IF="$DOTENV_ETH_IF" +fi + +if [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]]; then + IB_IF="$DOTENV_IB_IF" +fi + +if [[ -z "$MASTER_PORT" || "$MASTER_PORT" == "29501" ]] && [[ -n "$DOTENV_MASTER_PORT" ]]; then + MASTER_PORT="$DOTENV_MASTER_PORT" +fi + +if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOTENV_CONTAINER_NAME" ]]; then + CONTAINER_NAME="$DOTENV_CONTAINER_NAME" +fi + # Validate non-privileged mode flags if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then # Set default swap limit if not specified @@ -163,6 +285,22 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then esac fi +# Add container environment variables from .env (CONTAINER_* pattern) +for env_var in $(compgen -v DOTENV_CONTAINER_); do + # Get the value + value="${!env_var}" + + # Extract the actual env var name (remove DOTENV_CONTAINER_ prefix) + actual_var="${env_var#DOTENV_CONTAINER_}" + + # Properly escape the value for shell using Python + escaped_value=$(python3 -c "import shlex; print(shlex.quote('$value'))") + + # Add to docker args + DOCKER_ARGS="$DOCKER_ARGS -e $actual_var=$escaped_value" + echo "Adding container env: $actual_var" +done + # Add build job parallelization environment variables if BUILD_JOBS is set if [[ -n "$BUILD_JOBS" ]]; then DOCKER_ARGS="$DOCKER_ARGS -e MAX_JOBS=$BUILD_JOBS" diff --git a/run-recipe.py b/run-recipe.py index ba4563b..c2aa072 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -105,21 +105,21 @@ LAUNCH_SCRIPT = SCRIPT_DIR / "launch-cluster.sh" BUILD_SCRIPT = SCRIPT_DIR / "build-and-copy.sh" DOWNLOAD_SCRIPT = SCRIPT_DIR / "hf-download.sh" AUTODISCOVER_SCRIPT = SCRIPT_DIR / "autodiscover.sh" -ENV_FILE = SCRIPT_DIR / ".env" +ENV_FILE = None # Will be set from CLI argument or default def load_recipe(recipe_path: Path) -> dict[str, Any]: """ Load and validate a recipe YAML file. - + This function handles recipe resolution from multiple locations and validates required fields. Recipes are the core configuration format for deployments. - + EXTENSIBILITY: - To add new required fields: Add to the 'required' list below - To add new optional fields with defaults: Add to the setdefault() calls at the end - Recipe search order: exact path -> recipes/ dir -> with .yaml -> with .yml - + RECIPE SCHEMA: name (str, required): Human-readable name for the recipe recipe_version (str, required): Schema version for compatibility checking. @@ -135,13 +135,13 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]: build_args (list[str], optional): Extra args for build-and-copy.sh (e.g., ['-f', 'Dockerfile.mxfp4']) cluster_only (bool, optional): If True, recipe cannot run in solo mode solo_only (bool, optional): If True, recipe cannot run in cluster mode - + Args: recipe_path: Path object pointing to YAML file or just recipe name - + Returns: Validated recipe dictionary with all fields populated (defaults applied) - + Raises: SystemExit: If recipe not found or validation fails """ @@ -161,17 +161,17 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]: print(f"Error: Recipe not found: {recipe_path}") print(f"Searched in: {recipe_path}, {RECIPES_DIR}") sys.exit(1) - + with open(recipe_path) as f: recipe = yaml.safe_load(f) - + # Validate required fields required = ["name", "recipe_version", "container", "command"] for field in required: if field not in recipe: print(f"Error: Recipe missing required field: {field}") sys.exit(1) - + # Set defaults for optional fields recipe.setdefault("description", "") recipe.setdefault("model", None) @@ -180,26 +180,28 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]: recipe.setdefault("env", {}) recipe.setdefault("cluster_only", False) recipe.setdefault("solo_only", False) - + # Validate recipe version compatibility # EXTENSIBILITY: When adding new schema versions, update SUPPORTED_VERSIONS # and add migration/compatibility logic below SUPPORTED_VERSIONS = ["1"] recipe_ver = str(recipe["recipe_version"]) if recipe_ver not in SUPPORTED_VERSIONS: - print(f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}") + print( + f"Warning: Recipe uses schema version '{recipe_ver}', but this run-recipe.py supports: {SUPPORTED_VERSIONS}" + ) print("Some features may not work correctly. Consider updating run-recipe.py.") - + return recipe def list_recipes() -> None: """ List all available recipes with their metadata. - + Scans the recipes/ directory for YAML files and displays key information. Used by the --list CLI option. - + EXTENSIBILITY: - To show additional fields: Add them to the print statements in the loop - To support different output formats (e.g., JSON): Add a format parameter @@ -208,12 +210,12 @@ def list_recipes() -> None: if not RECIPES_DIR.exists(): print("No recipes directory found.") return - + recipes = sorted(RECIPES_DIR.glob("*.yaml")) if not recipes: print("No recipes found in recipes/ directory.") return - + print("Available recipes:\n") for recipe_path in recipes: try: @@ -227,7 +229,7 @@ def list_recipes() -> None: mods = recipe.get("mods", []) cluster_only = recipe.get("cluster_only", False) solo_only = recipe.get("solo_only", False) - + print(f" {recipe_path.name}") print(f" Name: {name}") if desc: @@ -252,77 +254,85 @@ def list_recipes() -> None: def check_image_exists(image: str, host: str | None = None) -> bool: """ Check if a Docker image exists locally or on a remote host. - + Used to avoid redundant builds and to verify cluster nodes have the image. - + EXTENSIBILITY: - To support other container runtimes (podman): Modify the docker command - To add image version/digest checking: Parse 'docker image inspect' JSON output - For custom SSH options: Modify the ssh command array - + Args: image: Docker image tag to check (e.g., 'vllm-node-mxfp4') host: Optional remote hostname/IP. If None, checks locally. - + Returns: True if image exists, False otherwise """ if host: result = subprocess.run( - ["ssh", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", - host, f"docker image inspect '{image}'"], - capture_output=True + [ + "ssh", + "-o", + "BatchMode=yes", + "-o", + "StrictHostKeyChecking=no", + host, + f"docker image inspect '{image}'", + ], + capture_output=True, ) else: result = subprocess.run( - ["docker", "image", "inspect", image], - capture_output=True + ["docker", "image", "inspect", image], capture_output=True ) return result.returncode == 0 -def build_image(image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None) -> bool: +def build_image( + image: str, copy_to: list[str] | None = None, build_args: list[str] | None = None +) -> bool: """ Build the container image using build-and-copy.sh. - + Delegates to the build-and-copy.sh script which handles multi-stage builds, cache optimization, and distribution to worker nodes. - + EXTENSIBILITY: - To add new build options: Add them to build_args in the recipe's build_args field - To support different Dockerfiles: Use build_args = ['-f', 'Dockerfile.custom'] - To add build-time secrets: Modify cmd array to include --secret flags - To add progress callbacks: Capture subprocess output line-by-line - + BUILD_ARGS EXAMPLES: ['-f', 'Dockerfile.mxfp4'] - Use alternate Dockerfile ['--no-cache'] - Force full rebuild ['--build-arg', 'VAR=value'] - Pass build-time variables - + Args: image: Target image tag copy_to: List of worker hostnames to copy image to after build build_args: Extra arguments passed to build-and-copy.sh - + Returns: True if build (and copy) succeeded, False otherwise """ if not BUILD_SCRIPT.exists(): print(f"Error: Build script not found: {BUILD_SCRIPT}") return False - + cmd = [str(BUILD_SCRIPT), "-t", image] if build_args: cmd.extend(build_args) if copy_to: cmd.extend(["--copy-to", ",".join(copy_to)]) - + print(f"Building image '{image}'...") if build_args: print(f"Build args: {' '.join(build_args)}") if copy_to: print(f"Will copy to: {', '.join(copy_to)}") - + result = subprocess.run(cmd) return result.returncode == 0 @@ -330,35 +340,35 @@ def build_image(image: str, copy_to: list[str] | None = None, build_args: list[s def download_model(model: str, copy_to: list[str] | None = None) -> bool: """ Download model from HuggingFace using hf-download.sh. - + Delegates to hf-download.sh which handles HF authentication, caching, and rsync to worker nodes. - + EXTENSIBILITY: - To support other model sources: Create a new download script and switch based on model URL - To add download progress: Capture subprocess output - To support private models: hf-download.sh uses HF_TOKEN env var - To add model verification: Check sha256 of downloaded files - + Args: model: HuggingFace model ID (e.g., 'Salyut1/GLM-4.7-NVFP4') copy_to: List of worker hostnames to copy model cache to - + Returns: True if download (and copy) succeeded, False otherwise """ if not DOWNLOAD_SCRIPT.exists(): print(f"Error: Download script not found: {DOWNLOAD_SCRIPT}") return False - + cmd = [str(DOWNLOAD_SCRIPT), model] if copy_to: cmd.extend(["--copy-to", ",".join(copy_to)]) - + print(f"Downloading model '{model}'...") if copy_to: print(f"Will copy to: {', '.join(copy_to)}") - + result = subprocess.run(cmd) return result.returncode == 0 @@ -366,17 +376,17 @@ def download_model(model: str, copy_to: list[str] | None = None) -> bool: def check_model_exists(model: str) -> bool: """ Check if a model exists in the HuggingFace cache. - + Checks the standard HF cache location for completed downloads. - + EXTENSIBILITY: - To support custom cache locations: Add HF_HOME env var support - To verify model integrity: Check for complete snapshot with config.json - To support other model sources: Add URL/path prefix detection - + Args: model: HuggingFace model ID (e.g., 'org/model-name') - + Returns: True if model appears to be fully downloaded, False otherwise """ @@ -384,7 +394,7 @@ def check_model_exists(model: str) -> bool: # e.g., "Salyut1/GLM-4.7-NVFP4" -> "models--Salyut1--GLM-4.7-NVFP4" cache_name = f"models--{model.replace('/', '--')}" cache_path = Path.home() / ".cache" / "huggingface" / "hub" / cache_name - + if cache_path.exists(): # Check for snapshots directory which indicates complete download snapshots = cache_path / "snapshots" @@ -393,19 +403,25 @@ def check_model_exists(model: str) -> bool: return False -def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is_solo: bool = False, extra_args: list[str] | None = None, no_ray: bool = False) -> str: +def generate_launch_script( + recipe: dict[str, Any], + overrides: dict[str, Any], + is_solo: bool = False, + extra_args: list[str] | None = None, + no_ray: bool = False, +) -> str: """ Generate a bash launch script from the recipe. - + Creates a self-contained bash script that runs inside the container. Handles template substitution, environment variables, and solo mode adjustments. - + EXTENSIBILITY: - To add new template variables: Add them to recipe['defaults'] or CLI overrides - To add pre/post hooks: Add 'pre_command'/'post_command' fields to recipe schema - To add conditional logic: Use Jinja2 templating instead of str.format() - To support GPU selection: Add CUDA_VISIBLE_DEVICES to env handling - + TEMPLATE VARIABLES (use {variable_name} in recipe command): port: API server port (default from recipe) host: API server bind address @@ -413,42 +429,42 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is gpu_memory_utilization: Fraction of GPU memory to use max_model_len: Maximum sequence length (custom variables can be added via recipe defaults) - + SOLO MODE BEHAVIOR: - Removes '--distributed-executor-backend ray' lines - Typically sets tensor_parallel=1 (handled by caller) - + EXTRA ARGS: - Appended verbatim to the end of the vLLM command - Allows passing any vLLM argument not covered by template variables - vLLM uses "last wins" semantics for duplicate arguments - + Args: recipe: Loaded recipe dictionary overrides: CLI-provided parameter overrides (take precedence over defaults) is_solo: If True, strip distributed executor configuration extra_args: Additional arguments to append to vLLM command (after --) - + Returns: Complete bash script content as string - + Raises: SystemExit: If required template variables are missing """ # Merge defaults with overrides params = {**recipe.get("defaults", {}), **overrides} - + # Build the script lines = ["#!/bin/bash", f"# Generated from recipe: {recipe['name']}", ""] - + # Add environment variables env_vars = recipe.get("env", {}) if env_vars: lines.append("# Environment variables") for key, value in env_vars.items(): - lines.append(f"export {key}=\"{value}\"") + lines.append(f'export {key}="{value}"') lines.append("") - + # Format the command with parameters command = recipe["command"] try: @@ -457,49 +473,47 @@ def generate_launch_script(recipe: dict[str, Any], overrides: dict[str, Any], is print(f"Error: Missing parameter in recipe command: {e}") print(f"Available parameters: {list(params.keys())}") sys.exit(1) - + # In solo or no-ray mode, remove --distributed-executor-backend # (not needed for solo; no-ray uses PyTorch distributed instead) if is_solo or no_ray: import re + # Remove just the flag and its value, not the whole line - command = re.sub(r'--distributed-executor-backend\s+\S+', '', command) + command = re.sub(r"--distributed-executor-backend\s+\S+", "", command) # Remove lines that are now empty or just a backslash continuation - lines_list = command.split('\n') - filtered_lines = [ - line for line in lines_list - if line.strip() not in ('', '\\') - ] - command = '\n'.join(filtered_lines) + lines_list = command.split("\n") + filtered_lines = [line for line in lines_list if line.strip() not in ("", "\\")] + command = "\n".join(filtered_lines) # Remove trailing backslash if present command = command.rstrip() - if command.endswith('\\'): - command = command.rstrip('\\\n').rstrip() - + if command.endswith("\\"): + command = command.rstrip("\\\n").rstrip() + # Append extra args if provided (after --) if extra_args: # Join extra args and append to command - extra_args_str = ' '.join(shlex.quote(a) for a in extra_args) - command = command + ' ' + extra_args_str - + extra_args_str = " ".join(shlex.quote(a) for a in extra_args) + command = command + " " + extra_args_str + lines.append("# Run the model") lines.append(command.strip()) lines.append("") - + return "\n".join(lines) def parse_nodes(nodes_arg: str | None) -> list[str]: """ Parse comma-separated node list. - + Simple utility to split node specifications. The first node is always treated as the head node for cluster deployments. - + Args: nodes_arg: Comma-separated string like '192.168.1.1,192.168.1.2' - + Returns: List of stripped node identifiers, empty list if input is None/empty """ @@ -511,13 +525,13 @@ def parse_nodes(nodes_arg: str | None) -> list[str]: def get_worker_nodes(nodes: list[str]) -> list[str]: """ Get worker nodes (all nodes except the first/head node). - + In a Ray cluster, the first node runs the head process. Workers are all subsequent nodes that join the cluster. - + Args: nodes: Full list of nodes (head first, then workers) - + Returns: List of worker nodes (excluding head), empty if single node """ @@ -529,20 +543,20 @@ def get_worker_nodes(nodes: list[str]) -> list[str]: def load_env_file() -> dict[str, str]: """ Load environment variables from .env file. - + Reads the .env file created by --discover for persistent cluster configuration. - + EXTENSIBILITY: - To add new persistent settings: Just add them to save_env_file() - To support multiple .env files: Add a --env-file CLI argument - To add validation: Check for required keys after loading - + SUPPORTED KEYS (set by --discover): CLUSTER_NODES: Comma-separated list of node IPs LOCAL_IP: This machine's IP address ETH_IF: Ethernet interface name IB_IF: InfiniBand interface name (if available) - + Returns: Dictionary of key=value pairs from .env file """ @@ -562,15 +576,15 @@ def load_env_file() -> dict[str, str]: def save_env_file(env: dict[str, str]) -> None: """ Save environment variables to .env file. - + Persists cluster configuration discovered by autodiscover.sh. Values are properly quoted if they contain spaces or commas. - + EXTENSIBILITY: - To add new persistent settings: Just add them to the env dict before calling - To add timestamps/metadata: Add comment lines to the output - To support append mode: Read existing, merge, then write - + Args: env: Dictionary of key=value pairs to save """ @@ -582,42 +596,42 @@ def save_env_file(env: dict[str, str]) -> None: else: lines.append(f"{key}={value}") lines.append("") - + with open(ENV_FILE, "w") as f: f.write("\n".join(lines)) - + print(f"Saved to {ENV_FILE}") def run_autodiscover() -> dict[str, str] | None: """ Run autodiscover.sh and return discovered configuration. - + Executes the autodiscover.sh script to detect cluster topology, then presents an interactive node selection menu. - + EXTENSIBILITY: - To add new discovery methods: Extend autodiscover.sh or add Python detection here - To add GPU detection: Add nvidia-smi parsing to discovered env - To skip interactive selection: Add a --non-interactive flag - To add node health checks: Ping/SSH test each discovered node - + DISCOVERED VARIABLES: CLUSTER_NODES: Comma-separated list of node IPs (user-selected) LOCAL_IP: This machine's IP address ETH_IF: Ethernet interface name (e.g., 'eth0') IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available - + Returns: Dictionary with discovered configuration, or None if discovery failed """ if not AUTODISCOVER_SCRIPT.exists(): print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") return None - + print("Running autodiscover...") print() - + # Run autodiscover in a subshell and capture the variables # We source the script and print the variables we care about script = f""" @@ -630,13 +644,9 @@ def run_autodiscover() -> dict[str, str] | None: echo "ETH_IF=$ETH_IF" echo "IB_IF=$IB_IF" """ - - result = subprocess.run( - ["bash", "-c", script], - capture_output=True, - text=True - ) - + + result = subprocess.run(["bash", "-c", script], capture_output=True, text=True) + if result.returncode != 0: print("Autodiscover output:") print(result.stdout) @@ -644,33 +654,36 @@ def run_autodiscover() -> dict[str, str] | None: print(result.stderr) print("Error: Autodiscover failed") return None - + # Print the autodiscover output (excluding the final variable lines) output_lines = result.stdout.strip().split("\n") env = {} for line in output_lines: - if "=" in line and any(line.startswith(k) for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="]): + if "=" in line and any( + line.startswith(k) + for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="] + ): key, _, value = line.partition("=") env[key] = value else: print(line) - + print() - + # Interactive node selection if env.get("CLUSTER_NODES"): all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()] local_ip = env.get("LOCAL_IP", "") - + if len(all_nodes) > 1: print("Select which nodes to include in the cluster:") print() - + selected_nodes = [] for node in all_nodes: is_local = node == local_ip label = f"{node} (this machine)" if is_local else node - + # Default to yes for all nodes while True: response = input(f" Include {label}? [Y/n]: ").strip().lower() @@ -681,47 +694,49 @@ def run_autodiscover() -> dict[str, str] | None: break else: print(" Please enter 'y' or 'n'") - + print() - + if not selected_nodes: print("No nodes selected. Aborting.") return None - + if len(selected_nodes) == 1: print(f"Only one node selected: {selected_nodes[0]}") print("This will run in solo mode (single node).") else: - print(f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}") - + print( + f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}" + ) + env["CLUSTER_NODES"] = ",".join(selected_nodes) print() - + return env def main(): """ Main entry point for the recipe runner. - + Orchestrates the full deployment pipeline: 1. Parse CLI arguments and load recipe 2. Resolve cluster nodes (CLI -> .env -> autodiscover) 3. Build phase: Build container if missing, copy to workers - 4. Download phase: Download model if missing, copy to workers + 4. Download phase: Download model if missing, copy to workers 5. Run phase: Generate launch script and execute via launch-cluster.sh - + EXTENSIBILITY: - To add new CLI options: Add to the appropriate argument group - To add new phases: Insert between existing phases with similar pattern - To add pre/post hooks: Add hook execution before/after subprocess calls - To add logging: Replace print() with logging module calls - To add config file support: Load defaults from ~/.config/vllm-recipes.yaml - + EXIT CODES: 0: Success 1: Error (recipe not found, build failed, validation error, etc.) - + Returns: Exit code for sys.exit() """ @@ -757,124 +772,229 @@ Examples: # Show current .env configuration %(prog)s --show-env - """ + """, ) - + parser.add_argument( "recipe", nargs="?", - help="Path to recipe YAML file (or just the name without .yaml)" + help="Path to recipe YAML file (or just the name without .yaml)", ) parser.add_argument( - "--list", "-l", - action="store_true", - help="List available recipes" + "--list", "-l", action="store_true", help="List available recipes" ) - + # Setup options setup_group = parser.add_argument_group("Setup options") setup_group.add_argument( "--setup", action="store_true", - help="Full setup: build container (if missing) + download model (if missing) + run" + help="Full setup: build container (if missing) + download model (if missing) + run", ) setup_group.add_argument( "--build-only", action="store_true", - help="Only build/copy the container image, don't run" + help="Only build/copy the container image, don't run", ) setup_group.add_argument( "--download-only", action="store_true", - help="Only download/copy the model, don't run" + help="Only download/copy the model, don't run", ) setup_group.add_argument( - "--force-build", - action="store_true", - help="Force rebuild even if image exists" + "--force-build", action="store_true", help="Force rebuild even if image exists" ) setup_group.add_argument( "--force-download", action="store_true", - help="Force re-download even if model exists" + help="Force re-download even if model exists", ) - + parser.add_argument( "--dry-run", action="store_true", - help="Show what would be executed without running" + help="Show what would be executed without running", ) - + # Override options override_group = parser.add_argument_group("Recipe overrides") override_group.add_argument("--port", type=int, help="Override port") override_group.add_argument("--host", help="Override host") - override_group.add_argument("--tensor-parallel", "--tp", type=int, dest="tensor_parallel", help="Override tensor parallelism") - override_group.add_argument("--gpu-memory-utilization", "--gpu-mem", type=float, dest="gpu_memory_utilization", help="Override GPU memory utilization") - override_group.add_argument("--max-model-len", type=int, dest="max_model_len", help="Override max model length") - + override_group.add_argument( + "--tensor-parallel", + "--tp", + type=int, + dest="tensor_parallel", + help="Override tensor parallelism", + ) + override_group.add_argument( + "--gpu-memory-utilization", + "--gpu-mem", + type=float, + dest="gpu_memory_utilization", + help="Override GPU memory utilization", + ) + override_group.add_argument( + "--max-model-len", + type=int, + dest="max_model_len", + help="Override max model length", + ) + # Launch options (passed to launch-cluster.sh) - launch_group = parser.add_argument_group("Launch options (passed to launch-cluster.sh)") - launch_group.add_argument("--solo", action="store_true", help="Run in solo mode (single node, no Ray)") - launch_group.add_argument("-n", "--nodes", help="Comma-separated list of node IPs (first is head node)") - launch_group.add_argument("-d", "--daemon", action="store_true", help="Run in daemon mode") - launch_group.add_argument("-t", "--container", dest="container_override", help="Override container image from recipe") - launch_group.add_argument("--nccl-debug", choices=["VERSION", "WARN", "INFO", "TRACE"], help="NCCL debug level") - launch_group.add_argument("-e", "--env", action="append", dest="env_vars", default=[], metavar="VAR=VALUE", help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.") + launch_group = parser.add_argument_group( + "Launch options (passed to launch-cluster.sh)" + ) + launch_group.add_argument( + "--solo", action="store_true", help="Run in solo mode (single node, no Ray)" + ) + launch_group.add_argument( + "-n", "--nodes", help="Comma-separated list of node IPs (first is head node)" + ) + launch_group.add_argument( + "-d", "--daemon", action="store_true", help="Run in daemon mode" + ) + launch_group.add_argument( + "-t", + "--container", + dest="container_override", + help="Override container image from recipe", + ) + launch_group.add_argument( + "--nccl-debug", + choices=["VERSION", "WARN", "INFO", "TRACE"], + help="NCCL debug level", + ) + launch_group.add_argument( + "-e", + "--env", + action="append", + dest="env_vars", + default=[], + metavar="VAR=VALUE", + help="Environment variable to pass to container (e.g. -e HF_TOKEN=xxx). Can be used multiple times.", + ) launch_group.add_argument( "--no-ray", action="store_true", dest="no_ray", - help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)" + help="No-Ray mode: run multi-node vLLM without Ray (uses PyTorch distributed backend)", + ) + launch_group.add_argument( + "--master-port", + "--head-port", + type=int, + dest="master_port", + help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)", + ) + launch_group.add_argument( + "--name", + dest="container_name", + help="Override container name (default: vllm_node)", + ) + launch_group.add_argument( + "--eth-if", + dest="eth_if", + help="Ethernet interface (overrides .env and auto-detection)", + ) + launch_group.add_argument( + "--ib-if", + dest="ib_if", + help="InfiniBand interface (overrides .env and auto-detection)", + ) + launch_group.add_argument( + "-j", + dest="build_jobs", + type=int, + metavar="N", + help="Number of parallel build jobs inside container", + ) + launch_group.add_argument( + "--no-cache-dirs", + action="store_true", + dest="no_cache_dirs", + help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton", + ) + launch_group.add_argument( + "--non-privileged", + action="store_true", + dest="non_privileged", + help="Run in non-privileged mode (removes --privileged and --ipc=host)", + ) + launch_group.add_argument( + "--mem-limit-gb", + type=int, + dest="mem_limit_gb", + help="Memory limit in GB (only with --non-privileged)", + ) + launch_group.add_argument( + "--mem-swap-limit-gb", + type=int, + dest="mem_swap_limit_gb", + help="Memory+swap limit in GB (only with --non-privileged)", + ) + launch_group.add_argument( + "--pids-limit", + type=int, + dest="pids_limit", + help="Process limit (only with --non-privileged, default: 4096)", + ) + launch_group.add_argument( + "--shm-size-gb", + type=int, + dest="shm_size_gb", + help="Shared memory size in GB (only with --non-privileged, default: 64)", + ) + + # Config file option + parser.add_argument( + "--config", + dest="config_file", + metavar="FILE", + help="Path to .env configuration file (default: .env in script directory)", ) - launch_group.add_argument("--master-port", "--head-port", type=int, dest="master_port", help="Port for cluster coordination (Ray head port or PyTorch distributed master port, default: 29501)") - launch_group.add_argument("--name", dest="container_name", help="Override container name (default: vllm_node)") - launch_group.add_argument("--eth-if", dest="eth_if", help="Ethernet interface (overrides .env and auto-detection)") - launch_group.add_argument("--ib-if", dest="ib_if", help="InfiniBand interface (overrides .env and auto-detection)") - launch_group.add_argument("-j", dest="build_jobs", type=int, metavar="N", help="Number of parallel build jobs inside container") - launch_group.add_argument("--no-cache-dirs", action="store_true", dest="no_cache_dirs", help="Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton") - launch_group.add_argument("--non-privileged", action="store_true", dest="non_privileged", help="Run in non-privileged mode (removes --privileged and --ipc=host)") - launch_group.add_argument("--mem-limit-gb", type=int, dest="mem_limit_gb", help="Memory limit in GB (only with --non-privileged)") - launch_group.add_argument("--mem-swap-limit-gb", type=int, dest="mem_swap_limit_gb", help="Memory+swap limit in GB (only with --non-privileged)") - launch_group.add_argument("--pids-limit", type=int, dest="pids_limit", help="Process limit (only with --non-privileged, default: 4096)") - launch_group.add_argument("--shm-size-gb", type=int, dest="shm_size_gb", help="Shared memory size in GB (only with --non-privileged, default: 64)") # Cluster discovery options discover_group = parser.add_argument_group("Cluster discovery") discover_group.add_argument( "--discover", action="store_true", - help="Auto-detect cluster nodes and save to .env file" + help="Auto-detect cluster nodes and save to .env file", ) discover_group.add_argument( - "--show-env", - action="store_true", - help="Show current .env configuration" + "--show-env", action="store_true", help="Show current .env configuration" ) - + # Use parse_known_args to allow extra vLLM arguments after -- args, extra_args = parser.parse_known_args() - + + # Set .env file path (use default if not specified) + global ENV_FILE + if args.config_file: + ENV_FILE = Path(args.config_file).resolve() + else: + ENV_FILE = SCRIPT_DIR / ".env" + # Filter out the -- separator if present - if extra_args and extra_args[0] == '--': + if extra_args and extra_args[0] == "--": extra_args = extra_args[1:] - + # Handle --discover (can be run with or without a recipe) if args.discover: env = run_autodiscover() if env is None: return 1 - + print("Discovered configuration:") for key, value in sorted(env.items()): print(f" {key}={value}") print() - + save_env_file(env) - + if not args.recipe: return 0 - + # Handle --show-env if args.show_env: env = load_env_file() @@ -885,39 +1005,39 @@ Examples: else: print(f"No .env file found at {ENV_FILE}") print("Run with --discover to auto-detect cluster nodes.") - + if not args.recipe: return 0 print() - + if args.list: list_recipes() return 0 - + if not args.recipe: parser.print_help() return 1 - + # Load recipe recipe_path = Path(args.recipe) recipe = load_recipe(recipe_path) - + print(f"Recipe: {recipe['name']}") if recipe.get("description"): print(f" {recipe['description']}") print() - + # Determine container image container = args.container_override or recipe["container"] model = recipe.get("model") build_args = recipe.get("build_args", []) - + # Parse nodes - check command line first, then .env file, then autodiscover nodes = parse_nodes(args.nodes) if not args.solo else [] nodes_from_env = False eth_if = None ib_if = None - + if not args.solo: # Try to load from .env file env = load_env_file() @@ -932,16 +1052,22 @@ Examples: # No nodes specified and no .env - run autodiscover print("No cluster nodes configured. Running autodiscover...") print() - + discovered_env = run_autodiscover() if discovered_env and discovered_env.get("CLUSTER_NODES"): nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) nodes_from_env = True - + if nodes: # Ask if user wants to save to .env print() - response = input("Save this configuration to .env for future use? [Y/n]: ").strip().lower() + response = ( + input( + "Save this configuration to .env for future use? [Y/n]: " + ) + .strip() + .lower() + ) if response in ("", "y", "yes"): save_env_file(discovered_env) print() @@ -954,7 +1080,7 @@ Examples: eth_if = env["ETH_IF"] if not ib_if and env.get("IB_IF"): ib_if = env["IB_IF"] - + worker_nodes = get_worker_nodes(nodes) if nodes else [] is_cluster = len(nodes) > 1 @@ -962,9 +1088,11 @@ Examples: cluster_only = recipe.get("cluster_only", False) solo_only = recipe.get("solo_only", False) is_solo = args.solo or not is_cluster - - if getattr(args, 'no_ray', False) and is_solo: - print("Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray.") + + if getattr(args, "no_ray", False) and is_solo: + print( + "Error: --no-ray is incompatible with --solo. Solo mode already runs without Ray." + ) return 1 if cluster_only and is_solo: @@ -972,7 +1100,9 @@ Examples: print(f"This model is too large to run on a single node.") print() print("Options:") - print(f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2") + print( + f" 1. Specify nodes directly: {sys.argv[0]} {args.recipe} -n node1,node2" + ) print(f" 2. Auto-discover and save: {sys.argv[0]} --discover") print(f" Then run: {sys.argv[0]} {args.recipe}") return 1 @@ -984,10 +1114,10 @@ Examples: print(f" 1. Run solo: {sys.argv[0]} {args.recipe} --solo") print(f" 2. Remove nodes from .env: {sys.argv[0]} --show-env") return 1 - + # Determine copy targets for cluster deployments copy_targets = worker_nodes if is_cluster else None - + if args.dry_run: print("=== Dry Run ===") print(f"Container: {container}") @@ -1007,9 +1137,13 @@ Examples: print(f" Workers: {', '.join(worker_nodes)}") print(f"Solo mode: {is_solo}") if eth_if: - print(f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}") + print( + f"Ethernet interface: {eth_if}{' (from .env)' if not args.eth_if else ''}" + ) if ib_if: - print(f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}") + print( + f"InfiniBand interface: {ib_if}{' (from .env)' if not args.ib_if else ''}" + ) if args.container_name: print(f"Container name: {args.container_name}") if args.non_privileged: @@ -1031,7 +1165,7 @@ Examples: print() else: image_exists = check_image_exists(container) - + if args.force_build or not image_exists: print("=== Building Container ===") if not build_image(container, copy_targets, build_args): @@ -1053,11 +1187,11 @@ Examples: print("Error: Failed to build/copy container") return 1 print() - + if args.build_only: print("Build complete." if not args.dry_run else "") return 0 - + # --- Download Phase --- if model and (args.download_only or args.setup or args.force_download): if args.dry_run: @@ -1071,7 +1205,7 @@ Examples: print() else: model_exists = check_model_exists(model) - + if args.force_download or not model_exists: print("=== Downloading Model ===") if not download_model(model, copy_targets): @@ -1081,15 +1215,15 @@ Examples: else: print(f"Model '{model}' already exists in cache.") print() - + if args.download_only: print("Download complete." if not args.dry_run else "") return 0 - + # --- Run Phase --- if args.build_only or args.download_only: return 0 - + # Check if image exists (if not using --setup) if not args.dry_run and not args.setup and not check_image_exists(container): print(f"Container image '{container}' not found locally.") @@ -1099,48 +1233,64 @@ Examples: print(f" 2. Build manually: ./build-and-copy.sh -t {container}") print() response = input("Build now? [y/N] ").strip().lower() - if response == 'y': + if response == "y": if not build_image(container, copy_targets, build_args): print("Error: Failed to build image") return 1 else: print("Aborting.") return 1 - + # Build overrides from CLI args overrides = {} - for key in ["port", "host", "tensor_parallel", "gpu_memory_utilization", "max_model_len"]: + for key in [ + "port", + "host", + "tensor_parallel", + "gpu_memory_utilization", + "max_model_len", + ]: value = getattr(args, key, None) if value is not None: overrides[key] = value - + # In solo mode, default tensor_parallel to 1 (unless user explicitly set --tp) if is_solo and "tensor_parallel" not in overrides: overrides["tensor_parallel"] = 1 - + # Check for duplicate arguments (warn if extra_args duplicate CLI overrides) if extra_args: # Map vLLM flags to our override keys flag_to_override = { - '--port': 'port', - '--host': 'host', - '--tensor-parallel-size': 'tensor_parallel', - '-tp': 'tensor_parallel', - '--gpu-memory-utilization': 'gpu_memory_utilization', - '--max-model-len': 'max_model_len', + "--port": "port", + "--host": "host", + "--tensor-parallel-size": "tensor_parallel", + "-tp": "tensor_parallel", + "--gpu-memory-utilization": "gpu_memory_utilization", + "--max-model-len": "max_model_len", } for i, arg in enumerate(extra_args): # Check both exact flag and =value syntax - flag = arg.split('=')[0] if '=' in arg else arg + flag = arg.split("=")[0] if "=" in arg else arg if flag in flag_to_override: override_key = flag_to_override[flag] if override_key in overrides: - print(f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override") - print(f" vLLM uses last value; extra args appear after template substitution") - + print( + f"Warning: '{arg}' in extra args duplicates --{override_key.replace('_', '-')} override" + ) + print( + f" vLLM uses last value; extra args appear after template substitution" + ) + # Generate launch script - script_content = generate_launch_script(recipe, overrides, is_solo=is_solo, extra_args=extra_args, no_ray=getattr(args, 'no_ray', False)) - + script_content = generate_launch_script( + recipe, + overrides, + is_solo=is_solo, + extra_args=extra_args, + no_ray=getattr(args, "no_ray", False), + ) + if args.dry_run: print("=== Generated Launch Script ===") print(script_content) @@ -1158,7 +1308,7 @@ Examples: cmd_parts.append("--solo") if args.daemon: cmd_parts.append("-d") - if getattr(args, 'no_ray', False): + if getattr(args, "no_ray", False): cmd_parts.append("--no-ray") if nodes: cmd_parts.extend(["-n", ",".join(nodes)]) @@ -1193,42 +1343,42 @@ Examples: print() print("3. The launch script runs inside the container") return 0 - + # Write temporary launch script - with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f: + with tempfile.NamedTemporaryFile(mode="w", suffix=".sh", delete=False) as f: f.write(script_content) temp_script = f.name - + try: os.chmod(temp_script, 0o755) - + # Build launch-cluster.sh command cmd = [str(LAUNCH_SCRIPT), "-t", container] - + # Add mods for mod in recipe.get("mods", []): mod_path = SCRIPT_DIR / mod if not mod_path.exists(): print(f"Warning: Mod path not found: {mod_path}") cmd.extend(["--apply-mod", str(mod_path)]) - + # Add launch options if args.solo: cmd.append("--solo") elif not is_cluster: # Auto-enable solo mode if no cluster nodes specified cmd.append("--solo") - + if args.daemon: cmd.append("-d") - if getattr(args, 'no_ray', False): + if getattr(args, "no_ray", False): cmd.append("--no-ray") # Pass nodes to launch-cluster.sh (from command line, .env, or autodiscover) if nodes: cmd.extend(["-n", ",".join(nodes)]) - + if args.nccl_debug: cmd.extend(["--nccl-debug", args.nccl_debug]) @@ -1260,7 +1410,7 @@ Examples: # Add launch script cmd.extend(["--launch-script", temp_script]) - + print(f"=== Launching ===") print(f"Container: {container}") if recipe.get("mods"): @@ -1270,11 +1420,11 @@ Examples: else: print("Mode: Solo") print() - + # Execute result = subprocess.run(cmd) return result.returncode - + finally: # Cleanup temp script try: From 07fac71dac2a60eb87429a416ed2c54f92084526 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 14:42:01 -0700 Subject: [PATCH 05/48] Fixed bug with CONTAINER_NAME variable --- .env.example | 3 ++- launch-cluster.sh | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index 2a3acbb..6f3f05e 100644 --- a/.env.example +++ b/.env.example @@ -15,10 +15,11 @@ IB_IF="ib0" MASTER_PORT="29501" # CONTAINER_NAME: Container name (default: vllm_node) +# Note: This is a configuration variable, NOT passed as env var to container CONTAINER_NAME="vllm_node" # Container environment variables -# Any variable starting with CONTAINER_ will be converted to -e flags +# Any variable starting with CONTAINER_ (except CONTAINER_NAME) will be converted to -e flags # Example: CONTAINER_NCCL_DEBUG=INFO becomes -e NCCL_DEBUG=INFO CONTAINER_NCCL_DEBUG="INFO" CONTAINER_HF_TOKEN="your_huggingface_token_here" diff --git a/launch-cluster.sh b/launch-cluster.sh index 1f6bfb9..40500b5 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -78,14 +78,15 @@ usage() { echo " IB_IF InfiniBand interface name" echo " MASTER_PORT Port for cluster coordination (default: 29501)" echo " CONTAINER_NAME Container name (default: vllm_node)" - echo " CONTAINER_* Any variable starting with CONTAINER_ becomes -e flag" - echo " Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO" + echo " CONTAINER_* Any variable starting with CONTAINER_ (except CONTAINER_NAME)" + echo " becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO" echo "" echo "Example .env file:" echo " CLUSTER_NODES=192.168.1.1,192.168.1.2" echo " ETH_IF=eth0" echo " IB_IF=ib0" echo " MASTER_PORT=29501" + echo " CONTAINER_NAME=vllm_node" echo " CONTAINER_NCCL_DEBUG=INFO" echo " CONTAINER_HF_TOKEN=abc123" echo "" @@ -286,7 +287,11 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then fi # Add container environment variables from .env (CONTAINER_* pattern) +# Excludes CONTAINER_NAME which is a configuration variable, not an env var for env_var in $(compgen -v DOTENV_CONTAINER_); do + # Skip CONTAINER_NAME as it's a configuration variable + [[ "$env_var" == "DOTENV_CONTAINER_NAME" ]] && continue + # Get the value value="${!env_var}" From 1755dfd114ad620680fbc8971c9930171be504de Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 15:16:06 -0700 Subject: [PATCH 06/48] Added LOCAL_IP support --- .env.example | 10 +++++++--- launch-cluster.sh | 11 ++++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index 6f3f05e..bc6f2dc 100644 --- a/.env.example +++ b/.env.example @@ -3,13 +3,17 @@ # Cluster configuration # CLUSTER_NODES: Comma-separated list of node IPs (first node is the head node) -CLUSTER_NODES="192.168.1.1,192.168.1.2,192.168.1.3" +CLUSTER_NODES="192.168.177.11,192.168.177.12" # ETH_IF: Ethernet interface name (optional, auto-detected if not specified) -ETH_IF="eth0" +ETH_IF="enp1s0f1np1" # IB_IF: InfiniBand interface name (optional, auto-detected if not specified) -IB_IF="ib0" +IB_IF="rocep1s0f1,roceP2p1s0f1" + +# LOCAL_IP: Local IP address (optional, auto-detected if not specified) +# Useful for solo mode or overriding auto-detection +LOCAL_IP="192.168.177.11" # MASTER_PORT: Port for cluster coordination (default: 29501) MASTER_PORT="29501" diff --git a/launch-cluster.sh b/launch-cluster.sh index 40500b5..24b6a42 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -78,6 +78,7 @@ usage() { echo " IB_IF InfiniBand interface name" echo " MASTER_PORT Port for cluster coordination (default: 29501)" echo " CONTAINER_NAME Container name (default: vllm_node)" + echo " LOCAL_IP Local IP address (for solo mode or override auto-detection)" echo " CONTAINER_* Any variable starting with CONTAINER_ (except CONTAINER_NAME)" echo " becomes -e flag. Example: CONTAINER_NCCL_DEBUG=INFO -> -e NCCL_DEBUG=INFO" echo "" @@ -87,6 +88,7 @@ usage() { echo " IB_IF=ib0" echo " MASTER_PORT=29501" echo " CONTAINER_NAME=vllm_node" + echo " LOCAL_IP=192.168.1.1" echo " CONTAINER_NCCL_DEBUG=INFO" echo " CONTAINER_HF_TOKEN=abc123" echo "" @@ -256,6 +258,10 @@ if [[ -z "$CONTAINER_NAME" || "$CONTAINER_NAME" == "vllm_node" ]] && [[ -n "$DOT CONTAINER_NAME="$DOTENV_CONTAINER_NAME" fi +if [[ -n "$DOTENV_LOCAL_IP" ]]; then + export LOCAL_IP="$DOTENV_LOCAL_IP" +fi + # Validate non-privileged mode flags if [[ "$NON_PRIVILEGED_MODE" == "true" ]]; then # Set default swap limit if not specified @@ -408,7 +414,10 @@ if [[ "$SOLO_MODE" == "true" ]]; then exit 1 fi # Solo mode: skip node detection, just get local IP - LOCAL_IP="127.0.0.1" + # Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1 + if [[ -z "$LOCAL_IP" ]]; then + LOCAL_IP="127.0.0.1" + fi NODES_ARG="$LOCAL_IP" PEER_NODES=() echo "Solo mode enabled. Skipping node detection." From b8930b05a10191f8ce7961eb5f4c68d1e58e3d1f Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 15:24:59 -0700 Subject: [PATCH 07/48] Added `--cleanup` option --- launch-cluster.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/launch-cluster.sh b/launch-cluster.sh index 24b6a42..636a9af 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -43,6 +43,7 @@ MEM_LIMIT_GB="110" MEM_SWAP_LIMIT_GB="" PIDS_LIMIT="4096" SHM_SIZE_GB="64" +CLEANUP_MODE="false" # Function to print usage usage() { @@ -69,6 +70,7 @@ usage() { echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --config Path to .env configuration file (default: .env in script directory)" + echo " --cleanup Remove all *.whl and *.-commit files in wheels directory" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" @@ -129,6 +131,7 @@ while [[ "$#" -gt 0 ]]; do --pids-limit) PIDS_LIMIT="$2"; shift ;; --shm-size-gb) SHM_SIZE_GB="$2"; shift ;; -d) DAEMON_MODE="true" ;; + --cleanup) CLEANUP_MODE="true" ;; -h|--help) usage ;; --config) CONFIG_FILE="$2"; shift ;; start|stop|status) @@ -535,6 +538,31 @@ if [[ "$ACTION" == "stop" ]]; then exit 0 fi +# Handle 'cleanup' action +if [[ "$CLEANUP_MODE" == "true" ]]; then + WHEELS_DIR="$SCRIPT_DIR/wheels" + echo "Cleaning up wheels directory..." + + # Remove all .whl files + if compgen -G "$WHEELS_DIR/*.whl" > /dev/null 2>&1; then + rm -f "$WHEELS_DIR"/*.whl + echo "Removed *.whl files from $WHEELS_DIR" + else + echo "No *.whl files found in $WHEELS_DIR" + fi + + # Remove all .-commit files + if compgen -G "$WHEELS_DIR/*.-commit" > /dev/null 2>&1; then + rm -f "$WHEELS_DIR"/*.-commit + echo "Removed *.-commit files from $WHEELS_DIR" + else + echo "No *.-commit files found in $WHEELS_DIR" + fi + + echo "Cleanup complete." + exit 0 +fi + # Handle 'status' action if [[ "$ACTION" == "status" ]]; then echo "Checking status..." From 429042b7dc19a5cfb983c9e13f567190637ea52a Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 15:35:15 -0700 Subject: [PATCH 08/48] Revert "Added `--cleanup` option" This reverts commit b8930b05a10191f8ce7961eb5f4c68d1e58e3d1f. --- launch-cluster.sh | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 636a9af..24b6a42 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -43,7 +43,6 @@ MEM_LIMIT_GB="110" MEM_SWAP_LIMIT_GB="" PIDS_LIMIT="4096" SHM_SIZE_GB="64" -CLEANUP_MODE="false" # Function to print usage usage() { @@ -70,7 +69,6 @@ usage() { echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --config Path to .env configuration file (default: .env in script directory)" - echo " --cleanup Remove all *.whl and *.-commit files in wheels directory" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" @@ -131,7 +129,6 @@ while [[ "$#" -gt 0 ]]; do --pids-limit) PIDS_LIMIT="$2"; shift ;; --shm-size-gb) SHM_SIZE_GB="$2"; shift ;; -d) DAEMON_MODE="true" ;; - --cleanup) CLEANUP_MODE="true" ;; -h|--help) usage ;; --config) CONFIG_FILE="$2"; shift ;; start|stop|status) @@ -538,31 +535,6 @@ if [[ "$ACTION" == "stop" ]]; then exit 0 fi -# Handle 'cleanup' action -if [[ "$CLEANUP_MODE" == "true" ]]; then - WHEELS_DIR="$SCRIPT_DIR/wheels" - echo "Cleaning up wheels directory..." - - # Remove all .whl files - if compgen -G "$WHEELS_DIR/*.whl" > /dev/null 2>&1; then - rm -f "$WHEELS_DIR"/*.whl - echo "Removed *.whl files from $WHEELS_DIR" - else - echo "No *.whl files found in $WHEELS_DIR" - fi - - # Remove all .-commit files - if compgen -G "$WHEELS_DIR/*.-commit" > /dev/null 2>&1; then - rm -f "$WHEELS_DIR"/*.-commit - echo "Removed *.-commit files from $WHEELS_DIR" - else - echo "No *.-commit files found in $WHEELS_DIR" - fi - - echo "Cleanup complete." - exit 0 -fi - # Handle 'status' action if [[ "$ACTION" == "status" ]]; then echo "Checking status..." From 4a0feea6c3af431d522be97a8adc9f4dcad4f90f Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 15:35:32 -0700 Subject: [PATCH 09/48] Added `--cleanup` option to build script --- build-and-copy.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/build-and-copy.sh b/build-and-copy.sh index d019ea0..a304e7c 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -27,6 +27,7 @@ FLASHINFER_RELEASE_TAG="prebuilt-flashinfer-current" VLLM_RELEASE_TAG="prebuilt-vllm-current" # Space-separated list of GPU architectures for which prebuilt wheels are available PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a" +CLEANUP_MODE="false" cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -278,6 +279,7 @@ usage() { echo " --full-log : Enable full build logging (--progress=plain)" echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " --network : Docker network to use during build" + echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" echo " -h, --help : Show this help message" exit 1 } @@ -339,6 +341,7 @@ while [[ "$#" -gt 0 ]]; do ;; --full-log) FULL_LOG=true ;; --no-build) NO_BUILD=true ;; + --cleanup) CLEANUP_MODE=true ;; --network) if [ -n "$2" ] && [[ "$2" != -* ]]; then NETWORK_ARG="$2" @@ -372,6 +375,30 @@ if [ "$NO_BUILD" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then exit 1 fi +# Handle cleanup mode +if [[ "$CLEANUP_MODE" == "true" ]]; then + WHEELS_DIR="./wheels" + echo "Cleaning up wheels directory..." + + # Remove all .whl files + if compgen -G "$WHEELS_DIR/*.whl" > /dev/null 2>&1; then + rm -f "$WHEELS_DIR"/*.whl + echo "Removed *.whl files from $WHEELS_DIR" + else + echo "No *.whl files found in $WHEELS_DIR" + fi + + # Remove all .-commit files + if compgen -G "$WHEELS_DIR/*.-commit" > /dev/null 2>&1; then + rm -f "$WHEELS_DIR"/*.-commit + echo "Removed *.-commit files from $WHEELS_DIR" + else + echo "No *.-commit files found in $WHEELS_DIR" + fi + + echo "Cleanup complete." +fi + # Ensure wheels directory exists mkdir -p ./wheels From 2f5ff0211eb9eea4cf961881b6e840cb4a49a711 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 15:39:23 -0700 Subject: [PATCH 10/48] Cleanup in build script --- build-and-copy.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build-and-copy.sh b/build-and-copy.sh index a304e7c..628b9c0 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -390,10 +390,10 @@ if [[ "$CLEANUP_MODE" == "true" ]]; then # Remove all .-commit files if compgen -G "$WHEELS_DIR/*.-commit" > /dev/null 2>&1; then - rm -f "$WHEELS_DIR"/*.-commit - echo "Removed *.-commit files from $WHEELS_DIR" + rm -f "$WHEELS_DIR"/.*-commit + echo "Removed .*-commit files from $WHEELS_DIR" else - echo "No *.-commit files found in $WHEELS_DIR" + echo "No .*-commit files found in $WHEELS_DIR" fi echo "Cleanup complete." From 73fec1bdf87652dd9e2ea7b283b2515ab2bf1b4c Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 15:40:09 -0700 Subject: [PATCH 11/48] bugfix --- build-and-copy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-and-copy.sh b/build-and-copy.sh index 628b9c0..ed7b9c0 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -389,7 +389,7 @@ if [[ "$CLEANUP_MODE" == "true" ]]; then fi # Remove all .-commit files - if compgen -G "$WHEELS_DIR/*.-commit" > /dev/null 2>&1; then + if compgen -G "$WHEELS_DIR/.*-commit" > /dev/null 2>&1; then rm -f "$WHEELS_DIR"/.*-commit echo "Removed .*-commit files from $WHEELS_DIR" else From 8b7c02aa252dc6e3215ea2ee46712248146d07bb Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 22:47:02 -0700 Subject: [PATCH 12/48] add .env support to build-and-copy.sh --- .env.example | 4 +++ autodiscover.sh | 52 ++++++++++++++++++++++++++++++++++++++ build-and-copy.sh | 64 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 105 insertions(+), 15 deletions(-) diff --git a/.env.example b/.env.example index bc6f2dc..d1cea6c 100644 --- a/.env.example +++ b/.env.example @@ -29,6 +29,10 @@ CONTAINER_NCCL_DEBUG="INFO" CONTAINER_HF_TOKEN="your_huggingface_token_here" CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1" +# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional) +# Used by build-and-copy.sh to distribute images across cluster +COPY_HOSTS="192.168.177.12" + # Additional container environment variables # CONTAINER_MAX_JOBS="16" # CONTAINER_CUDA_VISIBLE_DEVICES="0,1" diff --git a/autodiscover.sh b/autodiscover.sh index 43e622f..54ee4e0 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -1,5 +1,44 @@ #!/bin/bash +# Load .env file if exists (for shared configuration) +# This is called early so that DOTENV_* variables are available to all functions +load_env_if_exists() { + local env_file="${CONFIG_FILE:-}" + + # If CONFIG_FILE is not set, check default location + if [[ -z "$env_file" ]]; then + local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + env_file="$script_dir/.env" + fi + + if [[ -f "$env_file" ]]; then + # Load .env variables with DOTENV_ prefix + while IFS='=' read -r key value || [[ -n "$key" ]]; do + # Skip comments and empty lines + [[ "$key" =~ ^[[:space:]]*# ]] && continue + [[ -z "$key" ]] && continue + + # Remove leading/trailing whitespace from key + key=$(echo "$key" | xargs) + + # Skip if key is empty after trimming + [[ -z "$key" ]] && continue + + # Remove quotes from value + value="${value%\"}" + value="${value#\"}" + value="${value%\'}" + value="${value#\'}" + + # Export with DOTENV_ prefix + export "DOTENV_$key=$value" + done < "$env_file" + fi +} + +# Load .env file +load_env_if_exists + # Function to detect IB and Ethernet interfaces detect_interfaces() { # If both interfaces are already set, nothing to do @@ -110,6 +149,19 @@ detect_nodes() { done return 0 fi + + # Try to use COPY_HOSTS from .env + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + PEER_NODES=() + IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS" + for node in "${ALL_NODES[@]}"; do + node=$(echo "$node" | xargs) + PEER_NODES+=("$node") + done + NODES_ARG="$DOTENV_COPY_HOSTS" + return 0 + fi echo "Auto-detecting nodes..." diff --git a/build-and-copy.sh b/build-and-copy.sh index 628b9c0..1aa3628 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -28,6 +28,7 @@ VLLM_RELEASE_TAG="prebuilt-vllm-current" # Space-separated list of GPU architectures for which prebuilt wheels are available PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a" CLEANUP_MODE="false" +CONFIG_FILE="" cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -280,11 +281,32 @@ usage() { echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " --network : Docker network to use during build" echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" + echo " --config : Path to .env configuration file (default: .env in script directory)" echo " -h, --help : Show this help message" exit 1 } -# Argument parsing +# Set default CONFIG_FILE +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +export CONFIG_FILE="$SCRIPT_DIR/.env" + +# Parse --config argument first +i=1 +while [[ $i -le $# ]]; do + arg="${!i}" + if [[ "$arg" == "--config" ]]; then + next_i=$((i+1)) + CONFIG_FILE="${!next_i}" + export CONFIG_FILE + break + fi + i=$((i+1)) +done + +# Source autodiscover.sh to load .env file +source "$(dirname "$0")/autodiscover.sh" + +# Now parse all arguments normally while [[ "$#" -gt 0 ]]; do case $1 in -t|--tag) IMAGE_TAG="$2"; shift ;; @@ -300,24 +322,31 @@ while [[ "$#" -gt 0 ]]; do done if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "No hosts specified. Using autodiscovery..." - source "$(dirname "$0")/autodiscover.sh" + # Try to use COPY_HOSTS from .env first + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" + COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") + else + echo "No hosts specified. Using autodiscovery..." + source "$(dirname "$0")/autodiscover.sh" - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi + detect_nodes + if [ $? -ne 0 ]; then + echo "Error: Autodiscovery failed." + exit 1 + fi - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") - fi + if [ ${#PEER_NODES[@]} -gt 0 ]; then + COPY_HOSTS=("${PEER_NODES[@]}") + fi - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" fi - echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" fi continue ;; @@ -351,12 +380,17 @@ while [[ "$#" -gt 0 ]]; do exit 1 fi ;; + --config) CONFIG_FILE="$2"; shift ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done +# Set CONFIG_FILE and source autodiscover.sh to load .env +export CONFIG_FILE +source "$(dirname "$0")/autodiscover.sh" + # Validate flag combinations if [ -n "$VLLM_PRS" ]; then if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi From c2fe579cccb11ecd3b2096404267cce66b896153 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 23:16:56 -0700 Subject: [PATCH 13/48] Enhance .env file handling and validation in scripts --- autodiscover.sh | 8 +++++ build-and-copy.sh | 82 +++++++++++++++++------------------------------ launch-cluster.sh | 5 ++- 3 files changed, 42 insertions(+), 53 deletions(-) diff --git a/autodiscover.sh b/autodiscover.sh index 54ee4e0..11d771d 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -4,11 +4,19 @@ # This is called early so that DOTENV_* variables are available to all functions load_env_if_exists() { local env_file="${CONFIG_FILE:-}" + local config_explicit="${CONFIG_FILE_SET:-false}" # If CONFIG_FILE is not set, check default location if [[ -z "$env_file" ]]; then local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" env_file="$script_dir/.env" + config_explicit="false" + fi + + # Validate config file exists if explicitly specified + if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then + echo "Error: Config file not found: $env_file" + exit 1 fi if [[ -f "$env_file" ]]; then diff --git a/build-and-copy.sh b/build-and-copy.sh index 1aa3628..f841c6f 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -286,27 +286,8 @@ usage() { exit 1 } -# Set default CONFIG_FILE -SCRIPT_DIR="$(dirname "$(realpath "$0")")" -export CONFIG_FILE="$SCRIPT_DIR/.env" - -# Parse --config argument first -i=1 -while [[ $i -le $# ]]; do - arg="${!i}" - if [[ "$arg" == "--config" ]]; then - next_i=$((i+1)) - CONFIG_FILE="${!next_i}" - export CONFIG_FILE - break - fi - i=$((i+1)) -done - -# Source autodiscover.sh to load .env file -source "$(dirname "$0")/autodiscover.sh" - -# Now parse all arguments normally +# Parse all arguments +CONFIG_FILE_SET=false while [[ "$#" -gt 0 ]]; do case $1 in -t|--tag) IMAGE_TAG="$2"; shift ;; @@ -320,34 +301,6 @@ while [[ "$#" -gt 0 ]]; do add_copy_hosts "$1" shift done - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - # Try to use COPY_HOSTS from .env first - if [[ -n "$DOTENV_COPY_HOSTS" ]]; then - echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" - IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" - COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") - else - echo "No hosts specified. Using autodiscovery..." - source "$(dirname "$0")/autodiscover.sh" - - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi - - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") - fi - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 - fi - echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" - fi - fi continue ;; -j|--build-jobs) BUILD_JOBS="$2"; shift ;; @@ -380,17 +333,42 @@ while [[ "$#" -gt 0 ]]; do exit 1 fi ;; - --config) CONFIG_FILE="$2"; shift ;; + --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done -# Set CONFIG_FILE and source autodiscover.sh to load .env -export CONFIG_FILE +# Source autodiscover.sh to load .env file source "$(dirname "$0")/autodiscover.sh" +# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments +if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" + COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") + else + echo "No hosts specified. Using autodiscovery..." + detect_nodes + if [ $? -ne 0 ]; then + echo "Error: Autodiscovery failed." + exit 1 + fi + + if [ ${#PEER_NODES[@]} -gt 0 ]; then + COPY_HOSTS=("${PEER_NODES[@]}") + fi + + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" + fi +fi + # Validate flag combinations if [ -n "$VLLM_PRS" ]; then if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi diff --git a/launch-cluster.sh b/launch-cluster.sh index 24b6a42..f7dc1cd 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -159,9 +159,12 @@ done # Set .env file path (use default if not specified) if [[ -z "$CONFIG_FILE" ]]; then CONFIG_FILE="$SCRIPT_DIR/.env" + CONFIG_FILE_SET=false +else + CONFIG_FILE_SET=true fi -# Load .env file if exists +# Load .env file if [[ -f "$CONFIG_FILE" ]]; then echo "Loading configuration from .env file..." From c08b34a21873a99c6ae263f253f5bcf98ec2e18d Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 23:35:52 -0700 Subject: [PATCH 14/48] add --config passthrough to run-recipe --- run-recipe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/run-recipe.py b/run-recipe.py index c2aa072..b33b33b 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -1338,6 +1338,8 @@ Examples: cmd_parts.extend(["--pids-limit", str(args.pids_limit)]) if args.shm_size_gb: cmd_parts.extend(["--shm-size-gb", str(args.shm_size_gb)]) + if args.config_file: + cmd_parts.extend(["--config", args.config_file]) cmd_parts.extend(["\\", "\n --launch-script", "/tmp/tmpXXXXXX.sh"]) print(" ".join(cmd_parts)) print() @@ -1408,6 +1410,9 @@ Examples: if args.shm_size_gb: cmd.extend(["--shm-size-gb", str(args.shm_size_gb)]) + if args.config_file: + cmd.extend(["--config", args.config_file]) + # Add launch script cmd.extend(["--launch-script", temp_script]) From 83a74bccec298f33aef43fb95e9962ce2d5f5acd Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 07:45:23 -0700 Subject: [PATCH 15/48] Removed extra solo mode check --- launch-cluster.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index f7dc1cd..4f3bcf6 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -412,10 +412,6 @@ done source "$(dirname "$0")/autodiscover.sh" if [[ "$SOLO_MODE" == "true" ]]; then - if [[ -n "$NODES_ARG" ]]; then - echo "Error: --solo is incompatible with -n/--nodes." - exit 1 - fi # Solo mode: skip node detection, just get local IP # Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1 if [[ -z "$LOCAL_IP" ]]; then From 174de6f0a82025c59e0134ba8b325f1104188c51 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 08:58:04 -0700 Subject: [PATCH 16/48] temporary patch for PR38126 --- Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Dockerfile b/Dockerfile index 8401e85..41e47c8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -201,6 +201,10 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302 RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping" RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping" +# TEMPORARY PATCH for broken NVFP4 quants +RUN curl -sSL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38126.diff -o pr38126.diff \ + && (git apply --reverse --check pr38126.diff || git apply pr38126.diff) \ + && rm pr38126.diff # Final Compilation RUN --mount=type=cache,id=ccache,target=/root/.ccache \ From e6ee108cdfaa119b72b498ecc0efe23c3edecc75 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 11:43:44 -0700 Subject: [PATCH 17/48] Temporary patch for NVFP4 --- Dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 41e47c8..c4c6b46 100644 --- a/Dockerfile +++ b/Dockerfile @@ -202,8 +202,13 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping" RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping" # TEMPORARY PATCH for broken NVFP4 quants -RUN curl -sSL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38126.diff -o pr38126.diff \ - && (git apply --reverse --check pr38126.diff || git apply pr38126.diff) \ +RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38126.diff -o pr38126.diff \ + && if git apply --reverse --check pr38126.diff 2>/dev/null; then \ + echo "Patch already applied, skipping."; \ + else \ + echo "Applying patch..."; \ + git apply -v pr38126.diff; \ + fi \ && rm pr38126.diff # Final Compilation From a78e221de3827a2eacae641023c5b6feab8698d5 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 15:47:41 -0700 Subject: [PATCH 18/48] Autodiscovery refactoring with mesh support --- autodiscover.sh | 400 ++++++++++++++++++++++++++++++++++------------ build-and-copy.sh | 31 ++-- hf-download.sh | 72 +++++---- launch-cluster.sh | 19 ++- run-recipe.py | 149 ++--------------- 5 files changed, 401 insertions(+), 270 deletions(-) diff --git a/autodiscover.sh b/autodiscover.sh index 11d771d..a68c9d3 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -1,43 +1,44 @@ #!/bin/bash +SCRIPT_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + # Load .env file if exists (for shared configuration) # This is called early so that DOTENV_* variables are available to all functions load_env_if_exists() { local env_file="${CONFIG_FILE:-}" local config_explicit="${CONFIG_FILE_SET:-false}" - + # If CONFIG_FILE is not set, check default location if [[ -z "$env_file" ]]; then - local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" - env_file="$script_dir/.env" + env_file="$SCRIPT_DIR/.env" config_explicit="false" fi - + # Validate config file exists if explicitly specified if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then echo "Error: Config file not found: $env_file" exit 1 fi - + if [[ -f "$env_file" ]]; then # Load .env variables with DOTENV_ prefix while IFS='=' read -r key value || [[ -n "$key" ]]; do # Skip comments and empty lines [[ "$key" =~ ^[[:space:]]*# ]] && continue [[ -z "$key" ]] && continue - + # Remove leading/trailing whitespace from key key=$(echo "$key" | xargs) - + # Skip if key is empty after trimming [[ -z "$key" ]] && continue - + # Remove quotes from value value="${value%\"}" value="${value#\"}" value="${value%\'}" value="${value#\'}" - + # Export with DOTENV_ prefix export "DOTENV_$key=$value" done < "$env_file" @@ -47,6 +48,9 @@ load_env_if_exists() { # Load .env file load_env_if_exists +# Mesh mode flag (set by detect_interfaces) +MESH_MODE="false" + # Function to detect IB and Ethernet interfaces detect_interfaces() { # If both interfaces are already set, nothing to do @@ -61,60 +65,126 @@ detect_interfaces() { fi echo "Auto-detecting interfaces..." - + # Get all Up interfaces: "rocep1s0f1 port 1 ==> enp1s0f1np1 (Up)" # We capture: IB_DEV, NET_DEV mapfile -t IB_NET_PAIRS < <(ibdev2netdev | awk '/Up\)/ {print $1 " " $5}') - + if [ ${#IB_NET_PAIRS[@]} -eq 0 ]; then echo "Error: No active IB interfaces found." return 1 fi DETECTED_IB_IFS=() - CANDIDATE_ETH_IFS=() + ALL_NET_IFS=() for pair in "${IB_NET_PAIRS[@]}"; do ib_dev=$(echo "$pair" | awk '{print $1}') net_dev=$(echo "$pair" | awk '{print $2}') - DETECTED_IB_IFS+=("$ib_dev") - - # Check if interface has an IP address - if ip addr show "$net_dev" | grep -q "inet "; then - CANDIDATE_ETH_IFS+=("$net_dev") + ALL_NET_IFS+=("$net_dev") + done + + local num_up="${#IB_NET_PAIRS[@]}" + + # --- Sanity checks --- + + # 1. enp* (no capital P) interfaces MUST have an IP + for net_dev in "${ALL_NET_IFS[@]}"; do + if [[ "$net_dev" =~ ^enp[^P] ]] || [[ "$net_dev" == enp* && "$net_dev" != *P* ]]; then + if ! ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then + echo "Error: Interface $net_dev (enp*, no capital P) is Up but has no IP address assigned." + return 1 + fi fi done - # Set IB_IF if not provided - if [[ -z "$IB_IF" ]]; then - IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") - echo " Detected IB_IF: $IB_IF" - fi - - # Set ETH_IF if not provided - if [[ -z "$ETH_IF" ]]; then - if [ ${#CANDIDATE_ETH_IFS[@]} -eq 0 ]; then - echo "Error: No active IB-associated interfaces have IP addresses." + # 2. No two interfaces with IPs should share the same subnet + declare -A SEEN_SUBNETS + for net_dev in "${ALL_NET_IFS[@]}"; do + local cidr + cidr=$(ip -o -f inet addr show "$net_dev" 2>/dev/null | awk '{print $4}' | head -n1) + [[ -z "$cidr" ]] && continue + # Compute network address using python3 + local net_addr + net_addr=$(python3 -c "import ipaddress; print(str(ipaddress.ip_network('$cidr', strict=False)))" 2>/dev/null) + if [[ -n "${SEEN_SUBNETS[$net_addr]}" ]]; then + echo "Error: Interfaces $net_dev and ${SEEN_SUBNETS[$net_addr]} share the same subnet ($net_addr). Check network configuration." return 1 fi - - # Selection logic: Prefer interface without capital 'P' - SELECTED_ETH="" - for iface in "${CANDIDATE_ETH_IFS[@]}"; do - if [[ "$iface" != *"P"* ]]; then - SELECTED_ETH="$iface" - break - fi - done - - # Fallback: Use the first one if all have 'P' or none found yet - if [[ -z "$SELECTED_ETH" ]]; then - SELECTED_ETH="${CANDIDATE_ETH_IFS[0]}" + SEEN_SUBNETS["$net_addr"]="$net_dev" + done + + # --- Mode selection --- + + if [[ "$num_up" -eq 2 ]]; then + # Non-mesh configuration + MESH_MODE="false" + echo " Non-mesh mode: 2 CX7 interfaces active." + + # Set IB_IF if not provided + if [[ -z "$IB_IF" ]]; then + IB_IF=$(IFS=,; echo "${DETECTED_IB_IFS[*]}") + echo " Detected IB_IF: $IB_IF" fi - - ETH_IF="$SELECTED_ETH" - echo " Detected ETH_IF: $ETH_IF" + + # Set ETH_IF if not provided: prefer interface without capital 'P' + if [[ -z "$ETH_IF" ]]; then + local selected_eth="" + for net_dev in "${ALL_NET_IFS[@]}"; do + if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then + if [[ "$net_dev" != *P* ]]; then + selected_eth="$net_dev" + break + fi + fi + done + # Fallback: first interface with an IP + if [[ -z "$selected_eth" ]]; then + for net_dev in "${ALL_NET_IFS[@]}"; do + if ip addr show "$net_dev" 2>/dev/null | grep -q "inet "; then + selected_eth="$net_dev" + break + fi + done + fi + if [[ -z "$selected_eth" ]]; then + echo "Error: No active IB-associated interfaces have IP addresses." + return 1 + fi + ETH_IF="$selected_eth" + echo " Detected ETH_IF: $ETH_IF" + fi + + elif [[ "$num_up" -eq 4 ]]; then + # Mesh configuration + MESH_MODE="true" + echo " Mesh mode: all 4 CX7 interfaces active." + + # Set IB_IF to all four RoCE interfaces (hardcoded for mesh) + if [[ -z "$IB_IF" ]]; then + IB_IF="rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1" + echo " Detected IB_IF: $IB_IF" + fi + + # Set ETH_IF: check enP7s7 first, then wlP9s9 + if [[ -z "$ETH_IF" ]]; then + if ip addr show enP7s7 2>/dev/null | grep -q "inet "; then + ETH_IF="enP7s7" + echo " Detected ETH_IF: $ETH_IF" + elif ip addr show wlP9s9 2>/dev/null | grep -q "inet "; then + ETH_IF="wlP9s9" + echo " Detected ETH_IF: $ETH_IF" + echo " Warning: using wireless interface (wlP9s9) for cluster coordination. Performance may be limited." + else + echo "Error: Mesh mode requires enP7s7 or wlP9s9 to be up with an IP address for cluster coordination." + return 1 + fi + fi + + else + echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)." + return 1 fi } @@ -131,16 +201,51 @@ detect_local_ip() { # Get CIDR of the selected ETH_IF CIDR=$(ip -o -f inet addr show "$ETH_IF" | awk '{print $4}' | head -n 1) - + if [[ -z "$CIDR" ]]; then echo "Error: Could not determine IP/CIDR for interface $ETH_IF" return 1 fi - + LOCAL_IP=${CIDR%/*} echo " Detected Local IP: $LOCAL_IP ($CIDR)" } +# Scan a subnet for GB10-capable peers via SSH +# Usage: _scan_subnet_for_gb10 +_scan_subnet_for_gb10() { + local cidr="$1" + local exclude_ip="$2" + local out_file="$3" + + if ! command -v python3 &> /dev/null; then + echo "Error: python3 not found." + return 1 + fi + if ! command -v nc &> /dev/null; then + echo "Error: nc (netcat) not found." + return 1 + fi + + local all_ips + all_ips=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$cidr") + + for ip in $all_ips; do + [[ "$ip" == "$exclude_ip" ]] && continue + ( + if nc -z -w 1 "$ip" 22 &>/dev/null; then + # Check if remote is a GB10 system + if ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \ + "nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null" \ + 2>/dev/null | grep -q "NVIDIA GB10"; then + echo "$ip" >> "$out_file" + fi + fi + ) & + done + wait +} + # Function to detect cluster nodes detect_nodes() { detect_local_ip || return 1 @@ -157,72 +262,165 @@ detect_nodes() { done return 0 fi - - # Try to use COPY_HOSTS from .env - if [[ -n "$DOTENV_COPY_HOSTS" ]]; then - echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + + # Try to use CLUSTER_NODES from .env + if [[ -n "$DOTENV_CLUSTER_NODES" ]]; then + echo " Using CLUSTER_NODES from .env: $DOTENV_CLUSTER_NODES" PEER_NODES=() - IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS" + IFS=',' read -ra ALL_NODES <<< "$DOTENV_CLUSTER_NODES" for node in "${ALL_NODES[@]}"; do node=$(echo "$node" | xargs) - PEER_NODES+=("$node") + [[ "$node" != "$LOCAL_IP" ]] && PEER_NODES+=("$node") done - NODES_ARG="$DOTENV_COPY_HOSTS" + NODES_ARG="$DOTENV_CLUSTER_NODES" return 0 fi - echo "Auto-detecting nodes..." - - if ! command -v nc &> /dev/null; then - echo "Error: nc (netcat) not found. Please install netcat." - return 1 - fi - - if ! command -v python3 &> /dev/null; then - echo "Error: python3 not found. Please install python3." - return 1 + echo "Auto-detecting nodes on $CIDR (checking for NVIDIA GB10)..." + + local temp_file + temp_file=$(mktemp) + + _scan_subnet_for_gb10 "$CIDR" "$LOCAL_IP" "$temp_file" + + PEER_NODES=() + local detected_ips=("$LOCAL_IP") + if [[ -f "$temp_file" ]]; then + while read -r ip; do + PEER_NODES+=("$ip") + detected_ips+=("$ip") + echo " Found GB10 peer: $ip" + done < <(sort "$temp_file") + rm -f "$temp_file" fi - DETECTED_IPS=("$LOCAL_IP") - PEER_NODES=() - - echo " Scanning for SSH peers on $CIDR..." - - # Generate list of IPs using python - ALL_IPS=$(python3 -c "import ipaddress, sys; [print(ip) for ip in ipaddress.ip_network(sys.argv[1], strict=False).hosts()]" "$CIDR") - - TEMP_IPS_FILE=$(mktemp) - - # Scan in parallel - for ip in $ALL_IPS; do - # Skip own IP - if [[ "$ip" == "$LOCAL_IP" ]]; then continue; fi - - ( - # Check port 22 with 1 second timeout - if nc -z -w 1 "$ip" 22 &>/dev/null; then - echo "$ip" >> "$TEMP_IPS_FILE" - fi - ) & - done - - # Wait for all background scans to complete - wait - - # Read found IPs - if [[ -f "$TEMP_IPS_FILE" ]]; then - while read -r ip; do - DETECTED_IPS+=("$ip") - PEER_NODES+=("$ip") - echo " Found peer: $ip" - done < "$TEMP_IPS_FILE" - rm -f "$TEMP_IPS_FILE" - fi - - # Sort IPs - IFS=$'\n' SORTED_IPS=($(sort <<<"${DETECTED_IPS[*]}")) + # Sort and set NODES_ARG + IFS=$'\n' SORTED_IPS=($(sort <<<"${detected_ips[*]}")) unset IFS - NODES_ARG=$(IFS=,; echo "${SORTED_IPS[*]}") echo " Cluster Nodes: $NODES_ARG" } + +# Function to detect COPY_HOSTS for build/model distribution +# In non-mesh mode: COPY_PEER_NODES = PEER_NODES (same network) +# In mesh mode: scan enp* interfaces (direct IB-attached) for GB10 peers +detect_copy_hosts() { + if [[ "$MESH_MODE" == "false" ]]; then + COPY_PEER_NODES=("${PEER_NODES[@]}") + return 0 + fi + + # Mesh mode: scan enp1s0f0np0 and enp1s0f1np1 subnets + echo "Auto-detecting COPY_HOSTS on direct IB interfaces (mesh mode)..." + + local temp_file + temp_file=$(mktemp) + + for iface in enp1s0f0np0 enp1s0f1np1; do + local cidr + cidr=$(ip -o -f inet addr show "$iface" 2>/dev/null | awk '{print $4}' | head -n1) + [[ -z "$cidr" ]] && continue + local local_iface_ip="${cidr%/*}" + echo " Scanning $iface ($cidr)..." + _scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file" + done + + # Deduplicate and collect results + COPY_PEER_NODES=() + declare -A _SEEN_COPY + if [[ -f "$temp_file" ]]; then + while read -r ip; do + if [[ -z "${_SEEN_COPY[$ip]}" ]]; then + _SEEN_COPY["$ip"]=1 + COPY_PEER_NODES+=("$ip") + echo " Found GB10 copy host: $ip" + fi + done < <(sort "$temp_file") + rm -f "$temp_file" + fi +} + +# Save discovered configuration to .env +# Skips if .env already exists unless FORCE_DISCOVER=true +save_config() { + local env_file="${CONFIG_FILE:-$SCRIPT_DIR/.env}" + + # Skip if .env exists and not forced + if [[ -f "$env_file" && "${FORCE_DISCOVER:-false}" != "true" ]]; then + return 0 + fi + + echo "" + local save_prompt="Save discovered configuration to $env_file?" + if [[ -f "$env_file" ]]; then + save_prompt="Overwrite existing configuration in $env_file?" + fi + read -r -p "$save_prompt [Y/n]: " response + response="${response,,}" + if [[ "$response" =~ ^(n|no)$ ]]; then + return 0 + fi + + # Build list of all cluster nodes (local + peers) + local all_cluster_nodes=() + if [[ -n "$LOCAL_IP" ]]; then + all_cluster_nodes+=("$LOCAL_IP") + fi + for node in "${PEER_NODES[@]}"; do + all_cluster_nodes+=("$node") + done + + # Per-node confirmation for CLUSTER_NODES + echo "" + echo "Select nodes for CLUSTER_NODES:" + local selected_cluster=() + for node in "${all_cluster_nodes[@]}"; do + local label="$node" + [[ "$node" == "$LOCAL_IP" ]] && label="$node (this machine)" + read -r -p " Include $label? [Y/n]: " r + r="${r,,}" + if [[ ! "$r" =~ ^(n|no)$ ]]; then + selected_cluster+=("$node") + fi + done + + if [[ "${#selected_cluster[@]}" -eq 0 ]]; then + echo "No nodes selected. Aborting save." + return 1 + fi + + # Per-node confirmation for COPY_HOSTS + echo "" + echo "Select nodes for COPY_HOSTS (build/model distribution):" + local selected_copy=() + for node in "${COPY_PEER_NODES[@]}"; do + read -r -p " Include $node in COPY_HOSTS? [Y/n]: " r + r="${r,,}" + if [[ ! "$r" =~ ^(n|no)$ ]]; then + selected_copy+=("$node") + fi + done + + # Write .env + { + echo "# Auto-generated by autodiscover.sh" + echo "CLUSTER_NODES=$(IFS=,; echo "${selected_cluster[*]}")" + if [[ "${#selected_copy[@]}" -gt 0 ]]; then + echo "COPY_HOSTS=$(IFS=,; echo "${selected_copy[*]}")" + fi + echo "LOCAL_IP=$LOCAL_IP" + echo "ETH_IF=$ETH_IF" + echo "IB_IF=$IB_IF" + } > "$env_file" + echo "" + echo "Saved to $env_file" +} + +# Convenience function: run full autodiscovery pipeline +run_autodiscover() { + detect_interfaces || return 1 + detect_local_ip || return 1 + detect_nodes || return 1 + detect_copy_hosts || return 1 + save_config +} diff --git a/build-and-copy.sh b/build-and-copy.sh index dec93b4..90d6a27 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -282,6 +282,7 @@ usage() { echo " --network : Docker network to use during build" echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" echo " --config : Path to .env configuration file (default: .env in script directory)" + echo " --setup : Force autodiscovery and save configuration (even if .env exists)" echo " -h, --help : Show this help message" exit 1 } @@ -334,6 +335,7 @@ while [[ "$#" -gt 0 ]]; do fi ;; --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; + --setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac @@ -343,6 +345,18 @@ done # Source autodiscover.sh to load .env file source "$(dirname "$0")/autodiscover.sh" +# If --setup: force full autodiscovery and save configuration +if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then + echo "Running full autodiscovery (--setup)..." + detect_interfaces || exit 1 + detect_local_ip || exit 1 + detect_nodes || exit 1 + detect_copy_hosts || exit 1 + save_config || exit 1 + # Reload .env so DOTENV_* variables reflect saved config + load_env_if_exists +fi + # Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [[ -n "$DOTENV_COPY_HOSTS" ]]; then @@ -351,19 +365,18 @@ if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") else echo "No hosts specified. Using autodiscovery..." - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi + detect_interfaces || { echo "Error: Interface detection failed."; exit 1; } + detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; } + detect_nodes || { echo "Error: Node detection failed."; exit 1; } + detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; } - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") + if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then + COPY_HOSTS=("${COPY_PEER_NODES[@]}") fi if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 + echo "Error: Autodiscovery found no other nodes." + exit 1 fi echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" fi diff --git a/hf-download.sh b/hf-download.sh index c4c8d9f..3b5bdd6 100755 --- a/hf-download.sh +++ b/hf-download.sh @@ -7,6 +7,8 @@ HUB_PATH="${HF_HOME:-$HOME/.cache/huggingface}/hub" COPY_HOSTS=() SSH_USER="$USER" PARALLEL_COPY=false +CONFIG_FILE="" +CONFIG_FILE_SET=false # Help function usage() { @@ -16,6 +18,7 @@ usage() { echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-parallel : Copy to all hosts in parallel instead of serially." echo " -u, --user : Username for ssh commands (default: \$USER)" + echo " --config : Path to .env configuration file (default: .env in script directory)" echo " -h, --help : Show this help message" exit 1 } @@ -37,11 +40,11 @@ copy_model_to_host() { local host="$1" local model_name="$2" local model_dir="$3" - + echo "Copying model '$model_name' to ${SSH_USER}@${host}..." local host_copy_start host_copy_end host_copy_time host_copy_start=$(date +%s) - + if rsync -av --mkpath --progress "$model_dir" "${SSH_USER}@${host}:$HUB_PATH/"; then host_copy_end=$(date +%s) host_copy_time=$((host_copy_end - host_copy_start)) @@ -53,44 +56,24 @@ copy_model_to_host() { } # Argument parsing +COPY_TO_FLAG=false while [[ "$#" -gt 0 ]]; do case $1 in -c|--copy-to|--copy-to-host|--copy-to-hosts) + COPY_TO_FLAG=true shift # Consume arguments until the next flag or end of args while [[ "$#" -gt 0 && "$1" != -* ]]; do add_copy_hosts "$1" shift done - - # If no hosts specified, use autodiscovery - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "No hosts specified. Using autodiscovery..." - source "$(dirname "$0")/autodiscover.sh" - - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi - - # Use PEER_NODES directly - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") - fi - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 - fi - echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" - fi continue ;; --copy-parallel) PARALLEL_COPY=true ;; -u|--user) SSH_USER="$2"; shift ;; + --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; -h|--help) usage ;; - *) + *) # If positional argument is provided if [ -z "${MODEL_NAME:-}" ]; then MODEL_NAME="$1" @@ -103,12 +86,47 @@ while [[ "$#" -gt 0 ]]; do shift done +# Export config so autodiscover.sh picks it up +export CONFIG_FILE CONFIG_FILE_SET + +# Source autodiscover.sh to load .env (for DOTENV_COPY_HOSTS) and make detection functions available +source "$(dirname "$0")/autodiscover.sh" + # Validate model name is provided if [ -z "${MODEL_NAME:-}" ]; then echo "Error: Model name is required." usage fi +# Resolve COPY_HOSTS if --copy-to was given without hosts, or use .env +if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + # --copy-to was specified but no hosts given: use .env or autodiscover + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" + COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") + else + echo "No hosts specified. Using autodiscovery..." + detect_interfaces || { echo "Error: Interface detection failed."; exit 1; } + detect_local_ip || { echo "Error: Local IP detection failed."; exit 1; } + detect_nodes || { echo "Error: Node detection failed."; exit 1; } + detect_copy_hosts || { echo "Error: Copy host detection failed."; exit 1; } + + if [ "${#COPY_PEER_NODES[@]}" -gt 0 ]; then + COPY_HOSTS=("${COPY_PEER_NODES[@]}") + fi + + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered copy hosts: ${COPY_HOSTS[*]}" + fi +elif [ "$COPY_TO_FLAG" = false ] && [ "${#COPY_HOSTS[@]}" -eq 0 ] && [[ -n "$DOTENV_COPY_HOSTS" ]]; then + # No --copy-to flag but .env has COPY_HOSTS — don't auto-copy; user must request it explicitly + : # intentional no-op; user didn't ask for copy +fi + # Check if uvx is installed if ! command -v uvx &> /dev/null; then echo "Error: 'uvx' command not found." @@ -231,4 +249,4 @@ if [ "$COPY_TIME" -gt 0 ]; then fi echo "Total: $(printf '%02d:%02d:%02d' $((TOTAL_TIME/3600)) $((TOTAL_TIME%3600/60)) $((TOTAL_TIME%60)))" echo "=========================================" -echo "Done downloading $MODEL_NAME." \ No newline at end of file +echo "Done downloading $MODEL_NAME." diff --git a/launch-cluster.sh b/launch-cluster.sh index 4f3bcf6..1b267f9 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -68,7 +68,8 @@ usage() { echo " --mem-swap-limit-gb Memory+swap limit in GB (default: mem-limit + 10, only with --non-privileged)" echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" - echo " --config Path to .env configuration file (default: .env in script directory)" + echo " --config Path to .env configuration file (default: .env in script directory) + --setup Force autodiscovery and save configuration (even if .env exists)" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" @@ -131,6 +132,7 @@ while [[ "$#" -gt 0 ]]; do -d) DAEMON_MODE="true" ;; -h|--help) usage ;; --config) CONFIG_FILE="$2"; shift ;; + --setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; start|stop|status) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." @@ -411,6 +413,21 @@ done # Source autodiscover module source "$(dirname "$0")/autodiscover.sh" +if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then + # --setup: force full autodiscovery and save configuration + echo "Running full autodiscovery (--setup)..." + detect_interfaces || exit 1 + detect_local_ip || exit 1 + detect_nodes || exit 1 + detect_copy_hosts || exit 1 + save_config || exit 1 + # Reload .env so DOTENV_* variables reflect saved config + load_env_if_exists + [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES" + [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF" + [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF" +fi + if [[ "$SOLO_MODE" == "true" ]]; then # Solo mode: skip node detection, just get local IP # Use LOCAL_IP from .env if set, otherwise default to 127.0.0.1 diff --git a/run-recipe.py b/run-recipe.py index b33b33b..842f99b 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -547,7 +547,6 @@ def load_env_file() -> dict[str, str]: Reads the .env file created by --discover for persistent cluster configuration. EXTENSIBILITY: - - To add new persistent settings: Just add them to save_env_file() - To support multiple .env files: Add a --env-file CLI argument - To add validation: Check for required keys after loading @@ -573,57 +572,16 @@ def load_env_file() -> dict[str, str]: return env -def save_env_file(env: dict[str, str]) -> None: - """ - Save environment variables to .env file. - - Persists cluster configuration discovered by autodiscover.sh. - Values are properly quoted if they contain spaces or commas. - - EXTENSIBILITY: - - To add new persistent settings: Just add them to the env dict before calling - - To add timestamps/metadata: Add comment lines to the output - - To support append mode: Read existing, merge, then write - - Args: - env: Dictionary of key=value pairs to save - """ - lines = ["# Auto-generated by run-recipe.py --discover", ""] - for key, value in sorted(env.items()): - # Quote values with spaces - if " " in value or "," in value: - lines.append(f'{key}="{value}"') - else: - lines.append(f"{key}={value}") - lines.append("") - - with open(ENV_FILE, "w") as f: - f.write("\n".join(lines)) - - print(f"Saved to {ENV_FILE}") - - def run_autodiscover() -> dict[str, str] | None: """ - Run autodiscover.sh and return discovered configuration. + Run autodiscover.sh interactively and return discovered configuration. Executes the autodiscover.sh script to detect cluster topology, - then presents an interactive node selection menu. - - EXTENSIBILITY: - - To add new discovery methods: Extend autodiscover.sh or add Python detection here - - To add GPU detection: Add nvidia-smi parsing to discovered env - - To skip interactive selection: Add a --non-interactive flag - - To add node health checks: Ping/SSH test each discovered node - - DISCOVERED VARIABLES: - CLUSTER_NODES: Comma-separated list of node IPs (user-selected) - LOCAL_IP: This machine's IP address - ETH_IF: Ethernet interface name (e.g., 'eth0') - IB_IF: InfiniBand interface name (e.g., 'ibp12s0') if available + including interactive per-node confirmation and .env saving. + After autodiscover.sh completes, reads configuration from .env file. Returns: - Dictionary with discovered configuration, or None if discovery failed + Dictionary with discovered configuration from .env, or None if discovery failed """ if not AUTODISCOVER_SCRIPT.exists(): print(f"Error: Autodiscover script not found: {AUTODISCOVER_SCRIPT}") @@ -632,85 +590,28 @@ def run_autodiscover() -> dict[str, str] | None: print("Running autodiscover...") print() - # Run autodiscover in a subshell and capture the variables - # We source the script and print the variables we care about + # Build env for the subprocess so CONFIG_FILE is passed through + env_vars = os.environ.copy() + env_vars["CONFIG_FILE"] = str(ENV_FILE) + env_vars["CONFIG_FILE_SET"] = "true" + + # Run autodiscover interactively so its prompts are shown to the user script = f""" source '{AUTODISCOVER_SCRIPT}' - detect_interfaces - detect_local_ip - detect_nodes - echo "CLUSTER_NODES=$NODES_ARG" - echo "LOCAL_IP=$LOCAL_IP" - echo "ETH_IF=$ETH_IF" - echo "IB_IF=$IB_IF" + run_autodiscover """ - result = subprocess.run(["bash", "-c", script], capture_output=True, text=True) + result = subprocess.run(["bash", "-c", script], env=env_vars) if result.returncode != 0: - print("Autodiscover output:") - print(result.stdout) - if result.stderr: - print(result.stderr) print("Error: Autodiscover failed") return None - # Print the autodiscover output (excluding the final variable lines) - output_lines = result.stdout.strip().split("\n") - env = {} - for line in output_lines: - if "=" in line and any( - line.startswith(k) - for k in ["CLUSTER_NODES=", "LOCAL_IP=", "ETH_IF=", "IB_IF="] - ): - key, _, value = line.partition("=") - env[key] = value - else: - print(line) - - print() - - # Interactive node selection - if env.get("CLUSTER_NODES"): - all_nodes = [n.strip() for n in env["CLUSTER_NODES"].split(",") if n.strip()] - local_ip = env.get("LOCAL_IP", "") - - if len(all_nodes) > 1: - print("Select which nodes to include in the cluster:") - print() - - selected_nodes = [] - for node in all_nodes: - is_local = node == local_ip - label = f"{node} (this machine)" if is_local else node - - # Default to yes for all nodes - while True: - response = input(f" Include {label}? [Y/n]: ").strip().lower() - if response in ("", "y", "yes"): - selected_nodes.append(node) - break - elif response in ("n", "no"): - break - else: - print(" Please enter 'y' or 'n'") - - print() - - if not selected_nodes: - print("No nodes selected. Aborting.") - return None - - if len(selected_nodes) == 1: - print(f"Only one node selected: {selected_nodes[0]}") - print("This will run in solo mode (single node).") - else: - print( - f"Selected {len(selected_nodes)} nodes: {', '.join(selected_nodes)}" - ) - - env["CLUSTER_NODES"] = ",".join(selected_nodes) - print() + # Read configuration from the .env file that autodiscover.sh wrote + env = load_env_file() + if not env.get("CLUSTER_NODES"): + print("Autodiscover completed but no CLUSTER_NODES found in .env") + return None return env @@ -990,8 +891,6 @@ Examples: print(f" {key}={value}") print() - save_env_file(env) - if not args.recipe: return 0 @@ -1058,20 +957,6 @@ Examples: nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) nodes_from_env = True - if nodes: - # Ask if user wants to save to .env - print() - response = ( - input( - "Save this configuration to .env for future use? [Y/n]: " - ) - .strip() - .lower() - ) - if response in ("", "y", "yes"): - save_env_file(discovered_env) - print() - # Resolve network interfaces: CLI > .env > auto-detect by launch-cluster.sh eth_if = args.eth_if or None ib_if = args.ib_if or None From f163ca69de2b382de1612427ece550034d9f4e06 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 16:30:05 -0700 Subject: [PATCH 19/48] Autodiscover tweaks --- autodiscover.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/autodiscover.sh b/autodiscover.sh index a68c9d3..8cf21e1 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -182,6 +182,12 @@ detect_interfaces() { fi fi + # Export mesh NCCL settings directly so launch-cluster.sh picks them up + # even if the user declines to save config to .env + export DOTENV_CONTAINER_NCCL_NET_PLUGIN=none + export DOTENV_CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1 + export DOTENV_CONTAINER_NCCL_IB_MERGE_NICS=0 + else echo "Error: Unexpected number of active CX7 interfaces ($num_up). Expected 2 (non-mesh) or 4 (mesh)." return 1 @@ -411,6 +417,12 @@ save_config() { echo "LOCAL_IP=$LOCAL_IP" echo "ETH_IF=$ETH_IF" echo "IB_IF=$IB_IF" + if [[ "$MESH_MODE" == "true" ]]; then + echo "# Mesh mode NCCL settings" + echo "CONTAINER_NCCL_NET_PLUGIN=none" + echo "CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1" + echo "CONTAINER_NCCL_IB_MERGE_NICS=0" + fi } > "$env_file" echo "" echo "Saved to $env_file" From 00c16746e514ab85a557ca8210e689f892ebf095 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 16:45:35 -0700 Subject: [PATCH 20/48] Handle new copy hosts setup in run-recipe.py --- run-recipe.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/run-recipe.py b/run-recipe.py index 842f99b..2c700ea 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -954,6 +954,7 @@ Examples: discovered_env = run_autodiscover() if discovered_env and discovered_env.get("CLUSTER_NODES"): + env = discovered_env # use freshly loaded env from autodiscover nodes = parse_nodes(discovered_env["CLUSTER_NODES"]) nodes_from_env = True @@ -1000,8 +1001,17 @@ Examples: print(f" 2. Remove nodes from .env: {sys.argv[0]} --show-env") return 1 - # Determine copy targets for cluster deployments - copy_targets = worker_nodes if is_cluster else None + # Determine copy targets for build/model distribution. + # Prefer COPY_HOSTS from .env (may differ from CLUSTER_NODES in mesh mode), + # fall back to worker_nodes derived from CLUSTER_NODES. + if is_cluster: + copy_hosts_str = env.get("COPY_HOSTS") + if copy_hosts_str: + copy_targets = [h.strip() for h in copy_hosts_str.split(",") if h.strip()] + else: + copy_targets = worker_nodes + else: + copy_targets = None if args.dry_run: print("=== Dry Run ===") From f872cc17a85fe60ae8d2f327c75f193a10feb6a4 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 16:49:09 -0700 Subject: [PATCH 21/48] Fix for --setup behavior --- run-recipe.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/run-recipe.py b/run-recipe.py index 2c700ea..a9f8104 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -590,10 +590,12 @@ def run_autodiscover() -> dict[str, str] | None: print("Running autodiscover...") print() - # Build env for the subprocess so CONFIG_FILE is passed through + # Pass CONFIG_FILE so autodiscover.sh knows where to save the config. + # Do NOT set CONFIG_FILE_SET=true — that would cause an error if the file + # doesn't exist yet (it's the file we're about to create). env_vars = os.environ.copy() env_vars["CONFIG_FILE"] = str(ENV_FILE) - env_vars["CONFIG_FILE_SET"] = "true" + env_vars.pop("CONFIG_FILE_SET", None) # Run autodiscover interactively so its prompts are shown to the user script = f""" From ce293b5f0591ea834afed3c2ae213f50cfbd0d6d Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 17:52:47 -0700 Subject: [PATCH 22/48] Additional checks for parallelism and cluster size --- launch-cluster.sh | 49 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/launch-cluster.sh b/launch-cluster.sh index 1b267f9..9ae1088 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -720,6 +720,32 @@ apply_mod_to_container() { fi } +# Parse -tp/-pp/-dp (and long forms) from a text string (command or script content). +# Sets TP_SIZE, PP_SIZE, DP_SIZE, PARALLELISM_FOUND globals. +# Only acts when at least one parallelism flag is present. +parse_parallelism_from_text() { + local text="$1" + TP_SIZE=1; PP_SIZE=1; DP_SIZE=1 + PARALLELISM_FOUND=false + + # Normalize --flag=value to --flag value for uniform word-by-word parsing + local normalized + normalized=$(echo "$text" | sed 's/\(--[a-z-]*\)=/\1 /g') + + local prev="" + for word in $normalized; do + case "$prev" in + -tp|--tensor-parallel-size) + [[ "$word" =~ ^[0-9]+$ ]] && TP_SIZE="$word" && PARALLELISM_FOUND=true ;; + -pp|--pipeline-parallel-size) + [[ "$word" =~ ^[0-9]+$ ]] && PP_SIZE="$word" && PARALLELISM_FOUND=true ;; + -dp|--data-parallel-size) + [[ "$word" =~ ^[0-9]+$ ]] && DP_SIZE="$word" && PARALLELISM_FOUND=true ;; + esac + prev="$word" + done +} + # Build a patched copy of the launch script on the host for a specific node. # Strips --distributed-executor-backend and appends multi-node args. # Prints the path of the temp file (caller must delete it). @@ -965,6 +991,29 @@ exec_no_ray_cluster() { } if [[ "$ACTION" == "exec" ]]; then + # For --no-ray, trim (or error on) PEER_NODES based on declared parallelism + if [[ "$NO_RAY_MODE" == "true" ]]; then + if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then + cmd_text=$(cat "$LAUNCH_SCRIPT_PATH" 2>/dev/null || true) + else + cmd_text="$COMMAND_TO_RUN" + fi + parse_parallelism_from_text "$cmd_text" + + if [[ "$PARALLELISM_FOUND" == "true" ]]; then + required_nodes=$(( TP_SIZE * PP_SIZE * DP_SIZE )) + total_nodes=$(( 1 + ${#PEER_NODES[@]} )) + + if [[ "$required_nodes" -gt "$total_nodes" ]]; then + echo "Error: Command requires $required_nodes nodes (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE) but only $total_nodes node(s) are configured." + exit 1 + elif [[ "$required_nodes" -lt "$total_nodes" ]]; then + echo "Note: Command requires $required_nodes node(s) (tp=$TP_SIZE * pp=$PP_SIZE * dp=$DP_SIZE); using $required_nodes of $total_nodes configured node(s)." + PEER_NODES=("${PEER_NODES[@]:0:$(( required_nodes - 1 ))}") + fi + fi + fi + start_cluster echo "Executing command: $COMMAND_TO_RUN" From c8ee2a25118fe6136d2720a3e3cc0a9d40f36b74 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 18:15:09 -0700 Subject: [PATCH 23/48] Perform node count check in any mode --- launch-cluster.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 9ae1088..b20603e 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -991,8 +991,8 @@ exec_no_ray_cluster() { } if [[ "$ACTION" == "exec" ]]; then - # For --no-ray, trim (or error on) PEER_NODES based on declared parallelism - if [[ "$NO_RAY_MODE" == "true" ]]; then + # Trim (or error on) PEER_NODES based on declared parallelism, for any multi-node exec + if [[ "$SOLO_MODE" != "true" && ${#PEER_NODES[@]} -gt 0 ]]; then if [[ "$LAUNCH_SCRIPT_MODE" == "true" ]]; then cmd_text=$(cat "$LAUNCH_SCRIPT_PATH" 2>/dev/null || true) else From cecec7482896cd91d5c0a492904d7e7e73165136 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 18:41:57 -0700 Subject: [PATCH 24/48] Add recipe for Qwen3.5-397B-INT4-Autoround in pipeline-parallel mode --- .../qwen3.5-397b-int4-autoround.yaml | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml diff --git a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml new file mode 100644 index 0000000..bff1f23 --- /dev/null +++ b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml @@ -0,0 +1,61 @@ +# Recipe: Qwen3.5-122B-A10B-INT4-Autoround +# Qwen3.5-122B model in Intel INT4-Autoround quantization +# Important: set memory utilization in GB, not percentage! Requires --no-ray to fit full context on two sparks. +# If you experience node shutdown, please limit GPU clocks on the affected node (or both): `sudo nvidia-smi -lgc 200,2150` + +recipe_version: "1" +name: Qwen3.5-397B-INT4-Autoround (PP=3) +description: Recipe for Qwen3.5-397B-INT4-Autoround to run on 3-node mesh in pipeline-parallel mode + +# HuggingFace model to download (optional, for --download-model) +model: Intel/Qwen3.5-397B-A17B-int4-AutoRound + +cluster_only: true + +# Container image to use +container: vllm-node-tf5 + +build_args: + - --tf5 + +# Mod required to fix ROPE syntax error +mods: + - mods/fix-qwen3.5-autoround + - mods/fix-qwen3.5-chat-template + +# Default settings (can be overridden via CLI) +defaults: + port: 8000 + host: 0.0.0.0 + pipeline_parellel: 3 + gpu_memory_utilization: 0.6 + max_model_len: 262144 + max_num_batched_tokens: 4176 + +# Environment variables +env: + PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True" + VLLM_MARLIN_USE_ATOMIC_ADD: 1 + +# The vLLM serve command template +command: | + vllm serve Intel/Qwen3.5-397B-A17B-int4-AutoRound \ + --max-model-len {max_model_len} \ + --max-num-seqs 10 \ + --kv-cache-dtype fp8 \ + --gpu-memory-utilization {gpu_memory_utilization} \ + --port {port} \ + --host {host} \ + --enable-prefix-caching \ + --enable-auto-tool-choice \ + --tool-call-parser qwen3_coder \ + --reasoning-parser qwen3 \ + --max-num-batched-tokens {max_num_batched_tokens} \ + --trust-remote-code \ + --chat-template unsloth.jinja \ + -tp 1 \ + -pp {pipeline_parallel} \ + --enable-expert-parallel \ + --distributed-executor-backend ray + + From 0fa585f909a93280a6423175fa1794b54eacf38f Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Thu, 26 Mar 2026 18:43:17 -0700 Subject: [PATCH 25/48] Fix typo in pipeline_parallel setting in Qwen3.5-397B-INT4-Autoround recipe --- recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml index bff1f23..238a129 100644 --- a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml +++ b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml @@ -27,7 +27,7 @@ mods: defaults: port: 8000 host: 0.0.0.0 - pipeline_parellel: 3 + pipeline_parallel: 3 gpu_memory_utilization: 0.6 max_model_len: 262144 max_num_batched_tokens: 4176 From 47a896d722fa43ec6dcc06e361635b465e6ce298 Mon Sep 17 00:00:00 2001 From: eugr Date: Thu, 26 Mar 2026 22:44:48 -0700 Subject: [PATCH 26/48] Removed expert-parallel from 3x-node Qwen --- recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml index 238a129..a208268 100644 --- a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml +++ b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml @@ -28,9 +28,9 @@ defaults: port: 8000 host: 0.0.0.0 pipeline_parallel: 3 - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.7 max_model_len: 262144 - max_num_batched_tokens: 4176 + max_num_batched_tokens: 16384 # Environment variables env: @@ -55,7 +55,6 @@ command: | --chat-template unsloth.jinja \ -tp 1 \ -pp {pipeline_parallel} \ - --enable-expert-parallel \ --distributed-executor-backend ray From 51d69c5c173faaf6ab477dd77770afa3f9d3cd8b Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Fri, 27 Mar 2026 16:15:54 -0700 Subject: [PATCH 27/48] commenting out non-applicable PRs --- Dockerfile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index c4c6b46..5767055 100644 --- a/Dockerfile +++ b/Dockerfile @@ -199,17 +199,17 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # patch -p1 < fastsafetensors.patch; \ # fi # TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302 -RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping" -RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping" -# TEMPORARY PATCH for broken NVFP4 quants -RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38126.diff -o pr38126.diff \ - && if git apply --reverse --check pr38126.diff 2>/dev/null; then \ - echo "Patch already applied, skipping."; \ - else \ - echo "Applying patch..."; \ - git apply -v pr38126.diff; \ - fi \ - && rm pr38126.diff +# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping" +# RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping" +# # TEMPORARY PATCH for broken NVFP4 quants +# RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38126.diff -o pr38126.diff \ +# && if git apply --reverse --check pr38126.diff 2>/dev/null; then \ +# echo "Patch already applied, skipping."; \ +# else \ +# echo "Applying patch..."; \ +# git apply -v pr38126.diff; \ +# fi \ +# && rm pr38126.diff # Final Compilation RUN --mount=type=cache,id=ccache,target=/root/.ccache \ From c1a6cec074831103e26b950b277068b975ed28bd Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Fri, 27 Mar 2026 16:41:09 -0700 Subject: [PATCH 28/48] Updated documentation; default image tags in build script --- README.md | 205 ++++++++++++++++++++++++++++++++++++++++--- build-and-copy.sh | 14 ++- docs/NETWORKING.md | 211 ++++++++++++++++++++++++++++++++++++++++++++- recipes/README.md | 44 ++++++++-- 4 files changed, 451 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index ae8aca4..d10894a 100644 --- a/README.md +++ b/README.md @@ -52,8 +52,8 @@ Build the container. **On DGX Spark cluster:** -Make sure you connect your Sparks together and enable passwordless SSH as described in NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks). -You can also check out our new [Networking Guide](docs/NETWORKING.md). +Make sure you connect your Sparks together and enable passwordless SSH as described in our [Networking Guide](docs/NETWORKING.md). You can also check out NVidia's [Connect Two Sparks Playbook](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks), but using our guide is the best way to get started. +**NEW**: the guide now includes instructions on setting up 3-node Spark mesh! Then run the following command that will build and distribute image across the cluster. @@ -127,8 +127,6 @@ This will run the model on all available cluster nodes. **Also:** You can use any vLLM container that has "bash" as its default entrypoint with the launch script. It was tested with NGC vLLM, but can work with others too. To use such container in the cluster, you need to specify `--apply-mod use-ngc-vllm` argument to `./launch-cluster.sh`. However, it's recommended to build the container using this repository for best compatibility and most up-to-date features. -## CHANGELOG - **IMPORTANT** You may want to prune your build cache every once in a while, especially if you've been using these container builds since the beginning. @@ -149,6 +147,97 @@ Don't do it every time you rebuild, because it will slow down compilation times. For periodic maintenance, I recommend using a filter: `docker builder prune --filter until=72h` +## CHANGELOG + +### 2026-03-27 + +#### Default image tag in `build-and-copy.sh` + +`build-and-copy.sh` now automatically sets a sensible default image tag when `-t` is not specified: + +- `--tf5` / `--pre-tf` - tag defaults to `vllm-node-tf5` +- `--exp-mxfp4` - tag defaults to `vllm-node-mxfp4` +- in all other cases - tag defaults to `vllm-node` (no change) + +An explicit `-t ` always takes precedence. + +#### Support for 3-node mesh setups + +Added initial support for setups where 3 Sparks are connected in a ring-like mesh without an additional switch. +See [Networking Guide](docs/NETWORKING.md) for instructions on how to connect and set up networking in such cluster. + +Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can detect mesh setups and configure parameters accordingly. + +You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe: + +```bash +./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup # you can drop --setup and --force-build on subsequent calls +``` + +Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are: + +- `--pipeline-parallel 3` will let you run a model that can't fit on dual Sparks, but without additional speed improvements (total throughtput may improve though). +- `--data-parallel 3` (possibly with `--enable-expert-parallel`) will let you run a model that can fit on a single Spark, but allow for better concurrency. + +You can also run models with `--tensor-parallel 2` in a 3-node configuration - in this case only first two nodes (from autodiscovery/.env or from the CLI parameters) will be utilized. + +#### GB10 Verification During Node Discovery + +Node discovery now confirms each SSH-reachable peer is a GB10 system before adding it to the cluster: +Only hosts reporting `NVIDIA GB10` are included. This prevents accidentally adding non-Spark machines that happen to be on the same subnet. + +#### Separate COPY_HOSTS Discovery + +Autodiscover now determines the host list used for image and model distribution separately from `CLUSTER_NODES`: + +- **Non-mesh**: `COPY_HOSTS` mirrors `CLUSTER_NODES` (no change in behaviour). +- **Mesh**: scans the direct IB-attached `enp1s0f0np0` and `enp1s0f1np1` interfaces (not the OOB ETH interface), so large file transfers use the faster direct InfiniBand path. + +`COPY_HOSTS` is saved to `.env` and respected by `build-and-copy.sh`, `hf-download.sh`, and `run-recipe.py`. + +#### Interactive Configuration Save in `autodiscover.sh` + +`autodiscover.sh` now handles `.env` creation with a guided interactive flow, replacing the previous logic in `run-recipe.py`: + +- Runs automatically when `.env` is absent. +- Asks per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`. +- Skips if `.env` already exists (use `--setup` to force). + +`run-recipe.py` no longer contains its own `.env`-save prompt — it delegates entirely to `autodiscover.sh`. + +#### `--setup` Flag in `launch-cluster.sh` and `build-and-copy.sh` + +Both scripts now accept `--setup` to force a full autodiscovery run and overwrite the existing `.env`: + +```bash +./launch-cluster.sh --setup exec vllm serve ... +./build-and-copy.sh --setup -c +``` + +This is equivalent to the existing `--setup` in `run-recipe.sh`. + +#### `--config` Flag + +`hf-download.sh`, `build-and-copy.sh` and `launch-cluster.sh` now accept `--config ` to load a custom `.env` configuration file. `COPY_HOSTS` from the config is used for model distribution: + +```bash +./hf-download.sh QuantTrio/MiniMax-M2-AWQ --config /path/to/cluster.env -c --copy-parallel +``` + +#### Parallelism-Aware Node Trimming + +`launch-cluster.sh` now parses `-tp` / `--tensor-parallel-size`, `-pp` / `--pipeline-parallel-size`, and `-dp` / `--data-parallel-size` from the exec command or launch script and adjusts the active node count accordingly — for both Ray and no-Ray modes. + +- If **fewer nodes are needed** than configured, only the required nodes get containers started (excess nodes are left idle). +- If **more nodes are needed** than available, an error is raised before anything starts. + +``` +Note: Command requires 2 node(s) (tp=2 * pp=1 * dp=1); using 2 of 3 configured node(s). +Error: Command requires 4 nodes (tp=4 * pp=1 * dp=1) but only 3 node(s) are configured. +``` + +No flags required — the check is automatic whenever parallelism arguments are present in the command. + ### 2026-03-18 #### `--master-port` / `--head-port` Parameter @@ -591,7 +680,8 @@ See (this post on NVIDIA forums)[https://forums.developer.nvidia.com/t/make-glm- To use the mod, first build the container with Transformers 5 support (`--pre-tf`) flag, e.g.: ```bash -./build-and-copy.sh -t vllm-node-tf5 --pre-tf -c +# Image tag defaults to vllm-node-tf5 when --tf5/--pre-tf is used +./build-and-copy.sh --pre-tf -c ``` Then, to run on a single node: @@ -641,7 +731,8 @@ It is currently the fastest way to run GPT-OSS on DGX Spark, achieving 60 t/s on To use this build, first build the container with `--exp-mxfp4` flag. I recommend using a separate label as it is currently not recommended to use this build for models other than gpt-oss: ```bash -./build-and-copy.sh -t vllm-node-mxfp4 --exp-mxfp4 -c +# Image tag defaults to vllm-node-mxfp4 when --exp-mxfp4 is used +./build-and-copy.sh --exp-mxfp4 -c ``` Then, to run on a single Spark: @@ -885,7 +976,7 @@ Using a different username: | Flag | Description | | :--- | :--- | -| `-t, --tag ` | Image tag (default: `vllm-node`) | +| `-t, --tag ` | Image tag (default: `vllm-node`; auto-set to `vllm-node-tf5` with `--tf5`, `vllm-node-mxfp4` with `--exp-mxfp4`) | | `--gpu-arch ` | Target GPU architecture (default: `12.1a`) | | `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build | | `--rebuild-vllm` | Force rebuild vLLM from source | @@ -900,9 +991,13 @@ Using a different username: | `-u, --user ` | Username for SSH connection (default: current user) | | `--full-log` | Enable full Docker build output (`--progress=plain`) | | `--no-build` | Skip building, only copy existing image (requires `--copy-to`) | +| `--network ` | Docker network to use during build (e.g. `host`). | +| `--cleanup` | Remove all cached `.whl` and `*-commit` files from the `wheels/` directory. | +| `--config ` | Path to `.env` configuration file (default: `.env` in script directory) | +| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists) | | `-h, --help` | Show help message | -**IMPORTANT**: When copying to another node, make sure you use the Spark IP assigned to its ConnectX 7 interface (enp1s0f1np1), and not the 10G interface (enP7s7)! If you omit the IP address and use `-c` without addresses, it will use autodiscovery to detect a proper IP address. +**IMPORTANT**: When copying to another node manually, use the IP assigned to a ConnectX 7 interface (`enp1s0f*`), not the 10G/wireless interfaces. When using `-c` without addresses, autodiscovery selects the correct interface automatically — in mesh mode it uses the direct IB-attached interfaces (`enp1s0f0np0`, `enp1s0f1np1`) for maximum transfer speed. ### Copying the container to another Spark node (Manual Method) @@ -971,9 +1066,12 @@ Assumptions and limitations: ### Auto-Detection The script attempts to automatically detect: -* **Ethernet Interface:** The interface associated with the active InfiniBand device that has an IP address. -* **InfiniBand Interface:** The active InfiniBand devices. By default both active RoCE interfaces that correspond to active IB port(s) will be utilized. -* **Node Role:** Based on the detected IP address and the list of nodes (defaults to `192.168.177.11` as head and `192.168.177.12` as worker). +* **Ethernet Interface (`ETH_IF`):** Determined by the number of active CX7 interfaces: + - **2 active** (standard): the `enp*` interface (no capital P) that has an IP address. + - **4 active** (mesh topology): `enP7s7` (preferred) or `wlP9s9` (wireless, shown with a warning) — the cluster coordination interface is separate from the CX7 ports in this configuration. +* **InfiniBand Interface (`IB_IF`):** All active RoCE devices. In mesh mode this is always `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`. +* **Cluster peers:** Discovered by scanning the `ETH_IF` subnet for hosts with SSH access **and** a GB10 GPU (`nvidia-smi --query-gpu=name` must return `NVIDIA GB10`). +* **Copy hosts (`COPY_HOSTS`):** In standard mode, same as cluster peers. In mesh mode, scanned separately on `enp1s0f0np0` and `enp1s0f1np1` subnets so that image/model transfers use the direct InfiniBand path. ### Manual Overrides @@ -1006,6 +1104,10 @@ You can override the auto-detected values if needed: | `--mem-swap-limit-gb` | Memory+swap limit in GB (default: mem-limit + 10, only with `--non-privileged`). | | `--pids-limit` | Process limit (default: 4096, only with `--non-privileged`). | | `--shm-size-gb` | Shared memory size in GB (default: 64, only with `--non-privileged`). | +| `--config ` | Path to `.env` configuration file (default: `.env` in script directory). | +| `--setup` | Force autodiscovery and save configuration to `.env` (even if `.env` already exists). | +| `start \| stop \| status \| exec` | Action to perform (default: `start`). Not compatible with `--launch-script`. | +| `command` | Command to execute inside the container (only for `exec` action). | ### Non-Privileged Mode @@ -1149,6 +1251,61 @@ You need to make sure you allocate IP addresses to them (no need to allocate IP ## 5\. Configuration Details +### Cluster Configuration (`.env` file) + +The scripts share a `.env` file (default: `.env` in the repo directory) for persistent cluster configuration. It is created automatically by autodiscovery — run `--discover` (via `run-recipe.sh`) or `--setup` (via `launch-cluster.sh` / `build-and-copy.sh`) on first use. + +**Supported variables:** + +| Variable | Description | +| :--- | :--- | +| `CLUSTER_NODES` | Comma-separated node IPs used for Ray/vLLM cluster (head node first). | +| `COPY_HOSTS` | Comma-separated node IPs used for image and model distribution. In mesh mode these are the IPs on the direct IB-attached interfaces, which may differ from `CLUSTER_NODES`. | +| `LOCAL_IP` | IP address of the local node. | +| `ETH_IF` | Ethernet interface for cluster coordination (e.g. `enp1s0f1np1` or `enP7s7`). | +| `IB_IF` | Comma-separated RoCE/IB device names (e.g. `rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1`). | +| `CONTAINER_*` | Any variable prefixed with `CONTAINER_` (except `CONTAINER_NAME`) is passed as `-e VAR=VALUE` to the container. Example: `CONTAINER_NCCL_DEBUG=INFO` → `-e NCCL_DEBUG=INFO`. | + +**Mesh-mode NCCL variables** (written automatically when mesh topology is detected): + +``` +CONTAINER_NCCL_NET_PLUGIN=none +CONTAINER_NCCL_IB_SUBNET_AWARE_ROUTING=1 +CONTAINER_NCCL_IB_MERGE_NICS=0 +``` + +**Example `.env` for a standard 2-node cluster:** + +``` +CLUSTER_NODES=192.168.177.11,192.168.177.12 +COPY_HOSTS=192.168.177.12 +LOCAL_IP=192.168.177.11 +ETH_IF=enp1s0f1np1 +IB_IF=rocep1s0f1,roceP2p1s0f1 +``` + +To use a custom config file path, pass `--config /path/to/file.env` to any script. + +### Autodiscovery Workflow + +On first run, if no `.env` is present, the scripts will automatically trigger autodiscovery. You can also run it explicitly: + +```bash +# Via run-recipe.sh +./run-recipe.sh --discover + +# Via launch-cluster.sh or build-and-copy.sh (force re-run even if .env exists) +./launch-cluster.sh --setup exec vllm serve ... +./build-and-copy.sh --setup -c +``` + +Autodiscovery: +1. Detects active CX7 interfaces and determines mesh vs. standard topology. +2. Scans the network for SSH-reachable GB10 peers. +3. In mesh mode, separately discovers `COPY_HOSTS` on direct IB-attached interfaces. +4. Prompts for per-node confirmation for both `CLUSTER_NODES` and `COPY_HOSTS`. +5. Saves the result to `.env`. + ### Environment Persistence The script automatically appends exported variables to `~/.bashrc`. If you need to open a second terminal into the running container for debugging, simply run: @@ -1322,6 +1479,32 @@ The `hf-download.sh` script provides a convenient way to download models from Hu ./hf-download.sh -c --copy-parallel QuantTrio/MiniMax-M2-AWQ ``` +**Use nodes from `.env` (respects `COPY_HOSTS`):** + +```bash +./hf-download.sh -c QuantTrio/MiniMax-M2-AWQ +``` + +When `-c` is given without explicit hosts, the script checks `COPY_HOSTS` in `.env` first, then falls back to autodiscovery. In mesh mode this means transfers go over the direct IB-attached interfaces automatically. + +**Use a custom config file:** + +```bash +./hf-download.sh --config /path/to/cluster.env -c QuantTrio/MiniMax-M2-AWQ +``` + +**Available options:** + +| Flag | Description | +| :--- | :--- | +| `` | HuggingFace model ID (e.g. `QuantTrio/MiniMax-M2-AWQ`). Required. | +| `-c, --copy-to ` | Host(s) to copy the model to after download (space- or comma-separated). Omit hosts to use `COPY_HOSTS` from `.env` or autodiscovery. | +| `--copy-to-host` | Alias for `--copy-to` (backwards compatibility). | +| `--copy-parallel` | Copy to all hosts concurrently instead of serially. | +| `-u, --user ` | SSH username for remote copies (default: current user). | +| `--config ` | Path to `.env` configuration file (default: `.env` in script directory). | +| `-h, --help` | Show help message. | + ### Hardware Architecture **Note:** This project targets `12.1a` architecture (NVIDIA GB10 / DGX Spark). If you are using different hardware, you can use `--gpu-arch` flag in `./build-and-copy.sh`. diff --git a/build-and-copy.sh b/build-and-copy.sh index 90d6a27..804bab9 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -6,6 +6,7 @@ START_TIME=$(date +%s) # Default values IMAGE_TAG="vllm-node" +IMAGE_TAG_SET=false REBUILD_FLASHINFER=false REBUILD_VLLM=false COPY_HOSTS=() @@ -264,7 +265,7 @@ if downloads: # Help function usage() { echo "Usage: $0 [OPTIONS]" - echo " -t, --tag : Image tag (default: 'vllm-node')" + echo " -t, --tag : Image tag (default: 'vllm-node', 'vllm-node-tf5' with --tf5, 'vllm-node-mxfp4' with --exp-mxfp4)" echo " --gpu-arch : GPU architecture (default: '12.1a')" echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)" echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)" @@ -291,7 +292,7 @@ usage() { CONFIG_FILE_SET=false while [[ "$#" -gt 0 ]]; do case $1 in - -t|--tag) IMAGE_TAG="$2"; shift ;; + -t|--tag) IMAGE_TAG="$2"; IMAGE_TAG_SET=true; shift ;; --gpu-arch) GPU_ARCH_LIST="$2"; shift ;; --rebuild-flashinfer) REBUILD_FLASHINFER=true ;; --rebuild-vllm) REBUILD_VLLM=true ;; @@ -342,6 +343,15 @@ while [[ "$#" -gt 0 ]]; do shift done +# Apply default IMAGE_TAG based on flags if -t was not specified +if [ "$IMAGE_TAG_SET" = false ]; then + if [ "$PRE_TRANSFORMERS" = true ]; then + IMAGE_TAG="vllm-node-tf5" + elif [ "$EXP_MXFP4" = true ]; then + IMAGE_TAG="vllm-node-mxfp4" + fi +fi + # Source autodiscover.sh to load .env file source "$(dirname "$0")/autodiscover.sh" diff --git a/docs/NETWORKING.md b/docs/NETWORKING.md index 2cd4412..734592d 100644 --- a/docs/NETWORKING.md +++ b/docs/NETWORKING.md @@ -42,13 +42,54 @@ However, in order to get full bandwidth in NCCL RDMA mode, we need to utilize ** Also, note that connecting two Sparks using **both** ports won't give you any noticeable advantage in bandwidth, so single connection is sufficient. If you connect 3 Sparks by daisy-chaining them, you will only be able to sustain 100G between each pair of Sparks. -## Connecting more than 2 Sparks in the cluster +## Connecting 3 Sparks in a mesh cluster without a switch + +Three Sparks can be connected together in a cluster without using a separate RoCE switch. +However, all three Sparks need to be on the same wired network using it's 10G Ethernet port (RG-45, not QSFP). Being on a same wireless network should work too, but it's not recommended and was not tested. + +You need to make sure they are connected the following way: port 0 on one Spark should connect to port 1 on another Spark (unlike non-mesh configuration). +Example diagram: + +```mermaid +block-beta + columns 1 + + block:Spark3 + columns 2 + Title3["Spark 3"]:2 + s3p0["Port 0
192.168.187.13
192.168.188.13"] s3p1["Port 1
192.168.197.13
192.168.198.13"] + end + + space + + block:Spark2 + columns 2 + Title2["Spark 2"]:2 + s2p0["Port 0
192.168.197.12
192.168.198.12"] s2p1["Port 1
192.168.177.12
192.168.178.13"] + end + + space + + block:Spark1 + columns 2 + Title1["Spark 1"]:2 + s1p0["Port 0
192.168.177.11
192.168.178.11"] s1p1["Port 1
192.168.187.11
192.168.188.11"] + end + + s1p0 <--> s2p1 + s2p0 <--> s3p1 + s3p0 <--> s1p1 +``` + +## Connecting more than 2 Sparks in the cluster using a switch To connect more than 2 Sparks, you will need a proper switch, for example [Microtik CRS812-DDQ](https://mikrotik.com/product/crs812_ddq). Please refer to [this post](https://forums.developer.nvidia.com/t/6x-spark-setup/354399/56) for an example of setting up a 6-8 node Spark cluster. ## Network setup +### For dual Sparks or multiple Sparks using a QSFP switch + Assuming both are connected using rightmost QFSP port (when looking from the back). Create `/etc/netplan/40-cx7.yaml` on `spark`: @@ -115,6 +156,122 @@ MTU setting (testing): sudo ip link set dev enp1s0f1np1 mtu 9000 ``` +### For 3-node mesh + +3-node mesh is configured differently than dual clusters or clusters using a QSFP switch. + +Assuming, your Sparks are connected according to the diagram above: + +Create `/etc/netplan/40-cx7.yaml` on `spark1`: +```yaml +network: + version: 2 + ethernets: + enp1s0f0np0: + dhcp4: no + dhcp6: no # Explicitly disable DHCPv6 + link-local: [] # Restrict link-local addresses to static IPv4 only + mtu: 9000 + addresses: [192.168.177.11/24] + enP2p1s0f0np0: + dhcp4: no + dhcp6: no + link-local: [] + mtu: 9000 + addresses: [192.168.178.11/24] + enp1s0f1np1: + dhcp4: no + dhcp6: no # Explicitly disable DHCPv6 + link-local: [] # Restrict link-local addresses to static IPv4 only + mtu: 9000 + addresses: [192.168.187.11/24] + enP2p1s0f1np1: + dhcp4: no + dhcp6: no + link-local: [] + mtu: 9000 + addresses: [192.168.188.11/24] +``` + +Create `/etc/netplan/40-cx7.yaml` on `spark2`: +```yaml +network: + version: 2 + ethernets: + enp1s0f0np0: + dhcp4: no + dhcp6: no # Explicitly disable DHCPv6 + link-local: [] # Restrict link-local addresses to static IPv4 only + mtu: 9000 + addresses: [192.168.197.12/24] + enP2p1s0f0np0: + dhcp4: no + dhcp6: no + link-local: [] + mtu: 9000 + addresses: [192.168.198.12/24] + enp1s0f1np1: + dhcp4: no + dhcp6: no # Explicitly disable DHCPv6 + link-local: [] # Restrict link-local addresses to static IPv4 only + mtu: 9000 + addresses: [192.168.177.12/24] + enP2p1s0f1np1: + dhcp4: no + dhcp6: no + link-local: [] + mtu: 9000 + addresses: [192.168.178.12/24] +``` + +Create `/etc/netplan/40-cx7.yaml` on `spark3`: +```yaml +network: + version: 2 + ethernets: + enp1s0f0np0: + dhcp4: no + dhcp6: no # Explicitly disable DHCPv6 + link-local: [] # Restrict link-local addresses to static IPv4 only + mtu: 9000 + addresses: [192.168.187.13/24] + enP2p1s0f0np0: + dhcp4: no + dhcp6: no + link-local: [] + mtu: 9000 + addresses: [192.168.188.13/24] + enp1s0f1np1: + dhcp4: no + dhcp6: no # Explicitly disable DHCPv6 + link-local: [] # Restrict link-local addresses to static IPv4 only + mtu: 9000 + addresses: [192.168.197.13/24] + enP2p1s0f1np1: + dhcp4: no + dhcp6: no + link-local: [] + mtu: 9000 + addresses: [192.168.198.13/24] +``` + +Then run (on each Spark): + +```bash +sudo chmod 600 /etc/netplan/40-cx7.yaml +sudo netplan apply +``` + +### Passwordless SSH and benchmarks + +Set up passwordless ssh. On the first spark: + +```bash +wget https://raw.githubusercontent.com/NVIDIA/dgx-spark-playbooks/refs/heads/main/nvidia/connect-two-sparks/assets/discover-sparks +chmod +x discover-sparks +./discover-sparks +``` + **Benchmark connection (use perftest package):** Run the receiver on `spark2` node: @@ -196,7 +353,9 @@ ib_write_lat 192.168.177.12 -d rocep1s0f1 --report_gbits -R --force-link IB --------------------------------------------------------------------------------------- ``` -## NCCL Setup +## NCCL Tests + +### Dual Sparks or Sparks via QSFP switch From https://build.nvidia.com/spark/nccl/stacked-sparks @@ -240,3 +399,51 @@ mpirun -np 2 -H 192.168.177.11:1,192.168.177.12:1 \ $HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 2 ``` + +### 3-node mesh + +```bash +# Install dependencies and build NCCL +sudo apt-get update && sudo apt-get install -y libopenmpi-dev +git clone -b dgxspark-3node-ring https://github.com/zyang-dev/nccl.git ~/nccl/ +cd ~/nccl/ +make -j src.build NVCC_GENCODE="-gencode=arch=compute_121,code=sm_121" + +# Set environment variables +export CUDA_HOME="/usr/local/cuda" +export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi" +export NCCL_HOME="$HOME/nccl/build/" +export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH" +``` + +Build NCCL Test Suite: + +```bash +# Clone and build NCCL tests +git clone https://github.com/NVIDIA/nccl-tests.git ~/nccl-tests/ +cd ~/nccl-tests/ +make MPI=1 +``` + +Test on both nodes (replace spark1, spark2, spark3 with the actual hostnames or IP address on non-QSFP interface): + +```bash +# Set environment variables +export CUDA_HOME="/usr/local/cuda" +export MPI_HOME="/usr/lib/aarch64-linux-gnu/openmpi" +export NCCL_HOME="$HOME/nccl_spark_cluster/build/" +export LD_LIBRARY_PATH="$NCCL_HOME/lib:$CUDA_HOME/lib64/:$MPI_HOME/lib:$LD_LIBRARY_PATH" + +# For 3-node mesh we have to use 10G interface for OOB communication! +export UCX_NET_DEVICES=enP7s7 +export NCCL_SOCKET_IFNAME=enP7s7 +export OMPI_MCA_btl_tcp_if_include=enP7s7 +export NCCL_IB_HCA=rocep1s0f0,roceP2p1s0f0,rocep1s0f1,roceP2p1s0f1 +export NCCL_IB_DISABLE=0 + +# Run the all_gather performance test across both nodes +mpirun -np 3 -H spark1:1,spark2:1,spark3:1 \ + --mca plm_rsh_agent "ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" \ + -x LD_LIBRARY_PATH=$LD_LIBRARY_PATH -x NCCL_IB_MERGE_NICS=0 -x NCCL_NET_PLUGIN=none -x NCCL_IB_SUBNET_AWARE_ROUTING=1 \ + $HOME/nccl-tests/build/all_gather_perf -b 16G -e 16G -f 3 +``` \ No newline at end of file diff --git a/recipes/README.md b/recipes/README.md index 4f795ca..8ab6a76 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -44,12 +44,16 @@ The recipe runner can automatically discover cluster nodes: ``` When you run `--discover`, it: -1. Scans the network for nodes with SSH access -2. Prompts you to select which nodes to include -3. Saves the configuration to `.env` +1. Detects active CX7 interfaces and determines mesh vs. standard topology. +2. Scans the network for peers that are both SSH-reachable **and** have an NVIDIA GB10 GPU. +3. In mesh mode, separately discovers `COPY_HOSTS` on the direct IB-attached interfaces. +4. Prompts for per-node confirmation for `CLUSTER_NODES` and `COPY_HOSTS`. +5. Saves the full configuration (including mesh NCCL settings if applicable) to `.env`. Future recipe runs will automatically use nodes from `.env` unless you specify `-n` or `--solo`. +When distributing the container image or model files, the runner uses `COPY_HOSTS` from `.env` (which may differ from `CLUSTER_NODES` in mesh mode) to ensure transfers go over the fastest available path. + ## Workflow Modes ### Solo Mode (Single Node) @@ -169,6 +173,7 @@ Usage: ./run-recipe.sh [OPTIONS] [RECIPE] Cluster discovery: --discover Auto-detect cluster nodes and save to .env --show-env Show current .env configuration + --config FILE Path to .env configuration file (default: .env in repo directory) Recipe overrides: --port PORT Override port @@ -186,10 +191,25 @@ Setup options: Launch options: --solo Run in solo mode (single node, no Ray) + --no-ray Multi-node without Ray (PyTorch distributed backend) -n, --nodes IPS Comma-separated node IPs (first = head) -d, --daemon Run in daemon mode -t, --container IMAGE Override container from recipe + --name NAME Override container name --nccl-debug LEVEL NCCL debug level (VERSION, WARN, INFO, TRACE) + --master-port PORT Cluster coordination port: Ray head port or PyTorch + distributed master port (default: 29501). + Alias: --head-port + --eth-if IFACE Override Ethernet interface + --ib-if IFACE Override InfiniBand interface + -e VAR=VALUE Pass environment variable to container (repeatable) + -j N Number of parallel build jobs + --no-cache-dirs Do not mount ~/.cache/vllm, ~/.cache/flashinfer, ~/.triton + --non-privileged Run container without --privileged + --mem-limit-gb N Memory limit in GB (only with --non-privileged) + --mem-swap-limit-gb N Memory+swap limit in GB (only with --non-privileged) + --pids-limit N Process limit (only with --non-privileged) + --shm-size-gb N Shared memory size in GB (only with --non-privileged) Extra vLLM arguments: -- ARGS... Pass additional arguments directly to vLLM @@ -261,10 +281,18 @@ command: | ``` ┌─────────────────────────────────────────────────────────┐ +│ autodiscover.sh │ +│ - Interface detection (standard / mesh topology) │ +│ - GB10 peer verification via SSH │ +│ - CLUSTER_NODES and COPY_HOSTS discovery │ +│ - Interactive .env save with per-node confirmation │ +└──────────────────────────┬──────────────────────────────┘ + │ sourced by + ▼ +┌─────────────────────────────────────────────────────────┐ │ run-recipe.sh / run-recipe.py │ │ - Parses YAML recipe │ -│ - Auto-discovers cluster nodes (--discover) │ -│ - Loads nodes from .env │ +│ - Loads / triggers cluster discovery (--discover) │ │ - Handles --setup (build + download + run) │ │ - Generates launch script from template │ │ - Applies CLI overrides │ @@ -274,15 +302,15 @@ command: | ┌──────────────────────┐ ┌───────────────────────────────┐ │ build-and-copy.sh │ │ hf-download.sh │ │ - Docker build │ │ - HuggingFace model download │ -│ - Copy to workers │ │ - Rsync to workers │ +│ - Copy to COPY_HOSTS│ │ - Rsync to COPY_HOSTS │ └──────────────────────┘ └───────────────────────────────┘ - │ + │ │ then calls (for run) ▼ ┌─────────────────────────────────────────────────────────┐ │ launch-cluster.sh │ │ - Cluster orchestration │ -│ - Container lifecycle │ +│ - Container lifecycle (trimmed to required node count) │ │ - Mod application │ │ - Launch script execution │ └─────────────────────────────────────────────────────────┘ From e70c87b4f63cd846af2a7a7eb80d91308f7bb797 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 28 Mar 2026 08:50:54 -0700 Subject: [PATCH 29/48] Added PR38423 (temp) --- Dockerfile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5767055..88621eb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -201,15 +201,15 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302 # RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping" # RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping" -# # TEMPORARY PATCH for broken NVFP4 quants -# RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38126.diff -o pr38126.diff \ -# && if git apply --reverse --check pr38126.diff 2>/dev/null; then \ -# echo "Patch already applied, skipping."; \ -# else \ -# echo "Applying patch..."; \ -# git apply -v pr38126.diff; \ -# fi \ -# && rm pr38126.diff +# # TEMPORARY PATCH for broken compilation +RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38423.diff -o pr38423.diff \ + && if git apply --reverse --check pr38423.diff 2>/dev/null; then \ + echo "Patch already applied, skipping."; \ + else \ + echo "Applying patch..."; \ + git apply -v pr38423.diff; \ + fi \ + && rm pr38423.diff # Final Compilation RUN --mount=type=cache,id=ccache,target=/root/.ccache \ From d37217bad06bb3c092a4d4c9ab788413b6b9a725 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 28 Mar 2026 09:22:19 -0700 Subject: [PATCH 30/48] moved PR patch before the requirements patching --- Dockerfile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 88621eb..ac5f8fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -182,6 +182,16 @@ RUN if [ -n "$VLLM_PRS" ]; then \ done; \ fi +# TEMPORARY PATCH for broken compilation +RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38423.diff -o pr38423.diff \ + && if git apply --reverse --check pr38423.diff 2>/dev/null; then \ + echo "Patch already applied, skipping."; \ + else \ + echo "Applying patch..."; \ + git apply -v pr38423.diff; \ + fi \ + && rm pr38423.diff + # Prepare build requirements RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ python3 use_existing_torch.py && \ @@ -201,15 +211,6 @@ RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ # TEMPORARY PATCH for broken vLLM build (unguarded Hopper code) - reverting PR #34758 and #34302 # RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34758.diff | patch -p1 -R || echo "Cannot revert PR #34758, skipping" # RUN curl -L https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/34302.diff | patch -p1 -R || echo "Cannot revert PR #34302, skipping" -# # TEMPORARY PATCH for broken compilation -RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38423.diff -o pr38423.diff \ - && if git apply --reverse --check pr38423.diff 2>/dev/null; then \ - echo "Patch already applied, skipping."; \ - else \ - echo "Applying patch..."; \ - git apply -v pr38423.diff; \ - fi \ - && rm pr38423.diff # Final Compilation RUN --mount=type=cache,id=ccache,target=/root/.ccache \ From 47f5f931b5031b97d876815e2fd1a8c7dbcb2480 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 28 Mar 2026 14:55:31 -0700 Subject: [PATCH 31/48] Allow to specify config file when doing setup --- autodiscover.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autodiscover.sh b/autodiscover.sh index 8cf21e1..58b66c5 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -15,7 +15,8 @@ load_env_if_exists() { fi # Validate config file exists if explicitly specified - if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then + # Exception: if --setup is also specified, the file will be created by the setup procedure + if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]] && [[ "${FORCE_DISCOVER:-false}" != "true" ]]; then echo "Error: Config file not found: $env_file" exit 1 fi From 32674c26199b746932d4ae156e2c0b3a0a242e84 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 28 Mar 2026 17:49:17 -0700 Subject: [PATCH 32/48] removed temporary patch as it causes more issues. --- Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index ac5f8fb..cabd991 100644 --- a/Dockerfile +++ b/Dockerfile @@ -183,14 +183,14 @@ RUN if [ -n "$VLLM_PRS" ]; then \ fi # TEMPORARY PATCH for broken compilation -RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38423.diff -o pr38423.diff \ - && if git apply --reverse --check pr38423.diff 2>/dev/null; then \ - echo "Patch already applied, skipping."; \ - else \ - echo "Applying patch..."; \ - git apply -v pr38423.diff; \ - fi \ - && rm pr38423.diff +# RUN curl -fsL https://patch-diff.githubusercontent.com/raw/vllm-project/vllm/pull/38423.diff -o pr38423.diff \ +# && if git apply --reverse --check pr38423.diff 2>/dev/null; then \ +# echo "Patch already applied, skipping."; \ +# else \ +# echo "Applying patch..."; \ +# git apply -v pr38423.diff; \ +# fi \ +# && rm pr38423.diff # Prepare build requirements RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ From e471ca24365db4fbcc5fad5e5791a94fdcbbc913 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sat, 28 Mar 2026 18:12:32 -0700 Subject: [PATCH 33/48] Don't copy if `-c` is not specified --- build-and-copy.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build-and-copy.sh b/build-and-copy.sh index 804bab9..064990e 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -10,6 +10,7 @@ IMAGE_TAG_SET=false REBUILD_FLASHINFER=false REBUILD_VLLM=false COPY_HOSTS=() +COPY_TO_FLAG=false SSH_USER="$USER" NO_BUILD=false VLLM_REF="main" @@ -298,6 +299,7 @@ while [[ "$#" -gt 0 ]]; do --rebuild-vllm) REBUILD_VLLM=true ;; --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;; -c|--copy-to|--copy-to-host|--copy-to-hosts) + COPY_TO_FLAG=true shift while [[ "$#" -gt 0 && "$1" != -* ]]; do add_copy_hosts "$1" @@ -367,8 +369,8 @@ if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then load_env_if_exists fi -# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments -if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then +# Handle COPY_HOSTS from .env or autodiscovery only if -c was explicitly specified +if [ "$COPY_TO_FLAG" = true ] && [ "${#COPY_HOSTS[@]}" -eq 0 ]; then if [[ -n "$DOTENV_COPY_HOSTS" ]]; then echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" From a3201f88734de40a64eaf0487400502828e46869 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Sun, 29 Mar 2026 22:40:35 -0700 Subject: [PATCH 34/48] --flashinfer-ref / --apply-flashinfer-pr --- Dockerfile | 10 ++++++++++ README.md | 15 +++++++++++++++ build-and-copy.sh | 48 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 70 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index cabd991..4b22f61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -110,6 +110,16 @@ RUN --mount=type=cache,id=repo-cache,target=/repo-cache \ WORKDIR /workspace/flashinfer +ARG FLASHINFER_PRS="" + +RUN if [ -n "$FLASHINFER_PRS" ]; then \ + echo "Applying PRs: $FLASHINFER_PRS"; \ + for pr in $FLASHINFER_PRS; do \ + echo "Fetching and applying PR #$pr..."; \ + curl -fL "https://github.com/flashinfer-ai/flashinfer/pull/${pr}.diff" | git apply -v; \ + done; \ + fi + # Apply patch to avoid re-downloading existing cubins COPY flashinfer_cache.patch . RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ diff --git a/README.md b/README.md index d10894a..125f0b6 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ We will expand the selection of models we test in the pipeline, but since vLLM i If you want to build the latest from main branch, you can specify `--rebuild-vllm` flag. Or you can target a specific vLLM release by setting `--vllm-ref` parameter. +Similarly, `--rebuild-flashinfer`, `--flashinfer-ref`, and `--apply-flashinfer-pr` control the FlashInfer build in the same way. + ## QUICK START ### Build @@ -149,6 +151,17 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi ## CHANGELOG +### 2026-03-29 + +#### Flags to specify Flashinfer ref and apply PRs + +`build-and-copy.sh` gains two new flags that mirror the existing vLLM equivalents: + +- `--flashinfer-ref ` — build FlashInfer from a specific commit SHA, branch, or tag instead of `main`. Forces a local FlashInfer build (skips prebuilt wheel download). +- `--apply-flashinfer-pr ` — fetch and apply a FlashInfer GitHub PR patch before building. Can be specified multiple times. Forces a local FlashInfer build. + +Both flags are incompatible with `--exp-mxfp4`. + ### 2026-03-27 #### Default image tag in `build-and-copy.sh` @@ -981,7 +994,9 @@ Using a different username: | `--rebuild-flashinfer` | Skip prebuilt wheel download; force a fresh local FlashInfer build | | `--rebuild-vllm` | Force rebuild vLLM from source | | `--vllm-ref ` | vLLM commit SHA, branch or tag (default: `main`) | +| `--flashinfer-ref ` | FlashInfer commit SHA, branch or tag (default: `main`) | | `--apply-vllm-pr ` | Apply a vLLM PR patch during build. Can be specified multiple times. | +| `--apply-flashinfer-pr ` | Apply a FlashInfer PR patch during build. Can be specified multiple times. | | `--tf5` | Install transformers v5 (5.0.0 or higher). Aliases: `--pre-tf, --pre-transformers`. | | `--exp-mxfp4` | Build with experimental native MXFP4 support. Alias: `--experimental-mxfp4`. | | `-c, --copy-to ` | Host(s) to copy the image to after building (space- or comma-separated). | diff --git a/build-and-copy.sh b/build-and-copy.sh index 064990e..3526dbb 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -14,11 +14,14 @@ COPY_TO_FLAG=false SSH_USER="$USER" NO_BUILD=false VLLM_REF="main" +VLLM_REF_SET=false +FLASHINFER_REF="main" +FLASHINFER_REF_SET=false TMP_IMAGE="" PARALLEL_COPY=false EXP_MXFP4=false -VLLM_REF_SET=false VLLM_PRS="" +FLASHINFER_PRS="" PRE_TRANSFORMERS=false FULL_LOG=false BUILD_JOBS="16" @@ -271,6 +274,7 @@ usage() { echo " --rebuild-flashinfer : Force rebuild of FlashInfer wheels (ignore cached wheels)" echo " --rebuild-vllm : Force rebuild of vLLM wheels (ignore cached wheels)" echo " --vllm-ref : vLLM commit SHA, branch or tag (default: 'main')" + echo " --flashinfer-ref : FlashInfer commit SHA, branch or tag (default: 'main')" echo " -c, --copy-to : Host(s) to copy the image to. Accepts comma or space-delimited lists." echo " --copy-to-host : Alias for --copy-to (backwards compatibility)." echo " --copy-parallel : Copy to all hosts in parallel instead of serially." @@ -279,6 +283,7 @@ usage() { echo " --tf5 : Install transformers>=5 (aliases: --pre-tf, --pre-transformers)" echo " --exp-mxfp4, --experimental-mxfp4 : Build with experimental native MXFP4 support" echo " --apply-vllm-pr : Apply a specific PR patch to vLLM source. Can be specified multiple times." + echo " --apply-flashinfer-pr : Apply a specific PR patch to FlashInfer source. Can be specified multiple times." echo " --full-log : Enable full build logging (--progress=plain)" echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " --network : Docker network to use during build" @@ -298,6 +303,7 @@ while [[ "$#" -gt 0 ]]; do --rebuild-flashinfer) REBUILD_FLASHINFER=true ;; --rebuild-vllm) REBUILD_VLLM=true ;; --vllm-ref) VLLM_REF="$2"; VLLM_REF_SET=true; shift ;; + --flashinfer-ref) FLASHINFER_REF="$2"; FLASHINFER_REF_SET=true; shift ;; -c|--copy-to|--copy-to-host|--copy-to-hosts) COPY_TO_FLAG=true shift @@ -325,6 +331,19 @@ while [[ "$#" -gt 0 ]]; do exit 1 fi ;; + --apply-flashinfer-pr) + if [ -n "$2" ] && [[ "$2" != -* ]]; then + if [ -n "$FLASHINFER_PRS" ]; then + FLASHINFER_PRS="$FLASHINFER_PRS $2" + else + FLASHINFER_PRS="$2" + fi + shift + else + echo "Error: --apply-flashinfer-pr requires a PR number." + exit 1 + fi + ;; --full-log) FULL_LOG=true ;; --no-build) NO_BUILD=true ;; --cleanup) CLEANUP_MODE=true ;; @@ -399,8 +418,13 @@ if [ -n "$VLLM_PRS" ]; then if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi fi +if [ -n "$FLASHINFER_PRS" ]; then + if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-flashinfer-pr is incompatible with --exp-mxfp4"; exit 1; fi +fi + if [ "$EXP_MXFP4" = true ]; then if [ "$VLLM_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --vllm-ref"; exit 1; fi + if [ "$FLASHINFER_REF_SET" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --flashinfer-ref"; exit 1; fi if [ "$PRE_TRANSFORMERS" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --tf5"; exit 1; fi if [ "$REBUILD_FLASHINFER" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-flashinfer"; exit 1; fi if [ "$REBUILD_VLLM" = true ]; then echo "Error: --exp-mxfp4 is incompatible with --rebuild-vllm"; exit 1; fi @@ -478,9 +502,21 @@ if [ "$NO_BUILD" = false ]; then # ---------------------------------------------------------- # Phase 1: FlashInfer wheels # ---------------------------------------------------------- + if [ "$FLASHINFER_REF_SET" = true ] || [ -n "$FLASHINFER_PRS" ]; then + REBUILD_FLASHINFER=true + fi + BUILD_FLASHINFER=false if [ "$REBUILD_FLASHINFER" = true ]; then - echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..." + if [ "$FLASHINFER_REF_SET" = true ] && [ -n "$FLASHINFER_PRS" ]; then + echo "Rebuilding FlashInfer wheels (--flashinfer-ref and --apply-flashinfer-pr specified)..." + elif [ "$FLASHINFER_REF_SET" = true ]; then + echo "Rebuilding FlashInfer wheels (--flashinfer-ref specified)..." + elif [ -n "$FLASHINFER_PRS" ]; then + echo "Rebuilding FlashInfer wheels (--apply-flashinfer-pr specified)..." + else + echo "Rebuilding FlashInfer wheels (--rebuild-flashinfer specified)..." + fi BUILD_FLASHINFER=true elif try_download_wheels "$FLASHINFER_RELEASE_TAG" "flashinfer"; then echo "FlashInfer wheels ready." @@ -502,12 +538,18 @@ if [ "$NO_BUILD" = false ]; then FI_CMD=("docker" "build" "--target" "flashinfer-export" "--output" "type=local,dest=./wheels" - "${COMMON_BUILD_FLAGS[@]}") + "${COMMON_BUILD_FLAGS[@]}" + "--build-arg" "FLASHINFER_REF=$FLASHINFER_REF") if [ "$REBUILD_FLASHINFER" = true ]; then FI_CMD+=("--build-arg" "CACHEBUST_FLASHINFER=$(date +%s)") fi + if [ -n "$FLASHINFER_PRS" ]; then + echo "Applying FlashInfer PRs: $FLASHINFER_PRS" + FI_CMD+=("--build-arg" "FLASHINFER_PRS=$FLASHINFER_PRS") + fi + FI_CMD+=(".") echo "FlashInfer build command: ${FI_CMD[*]}" From 45494688d11496b86227182804bec375cc2c0dd7 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Mon, 30 Mar 2026 11:45:40 -0700 Subject: [PATCH 35/48] Updated README, added NVFP4 fix --- Dockerfile | 14 ++++++++++++++ README.md | 23 ++--------------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4b22f61..66921b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ ENV MAX_JOBS=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}" +ENV DG_JIT_USE_NVRTC=1 +ENV USE_CUDNN=1 # Set non-interactive frontend to prevent apt prompts ENV DEBIAN_FRONTEND=noninteractive @@ -120,6 +122,16 @@ RUN if [ -n "$FLASHINFER_PRS" ]; then \ done; \ fi +# TEMPORARY patch for NVFP4 crash (PR 2913) +RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/38423.diff -o pr2913.diff \ + && if git apply --reverse --check pr2913.diff 2>/dev/null; then \ + echo "PR #2913 already applied, skipping."; \ + else \ + echo "Applying FI PR #2913..."; \ + git apply -v pr2913.diff; \ + fi \ + && rm pr2913.diff + # Apply patch to avoid re-downloading existing cubins COPY flashinfer_cache.patch . RUN --mount=type=cache,id=uv-cache,target=/root/.cache/uv \ @@ -247,6 +259,8 @@ ENV MAX_JOBS=${BUILD_JOBS} ENV CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS} ENV NINJAFLAGS="-j${BUILD_JOBS}" ENV MAKEFLAGS="-j${BUILD_JOBS}" +ENV DG_JIT_USE_NVRTC=1 +ENV USE_CUDNN=1 ENV DEBIAN_FRONTEND=noninteractive ENV PIP_BREAK_SYSTEM_PACKAGES=1 diff --git a/README.md b/README.md index 125f0b6..9d45eb1 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ An initial build speed depends on your Internet connection speed and whether the **On a single node**: -**NEW** - `launch-cluster.sh` now supports solo mode, which is now a recommended way to run the container on a single Spark: +`launch-cluster.sh` supports solo mode, which is now a recommended way to run the container on a single Spark: ```bash ./launch-cluster.sh --solo exec \ @@ -80,23 +80,6 @@ An initial build speed depends on your Internet connection speed and whether the --load-format fastsafetensors ``` -**To launch using regular `docker run`** - -```bash - docker run \ - --privileged \ - --gpus all \ - -it --rm \ - --network host --ipc=host \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - vllm-node \ - bash -c -i "vllm serve \ - QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ \ - --port 8000 --host 0.0.0.0 \ - --gpu-memory-utilization 0.7 \ - --load-format fastsafetensors" -``` - **On a cluster** It's recommended to download the model on one node and distribute across the cluster using ConnectX interconnect prior to launching. This is to avoid re-downloading the model from the Internet on every node in the cluster. @@ -151,7 +134,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi ## CHANGELOG -### 2026-03-29 +### 2026-03-30 #### Flags to specify Flashinfer ref and apply PRs @@ -162,8 +145,6 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi Both flags are incompatible with `--exp-mxfp4`. -### 2026-03-27 - #### Default image tag in `build-and-copy.sh` `build-and-copy.sh` now automatically sets a sensible default image tag when `-t` is not specified: From 41c0ce2c9aa551375273c2d12d656fa66ff5f3c4 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Mon, 30 Mar 2026 14:25:42 -0700 Subject: [PATCH 36/48] Fixed FI PR --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 66921b0..75792de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -123,7 +123,7 @@ RUN if [ -n "$FLASHINFER_PRS" ]; then \ fi # TEMPORARY patch for NVFP4 crash (PR 2913) -RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/38423.diff -o pr2913.diff \ +RUN curl -fsL https://github.com/flashinfer-ai/flashinfer/pull/2913.diff -o pr2913.diff \ && if git apply --reverse --check pr2913.diff 2>/dev/null; then \ echo "PR #2913 already applied, skipping."; \ else \ From 7f0be29fcca2dc6508782bab31226b104bfb0ac5 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 11:59:03 -0700 Subject: [PATCH 37/48] Handle edge case when two sparks have both cables plugged and assigned IPs --- autodiscover.sh | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/autodiscover.sh b/autodiscover.sh index 58b66c5..16c056c 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -332,16 +332,28 @@ detect_copy_hosts() { _scan_subnet_for_gb10 "$cidr" "$local_iface_ip" "$temp_file" done - # Deduplicate and collect results + # Deduplicate and collect results. + # On two-cable setups two IB IPs may belong to the same host; deduplicate by + # querying each host's ETH_IF IP as a canonical identity. COPY_PEER_NODES=() - declare -A _SEEN_COPY + declare -A _SEEN_COPY # keyed by IB IP + declare -A _SEEN_HOST # keyed by ETH_IF IP → first IB IP seen for that host if [[ -f "$temp_file" ]]; then while read -r ip; do - if [[ -z "${_SEEN_COPY[$ip]}" ]]; then - _SEEN_COPY["$ip"]=1 - COPY_PEER_NODES+=("$ip") - echo " Found GB10 copy host: $ip" + [[ -n "${_SEEN_COPY[$ip]}" ]] && continue + _SEEN_COPY["$ip"]=1 + # Resolve canonical host identity via ETH_IF IP + local host_ip + host_ip=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \ + "ip -o -f inet addr show $ETH_IF 2>/dev/null | awk '{print \$4}' | head -n1 | cut -d/ -f1" \ + 2>/dev/null) + if [[ -n "$host_ip" && -n "${_SEEN_HOST[$host_ip]}" ]]; then + echo " Skipping $ip (same host as ${_SEEN_HOST[$host_ip]}, ETH_IF: $host_ip)" + continue fi + [[ -n "$host_ip" ]] && _SEEN_HOST["$host_ip"]="$ip" + COPY_PEER_NODES+=("$ip") + echo " Found GB10 copy host: $ip" done < <(sort "$temp_file") rm -f "$temp_file" fi From bb177383ffa19e51d98c3d2dfac19fa8c80def3a Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 12:46:15 -0700 Subject: [PATCH 38/48] Bugfix in autodiscovery dedup --- autodiscover.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autodiscover.sh b/autodiscover.sh index 16c056c..5f04a91 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -346,7 +346,7 @@ detect_copy_hosts() { local host_ip host_ip=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no -o BatchMode=yes "$ip" \ "ip -o -f inet addr show $ETH_IF 2>/dev/null | awk '{print \$4}' | head -n1 | cut -d/ -f1" \ - 2>/dev/null) + /dev/null) if [[ -n "$host_ip" && -n "${_SEEN_HOST[$host_ip]}" ]]; then echo " Skipping $ip (same host as ${_SEEN_HOST[$host_ip]}, ETH_IF: $host_ip)" continue From 9370b2bb3421e06727bf21375b22c6b86d49d9f8 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 13:29:56 -0700 Subject: [PATCH 39/48] Don't start the cluster if only --setup/--discover is specified --- docs/NETWORKING.md | 13 +++++-------- launch-cluster.sh | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/docs/NETWORKING.md b/docs/NETWORKING.md index 734592d..8d786a2 100644 --- a/docs/NETWORKING.md +++ b/docs/NETWORKING.md @@ -106,8 +106,9 @@ network: enP2p1s0f1np1: dhcp4: no dhcp6: no - link-local: [ ipv4 ] + link-local: [] mtu: 9000 + addresses: [192.168.178.11/24] ``` Create `/etc/netplan/40-cx7.yaml` on `spark2`: @@ -124,16 +125,12 @@ network: enP2p1s0f1np1: dhcp4: no dhcp6: no - link-local: [ ipv4 ] + link-local: [] mtu: 9000 + addresses: [192.168.178.12/24] ``` -Please note, that only one interface of the "twin" pair needs an IP address, but MTU needs to be set on both. -You can also assign a separate address to another "twin" if you want to utilize the second interface independently, but make sure you assign an IP address from a different subnet. - -For instance, for the example above, if you want to assign an IP to `enP2p1s0f1np1`, you need to use `192.168.177.12` on `spark`. **DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing. - -This will not affect vLLM performance as it will use RDMA over RoCE using both "twins", even if the IP is only set on one. +**DO NOT use the same subnet on both "twins"** - it will confuse autodiscovery and mess up routing. Then run on each node: diff --git a/launch-cluster.sh b/launch-cluster.sh index b20603e..af3d8f7 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -24,7 +24,7 @@ CONTAINER_NAME="$DEFAULT_CONTAINER_NAME" COMMAND_TO_RUN="" DAEMON_MODE="false" CHECK_CONFIG="false" -ACTION="start" +ACTION="" CLUSTER_WAS_RUNNING="false" MOD_PATHS=() MOD_TYPES=() @@ -69,7 +69,7 @@ usage() { echo " --pids-limit Process limit (default: 4096, only with --non-privileged)" echo " --shm-size-gb Shared memory size in GB (default: 64, only with --non-privileged)" echo " --config Path to .env configuration file (default: .env in script directory) - --setup Force autodiscovery and save configuration (even if .env exists)" + --setup/--discover Force autodiscovery and save configuration (even if .env exists)" echo " action start | stop | status | exec (Default: start). Not compatible with --launch-script." echo " command Command to run (only for 'exec' action). Not compatible with --launch-script." echo "" @@ -132,7 +132,7 @@ while [[ "$#" -gt 0 ]]; do -d) DAEMON_MODE="true" ;; -h|--help) usage ;; --config) CONFIG_FILE="$2"; shift ;; - --setup) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; + --setup|--discover) FORCE_DISCOVER=true; export FORCE_DISCOVER ;; start|stop|status) if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then echo "Error: Action '$1' is not compatible with --launch-script. Please omit the action or not use --launch-script." @@ -426,6 +426,10 @@ if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then [[ -z "$NODES_ARG" && -n "$DOTENV_CLUSTER_NODES" ]] && NODES_ARG="$DOTENV_CLUSTER_NODES" [[ -z "$ETH_IF" && -n "$DOTENV_ETH_IF" ]] && ETH_IF="$DOTENV_ETH_IF" [[ -z "$IB_IF" && -n "$DOTENV_IB_IF" ]] && IB_IF="$DOTENV_IB_IF" + # If no action was specified, setup was the only intent — exit cleanly + if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then + exit 0 + fi fi if [[ "$SOLO_MODE" == "true" ]]; then @@ -505,6 +509,12 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ] fi fi +if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then + echo "Error: No action specified. Use: start | stop | status | exec" + usage + exit 1 +fi + if [[ "$CHECK_CONFIG" == "true" ]]; then echo "Configuration Check Complete." echo " Image Name: $IMAGE_NAME" From 287d3c72e54349f9c85ac58ad7999b85b02045ea Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 13:34:59 -0700 Subject: [PATCH 40/48] Fix for forced autodiscovery --- launch-cluster.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/launch-cluster.sh b/launch-cluster.sh index af3d8f7..30b1e8b 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -416,6 +416,8 @@ source "$(dirname "$0")/autodiscover.sh" if [[ "${FORCE_DISCOVER:-false}" == "true" ]]; then # --setup: force full autodiscovery and save configuration echo "Running full autodiscovery (--setup)..." + # Clear pre-loaded values so detect functions run fresh instead of short-circuiting + ETH_IF="" IB_IF="" NODES_ARG="" LOCAL_IP="" detect_interfaces || exit 1 detect_local_ip || exit 1 detect_nodes || exit 1 From 48318380f98f4864dfd31c723ac17d080217ab0c Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 13:41:35 -0700 Subject: [PATCH 41/48] Bugfix --- launch-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index 30b1e8b..ddfa01c 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -511,7 +511,7 @@ if [[ "$ACTION" == "start" || "$ACTION" == "exec" || "$CHECK_CONFIG" == "true" ] fi fi -if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" ]]; then +if [[ -z "$ACTION" && "$LAUNCH_SCRIPT_MODE" != "true" && "$CHECK_CONFIG" != "true" ]]; then echo "Error: No action specified. Use: start | stop | status | exec" usage exit 1 From a467a7a0bd6e08b0fabbedd6344dd8a733ac49f9 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 13:47:04 -0700 Subject: [PATCH 42/48] Updated README for 3-node --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d45eb1..7a7506e 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ # vLLM Docker Optimized for DGX Spark (single or multi-node) This repository contains the Docker configuration and startup scripts to run a multi-node vLLM inference cluster using Ray. It supports InfiniBand/RDMA (NCCL) and custom environment configuration for high-performance setups. +Cluster setup supports direct connect between dual Sparks, connecting via QSFP/RoCE switch and 3-node mesh configuration. While it was primarily developed to support multi-node inference, it works just as well on a single node setups. @@ -134,7 +135,7 @@ For periodic maintenance, I recommend using a filter: `docker builder prune --fi ## CHANGELOG -### 2026-03-30 +### 2026-03-31 #### Flags to specify Flashinfer ref and apply PRs From 15a04ada32b56d8672daec2735413f701ef2422e Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 16:20:23 -0700 Subject: [PATCH 43/48] Bug fixes --- launch-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch-cluster.sh b/launch-cluster.sh index ddfa01c..7a5bb09 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -368,7 +368,7 @@ if [[ -n "$LAUNCH_SCRIPT_PATH" ]]; then LAUNCH_SCRIPT_MODE="true" # If launch script is specified, default action to exec unless explicitly set to stop/status - if [[ "$ACTION" == "start" ]]; then + if [[ -z "$ACTION" || "$ACTION" == "start" ]]; then ACTION="exec" fi fi From e89104d91b40d2a03d2701b2f04bd2c01ea147b8 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 16:25:05 -0700 Subject: [PATCH 44/48] Always rerun discovery when `--discover` is specified --- run-recipe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/run-recipe.py b/run-recipe.py index a9f8104..a141d82 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -595,6 +595,7 @@ def run_autodiscover() -> dict[str, str] | None: # doesn't exist yet (it's the file we're about to create). env_vars = os.environ.copy() env_vars["CONFIG_FILE"] = str(ENV_FILE) + env_vars["FORCE_DISCOVER"] = "true" env_vars.pop("CONFIG_FILE_SET", None) # Run autodiscover interactively so its prompts are shown to the user From a889fed25468a2a4cd3de511834cbb891720e9a3 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 16:54:19 -0700 Subject: [PATCH 45/48] Updated README --- README.md | 3 ++- run-recipe.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7a7506e..3aefb51 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,8 @@ Autodiscover function in both `launch-cluster.sh` and `run-recipe.sh` now can de You can try running a model on all 3 nodes in pipeline-parallel configuration using the following recipe: ```bash -./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup # you can drop --setup and --force-build on subsequent calls +./run-recipe.sh --discover # force mesh discovery +./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls ``` Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are: diff --git a/run-recipe.py b/run-recipe.py index a141d82..6a2de79 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -146,8 +146,11 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]: SystemExit: If recipe not found or validation fails """ if not recipe_path.exists(): - # Try recipes directory with various extensions + # Try candidates in order: add extension to original path first, + # then fall back to flat recipes/ directory (for bare recipe names) candidates = [ + recipe_path.with_suffix(".yaml"), + recipe_path.with_suffix(".yml"), RECIPES_DIR / recipe_path.name, RECIPES_DIR / f"{recipe_path.name}.yaml", RECIPES_DIR / f"{recipe_path.name}.yml", @@ -325,7 +328,7 @@ def build_image( if build_args: cmd.extend(build_args) if copy_to: - cmd.extend(["--copy-to", ",".join(copy_to)]) + cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"]) print(f"Building image '{image}'...") if build_args: @@ -363,7 +366,7 @@ def download_model(model: str, copy_to: list[str] | None = None) -> bool: cmd = [str(DOWNLOAD_SCRIPT), model] if copy_to: - cmd.extend(["--copy-to", ",".join(copy_to)]) + cmd.extend(["--copy-to", ",".join(copy_to), "--copy-parallel"]) print(f"Downloading model '{model}'...") if copy_to: From ead749239d5a685e4c31e71b931c6bd33a4384b1 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 16:57:56 -0700 Subject: [PATCH 46/48] Bugfix --- run-recipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run-recipe.py b/run-recipe.py index 6a2de79..1314a6a 100755 --- a/run-recipe.py +++ b/run-recipe.py @@ -149,8 +149,8 @@ def load_recipe(recipe_path: Path) -> dict[str, Any]: # Try candidates in order: add extension to original path first, # then fall back to flat recipes/ directory (for bare recipe names) candidates = [ - recipe_path.with_suffix(".yaml"), - recipe_path.with_suffix(".yml"), + Path(str(recipe_path) + ".yaml"), + Path(str(recipe_path) + ".yml"), RECIPES_DIR / recipe_path.name, RECIPES_DIR / f"{recipe_path.name}.yaml", RECIPES_DIR / f"{recipe_path.name}.yml", From 044557943c53abf612c5ac8de29ba5b1d1758032 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 17:49:17 -0700 Subject: [PATCH 47/48] Bugfixes --- recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml index a208268..fc12279 100644 --- a/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml +++ b/recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml @@ -55,6 +55,7 @@ command: | --chat-template unsloth.jinja \ -tp 1 \ -pp {pipeline_parallel} \ + --load-format fastsafetensors \ --distributed-executor-backend ray From c4860b86a279d2d28af1c6520f769f95b9a729d0 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Tue, 31 Mar 2026 18:19:22 -0700 Subject: [PATCH 48/48] Updated README with 3-node support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3aefb51..0cdc99e 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ You can try running a model on all 3 nodes in pipeline-parallel configuration us ```bash ./run-recipe.sh --discover # force mesh discovery -./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls +./run-recipe.sh recipes/3x-spark-cluster/qwen3.5-397b-int4-autoround.yaml --setup --no-ray --force-build # you can drop --setup and --force-build on subsequent calls ``` Please note that `--tensor-parallel-size 3` or `-tp 3` is not supported by any commonly used model, so the only two viable options to utilize all three nodes for a single model are: