diff --git a/.env.example b/.env.example index bc6f2dc..d1cea6c 100644 --- a/.env.example +++ b/.env.example @@ -29,6 +29,10 @@ CONTAINER_NCCL_DEBUG="INFO" CONTAINER_HF_TOKEN="your_huggingface_token_here" CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1" +# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional) +# Used by build-and-copy.sh to distribute images across cluster +COPY_HOSTS="192.168.177.12" + # Additional container environment variables # CONTAINER_MAX_JOBS="16" # CONTAINER_CUDA_VISIBLE_DEVICES="0,1" diff --git a/autodiscover.sh b/autodiscover.sh index 43e622f..11d771d 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -1,5 +1,52 @@ #!/bin/bash +# Load .env file if exists (for shared configuration) +# This is called early so that DOTENV_* variables are available to all functions +load_env_if_exists() { + local env_file="${CONFIG_FILE:-}" + local config_explicit="${CONFIG_FILE_SET:-false}" + + # If CONFIG_FILE is not set, check default location + if [[ -z "$env_file" ]]; then + local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + env_file="$script_dir/.env" + config_explicit="false" + fi + + # Validate config file exists if explicitly specified + if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then + echo "Error: Config file not found: $env_file" + exit 1 + fi + + if [[ -f "$env_file" ]]; then + # Load .env variables with DOTENV_ prefix + while IFS='=' read -r key value || [[ -n "$key" ]]; do + # Skip comments and empty lines + [[ "$key" =~ ^[[:space:]]*# ]] && continue + [[ -z "$key" ]] && continue + + # Remove leading/trailing whitespace from key + key=$(echo "$key" | xargs) + + # Skip if key is empty after trimming + [[ -z "$key" ]] && continue + + # Remove quotes from value + value="${value%\"}" + value="${value#\"}" + value="${value%\'}" + value="${value#\'}" + + # Export with DOTENV_ prefix + export "DOTENV_$key=$value" + done < "$env_file" + fi +} + +# Load .env file +load_env_if_exists + # Function to detect IB and Ethernet interfaces detect_interfaces() { # If both interfaces are already set, nothing to do @@ -110,6 +157,19 @@ detect_nodes() { done return 0 fi + + # Try to use COPY_HOSTS from .env + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + PEER_NODES=() + IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS" + for node in "${ALL_NODES[@]}"; do + node=$(echo "$node" | xargs) + PEER_NODES+=("$node") + done + NODES_ARG="$DOTENV_COPY_HOSTS" + return 0 + fi echo "Auto-detecting nodes..." diff --git a/build-and-copy.sh b/build-and-copy.sh index ed7b9c0..dec93b4 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -28,6 +28,7 @@ VLLM_RELEASE_TAG="prebuilt-vllm-current" # Space-separated list of GPU architectures for which prebuilt wheels are available PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a" CLEANUP_MODE="false" +CONFIG_FILE="" cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -280,11 +281,13 @@ usage() { echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " --network : Docker network to use during build" echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" + echo " --config : Path to .env configuration file (default: .env in script directory)" echo " -h, --help : Show this help message" exit 1 } -# Argument parsing +# Parse all arguments +CONFIG_FILE_SET=false while [[ "$#" -gt 0 ]]; do case $1 in -t|--tag) IMAGE_TAG="$2"; shift ;; @@ -298,27 +301,6 @@ while [[ "$#" -gt 0 ]]; do add_copy_hosts "$1" shift done - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "No hosts specified. Using autodiscovery..." - source "$(dirname "$0")/autodiscover.sh" - - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi - - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") - fi - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 - fi - echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" - fi continue ;; -j|--build-jobs) BUILD_JOBS="$2"; shift ;; @@ -351,12 +333,42 @@ while [[ "$#" -gt 0 ]]; do exit 1 fi ;; + --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done +# Source autodiscover.sh to load .env file +source "$(dirname "$0")/autodiscover.sh" + +# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments +if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" + COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") + else + echo "No hosts specified. Using autodiscovery..." + detect_nodes + if [ $? -ne 0 ]; then + echo "Error: Autodiscovery failed." + exit 1 + fi + + if [ ${#PEER_NODES[@]} -gt 0 ]; then + COPY_HOSTS=("${PEER_NODES[@]}") + fi + + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" + fi +fi + # Validate flag combinations if [ -n "$VLLM_PRS" ]; then if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi diff --git a/launch-cluster.sh b/launch-cluster.sh index 24b6a42..f7dc1cd 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -159,9 +159,12 @@ done # Set .env file path (use default if not specified) if [[ -z "$CONFIG_FILE" ]]; then CONFIG_FILE="$SCRIPT_DIR/.env" + CONFIG_FILE_SET=false +else + CONFIG_FILE_SET=true fi -# Load .env file if exists +# Load .env file if [[ -f "$CONFIG_FILE" ]]; then echo "Loading configuration from .env file..."