From 8b7c02aa252dc6e3215ea2ee46712248146d07bb Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 22:47:02 -0700 Subject: [PATCH 1/2] add .env support to build-and-copy.sh --- .env.example | 4 +++ autodiscover.sh | 52 ++++++++++++++++++++++++++++++++++++++ build-and-copy.sh | 64 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 105 insertions(+), 15 deletions(-) diff --git a/.env.example b/.env.example index bc6f2dc..d1cea6c 100644 --- a/.env.example +++ b/.env.example @@ -29,6 +29,10 @@ CONTAINER_NCCL_DEBUG="INFO" CONTAINER_HF_TOKEN="your_huggingface_token_here" CONTAINER_NCCL_IGNORE_CPU_AFFINITY="1" +# COPY_HOSTS: Comma-separated list of hosts for build-and-copy.sh (optional) +# Used by build-and-copy.sh to distribute images across cluster +COPY_HOSTS="192.168.177.12" + # Additional container environment variables # CONTAINER_MAX_JOBS="16" # CONTAINER_CUDA_VISIBLE_DEVICES="0,1" diff --git a/autodiscover.sh b/autodiscover.sh index 43e622f..54ee4e0 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -1,5 +1,44 @@ #!/bin/bash +# Load .env file if exists (for shared configuration) +# This is called early so that DOTENV_* variables are available to all functions +load_env_if_exists() { + local env_file="${CONFIG_FILE:-}" + + # If CONFIG_FILE is not set, check default location + if [[ -z "$env_file" ]]; then + local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" + env_file="$script_dir/.env" + fi + + if [[ -f "$env_file" ]]; then + # Load .env variables with DOTENV_ prefix + while IFS='=' read -r key value || [[ -n "$key" ]]; do + # Skip comments and empty lines + [[ "$key" =~ ^[[:space:]]*# ]] && continue + [[ -z "$key" ]] && continue + + # Remove leading/trailing whitespace from key + key=$(echo "$key" | xargs) + + # Skip if key is empty after trimming + [[ -z "$key" ]] && continue + + # Remove quotes from value + value="${value%\"}" + value="${value#\"}" + value="${value%\'}" + value="${value#\'}" + + # Export with DOTENV_ prefix + export "DOTENV_$key=$value" + done < "$env_file" + fi +} + +# Load .env file +load_env_if_exists + # Function to detect IB and Ethernet interfaces detect_interfaces() { # If both interfaces are already set, nothing to do @@ -110,6 +149,19 @@ detect_nodes() { done return 0 fi + + # Try to use COPY_HOSTS from .env + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo " Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + PEER_NODES=() + IFS=',' read -ra ALL_NODES <<< "$DOTENV_COPY_HOSTS" + for node in "${ALL_NODES[@]}"; do + node=$(echo "$node" | xargs) + PEER_NODES+=("$node") + done + NODES_ARG="$DOTENV_COPY_HOSTS" + return 0 + fi echo "Auto-detecting nodes..." diff --git a/build-and-copy.sh b/build-and-copy.sh index 628b9c0..1aa3628 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -28,6 +28,7 @@ VLLM_RELEASE_TAG="prebuilt-vllm-current" # Space-separated list of GPU architectures for which prebuilt wheels are available PREBUILT_WHEELS_SUPPORTED_ARCHS="12.1a" CLEANUP_MODE="false" +CONFIG_FILE="" cleanup() { if [ -n "$TMP_IMAGE" ] && [ -f "$TMP_IMAGE" ]; then @@ -280,11 +281,32 @@ usage() { echo " --no-build : Skip building, only copy image (requires --copy-to)" echo " --network : Docker network to use during build" echo " --cleanup : Remove all *.whl and *.-commit files in wheels directory" + echo " --config : Path to .env configuration file (default: .env in script directory)" echo " -h, --help : Show this help message" exit 1 } -# Argument parsing +# Set default CONFIG_FILE +SCRIPT_DIR="$(dirname "$(realpath "$0")")" +export CONFIG_FILE="$SCRIPT_DIR/.env" + +# Parse --config argument first +i=1 +while [[ $i -le $# ]]; do + arg="${!i}" + if [[ "$arg" == "--config" ]]; then + next_i=$((i+1)) + CONFIG_FILE="${!next_i}" + export CONFIG_FILE + break + fi + i=$((i+1)) +done + +# Source autodiscover.sh to load .env file +source "$(dirname "$0")/autodiscover.sh" + +# Now parse all arguments normally while [[ "$#" -gt 0 ]]; do case $1 in -t|--tag) IMAGE_TAG="$2"; shift ;; @@ -300,24 +322,31 @@ while [[ "$#" -gt 0 ]]; do done if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "No hosts specified. Using autodiscovery..." - source "$(dirname "$0")/autodiscover.sh" + # Try to use COPY_HOSTS from .env first + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" + COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") + else + echo "No hosts specified. Using autodiscovery..." + source "$(dirname "$0")/autodiscover.sh" - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi + detect_nodes + if [ $? -ne 0 ]; then + echo "Error: Autodiscovery failed." + exit 1 + fi - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") - fi + if [ ${#PEER_NODES[@]} -gt 0 ]; then + COPY_HOSTS=("${PEER_NODES[@]}") + fi - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" fi - echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" fi continue ;; @@ -351,12 +380,17 @@ while [[ "$#" -gt 0 ]]; do exit 1 fi ;; + --config) CONFIG_FILE="$2"; shift ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done +# Set CONFIG_FILE and source autodiscover.sh to load .env +export CONFIG_FILE +source "$(dirname "$0")/autodiscover.sh" + # Validate flag combinations if [ -n "$VLLM_PRS" ]; then if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi From c2fe579cccb11ecd3b2096404267cce66b896153 Mon Sep 17 00:00:00 2001 From: Eugene Rakhmatulin Date: Wed, 25 Mar 2026 23:16:56 -0700 Subject: [PATCH 2/2] Enhance .env file handling and validation in scripts --- autodiscover.sh | 8 +++++ build-and-copy.sh | 82 +++++++++++++++++------------------------------ launch-cluster.sh | 5 ++- 3 files changed, 42 insertions(+), 53 deletions(-) diff --git a/autodiscover.sh b/autodiscover.sh index 54ee4e0..11d771d 100644 --- a/autodiscover.sh +++ b/autodiscover.sh @@ -4,11 +4,19 @@ # This is called early so that DOTENV_* variables are available to all functions load_env_if_exists() { local env_file="${CONFIG_FILE:-}" + local config_explicit="${CONFIG_FILE_SET:-false}" # If CONFIG_FILE is not set, check default location if [[ -z "$env_file" ]]; then local script_dir="$(dirname "$(realpath "${BASH_SOURCE[0]}")")" env_file="$script_dir/.env" + config_explicit="false" + fi + + # Validate config file exists if explicitly specified + if [[ "$config_explicit" == "true" ]] && [[ ! -f "$env_file" ]]; then + echo "Error: Config file not found: $env_file" + exit 1 fi if [[ -f "$env_file" ]]; then diff --git a/build-and-copy.sh b/build-and-copy.sh index 1aa3628..f841c6f 100755 --- a/build-and-copy.sh +++ b/build-and-copy.sh @@ -286,27 +286,8 @@ usage() { exit 1 } -# Set default CONFIG_FILE -SCRIPT_DIR="$(dirname "$(realpath "$0")")" -export CONFIG_FILE="$SCRIPT_DIR/.env" - -# Parse --config argument first -i=1 -while [[ $i -le $# ]]; do - arg="${!i}" - if [[ "$arg" == "--config" ]]; then - next_i=$((i+1)) - CONFIG_FILE="${!next_i}" - export CONFIG_FILE - break - fi - i=$((i+1)) -done - -# Source autodiscover.sh to load .env file -source "$(dirname "$0")/autodiscover.sh" - -# Now parse all arguments normally +# Parse all arguments +CONFIG_FILE_SET=false while [[ "$#" -gt 0 ]]; do case $1 in -t|--tag) IMAGE_TAG="$2"; shift ;; @@ -320,34 +301,6 @@ while [[ "$#" -gt 0 ]]; do add_copy_hosts "$1" shift done - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - # Try to use COPY_HOSTS from .env first - if [[ -n "$DOTENV_COPY_HOSTS" ]]; then - echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" - IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" - COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") - else - echo "No hosts specified. Using autodiscovery..." - source "$(dirname "$0")/autodiscover.sh" - - detect_nodes - if [ $? -ne 0 ]; then - echo "Error: Autodiscovery failed." - exit 1 - fi - - if [ ${#PEER_NODES[@]} -gt 0 ]; then - COPY_HOSTS=("${PEER_NODES[@]}") - fi - - if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then - echo "Error: Autodiscovery found no other nodes." - exit 1 - fi - echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" - fi - fi continue ;; -j|--build-jobs) BUILD_JOBS="$2"; shift ;; @@ -380,17 +333,42 @@ while [[ "$#" -gt 0 ]]; do exit 1 fi ;; - --config) CONFIG_FILE="$2"; shift ;; + --config) CONFIG_FILE="$2"; CONFIG_FILE_SET=true; shift ;; -h|--help) usage ;; *) echo "Unknown parameter passed: $1"; usage ;; esac shift done -# Set CONFIG_FILE and source autodiscover.sh to load .env -export CONFIG_FILE +# Source autodiscover.sh to load .env file source "$(dirname "$0")/autodiscover.sh" +# Handle COPY_HOSTS from .env or autodiscovery if not specified via arguments +if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + if [[ -n "$DOTENV_COPY_HOSTS" ]]; then + echo "Using COPY_HOSTS from .env: $DOTENV_COPY_HOSTS" + IFS=',' read -ra HOSTS_FROM_ENV <<< "$DOTENV_COPY_HOSTS" + COPY_HOSTS=("${HOSTS_FROM_ENV[@]}") + else + echo "No hosts specified. Using autodiscovery..." + detect_nodes + if [ $? -ne 0 ]; then + echo "Error: Autodiscovery failed." + exit 1 + fi + + if [ ${#PEER_NODES[@]} -gt 0 ]; then + COPY_HOSTS=("${PEER_NODES[@]}") + fi + + if [ "${#COPY_HOSTS[@]}" -eq 0 ]; then + echo "Error: Autodiscovery found no other nodes." + exit 1 + fi + echo "Autodiscovered hosts: ${COPY_HOSTS[*]}" + fi +fi + # Validate flag combinations if [ -n "$VLLM_PRS" ]; then if [ "$EXP_MXFP4" = true ]; then echo "Error: --apply-vllm-pr is incompatible with --exp-mxfp4"; exit 1; fi diff --git a/launch-cluster.sh b/launch-cluster.sh index 24b6a42..f7dc1cd 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -159,9 +159,12 @@ done # Set .env file path (use default if not specified) if [[ -z "$CONFIG_FILE" ]]; then CONFIG_FILE="$SCRIPT_DIR/.env" + CONFIG_FILE_SET=false +else + CONFIG_FILE_SET=true fi -# Load .env file if exists +# Load .env file if [[ -f "$CONFIG_FILE" ]]; then echo "Loading configuration from .env file..."