diff --git a/launch-cluster.sh b/launch-cluster.sh index bc47fc8..8433415 100755 --- a/launch-cluster.sh +++ b/launch-cluster.sh @@ -24,6 +24,8 @@ DAEMON_MODE="false" CHECK_CONFIG="false" ACTION="start" CLUSTER_WAS_RUNNING="false" +MOD_PATH="" +MOD_TYPE="" # Function to print usage usage() { @@ -34,6 +36,7 @@ usage() { echo " --eth-if Ethernet interface (Optional, auto-detected)" echo " --ib-if InfiniBand interface (Optional, auto-detected)" echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO." + echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch" echo " --check-config Check configuration and auto-detection without launching" echo " -d Daemon mode (only for 'start' action)" echo " action start | stop | status | exec (Default: start)" @@ -49,6 +52,7 @@ while [[ "$#" -gt 0 ]]; do --name) CONTAINER_NAME="$2"; shift ;; --eth-if) ETH_IF="$2"; shift ;; --ib-if) IB_IF="$2"; shift ;; + --apply-mod) MOD_PATH="$2"; shift ;; --nccl-debug) if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then NCCL_DEBUG_VAL="$2" @@ -96,6 +100,41 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then esac fi +# Validate MOD_PATH if set +if [[ -n "$MOD_PATH" ]]; then + if [[ ! -e "$MOD_PATH" ]]; then + echo "Error: Mod path '$MOD_PATH' does not exist." + exit 1 + fi + + if [[ -d "$MOD_PATH" ]]; then + if [[ ! -f "$MOD_PATH/run.sh" ]]; then + echo "Error: Mod directory must contain 'run.sh'." + exit 1 + fi + MOD_TYPE="dir" + elif [[ -f "$MOD_PATH" && "$MOD_PATH" == *.zip ]]; then + # Check zip content using unzip if available, else python + if command -v unzip &> /dev/null; then + if ! unzip -l "$MOD_PATH" | grep -q "run.sh"; then + echo "Error: Mod zip file must contain 'run.sh'." + exit 1 + fi + else + # Fallback to python for checking zip content + if ! python3 -c "import zipfile, sys; sys.exit(0 if 'run.sh' in zipfile.ZipFile(sys.argv[1]).namelist() else 1)" "$MOD_PATH"; then + echo "Error: Mod zip file must contain 'run.sh'." + exit 1 + fi + fi + MOD_TYPE="zip" + else + echo "Error: --apply-mod must be a directory or a .zip file." + exit 1 + fi + MOD_PATH=$(realpath "$MOD_PATH") +fi + # --- Auto-Detection Logic --- # Source autodiscover module @@ -249,6 +288,92 @@ check_cluster_running() { fi } +# Apply Mod Function +apply_mod_to_container() { + local node_ip="$1" + local container="$2" + local is_local="$3" # true/false + + local mod_name=$(basename "$MOD_PATH") + if [[ "$MOD_TYPE" == "zip" ]]; then + mod_name="${mod_name%.*}" + fi + + echo "Applying mod '$mod_name' to $node_ip..." + + # 1. Copy mod to node (if remote) + local target_mod_path="" + if [[ "$is_local" == "true" ]]; then + target_mod_path="$MOD_PATH" + else + # SCP to remote + local remote_tmp="/tmp/vllm_mod_pkg_$(date +%s)" + echo " Copying mod package to $node_ip:$remote_tmp..." + + # Create directory first to ensure consistent path structure + ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" "mkdir -p $remote_tmp" + + # Copy contents using wildcard to avoid creating a subdirectory + # Note: We use scp -r with wildcard to copy contents into the pre-created dir + if ! scp -r -o BatchMode=yes -o StrictHostKeyChecking=no "$MOD_PATH"/* "$node_ip:$remote_tmp/"; then + echo "Error: Failed to copy mod to $node_ip" + exit 1 + fi + target_mod_path="$remote_tmp" + fi + + # 2. Copy into container + local container_dest="/workspace/mods/$mod_name" + + # Command prefix for remote vs local + local cmd_prefix="" + if [[ "$is_local" == "false" ]]; then + cmd_prefix="ssh -o BatchMode=yes -o StrictHostKeyChecking=no $node_ip" + fi + + # Create workspace in container + $cmd_prefix docker exec "$container" mkdir -p "$container_dest" + + if [[ "$MOD_TYPE" == "zip" ]]; then + local zip_name=$(basename "$MOD_PATH") + echo " Copying zip to container..." + $cmd_prefix docker cp "$target_mod_path" "$container:$container_dest/$zip_name" + + # Unzip in container using python + echo " Extracting zip..." + local py_unzip="import zipfile, sys; zipfile.ZipFile(sys.argv[1], 'r').extractall(sys.argv[2])" + $cmd_prefix docker exec "$container" python3 -c "$py_unzip" "$container_dest/$zip_name" "$container_dest" + else + # Directory + echo " Copying directory content to container..." + if [[ "$is_local" == "true" ]]; then + docker cp "$MOD_PATH/." "$container:$container_dest/" + else + # For remote, we copied contents to $target_mod_path. + # We want to copy contents of $target_mod_path to $container_dest. + $cmd_prefix docker cp "$target_mod_path/." "$container:$container_dest/" + fi + fi + + # 3. Run run.sh + echo " Running patch script on $node_ip..." + echo "CMD: cd $container_dest && chmod +x run.sh && ./run.sh" + if ! $cmd_prefix docker exec "$container" bash -c "cd $container_dest && chmod +x run.sh && ./run.sh"; then + echo "Error: Patch script failed on $node_ip" + # We should probably stop the cluster here or at least fail hard + exit 1 + fi + + # 4. Signal completion + echo " Signaling completion..." + $cmd_prefix docker exec "$container" touch /tmp/mod_done + + # 5. Cleanup remote temp + if [[ "$is_local" == "false" ]]; then + ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" "rm -rf $target_mod_path" + fi +} + # Start Cluster Function start_cluster() { check_cluster_running @@ -259,34 +384,45 @@ start_cluster() { # Start Head Node echo "Starting Head Node on $HEAD_IP..." + + local head_cmd_args=() + if [[ -n "$MOD_PATH" ]]; then + head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role head --host-ip $HEAD_IP --eth-if $ETH_IF --ib-if $IB_IF") + else + head_cmd_args=(./run-cluster-node.sh --role head --host-ip "$HEAD_IP" --eth-if "$ETH_IF" --ib-if "$IB_IF") + fi + docker run -d --privileged --gpus all --rm \ --ipc=host --network host \ --name "$CONTAINER_NAME" \ $DOCKER_ARGS \ "$IMAGE_NAME" \ - ./run-cluster-node.sh \ - --role head \ - --host-ip "$HEAD_IP" \ - --eth-if "$ETH_IF" \ - --ib-if "$IB_IF" + "${head_cmd_args[@]}" - # Start Worker Nodes # Start Worker Nodes for worker in "${PEER_NODES[@]}"; do echo "Starting Worker Node on $worker..." - ssh "$worker" "docker run -d --privileged --gpus all --rm \ - --ipc=host --network host \ - --name $CONTAINER_NAME \ - $DOCKER_ARGS \ - $IMAGE_NAME \ - ./run-cluster-node.sh \ - --role node \ - --host-ip $worker \ - --eth-if $ETH_IF \ - --ib-if $IB_IF \ - --head-ip $HEAD_IP" + + local docker_run_cmd="docker run -d --privileged --gpus all --rm --ipc=host --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME" + + if [[ -n "$MOD_PATH" ]]; then + local inner_script="echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP" + ssh "$worker" "$docker_run_cmd bash -c \"$inner_script\"" + else + ssh "$worker" "$docker_run_cmd ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP" + fi done + # Apply mods if requested + if [[ -n "$MOD_PATH" ]]; then + echo "Applying modifications to cluster nodes..." + apply_mod_to_container "$HEAD_IP" "$CONTAINER_NAME" "true" + + for worker in "${PEER_NODES[@]}"; do + apply_mod_to_container "$worker" "$CONTAINER_NAME" "false" + done + fi + wait_for_cluster } diff --git a/mods/fix-Salyut1-GLM-4.7-NVFP4/glm4_moe.patch b/mods/fix-Salyut1-GLM-4.7-NVFP4/glm4_moe.patch new file mode 100644 index 0000000..5d63693 --- /dev/null +++ b/mods/fix-Salyut1-GLM-4.7-NVFP4/glm4_moe.patch @@ -0,0 +1,18 @@ +--- a/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/glm4_moe.py ++++ b/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/glm4_moe.py +@@ -537,6 +537,7 @@ + if is_pp_missing_parameter(name, self): + continue + ++ if ('k_scale' in name or 'v_scale' in name) and name not in params_dict: continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) +@@ -596,6 +597,7 @@ + if is_pp_missing_parameter(name, self): + continue + ++ if ('k_scale' in name or 'v_scale' in name) and name not in params_dict: continue + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader diff --git a/mods/fix-Salyut1-GLM-4.7-NVFP4/run.sh b/mods/fix-Salyut1-GLM-4.7-NVFP4/run.sh new file mode 100644 index 0000000..5d1c39f --- /dev/null +++ b/mods/fix-Salyut1-GLM-4.7-NVFP4/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash +set -e +patch -p1 -d / < glm4_moe.patch \ No newline at end of file