initial mod implementation
This commit is contained in:
@@ -24,6 +24,8 @@ DAEMON_MODE="false"
|
|||||||
CHECK_CONFIG="false"
|
CHECK_CONFIG="false"
|
||||||
ACTION="start"
|
ACTION="start"
|
||||||
CLUSTER_WAS_RUNNING="false"
|
CLUSTER_WAS_RUNNING="false"
|
||||||
|
MOD_PATH=""
|
||||||
|
MOD_TYPE=""
|
||||||
|
|
||||||
# Function to print usage
|
# Function to print usage
|
||||||
usage() {
|
usage() {
|
||||||
@@ -34,6 +36,7 @@ usage() {
|
|||||||
echo " --eth-if Ethernet interface (Optional, auto-detected)"
|
echo " --eth-if Ethernet interface (Optional, auto-detected)"
|
||||||
echo " --ib-if InfiniBand interface (Optional, auto-detected)"
|
echo " --ib-if InfiniBand interface (Optional, auto-detected)"
|
||||||
echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
|
echo " --nccl-debug NCCL debug level (Optional, one of: VERSION, WARN, INFO, TRACE). If no level is provided, defaults to INFO."
|
||||||
|
echo " --apply-mod Path to directory or zip file containing run.sh to apply before launch"
|
||||||
echo " --check-config Check configuration and auto-detection without launching"
|
echo " --check-config Check configuration and auto-detection without launching"
|
||||||
echo " -d Daemon mode (only for 'start' action)"
|
echo " -d Daemon mode (only for 'start' action)"
|
||||||
echo " action start | stop | status | exec (Default: start)"
|
echo " action start | stop | status | exec (Default: start)"
|
||||||
@@ -49,6 +52,7 @@ while [[ "$#" -gt 0 ]]; do
|
|||||||
--name) CONTAINER_NAME="$2"; shift ;;
|
--name) CONTAINER_NAME="$2"; shift ;;
|
||||||
--eth-if) ETH_IF="$2"; shift ;;
|
--eth-if) ETH_IF="$2"; shift ;;
|
||||||
--ib-if) IB_IF="$2"; shift ;;
|
--ib-if) IB_IF="$2"; shift ;;
|
||||||
|
--apply-mod) MOD_PATH="$2"; shift ;;
|
||||||
--nccl-debug)
|
--nccl-debug)
|
||||||
if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then
|
if [[ -n "$2" && "$2" =~ ^(VERSION|WARN|INFO|TRACE)$ ]]; then
|
||||||
NCCL_DEBUG_VAL="$2"
|
NCCL_DEBUG_VAL="$2"
|
||||||
@@ -96,6 +100,41 @@ if [[ -n "$NCCL_DEBUG_VAL" ]]; then
|
|||||||
esac
|
esac
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Validate MOD_PATH if set
|
||||||
|
if [[ -n "$MOD_PATH" ]]; then
|
||||||
|
if [[ ! -e "$MOD_PATH" ]]; then
|
||||||
|
echo "Error: Mod path '$MOD_PATH' does not exist."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -d "$MOD_PATH" ]]; then
|
||||||
|
if [[ ! -f "$MOD_PATH/run.sh" ]]; then
|
||||||
|
echo "Error: Mod directory must contain 'run.sh'."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
MOD_TYPE="dir"
|
||||||
|
elif [[ -f "$MOD_PATH" && "$MOD_PATH" == *.zip ]]; then
|
||||||
|
# Check zip content using unzip if available, else python
|
||||||
|
if command -v unzip &> /dev/null; then
|
||||||
|
if ! unzip -l "$MOD_PATH" | grep -q "run.sh"; then
|
||||||
|
echo "Error: Mod zip file must contain 'run.sh'."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Fallback to python for checking zip content
|
||||||
|
if ! python3 -c "import zipfile, sys; sys.exit(0 if 'run.sh' in zipfile.ZipFile(sys.argv[1]).namelist() else 1)" "$MOD_PATH"; then
|
||||||
|
echo "Error: Mod zip file must contain 'run.sh'."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
MOD_TYPE="zip"
|
||||||
|
else
|
||||||
|
echo "Error: --apply-mod must be a directory or a .zip file."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
MOD_PATH=$(realpath "$MOD_PATH")
|
||||||
|
fi
|
||||||
|
|
||||||
# --- Auto-Detection Logic ---
|
# --- Auto-Detection Logic ---
|
||||||
|
|
||||||
# Source autodiscover module
|
# Source autodiscover module
|
||||||
@@ -249,6 +288,92 @@ check_cluster_running() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Apply Mod Function
|
||||||
|
apply_mod_to_container() {
|
||||||
|
local node_ip="$1"
|
||||||
|
local container="$2"
|
||||||
|
local is_local="$3" # true/false
|
||||||
|
|
||||||
|
local mod_name=$(basename "$MOD_PATH")
|
||||||
|
if [[ "$MOD_TYPE" == "zip" ]]; then
|
||||||
|
mod_name="${mod_name%.*}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Applying mod '$mod_name' to $node_ip..."
|
||||||
|
|
||||||
|
# 1. Copy mod to node (if remote)
|
||||||
|
local target_mod_path=""
|
||||||
|
if [[ "$is_local" == "true" ]]; then
|
||||||
|
target_mod_path="$MOD_PATH"
|
||||||
|
else
|
||||||
|
# SCP to remote
|
||||||
|
local remote_tmp="/tmp/vllm_mod_pkg_$(date +%s)"
|
||||||
|
echo " Copying mod package to $node_ip:$remote_tmp..."
|
||||||
|
|
||||||
|
# Create directory first to ensure consistent path structure
|
||||||
|
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" "mkdir -p $remote_tmp"
|
||||||
|
|
||||||
|
# Copy contents using wildcard to avoid creating a subdirectory
|
||||||
|
# Note: We use scp -r with wildcard to copy contents into the pre-created dir
|
||||||
|
if ! scp -r -o BatchMode=yes -o StrictHostKeyChecking=no "$MOD_PATH"/* "$node_ip:$remote_tmp/"; then
|
||||||
|
echo "Error: Failed to copy mod to $node_ip"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
target_mod_path="$remote_tmp"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. Copy into container
|
||||||
|
local container_dest="/workspace/mods/$mod_name"
|
||||||
|
|
||||||
|
# Command prefix for remote vs local
|
||||||
|
local cmd_prefix=""
|
||||||
|
if [[ "$is_local" == "false" ]]; then
|
||||||
|
cmd_prefix="ssh -o BatchMode=yes -o StrictHostKeyChecking=no $node_ip"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create workspace in container
|
||||||
|
$cmd_prefix docker exec "$container" mkdir -p "$container_dest"
|
||||||
|
|
||||||
|
if [[ "$MOD_TYPE" == "zip" ]]; then
|
||||||
|
local zip_name=$(basename "$MOD_PATH")
|
||||||
|
echo " Copying zip to container..."
|
||||||
|
$cmd_prefix docker cp "$target_mod_path" "$container:$container_dest/$zip_name"
|
||||||
|
|
||||||
|
# Unzip in container using python
|
||||||
|
echo " Extracting zip..."
|
||||||
|
local py_unzip="import zipfile, sys; zipfile.ZipFile(sys.argv[1], 'r').extractall(sys.argv[2])"
|
||||||
|
$cmd_prefix docker exec "$container" python3 -c "$py_unzip" "$container_dest/$zip_name" "$container_dest"
|
||||||
|
else
|
||||||
|
# Directory
|
||||||
|
echo " Copying directory content to container..."
|
||||||
|
if [[ "$is_local" == "true" ]]; then
|
||||||
|
docker cp "$MOD_PATH/." "$container:$container_dest/"
|
||||||
|
else
|
||||||
|
# For remote, we copied contents to $target_mod_path.
|
||||||
|
# We want to copy contents of $target_mod_path to $container_dest.
|
||||||
|
$cmd_prefix docker cp "$target_mod_path/." "$container:$container_dest/"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 3. Run run.sh
|
||||||
|
echo " Running patch script on $node_ip..."
|
||||||
|
echo "CMD: cd $container_dest && chmod +x run.sh && ./run.sh"
|
||||||
|
if ! $cmd_prefix docker exec "$container" bash -c "cd $container_dest && chmod +x run.sh && ./run.sh"; then
|
||||||
|
echo "Error: Patch script failed on $node_ip"
|
||||||
|
# We should probably stop the cluster here or at least fail hard
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4. Signal completion
|
||||||
|
echo " Signaling completion..."
|
||||||
|
$cmd_prefix docker exec "$container" touch /tmp/mod_done
|
||||||
|
|
||||||
|
# 5. Cleanup remote temp
|
||||||
|
if [[ "$is_local" == "false" ]]; then
|
||||||
|
ssh -o BatchMode=yes -o StrictHostKeyChecking=no "$node_ip" "rm -rf $target_mod_path"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# Start Cluster Function
|
# Start Cluster Function
|
||||||
start_cluster() {
|
start_cluster() {
|
||||||
check_cluster_running
|
check_cluster_running
|
||||||
@@ -259,34 +384,45 @@ start_cluster() {
|
|||||||
|
|
||||||
# Start Head Node
|
# Start Head Node
|
||||||
echo "Starting Head Node on $HEAD_IP..."
|
echo "Starting Head Node on $HEAD_IP..."
|
||||||
|
|
||||||
|
local head_cmd_args=()
|
||||||
|
if [[ -n "$MOD_PATH" ]]; then
|
||||||
|
head_cmd_args=(bash -c "echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role head --host-ip $HEAD_IP --eth-if $ETH_IF --ib-if $IB_IF")
|
||||||
|
else
|
||||||
|
head_cmd_args=(./run-cluster-node.sh --role head --host-ip "$HEAD_IP" --eth-if "$ETH_IF" --ib-if "$IB_IF")
|
||||||
|
fi
|
||||||
|
|
||||||
docker run -d --privileged --gpus all --rm \
|
docker run -d --privileged --gpus all --rm \
|
||||||
--ipc=host --network host \
|
--ipc=host --network host \
|
||||||
--name "$CONTAINER_NAME" \
|
--name "$CONTAINER_NAME" \
|
||||||
$DOCKER_ARGS \
|
$DOCKER_ARGS \
|
||||||
"$IMAGE_NAME" \
|
"$IMAGE_NAME" \
|
||||||
./run-cluster-node.sh \
|
"${head_cmd_args[@]}"
|
||||||
--role head \
|
|
||||||
--host-ip "$HEAD_IP" \
|
|
||||||
--eth-if "$ETH_IF" \
|
|
||||||
--ib-if "$IB_IF"
|
|
||||||
|
|
||||||
# Start Worker Nodes
|
|
||||||
# Start Worker Nodes
|
# Start Worker Nodes
|
||||||
for worker in "${PEER_NODES[@]}"; do
|
for worker in "${PEER_NODES[@]}"; do
|
||||||
echo "Starting Worker Node on $worker..."
|
echo "Starting Worker Node on $worker..."
|
||||||
ssh "$worker" "docker run -d --privileged --gpus all --rm \
|
|
||||||
--ipc=host --network host \
|
local docker_run_cmd="docker run -d --privileged --gpus all --rm --ipc=host --network host --name $CONTAINER_NAME $DOCKER_ARGS $IMAGE_NAME"
|
||||||
--name $CONTAINER_NAME \
|
|
||||||
$DOCKER_ARGS \
|
if [[ -n "$MOD_PATH" ]]; then
|
||||||
$IMAGE_NAME \
|
local inner_script="echo Waiting for mod application...; while [ ! -f /tmp/mod_done ]; do sleep 1; done; echo Mod applied, starting node...; exec ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
|
||||||
./run-cluster-node.sh \
|
ssh "$worker" "$docker_run_cmd bash -c \"$inner_script\""
|
||||||
--role node \
|
else
|
||||||
--host-ip $worker \
|
ssh "$worker" "$docker_run_cmd ./run-cluster-node.sh --role node --host-ip $worker --eth-if $ETH_IF --ib-if $IB_IF --head-ip $HEAD_IP"
|
||||||
--eth-if $ETH_IF \
|
fi
|
||||||
--ib-if $IB_IF \
|
|
||||||
--head-ip $HEAD_IP"
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Apply mods if requested
|
||||||
|
if [[ -n "$MOD_PATH" ]]; then
|
||||||
|
echo "Applying modifications to cluster nodes..."
|
||||||
|
apply_mod_to_container "$HEAD_IP" "$CONTAINER_NAME" "true"
|
||||||
|
|
||||||
|
for worker in "${PEER_NODES[@]}"; do
|
||||||
|
apply_mod_to_container "$worker" "$CONTAINER_NAME" "false"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
wait_for_cluster
|
wait_for_cluster
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
18
mods/fix-Salyut1-GLM-4.7-NVFP4/glm4_moe.patch
Normal file
18
mods/fix-Salyut1-GLM-4.7-NVFP4/glm4_moe.patch
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
--- a/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/glm4_moe.py
|
||||||
|
+++ b/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/glm4_moe.py
|
||||||
|
@@ -537,6 +537,7 @@
|
||||||
|
if is_pp_missing_parameter(name, self):
|
||||||
|
continue
|
||||||
|
|
||||||
|
+ if ('k_scale' in name or 'v_scale' in name) and name not in params_dict: continue
|
||||||
|
param = params_dict[name]
|
||||||
|
weight_loader = param.weight_loader
|
||||||
|
weight_loader(param, loaded_weight, shard_id)
|
||||||
|
@@ -596,6 +597,7 @@
|
||||||
|
if is_pp_missing_parameter(name, self):
|
||||||
|
continue
|
||||||
|
|
||||||
|
+ if ('k_scale' in name or 'v_scale' in name) and name not in params_dict: continue
|
||||||
|
param = params_dict[name]
|
||||||
|
weight_loader = getattr(
|
||||||
|
param, "weight_loader", default_weight_loader
|
||||||
3
mods/fix-Salyut1-GLM-4.7-NVFP4/run.sh
Normal file
3
mods/fix-Salyut1-GLM-4.7-NVFP4/run.sh
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
patch -p1 -d / < glm4_moe.patch
|
||||||
Reference in New Issue
Block a user