Move recipe to 4x-spark-cluster/ and add UMA memory optimizations

- Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/
  per maintainer request (multi-node recipes in separate directory)
- Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env
- Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory):
  - Disable Ray dashboard (saves ~1.2 GiB per node)
  - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB)
  - Disable pre-started idle workers (saves ~8 GiB on head node)
  - Set --num-cpus 2 and --disable-usage-stats on all nodes
- Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache
This commit is contained in:
sonusflow
2026-03-11 07:29:45 +00:00
parent 006734910c
commit 3baca14eb1
2 changed files with 10 additions and 5 deletions

View File

@@ -24,6 +24,7 @@ mods:
# Environment variables # Environment variables
env: env:
VLLM_MARLIN_USE_ATOMIC_ADD: 1 VLLM_MARLIN_USE_ATOMIC_ADD: 1
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
# Default settings (can be overridden via CLI, e.g. --tensor_parallel 2) # Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
defaults: defaults:

View File

@@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME" export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
export_persist RAY_memory_monitor_refresh_ms "0" export_persist RAY_memory_monitor_refresh_ms "0"
# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory)
# Disable pre-started idle workers (saves ~8 GiB on head node)
export_persist RAY_num_prestart_python_workers "0"
# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA)
export_persist RAY_object_store_memory "1073741824"
# --- Execution --- # --- Execution ---
if [ "${NODE_TYPE}" == "head" ]; then if [ "${NODE_TYPE}" == "head" ]; then
echo "Starting Ray HEAD node..." echo "Starting Ray HEAD node..."
exec ray start --block --head --port 6379 \ exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
--node-ip-address "$VLLM_HOST_IP" \ --node-ip-address "$VLLM_HOST_IP" \
--include-dashboard=True \ --include-dashboard=false \
--dashboard-host "0.0.0.0" \
--dashboard-port 8265 \
--disable-usage-stats --disable-usage-stats
else else
echo "Starting Ray WORKER node connecting to $HEAD_IP..." echo "Starting Ray WORKER node connecting to $HEAD_IP..."
exec ray start --block \ exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
--address="$HEAD_IP:6379" \ --address="$HEAD_IP:6379" \
--node-ip-address "$VLLM_HOST_IP" --node-ip-address "$VLLM_HOST_IP"
fi fi