From 3baca14eb1364b2cb039fa5339d09a36b577896d Mon Sep 17 00:00:00 2001 From: sonusflow Date: Wed, 11 Mar 2026 07:29:45 +0000 Subject: [PATCH] Move recipe to 4x-spark-cluster/ and add UMA memory optimizations - Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/ per maintainer request (multi-node recipes in separate directory) - Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env - Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory): - Disable Ray dashboard (saves ~1.2 GiB per node) - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB) - Disable pre-started idle workers (saves ~8 GiB on head node) - Set --num-cpus 2 and --disable-usage-stats on all nodes - Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache --- .../qwen3.5-397b-int4-autoround.yaml | 1 + run-cluster-node.sh | 14 +++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) rename recipes/{ => 4x-spark-cluster}/qwen3.5-397b-int4-autoround.yaml (96%) diff --git a/recipes/qwen3.5-397b-int4-autoround.yaml b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml similarity index 96% rename from recipes/qwen3.5-397b-int4-autoround.yaml rename to recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml index 073741a..c1d4d85 100644 --- a/recipes/qwen3.5-397b-int4-autoround.yaml +++ b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml @@ -24,6 +24,7 @@ mods: # Environment variables env: VLLM_MARLIN_USE_ATOMIC_ADD: 1 + PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True # Default settings (can be overridden via CLI, e.g. --tensor_parallel 2) defaults: diff --git a/run-cluster-node.sh b/run-cluster-node.sh index 2ec9049..796fe06 100755 --- a/run-cluster-node.sh +++ b/run-cluster-node.sh @@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME" export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME" export_persist RAY_memory_monitor_refresh_ms "0" +# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory) +# Disable pre-started idle workers (saves ~8 GiB on head node) +export_persist RAY_num_prestart_python_workers "0" +# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA) +export_persist RAY_object_store_memory "1073741824" + # --- Execution --- if [ "${NODE_TYPE}" == "head" ]; then echo "Starting Ray HEAD node..." - exec ray start --block --head --port 6379 \ + exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \ --node-ip-address "$VLLM_HOST_IP" \ - --include-dashboard=True \ - --dashboard-host "0.0.0.0" \ - --dashboard-port 8265 \ + --include-dashboard=false \ --disable-usage-stats else echo "Starting Ray WORKER node connecting to $HEAD_IP..." - exec ray start --block \ + exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \ --address="$HEAD_IP:6379" \ --node-ip-address "$VLLM_HOST_IP" fi