Move recipe to 4x-spark-cluster/ and add UMA memory optimizations

- Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/ per maintainer request (multi-node recipes in separate directory) - Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env - Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory): - Disable Ray dashboard (saves ~1.2 GiB per node) - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB) - Disable pre-started idle workers (saves ~8 GiB on head node) - Set --num-cpus 2 and --disable-usage-stats on all nodes - Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache
2026-03-11 07:29:45 +00:00
parent 006734910c
commit 3baca14eb1
2 changed files with 10 additions and 5 deletions
--- a/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
@@ -24,6 +24,7 @@ mods:
 # Environment variables
 env:
  VLLM_MARLIN_USE_ATOMIC_ADD: 1
  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
 # Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
 defaults:
--- a/run-cluster-node.sh
+++ b/run-cluster-node.sh
@@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist RAY_memory_monitor_refresh_ms "0"
 # UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory)
 # Disable pre-started idle workers (saves ~8 GiB on head node)
 export_persist RAY_num_prestart_python_workers "0"
 # Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA)
 export_persist RAY_object_store_memory "1073741824"
 # --- Execution ---
 if [ "${NODE_TYPE}" == "head" ]; then
    echo "Starting Ray HEAD node..."
-    exec ray start --block --head --port 6379 \
+    exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
        --node-ip-address "$VLLM_HOST_IP" \
-	--include-dashboard=True \
+	--include-dashboard=false \
        --dashboard-host "0.0.0.0" \
        --dashboard-port 8265 \
        --disable-usage-stats
 else
    echo "Starting Ray WORKER node connecting to $HEAD_IP..."
-    exec ray start --block \
+    exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
        --address="$HEAD_IP:6379" \
        --node-ip-address "$VLLM_HOST_IP"
 fi