From 3baca14eb1364b2cb039fa5339d09a36b577896d Mon Sep 17 00:00:00 2001
From: sonusflow <admin@sonusflow.pl>
Date: Wed, 11 Mar 2026 07:29:45 +0000
Subject: [PATCH] Move recipe to 4x-spark-cluster/ and add UMA memory
 optimizations

- Move qwen3.5-397b-int4-autoround.yaml to recipes/4x-spark-cluster/
  per maintainer request (multi-node recipes in separate directory)
- Add PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to recipe env
- Optimize Ray for GB10 UMA (128GB shared CPU/GPU memory):
  - Disable Ray dashboard (saves ~1.2 GiB per node)
  - Limit Ray object store to 1 GiB (default 30% of RAM = 33 GiB)
  - Disable pre-started idle workers (saves ~8 GiB on head node)
  - Set --num-cpus 2 and --disable-usage-stats on all nodes
- Net effect: ~40+ GiB freed across 4-node cluster for model/KV cache
---
 .../qwen3.5-397b-int4-autoround.yaml               |  1 +
 run-cluster-node.sh                                | 14 +++++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)
 rename recipes/{ => 4x-spark-cluster}/qwen3.5-397b-int4-autoround.yaml (96%)

diff --git a/recipes/qwen3.5-397b-int4-autoround.yaml b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
similarity index 96%
rename from recipes/qwen3.5-397b-int4-autoround.yaml
rename to recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
index 073741a..c1d4d85 100644
--- a/recipes/qwen3.5-397b-int4-autoround.yaml
+++ b/recipes/4x-spark-cluster/qwen3.5-397b-int4-autoround.yaml
@@ -24,6 +24,7 @@ mods:
 # Environment variables
 env:
   VLLM_MARLIN_USE_ATOMIC_ADD: 1
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
 
 # Default settings (can be overridden via CLI, e.g. --tensor_parallel 2)
 defaults:
diff --git a/run-cluster-node.sh b/run-cluster-node.sh
index 2ec9049..796fe06 100755
--- a/run-cluster-node.sh
+++ b/run-cluster-node.sh
@@ -101,19 +101,23 @@ export_persist GLOO_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist TP_SOCKET_IFNAME "$ETH_IF_NAME"
 export_persist RAY_memory_monitor_refresh_ms "0"
 
+# UMA Memory Optimization (DGX Spark 128GB shared CPU/GPU memory)
+# Disable pre-started idle workers (saves ~8 GiB on head node)
+export_persist RAY_num_prestart_python_workers "0"
+# Limit object store to 1 GiB (default 30% of RAM = 33 GiB, wastes UMA)
+export_persist RAY_object_store_memory "1073741824"
+
 # --- Execution ---
 
 if [ "${NODE_TYPE}" == "head" ]; then
     echo "Starting Ray HEAD node..."
-    exec ray start --block --head --port 6379 \
+    exec ray start --block --head --port 6379 --object-store-memory 1073741824 --num-cpus 2 \
         --node-ip-address "$VLLM_HOST_IP" \
-	--include-dashboard=True \
-        --dashboard-host "0.0.0.0" \
-        --dashboard-port 8265 \
+	--include-dashboard=false \
         --disable-usage-stats
 else
     echo "Starting Ray WORKER node connecting to $HEAD_IP..."
-    exec ray start --block \
+    exec ray start --block --object-store-memory 1073741824 --num-cpus 2 --disable-usage-stats \
         --address="$HEAD_IP:6379" \
         --node-ip-address "$VLLM_HOST_IP"
 fi