diff --git a/mods/gpu-mem-util-gb/gpu_mem.patch b/mods/gpu-mem-util-gb/gpu_mem.patch
new file mode 100644
index 0000000..aa044b5
--- /dev/null
+++ b/mods/gpu-mem-util-gb/gpu_mem.patch
@@ -0,0 +1,255 @@
+diff --git a/vllm/config/cache.py b/vllm/config/cache.py
+index 3796265ff..b6dcfb54c 100644
+--- a/vllm/config/cache.py
++++ b/vllm/config/cache.py
+@@ -45,6 +45,11 @@ class CacheConfig:
+     not matter if you have another vLLM instance running on the same GPU. For
+     example, if you have two vLLM instances running on the same GPU, you can
+     set the GPU memory utilization to 0.5 for each instance."""
++    gpu_memory_utilization_gb: float | None = Field(default=None, gt=0)
++    """Amount of GPU memory to be used in GiB. This provides fine-grained control
++    over GPU memory usage and is particularly useful on unified memory systems
++    where available memory changes dynamically. If specified, it overrides
++    gpu_memory_utilization. Cannot be used simultaneously with kv_cache_memory_bytes."""
+     cache_dtype: CacheDType = "auto"
+     """Data type for kv cache storage. If "auto", will use model data type.
+     CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
+@@ -204,6 +209,18 @@ class CacheConfig:
+             object.__setattr__(self, "user_specified_block_size", True)
+         return self
+ 
++    @model_validator(mode="after")
++    def _validate_memory_params(self) -> "CacheConfig":
++        if (
++            self.gpu_memory_utilization_gb is not None
++            and self.kv_cache_memory_bytes is not None
++        ):
++            raise ValueError(
++                "Cannot specify both gpu_memory_utilization_gb and "
++                "kv_cache_memory_bytes. Please use only one of them."
++            )
++        return self
++
+     @field_validator("cache_dtype", mode="after")
+     @classmethod
+     def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
+diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
+index 56bbb7bf5..db5012608 100644
+--- a/vllm/engine/arg_utils.py
++++ b/vllm/engine/arg_utils.py
+@@ -454,6 +454,7 @@ class EngineArgs:
+     offload_prefetch_step: int = PrefetchOffloadConfig.offload_prefetch_step
+     offload_params: set[str] = get_field(PrefetchOffloadConfig, "offload_params")
+     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
++    gpu_memory_utilization_gb: float | None = CacheConfig.gpu_memory_utilization_gb
+     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
+     max_num_batched_tokens: int | None = None
+     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
+@@ -954,6 +955,9 @@ class EngineArgs:
+         cache_group.add_argument(
+             "--gpu-memory-utilization", **cache_kwargs["gpu_memory_utilization"]
+         )
++        cache_group.add_argument(
++            "--gpu-memory-utilization-gb", **cache_kwargs["gpu_memory_utilization_gb"]
++        )
+         cache_group.add_argument(
+             "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
+         )
+@@ -1512,6 +1516,7 @@ class EngineArgs:
+         cache_config = CacheConfig(
+             block_size=self.block_size,  # type: ignore[arg-type]
+             gpu_memory_utilization=self.gpu_memory_utilization,
++            gpu_memory_utilization_gb=self.gpu_memory_utilization_gb,
+             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
+             cache_dtype=resolved_cache_dtype,  # type: ignore[arg-type]
+             is_attention_free=model_config.is_attention_free,
+diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
+index 5909b3043..c2607df6a 100644
+--- a/vllm/entrypoints/llm.py
++++ b/vllm/entrypoints/llm.py
+@@ -156,6 +156,11 @@ class LLM:
+             values will increase the KV cache size and thus improve the model's
+             throughput. However, if the value is too high, it may cause out-of-
+             memory (OOM) errors.
++        gpu_memory_utilization_gb: Amount of GPU memory to reserve in GiB.
++            This provides fine-grained control over GPU memory usage and is
++            particularly useful on unified memory systems where available memory
++            changes dynamically. If specified, it overrides gpu_memory_utilization.
++            Cannot be used simultaneously with kv_cache_memory_bytes.
+         kv_cache_memory_bytes: Size of KV Cache per GPU in bytes. By default,
+             this is set to None and vllm can automatically infer the kv cache
+             size based on gpu_memory_utilization. However, users may want to
+@@ -234,6 +239,7 @@ class LLM:
+         chat_template: Path | str | None = None,
+         seed: int = 0,
+         gpu_memory_utilization: float = 0.9,
++        gpu_memory_utilization_gb: float | None = None,
+         cpu_offload_gb: float = 0,
+         offload_group_size: int = 0,
+         offload_num_in_group: int = 1,
+@@ -356,6 +362,7 @@ class LLM:
+             tokenizer_revision=tokenizer_revision,
+             seed=seed,
+             gpu_memory_utilization=gpu_memory_utilization,
++            gpu_memory_utilization_gb=gpu_memory_utilization_gb,
+             kv_cache_memory_bytes=kv_cache_memory_bytes,
+             cpu_offload_gb=cpu_offload_gb,
+             offload_group_size=offload_group_size,
+diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
+index 2ed7ef7e0..806830b17 100644
+--- a/vllm/v1/core/kv_cache_utils.py
++++ b/vllm/v1/core/kv_cache_utils.py
+@@ -622,7 +622,8 @@ def _check_enough_kv_cache_memory(
+     if available_memory <= 0:
+         raise ValueError(
+             "No available memory for the cache blocks. "
+-            "Try increasing `gpu_memory_utilization` when initializing the engine. "
++            "Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb` "
++            "when initializing the engine. "
+             "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+             "for more details."
+         )
+@@ -643,8 +644,8 @@ def _check_enough_kv_cache_memory(
+             f"({max_model_len}), ({format_gib(needed_memory)} GiB KV "
+             f"cache is needed, which is larger than the available KV cache "
+             f"memory ({format_gib(available_memory)} GiB). {estimated_msg}"
+-            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
+-            f"when initializing the engine. "
++            f"Try increasing `gpu_memory_utilization` or `gpu_memory_utilization_gb`, "
++            f"or decreasing `max_model_len` when initializing the engine. "
+             f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+             f"for more details."
+         )
+@@ -1438,7 +1439,8 @@ def _auto_fit_max_model_len(
+     if auto_fit_max <= 0:
+         raise ValueError(
+             "Cannot auto-fit max_model_len: not enough GPU memory available "
+-            "to serve even a single token. Try increasing `gpu_memory_utilization`."
++            "to serve even a single token. Try increasing `gpu_memory_utilization` "
++            "or `gpu_memory_utilization_gb`."
+         )
+ 
+     if auto_fit_max >= original_max:
+diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
+index 3d065927e..e8cef2ceb 100644
+--- a/vllm/v1/utils.py
++++ b/vllm/v1/utils.py
+@@ -358,6 +358,7 @@ def report_usage_stats(
+             "dtype": str(vllm_config.model_config.dtype),
+             "block_size": vllm_config.cache_config.block_size,
+             "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
++            "gpu_memory_utilization_gb": vllm_config.cache_config.gpu_memory_utilization_gb,
+             "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
+             # Quantization
+             "quantization": vllm_config.model_config.quantization,
+diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
+index b53bd71a1..d28821328 100644
+--- a/vllm/v1/worker/gpu_model_runner.py
++++ b/vllm/v1/worker/gpu_model_runner.py
+@@ -5355,8 +5355,8 @@ class GPUModelRunner(
+                 raise RuntimeError(
+                     "CUDA out of memory occurred when warming up sampler with "
+                     f"{num_reqs} dummy requests. Please try lowering "
+-                    "`max_num_seqs` or `gpu_memory_utilization` when "
+-                    "initializing the engine."
++                    "`max_num_seqs`, `gpu_memory_utilization`, or "
++                    "`gpu_memory_utilization_gb` when initializing the engine."
+                 ) from e
+             else:
+                 raise e
+@@ -5434,8 +5434,8 @@ class GPUModelRunner(
+                 raise RuntimeError(
+                     "CUDA out of memory occurred when warming up pooler "
+                     f"({task=}) with {num_reqs} dummy requests. Please try "
+-                    "lowering `max_num_seqs` or `gpu_memory_utilization` when "
+-                    "initializing the engine."
++                    "lowering `max_num_seqs`, `gpu_memory_utilization`, or "
++                    "`gpu_memory_utilization_gb` when initializing the engine."
+                 ) from e
+             else:
+                 raise e
+diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
+index 842e76549..bf3bb359b 100644
+--- a/vllm/v1/worker/gpu_worker.py
++++ b/vllm/v1/worker/gpu_worker.py
+@@ -357,7 +357,8 @@ class Worker(WorkerBase):
+ 
+         Tip:
+             You may limit the usage of GPU memory
+-            by adjusting the `gpu_memory_utilization` parameter.
++            by adjusting the `gpu_memory_utilization` or
++            `gpu_memory_utilization_gb` parameter.
+         """
+         if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
+             # still need a profile run which compiles the model for
+@@ -369,7 +370,8 @@ class Worker(WorkerBase):
+                 f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
+                 "KV Cache as specified by kv_cache_memory_bytes config and "
+                 "skipped memory profiling. This does not respect the "
+-                "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
++                "gpu_memory_utilization or gpu_memory_utilization_gb config. "
++                "Only use kv_cache_memory_bytes "
+                 "config when you want manual control of KV cache memory "
+                 "size. If OOM'ed, check the difference of initial free "
+                 "memory between the current run and the previous run "
+diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
+index d06c40ed6..89c94e641 100644
+--- a/vllm/v1/worker/utils.py
++++ b/vllm/v1/worker/utils.py
+@@ -405,21 +405,43 @@ def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) ->
+     Calculate the amount of memory required by vLLM, then validate
+     that the current amount of free memory is sufficient for that.
+     """
+-    requested_memory = math.ceil(
+-        init_snapshot.total_memory * cache_config.gpu_memory_utilization
+-    )
+-
+-    if init_snapshot.free_memory < requested_memory:
+-        raise ValueError(
+-            f"Free memory on device {init_snapshot.device_} "
+-            f"({format_gib(init_snapshot.free_memory)}/"
+-            f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
+-            f"is less than desired GPU memory utilization "
+-            f"({cache_config.gpu_memory_utilization}, "
+-            f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
+-            f"utilization or reduce GPU memory used by other processes."
++    if cache_config.gpu_memory_utilization_gb is not None:
++        requested_memory = math.ceil(cache_config.gpu_memory_utilization_gb * 1024**3)
++        if requested_memory <= 0:
++            raise ValueError(
++                f"gpu_memory_utilization_gb must be positive, got "
++                f"{cache_config.gpu_memory_utilization_gb} GiB."
++            )
++        if requested_memory > init_snapshot.total_memory:
++            raise ValueError(
++                f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
++                f"total GPU memory ({format_gib(init_snapshot.total_memory)} GiB). "
++                f"Reduce gpu_memory_utilization_gb or use a smaller value."
++            )
++        safety_margin = 0.5 * 1024**3
++        if requested_memory > init_snapshot.free_memory + safety_margin:
++            raise ValueError(
++                f"Requested memory ({format_gib(requested_memory)} GiB) exceeds "
++                f"available memory ({format_gib(init_snapshot.free_memory)} GiB) "
++                f"with safety margin ({format_gib(safety_margin)} GiB). "
++                f"Reduce gpu_memory_utilization_gb or free up GPU memory."
++            )
++    else:
++        requested_memory = math.ceil(
++            init_snapshot.total_memory * cache_config.gpu_memory_utilization
+         )
+ 
++        if init_snapshot.free_memory < requested_memory:
++            raise ValueError(
++                f"Free memory on device {init_snapshot.device_} "
++                f"({format_gib(init_snapshot.free_memory)}/"
++                f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
++                f"is less than desired GPU memory utilization "
++                f"({cache_config.gpu_memory_utilization}, "
++                f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
++                f"utilization or reduce GPU memory used by other processes."
++            )
++
+     return requested_memory
+ 
+ 
diff --git a/mods/gpu-mem-util-gb/run.sh b/mods/gpu-mem-util-gb/run.sh
new file mode 100644
index 0000000..1db6efe
--- /dev/null
+++ b/mods/gpu-mem-util-gb/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -e
+
+patch -p1 -d /usr/local/lib/python3.12/dist-packages < gpu_mem.patch \
+  && echo "=====> You can now use --gpu-memory-utilization-gb parameter to specify reserved memory in GiB"
\ No newline at end of file