From f03b8bc51afa14fabd06412a16b27ee53d45b664 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Fri, 21 Nov 2025 11:13:36 -0800
Subject: [PATCH] ggml: Use max graph memory allocation when reserving

When calculating the size of the memory required for a compute
graph, we may test multiple graphs - for example a vision encoder
and the text model. Since these graphs are never run at the same
time, we just want the max size.

Typically, a new graph only reallocates memory if it doesn't fit in
the existing space, so the last graph reservation is the max size.
However, the Vulkan backend imposes a 1G cap for a single allocation,
which means that the graph may require multiple allocations. This
results in a problem if:
 - There is an old graph with one small chunk and one big chunk
 - A new graph with one big chunk that is smaller than the total
   of the old graph.
In this case, the big chunk of the new graph will trigger a
reallocation, which will free the old second chunk. The total
amount of memory reported will be lower than the max. To avoid
this, we should explicitly take the max from each graph.
---
 ml/backend/ggml/ggml.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index ebcc1d86f..75fce7a68 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -851,19 +851,19 @@ func (c *Context) Reserve() {
 
 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
 
-	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
-	for _, bt := range c.b.schedBufts {
-		c.b.btDeviceMemory[bt].Graph = 0
-	}
-
+	graphs := make(map[C.ggml_backend_buffer_type_t]uint64)
 	for i := range c.b.schedBackends {
 		bufferSize := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])
-		c.b.btDeviceMemory[c.b.schedBufts[i]].Graph += uint64(bufferSize)
+		graphs[c.b.schedBufts[i]] += uint64(bufferSize)
 
 		logutil.Trace("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])),
 			"buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])), "size", format.HumanBytes2(uint64(bufferSize)))
 	}
 
+	for bt, size := range graphs {
+		c.b.btDeviceMemory[bt].Graph = max(c.b.btDeviceMemory[bt].Graph, size)
+	}
+
 	if !reserved {
 		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
 	}