ggml: Avoid cudaMemsetAsync during memory fitting

We pass invalid pointers when we check the size of the required compute graph before fitting. Some CUDA APIs validate these pointers but we can just skip them during this phase. cudaMemsetAsync is one of these that we weren't skipping but never took the code path that used it. Now that we have enabled op_offload, we can hit it in memory pressured situations.
cpu: always ensure LibOllamaPath included (#12890 )
2025-10-31 15:23:28 -07:00 · 2025-10-31 14:37:29 -07:00 · 2025-10-31 09:54:25 -07:00 · 2025-10-30 17:12:33 -07:00
12 changed files with 37 additions and 302 deletions
--- a/discover/runner.go
+++ b/discover/runner.go
@@ -53,7 +53,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		if eval, err := filepath.EvalSymlinks(exe); err == nil {
 			exe = eval
 		}
-		files, err := filepath.Glob(filepath.Join(LibOllamaPath, "*", "*ggml-*"))
+		files, err := filepath.Glob(filepath.Join(ml.LibOllamaPath, "*", "*ggml-*"))
 		if err != nil {
 			slog.Debug("unable to lookup runner library directories", "error", err)
 		}
@@ -64,7 +64,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		// Our current packaging model places ggml-hip in the main directory
 		// but keeps rocm in an isolated directory.  We have to add it to
 		// the [LD_LIBRARY_]PATH so ggml-hip will load properly
-		rocmDir = filepath.Join(LibOllamaPath, "rocm")
+		rocmDir = filepath.Join(ml.LibOllamaPath, "rocm")
 		if _, err := os.Stat(rocmDir); err != nil {
 			rocmDir = ""
 		}
@@ -95,9 +95,9 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 				}
 			}
 			if dir == "" {
-				dirs = []string{LibOllamaPath}
+				dirs = []string{ml.LibOllamaPath}
 			} else {
-				dirs = []string{LibOllamaPath, dir}
+				dirs = []string{ml.LibOllamaPath, dir}
 			}

 			// ROCm can take a long time on some systems, so give it more time before giving up
@@ -249,7 +249,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 		libDirs = make(map[string]struct{})
 		for _, dev := range devices {
 			dir := dev.LibraryPath[len(dev.LibraryPath)-1]
-			if dir != LibOllamaPath {
+			if dir != ml.LibOllamaPath {
 				libDirs[dir] = struct{}{}
 			}
 		}
@@ -339,7 +339,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			devFilter := ml.GetVisibleDevicesEnv(devices)

 			for dir := range libDirs {
-				updatedDevices := bootstrapDevices(ctx, []string{LibOllamaPath, dir}, devFilter)
+				updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter)
 				for _, u := range updatedDevices {
 					for i := range devices {
 						if u.DeviceID == devices[i].DeviceID && u.PCIID == devices[i].PCIID {
--- a/integration/api_test.go
+++ b/integration/api_test.go
@@ -381,30 +381,3 @@ func TestAPIShowModel(t *testing.T) {
 		t.Errorf("%s missing modified_at: %#v", modelName, resp)
 	}
 }
-
-func TestAPIEmbeddings(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
-	defer cancel()
-	client, _, cleanup := InitServerConnection(ctx, t)
-	defer cleanup()
-	req := api.EmbeddingRequest{
-		Model:  libraryEmbedModels[0],
-		Prompt: "why is the sky blue?",
-		Options: map[string]interface{}{
-			"temperature": 0,
-			"seed":        123,
-		},
-	}
-
-	if err := PullIfMissing(ctx, client, req.Model); err != nil {
-		t.Fatalf("pull failed %s", err)
-	}
-
-	resp, err := client.Embeddings(ctx, &req)
-	if err != nil {
-		t.Fatalf("embeddings call failed %s", err)
-	}
-	if len(resp.Embedding) == 0 {
-		t.Errorf("zero length embedding response")
-	}
-}
--- a/integration/qwen3vl_test.go
+++ b/integration/qwen3vl_test.go
@@ -1,259 +0,0 @@
-//go:build integration
-
-package integration
-
-import (
-	"context"
-	"os"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/ollama/ollama/api"
-)
-
-// getTestConfig returns model and streaming mode based on environment variables or defaults
-func getTestConfig() (model string, stream bool) {
-	model = os.Getenv("QWEN3VL_MODEL")
-	if model == "" {
-		// model = "qwen3-vl:235b-cloud" // default
-		model = "qwen3vl-thinking-odc-dev"
-	}
-
-	streamStr := os.Getenv("QWEN3VL_STREAM")
-	stream = streamStr != "false" // default to true
-
-	return model, stream
-}
-
-func TestQwen3VL(t *testing.T) {
-	model, stream := getTestConfig()
-
-	tests := []struct {
-		name     string
-		messages []api.Message
-		tools    []api.Tool
-		images   []string
-	}{
-		{
-			name: "Text-Only Scenario",
-			messages: []api.Message{
-				{Role: "system", Content: "You are a helpful assistant."},
-				{Role: "user", Content: "Write a short haiku about autumn."},
-			},
-		},
-		{
-			name: "Single Image Scenario",
-			messages: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are a helpful assistant that can see images.",
-				},
-				{
-					Role:    "user",
-					Content: "What is in this image?",
-				},
-			},
-			images: []string{"testdata/menu.png"},
-		},
-		{
-			name: "Multiple Images Scenario",
-			messages: []api.Message{
-				{
-					Role:    "system",
-					Content: "You are a helpful assistant that can see images.",
-				},
-				{
-					Role:    "user",
-					Content: "Use both images to answer the question.",
-				},
-			},
-			images: []string{"testdata/satmath1.png", "testdata/satmath2.png"},
-		},
-		{
-			name: "Tools Scenario",
-			messages: []api.Message{
-				{
-					Role:    "system",
-					Content: "You can call tools when needed. Return tool calls when actions are needed.",
-				},
-				{Role: "user", Content: "What's the weather in San Francisco now?"},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather for a city.",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: map[string]api.ToolProperty{
-								"city": {
-									Type:        api.PropertyType{"string"},
-									Description: "The city to get the weather for",
-								},
-							},
-							Required: []string{"city"},
-						},
-					},
-				},
-			},
-		},
-		{
-			name: "Multi-Turn Tools With Image",
-			messages: []api.Message{
-				{Role: "system", Content: "Use tools when actions are required."},
-				{Role: "user", Content: "What's the current temperature in San Francisco?"},
-				{Role: "assistant", Content: "", ToolCalls: []api.ToolCall{
-					{Function: api.ToolCallFunction{
-						Name: "get_weather",
-						Arguments: api.ToolCallFunctionArguments{
-							"city": "San Francisco",
-						},
-					}},
-				}},
-				{Role: "tool", ToolName: "get_weather", Content: "Sunny"},
-				{Role: "user", Content: "Given that weather, what are the top 10 activities to do in San Francisco? Consider this photo as context."},
-			},
-			tools: []api.Tool{
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_weather",
-						Description: "Get current weather for a city.",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: map[string]api.ToolProperty{
-								"city": {
-									Type:        api.PropertyType{"string"},
-									Description: "The city to get the weather for",
-								},
-							},
-							Required: []string{"city"},
-						},
-					},
-				},
-				{
-					Type: "function",
-					Function: api.ToolFunction{
-						Name:        "get_top_10_activities",
-						Description: "Get the top 10 activities for a city given the weather.",
-						Parameters: api.ToolFunctionParameters{
-							Type: "object",
-							Properties: map[string]api.ToolProperty{
-								"weather": {
-									Type:        api.PropertyType{"string"},
-									Description: "The weather in the city",
-								},
-								"city": {
-									Type:        api.PropertyType{"string"},
-									Description: "The city to get the activities for",
-								},
-								"image": {
-									Type:        api.PropertyType{"base64"},
-									Description: "The image of the city",
-								},
-							},
-							Required: []string{"weather", "city", "image"},
-						},
-					},
-				},
-			},
-			images: []string{"testdata/sf-city.jpeg"},
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			// Load and attach images to last user message
-			messages := tt.messages
-			if len(tt.images) > 0 {
-				var imgs []api.ImageData
-				for _, path := range tt.images {
-					imgs = append(imgs, loadImageData(t, path))
-				}
-				// Find last user message and attach images
-				for i := len(messages) - 1; i >= 0; i-- {
-					if messages[i].Role == "user" {
-						messages[i].Images = imgs
-						break
-					}
-				}
-			}
-
-			ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
-			defer cancel()
-			client, _, cleanup := InitServerConnection(ctx, t)
-			defer cleanup()
-
-			// Pull/preload model if not using remote server
-			if os.Getenv("OLLAMA_TEST_EXISTING") == "" {
-				if err := PullIfMissing(ctx, client, model); err != nil {
-					t.Fatal(err)
-				}
-				// Preload to reduce startup latency
-				_ = client.Generate(ctx, &api.GenerateRequest{Model: model}, func(api.GenerateResponse) error { return nil })
-			}
-
-			// Build and execute chat request
-			req := &api.ChatRequest{
-				Model:    model,
-				Messages: messages,
-				Tools:    tt.tools,
-				Stream:   &stream,
-				Options:  map[string]any{"seed": 42, "temperature": 0.0},
-			}
-
-			var contentBuf, thinkingBuf strings.Builder
-			var toolCalls []api.ToolCall
-
-			err := client.Chat(ctx, req, func(r api.ChatResponse) error {
-				contentBuf.WriteString(r.Message.Content)
-				thinkingBuf.WriteString(r.Message.Thinking)
-				toolCalls = append(toolCalls, r.Message.ToolCalls...)
-				return nil
-			})
-			if err != nil {
-				t.Fatalf("chat error: %v", err)
-			}
-
-			// Log truncated responses
-			logTruncated := func(label, text string) {
-				if text != "" {
-					if len(text) > 800 {
-						text = text[:800] + "... [truncated]"
-					}
-					t.Logf("%s: %s", label, text)
-				}
-			}
-			logTruncated("Thinking", thinkingBuf.String())
-			logTruncated("Content", contentBuf.String())
-
-			if len(toolCalls) > 0 {
-				t.Logf("Tool calls: %d", len(toolCalls))
-				for i, call := range toolCalls {
-					t.Logf("  [%d] %s(%+v)", i, call.Function.Name, call.Function.Arguments)
-				}
-			}
-
-			// Validate tool calls if tools were provided
-			if len(tt.tools) > 0 {
-				if len(toolCalls) == 0 {
-					t.Fatal("expected at least one tool call, got none")
-				}
-				if toolCalls[0].Function.Name == "" {
-					t.Fatalf("tool call missing function name: %#v", toolCalls[0])
-				}
-			}
-		})
-	}
-}
-
-// loadImageData loads image data from a file path
-func loadImageData(t *testing.T, imagePath string) []byte {
-	data, err := os.ReadFile(imagePath)
-	if err != nil {
-		t.Fatalf("Failed to load image %s: %v", imagePath, err)
-	}
-	return data
-}
--- a/integration/testdata/menu.png
+++ b/integration/testdata/menu.png
--- a/integration/testdata/satmath1.png
+++ b/integration/testdata/satmath1.png
--- a/integration/testdata/satmath2.png
+++ b/integration/testdata/satmath2.png
--- a/integration/testdata/sf-city.jpeg
+++ b/integration/testdata/sf-city.jpeg
--- a/llama/patches/0022-ggml-No-alloc-mode.patch
+++ b/llama/patches/0022-ggml-No-alloc-mode.patch
@@ -11,9 +11,9 @@ must be recreated with no-alloc set to false before loading data.
 ggml/include/ggml-backend.h     |   1 +
 ggml/src/ggml-backend-impl.h    |  16 +++
 ggml/src/ggml-backend.cpp       |  72 ++++++++++-
- ggml/src/ggml-cuda/common.cuh   |  48 ++++++-
+ ggml/src/ggml-cuda/common.cuh   |  58 ++++++++-
 ggml/src/ggml-cuda/ggml-cuda.cu | 217 ++++++++++++++++++++++++++------
- 5 files changed, 310 insertions(+), 44 deletions(-)
+ 5 files changed, 320 insertions(+), 44 deletions(-)

 diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
 index 2763f2bd6..b3b5b356a 100644
@@ -219,10 +219,10 @@ index 41eef3b5f..c81a2e48a 100644
 
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
 diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
-index e0abde542..28d6bcd71 100644
+index e0abde542..e98044bd8 100644
 --- a/ggml/src/ggml-cuda/common.cuh
 +++ b/ggml/src/ggml-cuda/common.cuh
-@@ -35,6 +35,31 @@
+@@ -35,6 +35,41 @@
 #include "vendors/cuda.h"
 #endif // defined(GGML_USE_HIP)
 
@@ -246,15 +246,25 @@ index e0abde542..28d6bcd71 100644
 +    }
 +}
 +
+static cudaError_t cudaMemsetAsyncReserve ( void* devPtr, int value, size_t count, cudaStream_t stream = 0 ) {
+    if (!reserving_graph) {
+        return cudaMemsetAsync(devPtr, value, count, stream);
+    } else {
+        return cudaSuccess;
+    }
+}
+
 +#undef cudaMemcpyAsync
 +#define cudaMemcpyAsync cudaMemcpyAsyncReserve
 +#undef cudaMemcpy2DAsync
 +#define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve
+#undef cudaMemsetAsync
+#define cudaMemsetAsync cudaMemsetAsyncReserve
 +
 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
 
-@@ -856,6 +881,9 @@ struct ggml_cuda_pool {
+@@ -856,6 +891,9 @@ struct ggml_cuda_pool {
 
     virtual void * alloc(size_t size, size_t * actual_size) = 0;
     virtual void free(void * ptr, size_t size) = 0;
@@ -264,7 +274,7 @@ index e0abde542..28d6bcd71 100644
 };
 
 template<typename T>
-@@ -999,11 +1027,11 @@ struct ggml_backend_cuda_context {
+@@ -999,11 +1037,11 @@ struct ggml_backend_cuda_context {
     // pool
     std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
 
@@ -278,7 +288,7 @@ index e0abde542..28d6bcd71 100644
         }
         return *pools[device];
     }
-@@ -1011,4 +1039,20 @@ struct ggml_backend_cuda_context {
+@@ -1011,4 +1049,20 @@ struct ggml_backend_cuda_context {
     ggml_cuda_pool & pool() {
         return pool(device);
     }
@@ -300,7 +310,7 @@ index e0abde542..28d6bcd71 100644
 +    }
 };
 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index f4d4a4267..ac70dcac8 100644
+index c555cd30f..eb3db0f19 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -350,6 +350,8 @@ const ggml_cuda_device_info & ggml_cuda_info() {
--- a/llm/status.go
+++ b/llm/status.go
@@ -23,6 +23,7 @@ func NewStatusWriter(out *os.File) *StatusWriter {
 var errorPrefixes = []string{
 	"error:",
 	"CUDA error",
+	"ROCm error",
 	"cudaMalloc failed",
 	"\"ERR\"",
 	"error loading model",
--- a/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh
@@ -55,10 +55,20 @@ static cudaError_t cudaMemcpy2DAsyncReserve ( void* dst, size_t dpitch, const vo
    }
 }

+static cudaError_t cudaMemsetAsyncReserve ( void* devPtr, int value, size_t count, cudaStream_t stream = 0 ) {
+    if (!reserving_graph) {
+        return cudaMemsetAsync(devPtr, value, count, stream);
+    } else {
+        return cudaSuccess;
+    }
+}
+
 #undef cudaMemcpyAsync
 #define cudaMemcpyAsync cudaMemcpyAsyncReserve
 #undef cudaMemcpy2DAsync
 #define cudaMemcpy2DAsync cudaMemcpy2DAsyncReserve
+#undef cudaMemsetAsync
+#define cudaMemsetAsync cudaMemsetAsyncReserve

 #define STRINGIZE_IMPL(...) #__VA_ARGS__
 #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
--- a/ml/device.go
+++ b/ml/device.go
@@ -361,7 +361,7 @@ func ByLibrary(l []DeviceInfo) [][]DeviceInfo {
 }

 func LibraryPaths(l []DeviceInfo) []string {
-	var gpuLibs []string
+	gpuLibs := []string{LibOllamaPath}
 	for _, gpu := range l {
 		for _, dir := range gpu.LibraryPath {
 			needed := true
--- a/discover/path.go
+++ b/discover/path.go
@@ -1,4 +1,4 @@
-package discover
+package ml

 import (
 	"os"
Author	SHA1	Message	Date
Jesse Gross	392a270261	ggml: Avoid cudaMemsetAsync during memory fitting We pass invalid pointers when we check the size of the required compute graph before fitting. Some CUDA APIs validate these pointers but we can just skip them during this phase. cudaMemsetAsync is one of these that we weren't skipping but never took the code path that used it. Now that we have enabled op_offload, we can hit it in memory pressured situations.	2025-10-31 15:23:28 -07:00
Daniel Hiltgen	3bee3af6ed	cpu: always ensure LibOllamaPath included (#12890 ) In CPU only setups the LibOllamaPath was omitted causing us not to load the ggml-cpu-XXX libraries during inference.	2025-10-31 14:37:29 -07:00
Daniel Hiltgen	83537993d7	logs: catch rocm errors (#12888 ) This will help bubble up more crash errors	2025-10-31 09:54:25 -07:00
nicole pardal	7dd4862a89	embeddings: removed redundant TestAPIEmbeddings test (#12863 ) This PR removes a redundant test from TestAPIEmbeddings Contents of this test already exists in embed_test.go and model_arch_test.go	2025-10-30 17:12:33 -07:00