From ea7657b54a000b9cf381e6e83463f50aaa40a161 Mon Sep 17 00:00:00 2001
From: Daniel Andersen <github@danand.de>
Date: Mon, 11 Aug 2025 22:59:38 +0200
Subject: [PATCH 1/4] sched:  Add support for grouping GPUs (#10678)

This patch modifies Ollama to allow grouping GPUs to memory-fit to the requested model, instead of the former algorithm of using one GPU distributing over all available GPUs.

Benefits:
 - Lower amount of (PCIe-)bus communication between GPUs - especially when they are not very high speed
 - Allowing unallocated GPUs to get into power-saving mode.
 - Significantly reduce VRAM allocation when using more than 2 GPUs in a system
 - Due to the reduced memory allocation, you can run more models simultaneously.
---
 server/sched.go | 58 +++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/server/sched.go b/server/sched.go
index 2842bb3a0..40e6e5f72 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
 // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
 // opts.NumCtx accordingly
 func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
-	var estimatedVRAM uint64
-
 	var numParallelToTry []int
 	if *numParallel <= 0 {
 		// If no specific parallel setting was provided, try larger then smaller, always end with 1
@@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
 	}
 
 	for _, gl := range gpus.ByLibrary() {
-		var ok bool
 		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
 
 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
-		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
+		// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
 		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
 
-		// First attempt to fit the model into a single GPU
-		for _, p := range numParallelToTry {
-			req.opts.NumCtx = req.origNumCtx * p
-			if !envconfig.SchedSpread() {
-				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
-						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
+		if !envconfig.SchedSpread() {
+			for _, p := range numParallelToTry {
+				req.opts.NumCtx = req.origNumCtx * p
+				// Try to pack into as few GPUs as possible, starting from 1 GPU
+				for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
+					gpuSubset := sgl[:numGPUs]
+					ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
+
+					if ok {
+						slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
+							"model", req.model.ModelPath,
+							"library", sgl[0].Library,
+							"parallel", p,
+							"required", format.HumanBytes2(estimatedVRAM),
+							"gpus", numGPUs)
 						*numParallel = p
-						return []discover.GpuInfo{g}
+						return gpuSubset
 					}
 				}
 			}
-		}
+		} else {
+			// TODO future refinements
+			// - if multiple Libraries, see if any single GPU in any Library will fit
+			// - try subsets of GPUs instead of just falling back to 1 or all in a family
 
-		// TODO future refinements
-		// - if multiple Libraries, see if any single GPU in any Library will fit
-		// - try subsets of GPUs instead of just falling back to 1 or all in a family
-
-		// Now try all the GPUs
-		for _, p := range numParallelToTry {
-			req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
-				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
-				*numParallel = p
-				return sgl
+			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
+			for _, p := range numParallelToTry {
+				req.opts.NumCtx = req.origNumCtx * p
+				if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
+					slog.Info("new model will fit in available VRAM, loading",
+						"model", req.model.ModelPath,
+						"library", sgl[0].Library,
+						"parallel", p,
+						"required", format.HumanBytes2(estimatedVRAM),
+						"gpus", len(sgl))
+					*numParallel = p
+					return sgl
+				}
 			}
 		}
 	}

From ee04dbba51a4299b6ff4bb19f758eeacbf2b35d8 Mon Sep 17 00:00:00 2001
From: Devon Rifkin <drifkin@drifkin.net>
Date: Mon, 11 Aug 2025 14:09:13 -0700
Subject: [PATCH 2/4] server: fix error when parsing bad harmony tool calls

Thanks @moll for reporting!

Fixes: #11781
---
 server/routes.go | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 991e92003..8d5ca12df 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -364,7 +364,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
-							ch <- gin.H{"error parsing tool call": err.Error()}
+							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
+							ch <- gin.H{"error": errStr}
 							return
 						}
 
@@ -1655,7 +1656,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
-							ch <- gin.H{"error parsing tool call": err.Error()}
+							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
+							ch <- gin.H{"error": errStr}
 							return
 						}
 						res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}}

From 8f4ec9ab289fd2a1f96384926a7f7bfd888d4ef9 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Mon, 11 Aug 2025 14:45:45 -0700
Subject: [PATCH 3/4] discover: CPU supports flash attention

We already run flash attention on CPUs in cases where we have
partial offloading but were disabling it if running on pure CPU,
 which is unnecessary.
---
 discover/types.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/discover/types.go b/discover/types.go
index c5212d94e..13a030fd5 100644
--- a/discover/types.go
+++ b/discover/types.go
@@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 // For each GPU, check if it does NOT support flash attention
 func (l GpuInfoList) FlashAttentionSupported() bool {
 	for _, gpu := range l {
-		supportsFA := gpu.Library == "metal" ||
+		supportsFA := gpu.Library == "cpu" ||
+			gpu.Library == "metal" ||
 			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
 			gpu.Library == "rocm"
 

From d0cf6c82811c2268a396888347ff95087a618d56 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 12 Aug 2025 11:02:01 -0700
Subject: [PATCH 4/4] fix(openai): handle reasoning_effort (#11868)

---
 api/types.go     |  8 ++++----
 openai/openai.go | 11 +++++++----
 server/prompt.go |  8 ++++----
 server/routes.go | 16 ++++++++--------
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/api/types.go b/api/types.go
index 0f99de18c..0309ebbe3 100644
--- a/api/types.go
+++ b/api/types.go
@@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool {
 	return ok
 }
 
-// AsBool returns the value as a bool (true if enabled in any way)
-func (t *ThinkValue) AsBool() bool {
+// Bool returns the value as a bool (true if enabled in any way)
+func (t *ThinkValue) Bool() bool {
 	if t == nil || t.Value == nil {
 		return false
 	}
@@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool {
 	}
 }
 
-// AsString returns the value as a string
-func (t *ThinkValue) AsString() string {
+// String returns the value as a string
+func (t *ThinkValue) String() string {
 	if t == nil || t.Value == nil {
 		return ""
 	}
diff --git a/openai/openai.go b/openai/openai.go
index 50fdb81e9..13b9c425f 100644
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -103,6 +103,7 @@ type ChatCompletionRequest struct {
 	ResponseFormat   *ResponseFormat `json:"response_format"`
 	Tools            []api.Tool      `json:"tools"`
 	Reasoning        *Reasoning      `json:"reasoning,omitempty"`
+	ReasoningEffort  *string         `json:"reasoning_effort,omitempty"`
 }
 
 type ChatCompletion struct {
@@ -541,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		options["top_p"] = 1.0
 	}
 
-	if r.Reasoning != nil {
-		options["reasoning"] = *r.Reasoning.Effort
-	}
-
 	var format json.RawMessage
 	if r.ResponseFormat != nil {
 		switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
@@ -560,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 
 	var think *api.ThinkValue
 	if r.Reasoning != nil {
+		options["reasoning"] = *r.Reasoning.Effort
 		think = &api.ThinkValue{
 			Value: *r.Reasoning.Effort,
 		}
+	} else if r.ReasoningEffort != nil {
+		options["reasoning"] = *r.ReasoningEffort
+		think = &api.ThinkValue{
+			Value: *r.ReasoningEffort,
+		}
 	}
 
 	return &api.ChatRequest{
diff --git a/server/prompt.go b/server/prompt.go
index 5d6c3e27c..f1d8020ea 100644
--- a/server/prompt.go
+++ b/server/prompt.go
@@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 		thinkVal := false
 		thinkLevel := ""
 		if think != nil {
-			thinkVal = think.AsBool()
-			thinkLevel = think.AsString()
+			thinkVal = think.Bool()
+			thinkLevel = think.String()
 		}
 		var b bytes.Buffer
 		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
@@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 	thinkVal := false
 	thinkLevel := ""
 	if think != nil {
-		thinkVal = think.AsBool()
-		thinkLevel = think.AsString()
+		thinkVal = think.Bool()
+		thinkLevel = think.String()
 	}
 	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
 		return "", nil, err
diff --git a/server/routes.go b/server/routes.go
index d8d1e301c..3c044cd00 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 
 	// Validate Think value: string values currently only allowed for gptoss models
 	if req.Think != nil && req.Think.IsString() && !useHarmony {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
 		return
 	}
 
@@ -213,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	if req.Suffix != "" {
 		caps = append(caps, model.CapabilityInsert)
 	}
-	if req.Think != nil && req.Think.AsBool() {
+	if req.Think != nil && req.Think.Bool() {
 		caps = append(caps, model.CapabilityThinking)
 		// TODO(drifkin): consider adding a warning if it's false and the model
 		// doesn't support thinking. It's not strictly required, but it can be a
@@ -288,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}
 
-		values.Think = req.Think != nil && req.Think.AsBool()
+		values.Think = req.Think != nil && req.Think.Bool()
 		values.ThinkLevel = ""
 		if req.Think != nil {
-			values.ThinkLevel = req.Think.AsString()
+			values.ThinkLevel = req.Think.String()
 		}
 		values.IsThinkSet = req.Think != nil
 
@@ -317,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	var thinkingState *thinking.Parser
 	if !useHarmony {
 		openingTag, closingTag := thinking.InferTags(m.Template.Template)
-		if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
+		if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
 			thinkingState = &thinking.Parser{
 				OpeningTag: openingTag,
 				ClosingTag: closingTag,
@@ -1547,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if len(req.Tools) > 0 {
 		caps = append(caps, model.CapabilityTools)
 	}
-	if req.Think != nil && req.Think.AsBool() {
+	if req.Think != nil && req.Think.Bool() {
 		caps = append(caps, model.CapabilityThinking)
 	}
 
@@ -1601,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
 	// Validate Think value: string values currently only allowed for gptoss models
 	if req.Think != nil && req.Think.IsString() && !useHarmony {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
 		return
 	}
 
@@ -1620,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 
 	var thinkingState *thinking.Parser
 	openingTag, closingTag := thinking.InferTags(m.Template.Template)
-	if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
+	if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
 		thinkingState = &thinking.Parser{
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,