From ea7657b54a000b9cf381e6e83463f50aaa40a161 Mon Sep 17 00:00:00 2001 From: Daniel Andersen Date: Mon, 11 Aug 2025 22:59:38 +0200 Subject: [PATCH 1/4] sched: Add support for grouping GPUs (#10678) This patch modifies Ollama to allow grouping GPUs to memory-fit to the requested model, instead of the former algorithm of using one GPU distributing over all available GPUs. Benefits: - Lower amount of (PCIe-)bus communication between GPUs - especially when they are not very high speed - Allowing unallocated GPUs to get into power-saving mode. - Significantly reduce VRAM allocation when using more than 2 GPUs in a system - Due to the reduced memory allocation, you can run more models simultaneously. --- server/sched.go | 58 +++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/server/sched.go b/server/sched.go index 2842bb3a0..40e6e5f72 100644 --- a/server/sched.go +++ b/server/sched.go @@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool { // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust // opts.NumCtx accordingly func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList { - var estimatedVRAM uint64 - var numParallelToTry []int if *numParallel <= 0 { // If no specific parallel setting was provided, try larger then smaller, always end with 1 @@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn } for _, gl := range gpus.ByLibrary() { - var ok bool sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...) // TODO - potentially sort by performance capability, existing models loaded, etc. // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them - // Note: at present, this will favor more VRAM over faster GPU speed in mixed setups + // Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl))) - // First attempt to fit the model into a single GPU - for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCtx * p - if !envconfig.SchedSpread() { - for _, g := range sgl { - if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { - slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) + if !envconfig.SchedSpread() { + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCtx * p + // Try to pack into as few GPUs as possible, starting from 1 GPU + for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ { + gpuSubset := sgl[:numGPUs] + ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p) + + if ok { + slog.Info("new model will fit in available VRAM across minimum required GPUs, loading", + "model", req.model.ModelPath, + "library", sgl[0].Library, + "parallel", p, + "required", format.HumanBytes2(estimatedVRAM), + "gpus", numGPUs) *numParallel = p - return []discover.GpuInfo{g} + return gpuSubset } } } - } + } else { + // TODO future refinements + // - if multiple Libraries, see if any single GPU in any Library will fit + // - try subsets of GPUs instead of just falling back to 1 or all in a family - // TODO future refinements - // - if multiple Libraries, see if any single GPU in any Library will fit - // - try subsets of GPUs instead of just falling back to 1 or all in a family - - // Now try all the GPUs - for _, p := range numParallelToTry { - req.opts.NumCtx = req.origNumCtx * p - if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { - slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM)) - *numParallel = p - return sgl + // Now try all the GPUS (OLLAMA_SCHED_SPREAD is set) + for _, p := range numParallelToTry { + req.opts.NumCtx = req.origNumCtx * p + if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok { + slog.Info("new model will fit in available VRAM, loading", + "model", req.model.ModelPath, + "library", sgl[0].Library, + "parallel", p, + "required", format.HumanBytes2(estimatedVRAM), + "gpus", len(sgl)) + *numParallel = p + return sgl + } } } } From ee04dbba51a4299b6ff4bb19f758eeacbf2b35d8 Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Mon, 11 Aug 2025 14:09:13 -0700 Subject: [PATCH 2/4] server: fix error when parsing bad harmony tool calls Thanks @moll for reporting! Fixes: #11781 --- server/routes.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/routes.go b/server/routes.go index 991e92003..8d5ca12df 100644 --- a/server/routes.go +++ b/server/routes.go @@ -364,7 +364,8 @@ func (s *Server) GenerateHandler(c *gin.Context) { *toolName = strings.TrimPrefix(*toolName, "functions.") var args api.ToolCallFunctionArguments if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - ch <- gin.H{"error parsing tool call": err.Error()} + errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) + ch <- gin.H{"error": errStr} return } @@ -1655,7 +1656,8 @@ func (s *Server) ChatHandler(c *gin.Context) { *toolName = strings.TrimPrefix(*toolName, "functions.") var args api.ToolCallFunctionArguments if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - ch <- gin.H{"error parsing tool call": err.Error()} + errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) + ch <- gin.H{"error": errStr} return } res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}} From 8f4ec9ab289fd2a1f96384926a7f7bfd888d4ef9 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 11 Aug 2025 14:45:45 -0700 Subject: [PATCH 3/4] discover: CPU supports flash attention We already run flash attention on CPUs in cases where we have partial offloading but were disabling it if running on pure CPU, which is unnecessary. --- discover/types.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/discover/types.go b/discover/types.go index c5212d94e..13a030fd5 100644 --- a/discover/types.go +++ b/discover/types.go @@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int { // For each GPU, check if it does NOT support flash attention func (l GpuInfoList) FlashAttentionSupported() bool { for _, gpu := range l { - supportsFA := gpu.Library == "metal" || + supportsFA := gpu.Library == "cpu" || + gpu.Library == "metal" || (gpu.Library == "cuda" && gpu.DriverMajor >= 7) || gpu.Library == "rocm" From d0cf6c82811c2268a396888347ff95087a618d56 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 12 Aug 2025 11:02:01 -0700 Subject: [PATCH 4/4] fix(openai): handle reasoning_effort (#11868) --- api/types.go | 8 ++++---- openai/openai.go | 11 +++++++---- server/prompt.go | 8 ++++---- server/routes.go | 16 ++++++++-------- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/api/types.go b/api/types.go index 0f99de18c..0309ebbe3 100644 --- a/api/types.go +++ b/api/types.go @@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool { return ok } -// AsBool returns the value as a bool (true if enabled in any way) -func (t *ThinkValue) AsBool() bool { +// Bool returns the value as a bool (true if enabled in any way) +func (t *ThinkValue) Bool() bool { if t == nil || t.Value == nil { return false } @@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool { } } -// AsString returns the value as a string -func (t *ThinkValue) AsString() string { +// String returns the value as a string +func (t *ThinkValue) String() string { if t == nil || t.Value == nil { return "" } diff --git a/openai/openai.go b/openai/openai.go index 50fdb81e9..13b9c425f 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -103,6 +103,7 @@ type ChatCompletionRequest struct { ResponseFormat *ResponseFormat `json:"response_format"` Tools []api.Tool `json:"tools"` Reasoning *Reasoning `json:"reasoning,omitempty"` + ReasoningEffort *string `json:"reasoning_effort,omitempty"` } type ChatCompletion struct { @@ -541,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { options["top_p"] = 1.0 } - if r.Reasoning != nil { - options["reasoning"] = *r.Reasoning.Effort - } - var format json.RawMessage if r.ResponseFormat != nil { switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) { @@ -560,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { var think *api.ThinkValue if r.Reasoning != nil { + options["reasoning"] = *r.Reasoning.Effort think = &api.ThinkValue{ Value: *r.Reasoning.Effort, } + } else if r.ReasoningEffort != nil { + options["reasoning"] = *r.ReasoningEffort + think = &api.ThinkValue{ + Value: *r.ReasoningEffort, + } } return &api.ChatRequest{ diff --git a/server/prompt.go b/server/prompt.go index 5d6c3e27c..f1d8020ea 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. thinkVal := false thinkLevel := "" if think != nil { - thinkVal = think.AsBool() - thinkLevel = think.AsString() + thinkVal = think.Bool() + thinkLevel = think.String() } var b bytes.Buffer if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { @@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. thinkVal := false thinkLevel := "" if think != nil { - thinkVal = think.AsBool() - thinkLevel = think.AsString() + thinkVal = think.Bool() + thinkLevel = think.String() } if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil { return "", nil, err diff --git a/server/routes.go b/server/routes.go index d8d1e301c..3c044cd00 100644 --- a/server/routes.go +++ b/server/routes.go @@ -205,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { // Validate Think value: string values currently only allowed for gptoss models if req.Think != nil && req.Think.IsString() && !useHarmony { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())}) return } @@ -213,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { if req.Suffix != "" { caps = append(caps, model.CapabilityInsert) } - if req.Think != nil && req.Think.AsBool() { + if req.Think != nil && req.Think.Bool() { caps = append(caps, model.CapabilityThinking) // TODO(drifkin): consider adding a warning if it's false and the model // doesn't support thinking. It's not strictly required, but it can be a @@ -288,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) { values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt}) } - values.Think = req.Think != nil && req.Think.AsBool() + values.Think = req.Think != nil && req.Think.Bool() values.ThinkLevel = "" if req.Think != nil { - values.ThinkLevel = req.Think.AsString() + values.ThinkLevel = req.Think.String() } values.IsThinkSet = req.Think != nil @@ -317,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { var thinkingState *thinking.Parser if !useHarmony { openingTag, closingTag := thinking.InferTags(m.Template.Template) - if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { + if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" { thinkingState = &thinking.Parser{ OpeningTag: openingTag, ClosingTag: closingTag, @@ -1547,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) { if len(req.Tools) > 0 { caps = append(caps, model.CapabilityTools) } - if req.Think != nil && req.Think.AsBool() { + if req.Think != nil && req.Think.Bool() { caps = append(caps, model.CapabilityThinking) } @@ -1601,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) { // Validate Think value: string values currently only allowed for gptoss models if req.Think != nil && req.Think.IsString() && !useHarmony { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())}) + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())}) return } @@ -1620,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) { var thinkingState *thinking.Parser openingTag, closingTag := thinking.InferTags(m.Template.Template) - if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" { + if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" { thinkingState = &thinking.Parser{ OpeningTag: openingTag, ClosingTag: closingTag,