From 9714e38dd098a466d613edd87f3576ddd5c246f4 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 8 Sep 2025 09:53:59 -0700 Subject: [PATCH 01/13] fix: nil pointer dereference if cache is nil (#12215) --- runner/ollamarunner/cache.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go index 955ef9b3d..1d1ca5180 100644 --- a/runner/ollamarunner/cache.go +++ b/runner/ollamarunner/cache.go @@ -70,11 +70,9 @@ func kvCacheTypeFromStr(s string) ml.DType { } func (c *InputCache) Close() { - if c == nil { - return + if c != nil && c.cache != nil { + c.cache.Close() } - - c.cache.Close() } // Locking: Operations on InputCacheSlot (including finding one From 950d33aa3045581906c2db11f33d2a4c2bca3033 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 8 Sep 2025 11:39:00 -0700 Subject: [PATCH 02/13] docs: show how to debug nvidia init failures (#12216) This debug setting can help troubleshoot obscure initialization failures. --- docs/troubleshooting.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6fdd3e85b..7647b12f9 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -92,6 +92,9 @@ If none of those resolve the problem, gather additional information and file an - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` +You may get more details for initialization failures by enabling debug prints in the uvm driver. You should only use this temporarily while troubleshooting +- `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1` + ## AMD GPU Discovery From 7b91c9ce51feb6c056c0c3c3f43b4d4109d8683a Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 8 Sep 2025 15:53:22 -0600 Subject: [PATCH 03/13] Hybrid and recurrent memory estimates (#12186) This PR updates the memory size estimate logic to better handle recurrent and hybrid-recurrent models which are currently being badly overestimated because the default logic assumes full attention for all layers. The logic for the sizing of the recurrent layers comes from the llama.cpp implementation ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size); ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size); Signed-off-by: Gabe Goodhart --- fs/ggml/ggml.go | 135 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 121 insertions(+), 14 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 3f4374cd0..56ad420e5 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -57,10 +57,28 @@ func (kv KV) EmbeddingLength() uint64 { return uint64(kv.Uint("embedding_length")) } +func (kv KV) HeadCount() []uint64 { + headCountDefault := uint32(1) + headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault) + if len(headCount) == 1 { + headCountDefault = headCount[0] + } + nLayers := int(kv.BlockCount()) + if len(headCount) > nLayers { + slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers) + } + out := make([]uint64, nLayers) + for i := range nLayers { + if i >= len(headCount) { + out[i] = uint64(headCountDefault) + } else { + out[i] = uint64(headCount[i]) + } + } + return out +} + func (kv KV) HeadCountMax() uint64 { - // TODO(drifkin): using the max value can cause an overestimation. In the - // future if array values become more popular, we can adapt the more invasive - // return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1)) } @@ -68,6 +86,27 @@ func (kv KV) HeadCountMin() uint64 { return uint64(kv.UintOrMinArrayValue("attention.head_count", 1)) } +func (kv KV) HeadCountKV() []uint64 { + headCountKVDefault := uint32(1) + headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault) + if len(headCountKV) == 1 { + headCountKVDefault = headCountKV[0] + } + nLayers := int(kv.BlockCount()) + if len(headCountKV) > nLayers { + slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers) + } + out := make([]uint64, nLayers) + for i := range nLayers { + if i >= len(headCountKV) { + out[i] = uint64(headCountKVDefault) + } else { + out[i] = uint64(headCountKV[i]) + } + } + return out +} + func (kv KV) HeadCountKVMax() uint64 { return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1)) } @@ -100,6 +139,26 @@ func (kv KV) ChatTemplate() string { return kv.String("tokenizer.chat_template") } +// ssm architecture parameters + +func (kv KV) SSMConvKernel() uint64 { + return uint64(kv.Uint("ssm.conv_kernel")) +} + +func (kv KV) SSMInnerSize() uint64 { + return uint64(kv.Uint("ssm.inner_size")) +} + +func (kv KV) SSMStateSize() uint64 { + return uint64(kv.Uint("ssm.state_size")) +} + +func (kv KV) SSMGroupCount() uint64 { + return uint64(kv.Uint("ssm.group_count")) +} + +// general types + func (kv KV) String(key string, defaultValue ...string) string { val, _ := keyValue(kv, key, append(defaultValue, "")...) return val @@ -131,22 +190,27 @@ func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 { } func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) { + arrVal := kv.UintOrArrayValueAsArray(key, defaultValue) + return slices.Min(arrVal), slices.Max(arrVal) +} + +func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 { if u32, ok := keyValue(kv, key, uint32(0)); ok { - return u32, u32 + return []uint32{u32} } else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok { - min := slices.Min(u32s.values) - max := slices.Max(u32s.values) - return min, max + return u32s.values } else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok { - min := slices.Min(i32s.values) - max := slices.Max(i32s.values) - if min < 0 || max < 0 { - slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max) + dst := make([]uint32, len(i32s.values)) + for i, v := range i32s.values { + if v < 0 { + slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v) + } + dst[i] = uint32(v) } - return uint32(min), uint32(max) + return dst } - return defaultValue, defaultValue + return []uint32{defaultValue} } func (kv KV) Strings(key string, defaultValue ...[]string) []string { @@ -486,7 +550,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri embedding := f.KV().EmbeddingLength() heads := f.KV().HeadCountMax() + headsArr := f.KV().HeadCount() headsKV := f.KV().HeadCountKVMax() + headsKVArr := f.KV().HeadCountKV() vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size) embeddingHeads := f.KV().EmbeddingHeadCountMax() @@ -496,12 +562,51 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri layers := f.Tensors().GroupLayers() bytesPerElement := kvCacheBytesPerElement(kvCacheType) + + // Default for models unless special-cased below. These defaults mirror the + // cache usage in llama.cpp under the assumption that models without special + // cases below will use the llamarunner and caching will be handled by the + // llama.cpp layer. + // + // This also assumes that a layer without heads or headsKV set is recurrent + // which is usually the case. Some models (eg nemotronh) use "blocks" in + // place of layers where some are MLP blocks that don't have any cache. + // Models like this will need a special case below to be accurately + // estimated. var kvTotal uint64 kv = make([]uint64, f.KV().BlockCount()) + kvSizeAttn := uint64(0) + kvSizeRecurrent := uint64(0) for i := range kv { - kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement) + headsL := headsArr[i] + headsKVL := headsKVArr[i] + if headsL > 0 && headsKVL > 0 { + // full attention layer + // NOTE: Assumes uniform values for all attn layers + kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement) + kvSizeAttn += kv[i] + } else { + // recurrent layer + ssmDConv := f.KV().SSMConvKernel() + ssmDState := f.KV().SSMStateSize() + ssmDInner := f.KV().SSMInnerSize() + ssmNGroups := f.KV().SSMGroupCount() + nEmbdR := uint64(0) + if ssmDConv > 0 { + nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState) + } + nEmbdS := ssmDState * ssmDInner + + // recurrent always uses F32 in llama.cpp backend + // https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644 + bytesPerElementRecurrent := kvCacheBytesPerElement("f32") + + kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent) + kvSizeRecurrent += kv[i] + } kvTotal += kv[i] } + slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent) switch f.KV().Architecture() { case "llama", "llama4": @@ -794,6 +899,8 @@ func kvCacheBytesPerElement(cacheType string) float64 { return 1 // 1/2 of fp16 case "q4_0": return 0.5 // 1/4 of fp16 + case "f32": + return 4 // f32 (default for recurrent) default: return 2 // f16 (default) } From 1a558f98e2d07885efb6cf82943ae029c647f3d0 Mon Sep 17 00:00:00 2001 From: Parth Sareen Date: Mon, 8 Sep 2025 15:07:59 -0700 Subject: [PATCH 04/13] runner: move harmony to runner (#12052) --- harmony/harmonyparser.go | 46 +++-- harmony/harmonyparser_test.go | 200 ++++++++++++++++++++ llm/server.go | 37 ++-- runner/ollamarunner/runner.go | 55 +++++- server/routes.go | 131 +++++-------- server/routes_harmony_streaming_test.go | 237 ++---------------------- 6 files changed, 369 insertions(+), 337 deletions(-) diff --git a/harmony/harmonyparser.go b/harmony/harmonyparser.go index a51819dda..addce4c94 100644 --- a/harmony/harmonyparser.go +++ b/harmony/harmonyparser.go @@ -3,15 +3,29 @@ package harmony import ( "fmt" "log/slog" + "slices" "strings" "unicode" "github.com/ollama/ollama/api" "github.com/ollama/ollama/logutil" + "github.com/ollama/ollama/template" ) type harmonyParserState int +func ShouldUseHarmony(modelFamily string, template *template.Template) bool { + if slices.Contains([]string{"gptoss", "gpt-oss"}, modelFamily) { + // heuristic to check whether the template expects to be parsed via harmony: + // search for harmony tags that are nearly always used + if template.Contains("<|start|>") && template.Contains("<|end|>") { + return true + } + } + + return false +} + const ( harmonyParserState_LookingForMessageStart harmonyParserState = iota harmonyParserState_ParsingHeader @@ -75,18 +89,28 @@ func (s *HarmonyParser) AddImplicitStart() { s.acc.WriteString("<|start|>assistant") } -func (s *HarmonyParser) AddImplicitStartOrPrefill(lastMessage *api.Message) { - if lastMessage != nil && lastMessage.Role == "assistant" { - // handle prefilling conditions - if lastMessage.Content != "" { - s.acc.WriteString("<|start|>assistant<|channel|>final<|message|>") - return - } else if lastMessage.Thinking != "" { - s.acc.WriteString("<|start|>assistant<|channel|>analysis<|message|>") - return - } +func Prefill(lastMessage api.Message) string { + if lastMessage.Role != "assistant" { + return "" + } + + switch { + case strings.TrimSpace(lastMessage.Content) != "": + return "<|start|>assistant<|channel|>final<|message|>" + case strings.TrimSpace(lastMessage.Thinking) != "": + return "<|start|>assistant<|channel|>analysis<|message|>" + default: + return "" + } +} + +// AddImplicitStartOrPrefill adds an implicit start tag or prefill string if provided +func (s *HarmonyParser) AddImplicitStartOrPrefill(prefillString string) { + if strings.TrimSpace(prefillString) != "" { + s.acc.WriteString(prefillString) + } else { + s.AddImplicitStart() } - s.AddImplicitStart() } func (s *HarmonyParser) AddContent(content string) []HarmonyEvent { diff --git a/harmony/harmonyparser_test.go b/harmony/harmonyparser_test.go index b988a018f..dcf1af4e8 100644 --- a/harmony/harmonyparser_test.go +++ b/harmony/harmonyparser_test.go @@ -3,6 +3,7 @@ package harmony import ( "fmt" "reflect" + "strings" "testing" ) @@ -535,3 +536,202 @@ func TestFunctionConvertAndAdd(t *testing.T) { }) } } + +func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { + t.Run("thinking_then_content_streams", func(t *testing.T) { + handler := NewHarmonyMessageHandler() + handler.HarmonyParser.AddImplicitStart() + tp := handler.CreateToolParser() + type step struct { + in string + wantContent string + wantThinking string + } + steps := []step{ + {in: "<|channel|>analysis<|message|>Thinking...", wantThinking: "Thinking..."}, + {in: "<|end|>", wantThinking: ""}, + {in: "<|start|>assistant<|message|>Answer", wantContent: "Answer"}, + {in: "<|end|>", wantContent: ""}, + } + for i, s := range steps { + content, thinking, tool := handler.AddContent(s.in, tp) + if tool != "" { + tp.Add(tool) + } + if content != s.wantContent || thinking != s.wantThinking { + t.Fatalf("step %d: got (content=%q thinking=%q), want (content=%q thinking=%q)", i, content, thinking, s.wantContent, s.wantThinking) + } + } + }) + + t.Run("content_streams_as_it_arrives", func(t *testing.T) { + handler := NewHarmonyMessageHandler() + handler.HarmonyParser.AddImplicitStart() + tp := handler.CreateToolParser() + inputs := []string{ + "<|start|>assistant<|message|>Hello", + ", world", + "!<|end|>", + } + var got []string + for _, in := range inputs { + content, thinking, tool := handler.AddContent(in, tp) + if tool != "" { + tp.Add(tool) + } + if thinking != "" { + t.Fatalf("unexpected thinking %q", thinking) + } + if content != "" { + got = append(got, content) + } + } + want := []string{"Hello", ", world", "!"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("content pieces mismatch: got %v want %v", got, want) + } + }) + + t.Run("thinking_streams_separately_from_content", func(t *testing.T) { + handler := NewHarmonyMessageHandler() + handler.HarmonyParser.AddImplicitStart() + tp := handler.CreateToolParser() + inputs := []string{ + "<|channel|>analysis<|message|>Thinking...", + "<|end|>", + "<|start|>assistant<|message|>Answer", + "<|end|>", + } + var got []string + for _, in := range inputs { + content, thinking, tool := handler.AddContent(in, tp) + if tool != "" { + tp.Add(tool) + } + if thinking != "" { + got = append(got, thinking) + } + if content != "" { + got = append(got, content) + } + } + want := []string{"Thinking...", "Answer"} + if !reflect.DeepEqual(got, want) { + t.Fatalf("content pieces mismatch: got %v want %v", got, want) + } + }) + + t.Run("partial_tags_buffer_until_complete", func(t *testing.T) { + handler := NewHarmonyMessageHandler() + handler.HarmonyParser.AddImplicitStart() + tp := handler.CreateToolParser() + inputs := []string{ + "<|chan", + "nel|>analysis<|mess", + "age|>Deep ", + "thought", + "<|end|>", + "<|start|>assistant<|message|>Done", + "<|end|>", + } + var thinkingPieces []string + var contentPieces []string + for _, in := range inputs { + content, thinking, tool := handler.AddContent(in, tp) + if tool != "" { + tp.Add(tool) + } + if thinking != "" { + thinkingPieces = append(thinkingPieces, thinking) + } + if content != "" { + contentPieces = append(contentPieces, content) + } + } + if want := []string{"Deep ", "thought"}; !reflect.DeepEqual(thinkingPieces, want) { + t.Fatalf("thinking pieces mismatch: got %v want %v", thinkingPieces, want) + } + if want := []string{"Done"}; !reflect.DeepEqual(contentPieces, want) { + t.Fatalf("content pieces mismatch: got %v want %v", contentPieces, want) + } + }) + + t.Run("simple_assistant_after_analysis", func(t *testing.T) { + handler := NewHarmonyMessageHandler() + handler.HarmonyParser.AddImplicitStart() + tp := handler.CreateToolParser() + inputs := []string{ + "<|channel|>analysis<|message|>Think", + "<|end|>", + "<|start|>assistant<|message|>Answer", + "<|end|>", + } + var contentSb, thinkingSb strings.Builder + for _, in := range inputs { + content, thinking, tool := handler.AddContent(in, tp) + if tool != "" { + tp.Add(tool) + } + contentSb.WriteString(content) + thinkingSb.WriteString(thinking) + } + if contentSb.String() != "Answer" { + t.Fatalf("content mismatch: got %q want %q", contentSb.String(), "Answer") + } + if thinkingSb.String() != "Think" { + t.Fatalf("thinking mismatch: got %q want %q", thinkingSb.String(), "Think") + } + }) + + t.Run("tool_call_parsed_and_returned_correctly", func(t *testing.T) { + handler := NewHarmonyMessageHandler() + handler.HarmonyParser.AddImplicitStart() + tp := handler.CreateToolParser() + inputs := []string{ + "<|channel|>commentary to=functions.calculate<|message|>{\"expression\":\"2+2\"}<|end|>", + } + for _, in := range inputs { + content, thinking, tool := handler.AddContent(in, tp) + if content != "" || thinking != "" { + continue + } + if tool != "" { + tp.Add(tool) + } + } + name, args := tp.Drain() + if name == nil || *name != "functions.calculate" { + t.Fatalf("unexpected tool name: %v", name) + } + if got, want := args, "{\"expression\":\"2+2\"}"; got != want { + t.Fatalf("unexpected tool args: got %s want %s", got, want) + } + }) + + t.Run("tool_call_across_chunks", func(t *testing.T) { + handler := NewHarmonyMessageHandler() + handler.HarmonyParser.AddImplicitStart() + tp := handler.CreateToolParser() + inputs := []string{ + "<|channel|>commentary to=functions.calculate<|message|>{\"expression\":\"2+", + "2\"}", + "<|end|>", + } + for _, in := range inputs { + content, thinking, tool := handler.AddContent(in, tp) + if content != "" || thinking != "" { + continue + } + if tool != "" { + tp.Add(tool) + } + } + name, args := tp.Drain() + if name == nil || *name != "functions.calculate" { + t.Fatalf("unexpected tool name: %v", name) + } + if got, want := args, "{\"expression\":\"2+2\"}"; got != want { + t.Fatalf("unexpected tool args: got %s want %s", got, want) + } + }) +} diff --git a/llm/server.go b/llm/server.go index 664a69fb3..e0a652ec0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -1347,7 +1347,9 @@ type CompletionRequest struct { Images []ImageData Options *api.Options - Grammar string // set before sending the request to the subprocess + Grammar string // set before sending the request to the subprocess + UseHarmony bool + PrefillString string } // DoneReason represents the reason why a completion response is done @@ -1360,6 +1362,8 @@ const ( DoneReasonLength // DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed DoneReasonConnectionClosed + // DoneReasonTokenRepeatLimit indicates the completion stopped due to a token repeat limit + DoneReasonTokenRepeatLimit ) func (d DoneReason) String() string { @@ -1368,19 +1372,23 @@ func (d DoneReason) String() string { return "length" case DoneReasonStop: return "stop" + case DoneReasonTokenRepeatLimit: + return "token_repeat_limit" default: return "" // closed } } type CompletionResponse struct { - Content string `json:"content"` - DoneReason DoneReason `json:"done_reason"` - Done bool `json:"done"` - PromptEvalCount int `json:"prompt_eval_count"` - PromptEvalDuration time.Duration `json:"prompt_eval_duration"` - EvalCount int `json:"eval_count"` - EvalDuration time.Duration `json:"eval_duration"` + Content string `json:"content"` + Thinking string `json:"thinking"` + ToolCalls []api.ToolCall `json:"tool_calls"` + DoneReason DoneReason `json:"done_reason"` + Done bool `json:"done"` + PromptEvalCount int `json:"prompt_eval_count"` + PromptEvalDuration time.Duration `json:"prompt_eval_duration"` + EvalCount int `json:"eval_count"` + EvalDuration time.Duration `json:"eval_duration"` } func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error { @@ -1498,7 +1506,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu return fmt.Errorf("error unmarshalling llm prediction response: %v", err) } switch { - case strings.TrimSpace(c.Content) == lastToken: + // TODO(parthsareen): token repeat limit is now handled in the runner, this currently support legacy model and can be removed in the future + case strings.TrimSpace(c.Content) == lastToken && c.Content != "": tokenRepeat++ default: lastToken = strings.TrimSpace(c.Content) @@ -1511,16 +1520,14 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu return ctx.Err() } - if c.Content != "" { - fn(CompletionResponse{ - Content: c.Content, - }) - } - if c.Done { fn(c) return nil } + + if c.Content != "" || c.Thinking != "" || len(c.ToolCalls) > 0 { + fn(c) + } } } diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index df3ce1d9f..a40643ef2 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -30,6 +30,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/harmony" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" @@ -781,6 +782,14 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { return } + var harmonyMessageHandler *harmony.HarmonyMessageHandler + var harmonyToolParser *harmony.HarmonyToolCallAccumulator + if req.UseHarmony { + harmonyMessageHandler = harmony.NewHarmonyMessageHandler() + harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(req.PrefillString) + harmonyToolParser = harmonyMessageHandler.CreateToolParser() + } + if req.Options == nil { opts := api.DefaultOptions() req.Options = &opts @@ -863,6 +872,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { http.Error(w, "could not find an available sequence", http.StatusInternalServerError) return } + var lastToken string + tokenRepeat := 0 + const tokenRepeatLimit = 30 for { select { @@ -871,8 +883,27 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { return case content, ok := <-seq.responses: if ok { + if strings.TrimSpace(content) == lastToken { + tokenRepeat++ + } + if tokenRepeat == tokenRepeatLimit { + http.Error(w, "token repeat limit reached", http.StatusInternalServerError) + seq.doneReason = llm.DoneReasonTokenRepeatLimit + close(seq.quit) + return + } + lastToken = strings.TrimSpace(content) + + var thinking string + if harmonyMessageHandler != nil { + var toolContent string + content, thinking, toolContent = harmonyMessageHandler.AddContent(content, harmonyToolParser) + harmonyToolParser.Add(toolContent) + } + if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{ - Content: content, + Content: content, + Thinking: thinking, }); err != nil { http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError) close(seq.quit) @@ -881,7 +912,29 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { flusher.Flush() } else { + var toolCalls []api.ToolCall + if harmonyMessageHandler != nil { + // these tools still need to be transformed to the original function name + toolName, toolContent := harmonyToolParser.Drain() + if toolName != nil { + *toolName = strings.TrimPrefix(*toolName, "functions.") + var args api.ToolCallFunctionArguments + if err := json.Unmarshal([]byte(toolContent), &args); err != nil { + http.Error(w, fmt.Sprintf("failed to unmarshal tool call function arguments: %v", err), http.StatusInternalServerError) + close(seq.quit) + return + } + toolCalls = append(toolCalls, api.ToolCall{ + Function: api.ToolCallFunction{ + Name: *toolName, + Arguments: args, + }, + }) + } + } + if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{ + ToolCalls: toolCalls, Done: true, DoneReason: seq.doneReason, PromptEvalCount: seq.numPromptInputs, diff --git a/server/routes.go b/server/routes.go index e6e4e2c47..73ea5fea4 100644 --- a/server/routes.go +++ b/server/routes.go @@ -46,18 +46,6 @@ import ( "github.com/ollama/ollama/version" ) -func shouldUseHarmony(model *Model) bool { - if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) { - // heuristic to check whether the template expects to be parsed via harmony: - // search for harmony tags that are nearly always used - if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") { - return true - } - } - - return false -} - func experimentEnabled(name string) bool { return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name) } @@ -207,13 +195,11 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } - useHarmony := shouldUseHarmony(m) && !req.Raw - var harmonyMessageHandler *harmony.HarmonyMessageHandler - var harmonyToolParser *harmony.HarmonyToolCallAccumulator + useHarmony := harmony.ShouldUseHarmony(m.Config.ModelFamily, m.Template) && !req.Raw + var functionNameMap *harmony.FunctionNameMap + if useHarmony { - harmonyMessageHandler = harmony.NewHarmonyMessageHandler() - harmonyMessageHandler.HarmonyParser.AddImplicitStart() - harmonyToolParser = harmonyMessageHandler.CreateToolParser() + functionNameMap = harmony.NewFunctionNameMap() } // Validate Think value: string values currently only allowed for gptoss models @@ -357,16 +343,19 @@ func (s *Server) GenerateHandler(c *gin.Context) { var sb strings.Builder defer close(ch) if err := r.Completion(c.Request.Context(), llm.CompletionRequest{ - Prompt: prompt, - Images: images, - Format: req.Format, - Options: opts, + Prompt: prompt, + Images: images, + Format: req.Format, + Options: opts, + UseHarmony: useHarmony, }, func(cr llm.CompletionResponse) { res := api.GenerateResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), Response: cr.Content, Done: cr.Done, + Thinking: cr.Thinking, + ToolCalls: cr.ToolCalls, Metrics: api.Metrics{ PromptEvalCount: cr.PromptEvalCount, PromptEvalDuration: cr.PromptEvalDuration, @@ -375,12 +364,22 @@ func (s *Server) GenerateHandler(c *gin.Context) { }, } + if res.Done { + res.DoneReason = cr.DoneReason.String() + res.TotalDuration = time.Since(checkpointStart) + res.LoadDuration = checkpointLoaded.Sub(checkpointStart) + } + if useHarmony { - content, thinking, toolContent := harmonyMessageHandler.AddContent(cr.Content, harmonyToolParser) - res.Response = content - res.Thinking = thinking - harmonyToolParser.Add(toolContent) - } else if thinkingState != nil { + for i, tool := range res.ToolCalls { + res.ToolCalls[i].Function.Name = functionNameMap.OriginalFromConverted(tool.Function.Name) + } + if res.Response != "" || res.Thinking != "" || len(res.ToolCalls) > 0 || res.Done { + ch <- res + } + return + } + if thinkingState != nil { thinking, content := thinkingState.AddContent(cr.Content) res.Thinking = thinking res.Response = content @@ -391,30 +390,6 @@ func (s *Server) GenerateHandler(c *gin.Context) { } if cr.Done { - if useHarmony { - toolName, toolContent := harmonyToolParser.Drain() - if toolName != nil { - *toolName = strings.TrimPrefix(*toolName, "functions.") - var args api.ToolCallFunctionArguments - if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) - ch <- gin.H{"error": errStr} - return - } - - res.ToolCalls = append(res.ToolCalls, api.ToolCall{ - Function: api.ToolCallFunction{ - Name: *toolName, - Arguments: args, - }, - }) - } - } - - res.DoneReason = cr.DoneReason.String() - res.TotalDuration = time.Since(checkpointStart) - res.LoadDuration = checkpointLoaded.Sub(checkpointStart) - if !req.Raw { tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String()) if err != nil { @@ -1616,27 +1591,21 @@ func (s *Server) ChatHandler(c *gin.Context) { } msgs = filterThinkTags(msgs, m) - var harmonyMessageHandler *harmony.HarmonyMessageHandler - var harmonyToolParser *harmony.HarmonyToolCallAccumulator - - useHarmony := shouldUseHarmony(m) + useHarmony := harmony.ShouldUseHarmony(m.Config.ModelFamily, m.Template) processedTools := req.Tools + var functionNameMap *harmony.FunctionNameMap + var prefillString string + // TODO(parthsareen): this can be abstracted to not be model specific and potentially moved to the runner if useHarmony { - harmonyMessageHandler = harmony.NewHarmonyMessageHandler() - var lastMessage *api.Message - if len(msgs) > 0 { - lastMessage = &msgs[len(msgs)-1] - } - harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(lastMessage) - harmonyToolParser = harmonyMessageHandler.CreateToolParser() - + prefillString = harmony.Prefill(msgs[len(msgs)-1]) + functionNameMap = harmony.NewFunctionNameMap() // make a copy of tools to pass to the chat prompt. Function names may be // renamed to be valid Harmony function names. processedTools = make([]api.Tool, len(req.Tools)) copy(processedTools, req.Tools) for i, tool := range processedTools { - processedTools[i].Function.Name = harmonyMessageHandler.FunctionNameMap.ConvertAndAdd(tool.Function.Name) + processedTools[i].Function.Name = functionNameMap.ConvertAndAdd(tool.Function.Name) } } @@ -1689,15 +1658,17 @@ func (s *Server) ChatHandler(c *gin.Context) { defer close(ch) if err := r.Completion(c.Request.Context(), llm.CompletionRequest{ - Prompt: prompt, - Images: images, - Format: req.Format, - Options: opts, + Prompt: prompt, + Images: images, + Format: req.Format, + Options: opts, + UseHarmony: useHarmony, + PrefillString: prefillString, }, func(r llm.CompletionResponse) { res := api.ChatResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), - Message: api.Message{Role: "assistant", Content: r.Content}, + Message: api.Message{Role: "assistant", Content: r.Content, Thinking: r.Thinking, ToolCalls: r.ToolCalls}, Done: r.Done, Metrics: api.Metrics{ PromptEvalCount: r.PromptEvalCount, @@ -1713,31 +1684,13 @@ func (s *Server) ChatHandler(c *gin.Context) { } if useHarmony { - content, thinking, toolContent := harmonyMessageHandler.AddContent(r.Content, harmonyToolParser) - res.Message.Content = content - res.Message.Thinking = thinking - harmonyToolParser.Add(toolContent) - - if r.Done { - toolName, toolContent := harmonyToolParser.Drain() - if toolName != nil { - *toolName = strings.TrimPrefix(*toolName, "functions.") - *toolName = harmonyMessageHandler.FunctionNameMap.OriginalFromConverted(*toolName) - var args api.ToolCallFunctionArguments - if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error()) - ch <- gin.H{"error": errStr} - return - } - res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}} - } + for i, tool := range res.Message.ToolCalls { + res.Message.ToolCalls[i].Function.Name = functionNameMap.OriginalFromConverted(tool.Function.Name) } - // only send messages with meaningful content (empty messages confuse clients) if res.Message.Content != "" || res.Message.Thinking != "" || len(res.Message.ToolCalls) > 0 || res.Done { ch <- res } - return } diff --git a/server/routes_harmony_streaming_test.go b/server/routes_harmony_streaming_test.go index b1ede4e39..bcb020886 100644 --- a/server/routes_harmony_streaming_test.go +++ b/server/routes_harmony_streaming_test.go @@ -7,7 +7,6 @@ import ( "bytes" "context" "encoding/json" - "net/http" "strings" "testing" "time" @@ -118,7 +117,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { name: "content streams as it arrives", steps: []step{ { - input: llm.CompletionResponse{Content: "<|message|>Hello", Done: false}, + input: llm.CompletionResponse{Content: "Hello", Done: false}, wantContent: "Hello", }, { @@ -126,7 +125,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { wantContent: ", world", }, { - input: llm.CompletionResponse{Content: "!<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, + input: llm.CompletionResponse{Content: "!", Done: true, DoneReason: llm.DoneReasonStop}, wantContent: "!", }, }, @@ -135,20 +134,15 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { name: "thinking streams separately from content", steps: []step{ { - input: llm.CompletionResponse{Content: "<|channel|>analysis<|message|>Thinking...", Done: false}, + input: llm.CompletionResponse{Thinking: "Thinking...", Done: false}, wantThinking: "Thinking...", }, { - input: llm.CompletionResponse{Content: "<|end|>", Done: false}, - // No output expected - just closes the analysis message and resets state to normal + input: llm.CompletionResponse{Content: "Answer", Done: false}, + wantContent: "Answer", }, { - input: llm.CompletionResponse{Content: "<|start|>assistant<|message|>Answer", Done: false}, - wantContent: "Answer", // After message end, state is reset to normal - }, - { - input: llm.CompletionResponse{Content: "<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, - // No output expected - just closes the assistant message + input: llm.CompletionResponse{Done: true, DoneReason: llm.DoneReasonStop}, }, }, }, @@ -156,24 +150,16 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { name: "partial tags buffer until complete", steps: []step{ { - input: llm.CompletionResponse{Content: "<|chan", Done: false}, - // No output - partial tag - }, - { - input: llm.CompletionResponse{Content: "nel|>analysis<|mess", Done: false}, - // No output - still building tags - }, - { - input: llm.CompletionResponse{Content: "age|>Deep ", Done: false}, + input: llm.CompletionResponse{Thinking: "Deep ", Done: false}, wantThinking: "Deep ", }, { - input: llm.CompletionResponse{Content: "thought<|end|>", Done: false}, + input: llm.CompletionResponse{Thinking: "thought", Done: false}, wantThinking: "thought", }, { - input: llm.CompletionResponse{Content: "<|start|>assistant<|message|>Done<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, - wantContent: "Done", // After message end, state is reset to normal + input: llm.CompletionResponse{Content: "Done", Done: true, DoneReason: llm.DoneReasonStop}, + wantContent: "Done", }, }, }, @@ -181,7 +167,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { name: "simple assistant after analysis", steps: []step{ { - input: llm.CompletionResponse{Content: "<|channel|>analysis<|message|>Think<|end|><|start|>assistant<|message|>Answer<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, + input: llm.CompletionResponse{Thinking: "Think", Content: "Answer", Done: true, DoneReason: llm.DoneReasonStop}, wantContent: "Answer", wantThinking: "Think", }, @@ -191,7 +177,7 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { name: "tool call parsed and returned correctly", steps: []step{ { - input: llm.CompletionResponse{Content: "<|channel|>commentary to=functions.get_weather<|message|>{\"location\":\"San Francisco\"}<|end|><|start|>assistant<|message|>The weather is sunny<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, + input: llm.CompletionResponse{Content: "The weather is sunny", ToolCalls: []api.ToolCall{{Function: api.ToolCallFunction{Name: "get_weather", Arguments: api.ToolCallFunctionArguments{"location": "San Francisco"}}}}, Done: true, DoneReason: llm.DoneReasonStop}, wantContent: "The weather is sunny", wantToolCalls: []api.ToolCall{ { @@ -210,15 +196,10 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { name: "tool call with streaming JSON across chunks", steps: []step{ { - input: llm.CompletionResponse{Content: "<|channel|>commentary to=functions.calculate<|message|>{\"expr", Done: false}, - // No output yet - incomplete JSON + input: llm.CompletionResponse{Done: false}, }, { - input: llm.CompletionResponse{Content: "ession\":\"2+", Done: false}, - // Still no output - incomplete JSON - }, - { - input: llm.CompletionResponse{Content: "2\"}", Done: true}, + input: llm.CompletionResponse{ToolCalls: []api.ToolCall{{Function: api.ToolCallFunction{Name: "calculate", Arguments: api.ToolCallFunctionArguments{"expression": "2+2"}}}}, Done: true}, wantToolCalls: []api.ToolCall{ { Function: api.ToolCallFunction{ @@ -400,9 +381,9 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { gin.SetMode(gin.TestMode) mockResponses := []llm.CompletionResponse{ - {Content: "<|message|>First ", Done: false}, + {Content: "First ", Done: false}, {Content: "chunk ", Done: false}, - {Content: "here<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, + {Content: "here", Done: true, DoneReason: llm.DoneReasonStop}, } mock := mockRunner{ @@ -507,189 +488,3 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { t.Errorf("expected at least 2 content chunks for streaming, got %d", contentChunks) } } - -func TestChatHarmonyParserStreaming(t *testing.T) { - gin.SetMode(gin.TestMode) - - type expectedChunk struct { - afterResponse int // Which mock response this chunk should appear after - content string // Expected content in this chunk - thinking string // Expected thinking in this chunk - } - - testCases := []struct { - name string - mockResponses []llm.CompletionResponse - expectedChunks []expectedChunk - wantContent string - wantThinking string - }{ - { - name: "simple message without thinking", - mockResponses: []llm.CompletionResponse{ - {Content: "<|start|>assistant<|message|>Hello, ", Done: false}, - {Content: "how can I help?", Done: false}, - {Content: "<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, - }, - expectedChunks: []expectedChunk{ - {afterResponse: 1, content: "Hello, "}, - {afterResponse: 2, content: "how can I help?"}, - }, - wantContent: "Hello, how can I help?", - }, - { - name: "message with analysis channel for thinking", - mockResponses: []llm.CompletionResponse{ - {Content: "<|channel|>analysis<|message|>", Done: false}, - {Content: "Let me think ", Done: false}, - {Content: "about this problem...", Done: false}, - {Content: "<|end|>", Done: false}, - {Content: "<|start|>assistant<|message|>", Done: false}, - {Content: "The answer ", Done: false}, - {Content: "is 42", Done: false}, - {Content: "<|end|>", Done: true, DoneReason: llm.DoneReasonStop}, - }, - expectedChunks: []expectedChunk{ - {afterResponse: 2, thinking: "Let me think "}, - {afterResponse: 3, thinking: "about this problem..."}, - {afterResponse: 6, content: "The answer "}, - {afterResponse: 7, content: "is 42"}, - }, - wantContent: "The answer is 42", - wantThinking: "Let me think about this problem...", - }, - { - name: "streaming with partial tags across boundaries", - mockResponses: []llm.CompletionResponse{ - {Content: "<|chan", Done: false}, - {Content: "nel|>analy", Done: false}, - {Content: "sis<|mess", Done: false}, - {Content: "age|>Think", Done: false}, - {Content: "ing deeply...<|end|>", Done: false}, - {Content: "<|start|>assi", Done: false}, - {Content: "stant<|message|>Result ", Done: false}, - {Content: "computed<|e", Done: false}, - {Content: "nd|>", Done: true, DoneReason: llm.DoneReasonStop}, - }, - expectedChunks: []expectedChunk{ - {afterResponse: 4, thinking: "Think"}, - {afterResponse: 5, thinking: "ing deeply..."}, - {afterResponse: 7, content: "Result "}, - {afterResponse: 8, content: "computed"}, - }, - wantContent: "Result computed", - wantThinking: "Thinking deeply...", - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - // Channel to synchronize mock responses with chunk verification - responsesSent := make(chan int, len(tc.mockResponses)) - - mock := mockRunner{ - CompletionFn: func(ctx context.Context, r llm.CompletionRequest, fn func(llm.CompletionResponse)) error { - // Send mock responses one at a time, notifying when each is sent - for i, resp := range tc.mockResponses { - fn(resp) - responsesSent <- i + 1 - } - close(responsesSent) - return nil - }, - } - - s := Server{ - sched: &Scheduler{ - pendingReqCh: make(chan *LlmRequest, 1), - finishedReqCh: make(chan *LlmRequest, 1), - expiredCh: make(chan *runnerRef, 1), - unloadedCh: make(chan any, 1), - loaded: make(map[string]*runnerRef), - newServerFn: newMockServer(&mock), - getGpuFn: discover.GetGPUInfo, - getCpuFn: discover.GetCPUInfo, - reschedDelay: 250 * time.Millisecond, - loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool { - req.successCh <- &runnerRef{ - llama: &mock, - } - return false - }, - }, - } - - go s.sched.Run(t.Context()) - - // Create a minimal model - _, digest := createHarmonyTestModel(t) - - // Create model with passthrough template - stream := false - w := createRequest(t, s.CreateHandler, api.CreateRequest{ - Model: "harmony-test", - Files: map[string]string{"file.gguf": digest}, - Template: `<|start|><|end|>{{ with .Tools }}{{ end }}{{ .Prompt }}`, - Stream: &stream, - }) - - if w.Code != http.StatusOK { - t.Fatalf("failed to create model: %d", w.Code) - } - - // Test chat endpoint with streaming - streamTrue := true - w = createRequest(t, s.ChatHandler, api.ChatRequest{ - Model: "harmony-test", - Messages: []api.Message{{Role: "user", Content: "Hello"}}, - Stream: &streamTrue, - Tools: getTestTools(), - }) - - if w.Code != http.StatusOK { - t.Fatalf("chat request failed: %d - %s", w.Code, w.Body.String()) - } - - // Parse streaming response - var chunks []api.ChatResponse - var content, thinking strings.Builder - - decoder := json.NewDecoder(w.Body) - for decoder.More() { - var chunk api.ChatResponse - if err := decoder.Decode(&chunk); err != nil { - t.Fatalf("failed to decode chunk: %v", err) - } - chunks = append(chunks, chunk) - - // Accumulate content and thinking from each chunk - content.WriteString(chunk.Message.Content) - thinking.WriteString(chunk.Message.Thinking) - - // Debug output - t.Logf("Chunk %d: content=%q thinking=%q done=%v", len(chunks), chunk.Message.Content, chunk.Message.Thinking, chunk.Done) - } - - // Verify we got streaming chunks - if len(chunks) == 0 { - t.Fatal("expected streaming chunks, got none") - } - - gotContent := content.String() - gotThinking := thinking.String() - - if gotContent != tc.wantContent { - t.Errorf("content mismatch: got %q, want %q", gotContent, tc.wantContent) - } - if gotThinking != tc.wantThinking { - t.Errorf("thinking mismatch: got %q, want %q", gotThinking, tc.wantThinking) - } - - // Verify last chunk has done=true - lastChunk := chunks[len(chunks)-1] - if !lastChunk.Done { - t.Error("expected last chunk to have done=true") - } - }) - } -} From e119783e66335c861e253734f36166d1a99f2e78 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 8 Sep 2025 17:33:31 -0700 Subject: [PATCH 05/13] llm: Clamp batch size to context size The context must always be able to store the current batch, so if the user requests a small context then we should also shrink the batch to match. This also fixes the TestLongInputContext test on the new engine. (The old engine already has this behavior.) --- integration/context_test.go | 2 +- llm/server.go | 2 ++ runner/ollamarunner/cache.go | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/integration/context_test.go b/integration/context_test.go index 24c57dcf2..ca6f16087 100644 --- a/integration/context_test.go +++ b/integration/context_test.go @@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) { if err := PullIfMissing(ctx, client, req.Model); err != nil { t.Fatalf("PullIfMissing failed: %v", err) } - DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second) + DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second) } func TestContextExhaustion(t *testing.T) { diff --git a/llm/server.go b/llm/server.go index e0a652ec0..7bc2ca13d 100644 --- a/llm/server.go +++ b/llm/server.go @@ -173,6 +173,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a opts.NumCtx = int(trainCtx) } + opts.NumBatch = min(opts.NumBatch, opts.NumCtx) + loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()} defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount() diff --git a/runner/ollamarunner/cache.go b/runner/ollamarunner/cache.go index 1d1ca5180..f558f7b87 100644 --- a/runner/ollamarunner/cache.go +++ b/runner/ollamarunner/cache.go @@ -34,8 +34,8 @@ type InputCache struct { func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) { numCtx := kvSize / int32(numSlots) - if numCtx < 1 { - return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots) + if int(numCtx) < batchSize { + return nil, fmt.Errorf("kv size must be at least as large as batch size * parallel (kv: %v batch: %v parallel: %v)", kvSize, batchSize, numSlots) } slots := make([]InputCacheSlot, numSlots) From f810ec741c5563a77d05719a1459b885aa641b27 Mon Sep 17 00:00:00 2001 From: Kashyap Tanuku Date: Tue, 9 Sep 2025 00:31:29 -0400 Subject: [PATCH 06/13] readme: add Clueless to community integrations (#12188) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 26742e8a0..481a29aea 100644 --- a/README.md +++ b/README.md @@ -413,6 +413,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.) - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.) - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models) +- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare) ### Cloud From 6745182885e798be81c70f277eaea81cc4f71524 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 9 Sep 2025 09:32:15 -0700 Subject: [PATCH 07/13] tests: reduce stress on CPU to 2 models (#12161) * tests: reduce stress on CPU to 2 models This should avoid flakes due to systems getting overloaded with 3 (or more) models running concurrently * tests: allow slow systems to pass on timeout If a slow system is still streaming a response, and the response will pass validation, don't fail just because the system is slow. * test: unload embedding models more quickly --- integration/concurrency_test.go | 8 ++++ integration/embed_test.go | 5 ++- integration/utils_test.go | 72 ++++++++++++++++++++------------- 3 files changed, 55 insertions(+), 30 deletions(-) diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 331bb6e75..3104eacca 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -121,6 +121,7 @@ func TestMultiModelStress(t *testing.T) { // The intent is to go 1 over what can fit so we force the scheduler to thrash targetLoadCount := 0 slog.Info("Loading models to find how many can fit in VRAM before overflowing") +chooseModels: for i, model := range chosenModels { req := &api.GenerateRequest{Model: model} slog.Info("loading", "model", model) @@ -142,6 +143,13 @@ func TestMultiModelStress(t *testing.T) { slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount]) break } + // Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts + for _, m := range models.Models { + if m.SizeVRAM == 0 { + slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount]) + break chooseModels + } + } } } if targetLoadCount == len(chosenModels) { diff --git a/integration/embed_test.go b/integration/embed_test.go index 09369dbb4..eb00f4ba6 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -38,8 +38,9 @@ func TestAllMiniLMEmbeddings(t *testing.T) { defer cleanup() req := api.EmbeddingRequest{ - Model: "all-minilm", - Prompt: "why is the sky blue?", + Model: "all-minilm", + Prompt: "why is the sky blue?", + KeepAlive: &api.Duration{Duration: 10 * time.Second}, } res, err := embeddingTestHelper(ctx, client, t, req) diff --git a/integration/utils_test.go b/integration/utils_test.go index 2bb6a1570..ec74b2e3d 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -502,6 +502,22 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap done <- 0 }() + var response string + verify := func() { + // Verify the response contains the expected data + response = buf.String() + atLeastOne := false + for _, resp := range anyResp { + if strings.Contains(strings.ToLower(response), resp) { + atLeastOne = true + break + } + } + if !atLeastOne { + t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response) + } + } + select { case <-stallTimer.C: if buf.Len() == 0 { @@ -517,21 +533,14 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap if genErr != nil { t.Fatalf("%s failed with %s request prompt %s", genErr, genReq.Model, genReq.Prompt) } - // Verify the response contains the expected data - response := buf.String() - atLeastOne := false - for _, resp := range anyResp { - if strings.Contains(strings.ToLower(response), resp) { - atLeastOne = true - break - } - } - if !atLeastOne { - t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response) - } + verify() slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response) case <-ctx.Done(): - t.Error("outer test context done while waiting for generate") + // On slow systems, we might timeout before some models finish rambling, so check what we have so far to see + // if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass + // if they are still generating valid responses + slog.Warn("outer test context done while waiting for generate") + verify() } return context } @@ -599,6 +608,22 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR done <- 0 }() + var response string + verify := func() { + // Verify the response contains the expected data + response = buf.String() + atLeastOne := false + for _, resp := range anyResp { + if strings.Contains(strings.ToLower(response), resp) { + atLeastOne = true + break + } + } + if !atLeastOne { + t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages) + } + } + select { case <-stallTimer.C: if buf.Len() == 0 { @@ -614,23 +639,14 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR if genErr != nil { t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages) } - - // Verify the response contains the expected data - response := buf.String() - atLeastOne := false - for _, resp := range anyResp { - if strings.Contains(strings.ToLower(response), resp) { - atLeastOne = true - break - } - } - if !atLeastOne { - t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages) - } - + verify() slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response) case <-ctx.Done(): - t.Error("outer test context done while waiting for generate") + // On slow systems, we might timeout before some models finish rambling, so check what we have so far to see + // if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass + // if they are still generating valid responses + slog.Warn("outer test context done while waiting for chat") + verify() } return &api.Message{Role: role, Content: buf.String()} } From 20b53eaa726a4c08043c7af1fa6a322295edcde2 Mon Sep 17 00:00:00 2001 From: Parth Sareen Date: Tue, 9 Sep 2025 14:01:11 -0700 Subject: [PATCH 08/13] tests: add tool calling integration test (#12232) --- integration/api_test.go | 96 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/integration/api_test.go b/integration/api_test.go index 0baba8827..c39192c99 100644 --- a/integration/api_test.go +++ b/integration/api_test.go @@ -410,3 +410,99 @@ func TestAPIEmbeddings(t *testing.T) { t.Errorf("zero length embedding response") } } + +func TestAPIToolCalling(t *testing.T) { + initialTimeout := 60 * time.Second + streamTimeout := 30 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + + modelName := "qwen3:0.6b" + if err := PullIfMissing(ctx, client, modelName); err != nil { + t.Fatalf("pull failed %s", err) + } + + tools := []api.Tool{ + { + Type: "function", + Function: api.ToolFunction{ + Name: "get_weather", + Description: "Get the current weather in a given location", + Parameters: api.ToolFunctionParameters{ + Type: "object", + Required: []string{"location"}, + Properties: map[string]api.ToolProperty{ + "location": { + Type: api.PropertyType{"string"}, + Description: "The city and state, e.g. San Francisco, CA", + }, + }, + }, + }, + }, + } + + req := api.ChatRequest{ + Model: modelName, + Messages: []api.Message{ + { + Role: "user", + Content: "Call get_weather with location set to San Francisco.", + }, + }, + Tools: tools, + Options: map[string]any{ + "temperature": 0, + }, + } + + stallTimer := time.NewTimer(initialTimeout) + var gotToolCall bool + var lastToolCall api.ToolCall + + fn := func(response api.ChatResponse) error { + if len(response.Message.ToolCalls) > 0 { + gotToolCall = true + lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1] + } + if !stallTimer.Reset(streamTimeout) { + return fmt.Errorf("stall was detected while streaming response, aborting") + } + return nil + } + + stream := true + req.Stream = &stream + done := make(chan int) + var genErr error + go func() { + genErr = client.Chat(ctx, &req, fn) + done <- 0 + }() + + select { + case <-stallTimer.C: + t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String()) + case <-done: + if genErr != nil { + t.Fatalf("chat failed: %v", genErr) + } + + if !gotToolCall { + t.Fatalf("expected at least one tool call, got none") + } + + if lastToolCall.Function.Name != "get_weather" { + t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather") + } + + if _, ok := lastToolCall.Function.Arguments["location"]; !ok { + t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String()) + } + case <-ctx.Done(): + t.Error("outer test context done while waiting for tool-calling chat") + } +} From 8d6fffaead722d86622b17c059051796d2621858 Mon Sep 17 00:00:00 2001 From: Parth Sareen Date: Wed, 10 Sep 2025 11:24:42 -0700 Subject: [PATCH 09/13] runner: simplify parser entrypoints in runner (#12233) --- harmony/harmonyparser.go | 18 +++-- harmony/harmonyparser_test.go | 28 ++++---- llm/server.go | 7 +- parser/token_parser.go | 126 ++++++++++++++++++++++++++++++++++ runner/ollamarunner/runner.go | 54 +++------------ server/routes.go | 17 ++++- 6 files changed, 173 insertions(+), 77 deletions(-) create mode 100644 parser/token_parser.go diff --git a/harmony/harmonyparser.go b/harmony/harmonyparser.go index addce4c94..3ec2c21f1 100644 --- a/harmony/harmonyparser.go +++ b/harmony/harmonyparser.go @@ -289,6 +289,7 @@ type HarmonyMessageHandler struct { state harmonyMessageState HarmonyParser *HarmonyParser FunctionNameMap *FunctionNameMap + ToolParser *HarmonyToolCallAccumulator } // NewHarmonyMessageHandler creates a new message handler @@ -301,12 +302,16 @@ func NewHarmonyMessageHandler() *HarmonyMessageHandler { HeaderEndTag: "<|message|>", }, FunctionNameMap: NewFunctionNameMap(), + ToolParser: &HarmonyToolCallAccumulator{ + state: harmonyToolCallState_Normal, + currentToolName: nil, + }, } } // AddContent processes the content and returns the content, thinking, and tool content. // content and thinking are already fully parsed, but tool content still needs to be passed to the tool parser -func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyToolCallAccumulator) (string, string, string) { +func (h *HarmonyMessageHandler) AddContent(content string) (string, string, string) { contentSb := strings.Builder{} thinkingSb := strings.Builder{} toolContentSb := strings.Builder{} @@ -323,14 +328,14 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo // event.Header.Recipient is the tool name, something like // "browser.search" for a built-in, or "functions.calc" for a // custom one - toolParser.SetToolName(event.Header.Recipient) + h.ToolParser.SetToolName(event.Header.Recipient) } else { h.state = harmonyMessageState_Thinking } case "commentary": if event.Header.Recipient != "" { h.state = harmonyMessageState_ToolCalling - toolParser.SetToolName(event.Header.Recipient) + h.ToolParser.SetToolName(event.Header.Recipient) } else { h.state = harmonyMessageState_Normal } @@ -353,13 +358,6 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo return contentSb.String(), thinkingSb.String(), toolContentSb.String() } -func (h *HarmonyMessageHandler) CreateToolParser() *HarmonyToolCallAccumulator { - return &HarmonyToolCallAccumulator{ - state: harmonyToolCallState_Normal, - currentToolName: nil, - } -} - type harmonyToolCallState int const ( diff --git a/harmony/harmonyparser_test.go b/harmony/harmonyparser_test.go index dcf1af4e8..82bf5b2de 100644 --- a/harmony/harmonyparser_test.go +++ b/harmony/harmonyparser_test.go @@ -541,7 +541,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { t.Run("thinking_then_content_streams", func(t *testing.T) { handler := NewHarmonyMessageHandler() handler.HarmonyParser.AddImplicitStart() - tp := handler.CreateToolParser() + tp := handler.ToolParser type step struct { in string wantContent string @@ -554,7 +554,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { {in: "<|end|>", wantContent: ""}, } for i, s := range steps { - content, thinking, tool := handler.AddContent(s.in, tp) + content, thinking, tool := handler.AddContent(s.in) if tool != "" { tp.Add(tool) } @@ -567,7 +567,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { t.Run("content_streams_as_it_arrives", func(t *testing.T) { handler := NewHarmonyMessageHandler() handler.HarmonyParser.AddImplicitStart() - tp := handler.CreateToolParser() + tp := handler.ToolParser inputs := []string{ "<|start|>assistant<|message|>Hello", ", world", @@ -575,7 +575,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { } var got []string for _, in := range inputs { - content, thinking, tool := handler.AddContent(in, tp) + content, thinking, tool := handler.AddContent(in) if tool != "" { tp.Add(tool) } @@ -595,7 +595,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { t.Run("thinking_streams_separately_from_content", func(t *testing.T) { handler := NewHarmonyMessageHandler() handler.HarmonyParser.AddImplicitStart() - tp := handler.CreateToolParser() + tp := handler.ToolParser inputs := []string{ "<|channel|>analysis<|message|>Thinking...", "<|end|>", @@ -604,7 +604,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { } var got []string for _, in := range inputs { - content, thinking, tool := handler.AddContent(in, tp) + content, thinking, tool := handler.AddContent(in) if tool != "" { tp.Add(tool) } @@ -624,7 +624,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { t.Run("partial_tags_buffer_until_complete", func(t *testing.T) { handler := NewHarmonyMessageHandler() handler.HarmonyParser.AddImplicitStart() - tp := handler.CreateToolParser() + tp := handler.ToolParser inputs := []string{ "<|chan", "nel|>analysis<|mess", @@ -637,7 +637,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { var thinkingPieces []string var contentPieces []string for _, in := range inputs { - content, thinking, tool := handler.AddContent(in, tp) + content, thinking, tool := handler.AddContent(in) if tool != "" { tp.Add(tool) } @@ -659,7 +659,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { t.Run("simple_assistant_after_analysis", func(t *testing.T) { handler := NewHarmonyMessageHandler() handler.HarmonyParser.AddImplicitStart() - tp := handler.CreateToolParser() + tp := handler.ToolParser inputs := []string{ "<|channel|>analysis<|message|>Think", "<|end|>", @@ -668,7 +668,7 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { } var contentSb, thinkingSb strings.Builder for _, in := range inputs { - content, thinking, tool := handler.AddContent(in, tp) + content, thinking, tool := handler.AddContent(in) if tool != "" { tp.Add(tool) } @@ -686,12 +686,12 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { t.Run("tool_call_parsed_and_returned_correctly", func(t *testing.T) { handler := NewHarmonyMessageHandler() handler.HarmonyParser.AddImplicitStart() - tp := handler.CreateToolParser() + tp := handler.ToolParser inputs := []string{ "<|channel|>commentary to=functions.calculate<|message|>{\"expression\":\"2+2\"}<|end|>", } for _, in := range inputs { - content, thinking, tool := handler.AddContent(in, tp) + content, thinking, tool := handler.AddContent(in) if content != "" || thinking != "" { continue } @@ -711,14 +711,14 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) { t.Run("tool_call_across_chunks", func(t *testing.T) { handler := NewHarmonyMessageHandler() handler.HarmonyParser.AddImplicitStart() - tp := handler.CreateToolParser() + tp := handler.ToolParser inputs := []string{ "<|channel|>commentary to=functions.calculate<|message|>{\"expression\":\"2+", "2\"}", "<|end|>", } for _, in := range inputs { - content, thinking, tool := handler.AddContent(in, tp) + content, thinking, tool := handler.AddContent(in) if content != "" || thinking != "" { continue } diff --git a/llm/server.go b/llm/server.go index 7bc2ca13d..4740a1fd4 100644 --- a/llm/server.go +++ b/llm/server.go @@ -35,6 +35,7 @@ import ( "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" + "github.com/ollama/ollama/parser" ) type filteredEnv []string @@ -1350,7 +1351,7 @@ type CompletionRequest struct { Options *api.Options Grammar string // set before sending the request to the subprocess - UseHarmony bool + ParserType parser.TokenParserType PrefillString string } @@ -1364,8 +1365,6 @@ const ( DoneReasonLength // DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed DoneReasonConnectionClosed - // DoneReasonTokenRepeatLimit indicates the completion stopped due to a token repeat limit - DoneReasonTokenRepeatLimit ) func (d DoneReason) String() string { @@ -1374,8 +1373,6 @@ func (d DoneReason) String() string { return "length" case DoneReasonStop: return "stop" - case DoneReasonTokenRepeatLimit: - return "token_repeat_limit" default: return "" // closed } diff --git a/parser/token_parser.go b/parser/token_parser.go new file mode 100644 index 000000000..812458299 --- /dev/null +++ b/parser/token_parser.go @@ -0,0 +1,126 @@ +package parser + +import ( + "encoding/json" + "errors" + "strings" + + "github.com/ollama/ollama/api" + "github.com/ollama/ollama/harmony" +) + +type TokenParserType int + +const ( + TokenParserTypeDefault TokenParserType = iota + TokenParserTypeHarmony +) + +type TokenParser struct { + messageHandler MessageHandler + parserEngine ParserInternals + toolParser ToolParser + lastToken string + tokenRepeat int + repeatLimit int +} + +const defaultTokenRepeatLimit = 30 + +type MessageHandler interface { + AddContent(token string) (content, thinking string, toolContent string) +} + +type ParserInternals interface { + AddImplicitStartOrPrefill(prefillString string) +} + +type ToolParser interface { + Add(token string) + Drain() (toolName *string, toolContent string) +} + +// Default implementation for the TokenParser interface as a no-op passthrough +type defaultMessageHandler struct{} + +func (defaultMessageHandler) AddContent(token string) (string, string, string) { + return token, "", "" +} + +type defaultEngine struct{} + +func (defaultEngine) AddImplicitStartOrPrefill(prefillString string) {} + +type defaultToolParser struct{} + +func (defaultToolParser) Add(token string) {} + +func (defaultToolParser) Drain() (*string, string) { return nil, "" } + +func NewTokenParser(parserType TokenParserType, prefillString string) TokenParser { + switch parserType { + case TokenParserTypeHarmony: + harmonyMessageHandler := harmony.NewHarmonyMessageHandler() + harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(prefillString) + return TokenParser{ + messageHandler: harmonyMessageHandler, + parserEngine: harmonyMessageHandler.HarmonyParser, + toolParser: harmonyMessageHandler.ToolParser, + repeatLimit: defaultTokenRepeatLimit, + } + + default: + return TokenParser{ + messageHandler: defaultMessageHandler{}, + parserEngine: defaultEngine{}, + toolParser: defaultToolParser{}, + repeatLimit: 30, + } + } +} + +func (p *TokenParser) AddContent(token string) (string, string, error) { + if p.repeatLimitReached(token) { + return "", "", errors.New("token repeat limit reached") + } + content, thinking, toolContent := p.messageHandler.AddContent(token) + p.toolParser.Add(toolContent) + return content, thinking, nil +} + +// repeatLimitReached updates repeat counters and returns true if the repeat limit is reached. +func (p *TokenParser) repeatLimitReached(token string) bool { + if p == nil { + return false + } + trimmed := strings.TrimSpace(token) + if trimmed == p.lastToken { + p.tokenRepeat++ + } else { + p.tokenRepeat = 0 + } + p.lastToken = trimmed + + return p.tokenRepeat >= p.repeatLimit +} + +// TODO: update to work with multiple toolcalls - unmarshalling should also happen on parser level +func (p *TokenParser) Drain() []api.ToolCall { + toolName, toolContent := p.toolParser.Drain() + if toolName != nil { + *toolName = strings.TrimPrefix(*toolName, "functions.") + var args api.ToolCallFunctionArguments + if err := json.Unmarshal([]byte(toolContent), &args); err != nil { + return nil + } + return []api.ToolCall{ + { + Function: api.ToolCallFunction{ + Name: *toolName, + Arguments: args, + }, + }, + } + } + return nil +} diff --git a/runner/ollamarunner/runner.go b/runner/ollamarunner/runner.go index a40643ef2..201d55a16 100644 --- a/runner/ollamarunner/runner.go +++ b/runner/ollamarunner/runner.go @@ -30,12 +30,12 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" - "github.com/ollama/ollama/harmony" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/ml" "github.com/ollama/ollama/model" "github.com/ollama/ollama/model/input" + "github.com/ollama/ollama/parser" "github.com/ollama/ollama/runner/common" "github.com/ollama/ollama/sample" @@ -782,13 +782,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { return } - var harmonyMessageHandler *harmony.HarmonyMessageHandler - var harmonyToolParser *harmony.HarmonyToolCallAccumulator - if req.UseHarmony { - harmonyMessageHandler = harmony.NewHarmonyMessageHandler() - harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(req.PrefillString) - harmonyToolParser = harmonyMessageHandler.CreateToolParser() - } + tokenParser := parser.NewTokenParser(req.ParserType, req.PrefillString) if req.Options == nil { opts := api.DefaultOptions() @@ -872,9 +866,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { http.Error(w, "could not find an available sequence", http.StatusInternalServerError) return } - var lastToken string - tokenRepeat := 0 - const tokenRepeatLimit = 30 for { select { @@ -883,23 +874,14 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { return case content, ok := <-seq.responses: if ok { - if strings.TrimSpace(content) == lastToken { - tokenRepeat++ - } - if tokenRepeat == tokenRepeatLimit { - http.Error(w, "token repeat limit reached", http.StatusInternalServerError) - seq.doneReason = llm.DoneReasonTokenRepeatLimit + var thinking string + var err error + content, thinking, err = tokenParser.AddContent(content) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) close(seq.quit) return } - lastToken = strings.TrimSpace(content) - - var thinking string - if harmonyMessageHandler != nil { - var toolContent string - content, thinking, toolContent = harmonyMessageHandler.AddContent(content, harmonyToolParser) - harmonyToolParser.Add(toolContent) - } if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{ Content: content, @@ -912,27 +894,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) { flusher.Flush() } else { - var toolCalls []api.ToolCall - if harmonyMessageHandler != nil { - // these tools still need to be transformed to the original function name - toolName, toolContent := harmonyToolParser.Drain() - if toolName != nil { - *toolName = strings.TrimPrefix(*toolName, "functions.") - var args api.ToolCallFunctionArguments - if err := json.Unmarshal([]byte(toolContent), &args); err != nil { - http.Error(w, fmt.Sprintf("failed to unmarshal tool call function arguments: %v", err), http.StatusInternalServerError) - close(seq.quit) - return - } - toolCalls = append(toolCalls, api.ToolCall{ - Function: api.ToolCallFunction{ - Name: *toolName, - Arguments: args, - }, - }) - } - } - + toolCalls := tokenParser.Drain() if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{ ToolCalls: toolCalls, Done: true, diff --git a/server/routes.go b/server/routes.go index 73ea5fea4..ac4df4a46 100644 --- a/server/routes.go +++ b/server/routes.go @@ -36,6 +36,7 @@ import ( "github.com/ollama/ollama/llm" "github.com/ollama/ollama/logutil" "github.com/ollama/ollama/openai" + "github.com/ollama/ollama/parser" "github.com/ollama/ollama/server/internal/client/ollama" "github.com/ollama/ollama/server/internal/registry" "github.com/ollama/ollama/template" @@ -196,6 +197,12 @@ func (s *Server) GenerateHandler(c *gin.Context) { } useHarmony := harmony.ShouldUseHarmony(m.Config.ModelFamily, m.Template) && !req.Raw + var parserType parser.TokenParserType + if useHarmony { + parserType = parser.TokenParserTypeHarmony + } else { + parserType = parser.TokenParserTypeDefault + } var functionNameMap *harmony.FunctionNameMap if useHarmony { @@ -347,7 +354,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { Images: images, Format: req.Format, Options: opts, - UseHarmony: useHarmony, + ParserType: parserType, }, func(cr llm.CompletionResponse) { res := api.GenerateResponse{ Model: req.Model, @@ -1592,6 +1599,12 @@ func (s *Server) ChatHandler(c *gin.Context) { msgs = filterThinkTags(msgs, m) useHarmony := harmony.ShouldUseHarmony(m.Config.ModelFamily, m.Template) + var parserType parser.TokenParserType + if useHarmony { + parserType = parser.TokenParserTypeHarmony + } else { + parserType = parser.TokenParserTypeDefault + } processedTools := req.Tools var functionNameMap *harmony.FunctionNameMap @@ -1662,7 +1675,7 @@ func (s *Server) ChatHandler(c *gin.Context) { Images: images, Format: req.Format, Options: opts, - UseHarmony: useHarmony, + ParserType: parserType, PrefillString: prefillString, }, func(r llm.CompletionResponse) { res := api.ChatResponse{ From 17a023f34b9ec81afd39aca36f3dcd488c4d48ea Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 10 Sep 2025 12:05:18 -0700 Subject: [PATCH 10/13] Add v12 + v13 cuda support (#12000) * Add support for upcoming NVIDIA Jetsons The latest Jetsons with JetPack 7 are moving to an SBSA compatible model and will not require building a JetPack specific variant. * cuda: bring back dual versions This adds back dual CUDA versions for our releases, with v11 and v13 to cover a broad set of GPUs and driver versions. * win: break up native builds in build_windows.ps1 * v11 build working on windows and linux * switch to cuda v12.8 not JIT * Set CUDA compression to size * enhance manual install linux docs --- .github/workflows/release.yaml | 13 +++++- .github/workflows/test.yaml | 6 +-- CMakeLists.txt | 6 +-- CMakePresets.json | 26 ++++++++++++ Dockerfile | 30 ++++++++++++-- discover/cuda_common.go | 15 +++---- docs/linux.md | 3 +- scripts/build_windows.ps1 | 72 +++++++++++++++++++++++++++++++--- 8 files changed, 146 insertions(+), 25 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 40871e644..902fa9ccc 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -67,12 +67,21 @@ jobs: install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe cuda-version: '12.8' flags: '' + runner_dir: 'cuda_v12' + - os: windows + arch: amd64 + preset: 'CUDA 13' + install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe + cuda-version: '13.0' + flags: '' + runner_dir: 'cuda_v13' - os: windows arch: amd64 preset: 'ROCm 6' install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe rocm-version: '6.2' flags: '-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma" -DCMAKE_CXX_FLAGS="-parallel-jobs=4 -Wno-ignored-attributes -Wno-deprecated-pragma"' + runner_dir: '' runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }} environment: release env: @@ -138,7 +147,7 @@ jobs: run: | Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo' - cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} + cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} -DOLLAMA_RUNNER_DIR="${{ matrix.runner_dir }}" cmake --build --parallel --preset "${{ matrix.preset }}" cmake --install build --component "${{ startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || 'CPU' }}" --strip --parallel 8 env: @@ -232,7 +241,7 @@ jobs: case "$COMPONENT" in bin/ollama) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; - lib/ollama/cuda_sbsa) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; + lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;; lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;; lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;; diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4d8cf773c..a10ad37a9 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,7 +46,7 @@ jobs: include: - preset: CPU - preset: CUDA - container: nvidia/cuda:12.8.1-devel-ubuntu22.04 + container: nvidia/cuda:13.0.0-devel-ubuntu22.04 flags: '-DCMAKE_CUDA_ARCHITECTURES=87' - preset: ROCm container: rocm/dev-ubuntu-22.04:6.1.2 @@ -78,7 +78,7 @@ jobs: include: - preset: CPU - preset: CUDA - install: https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_571.96_windows.exe + install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe flags: '-DCMAKE_CUDA_ARCHITECTURES=80' - preset: ROCm install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe @@ -102,7 +102,7 @@ jobs: $ErrorActionPreference = "Stop" if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') { Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe" - Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_12.8", "nvcc_12.8", "cublas_12.8", "cublas_dev_12.8")) -NoNewWindow -Wait + Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_13.0", "nvcc_13.0", "cublas_13.0", "cublas_dev_13.0")) -NoNewWindow -Wait } $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path diff --git a/CMakeLists.txt b/CMakeLists.txt index d62c8f99f..7cce5e4b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ set(GGML_LLAMAFILE ON) set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128) set(GGML_CUDA_GRAPHS ON) set(GGML_CUDA_FA ON) -set(GGML_CUDA_COMPRESSION_MODE default) +set(GGML_CUDA_COMPRESSION_MODE size) if((CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64|ARM64|ARMv[0-9]+")) @@ -38,7 +38,7 @@ if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64") endif() set(OLLAMA_BUILD_DIR ${CMAKE_BINARY_DIR}/lib/ollama) -set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama) +set(OLLAMA_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib/ollama/${OLLAMA_RUNNER_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OLLAMA_BUILD_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${OLLAMA_BUILD_DIR}) @@ -81,7 +81,7 @@ if(CMAKE_CUDA_COMPILER) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda) install(TARGETS ggml-cuda RUNTIME_DEPENDENCIES - DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_LIBRARY_DIR} + DIRECTORIES ${CUDAToolkit_BIN_DIR} ${CUDAToolkit_BIN_DIR}/x64 ${CUDAToolkit_LIBRARY_DIR} PRE_INCLUDE_REGEXES cublas cublasLt cudart PRE_EXCLUDE_REGEXES ".*" RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT CUDA diff --git a/CMakePresets.json b/CMakePresets.json index ab2cfe9d6..51190c719 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -18,6 +18,14 @@ "name": "CUDA", "inherits": [ "Default" ] }, + { + "name": "CUDA 11", + "inherits": [ "CUDA" ], + "cacheVariables": { + "CMAKE_CUDA_ARCHITECTURES": "50-virtual;60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual", + "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2" + } + }, { "name": "CUDA 12", "inherits": [ "CUDA" ], @@ -26,6 +34,14 @@ "CMAKE_CUDA_FLAGS": "-Wno-deprecated-gpu-targets -t 2" } }, + { + "name": "CUDA 13", + "inherits": [ "CUDA" ], + "cacheVariables": { + "CMAKE_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;87-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;110-virtual;120-virtual;121-virtual", + "CMAKE_CUDA_FLAGS": "-t 2" + } + }, { "name": "JetPack 5", "inherits": [ "CUDA" ], @@ -72,11 +88,21 @@ "configurePreset": "CUDA", "targets": [ "ggml-cuda" ] }, + { + "name": "CUDA 11", + "inherits": [ "CUDA" ], + "configurePreset": "CUDA 11" + }, { "name": "CUDA 12", "inherits": [ "CUDA" ], "configurePreset": "CUDA 12" }, + { + "name": "CUDA 13", + "inherits": [ "CUDA" ], + "configurePreset": "CUDA 13" + }, { "name": "JetPack 5", "inherits": [ "CUDA" ], diff --git a/Dockerfile b/Dockerfile index 0dc3c1267..c84b52392 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,15 +39,35 @@ RUN --mount=type=cache,target=/root/.ccache \ && cmake --build --parallel --preset 'CPU' \ && cmake --install build --component CPU --strip --parallel 8 +FROM base AS cuda-11 +ARG CUDA11VERSION=11.8 +RUN dnf install -y cuda-toolkit-${CUDA11VERSION//./-} +ENV PATH=/usr/local/cuda-11/bin:$PATH +RUN --mount=type=cache,target=/root/.ccache \ + cmake --preset 'CUDA 11' -DOLLAMA_RUNNER_DIR="cuda_v11" \ + && cmake --build --parallel --preset 'CUDA 11' \ + && cmake --install build --component CUDA --strip --parallel 8 + FROM base AS cuda-12 ARG CUDA12VERSION=12.8 RUN dnf install -y cuda-toolkit-${CUDA12VERSION//./-} ENV PATH=/usr/local/cuda-12/bin:$PATH RUN --mount=type=cache,target=/root/.ccache \ - cmake --preset 'CUDA 12' \ + cmake --preset 'CUDA 12' -DOLLAMA_RUNNER_DIR="cuda_v12"\ && cmake --build --parallel --preset 'CUDA 12' \ && cmake --install build --component CUDA --strip --parallel 8 + +FROM base AS cuda-13 +ARG CUDA13VERSION=13.0 +RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} +ENV PATH=/usr/local/cuda-13/bin:$PATH +RUN --mount=type=cache,target=/root/.ccache \ + cmake --preset 'CUDA 13' -DOLLAMA_RUNNER_DIR="cuda_v13" \ + && cmake --build --parallel --preset 'CUDA 13' \ + && cmake --install build --component CUDA --strip --parallel 8 + + FROM base AS rocm-6 ENV PATH=/opt/rocm/hcc/bin:/opt/rocm/hip/bin:/opt/rocm/bin:/opt/rocm/hcc/bin:$PATH RUN --mount=type=cache,target=/root/.ccache \ @@ -92,10 +112,14 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ go build -trimpath -buildmode=pie -o /bin/ollama . FROM --platform=linux/amd64 scratch AS amd64 -COPY --from=cuda-12 dist/lib/ollama /lib/ollama +# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/ +COPY --from=cuda-12 dist/lib/ollama /lib/ollama/ +COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/ FROM --platform=linux/arm64 scratch AS arm64 -COPY --from=cuda-12 dist/lib/ollama /lib/ollama/cuda_sbsa +# COPY --from=cuda-11 dist/lib/ollama/ /lib/ollama/ +COPY --from=cuda-12 dist/lib/ollama /lib/ollama/ +COPY --from=cuda-13 dist/lib/ollama/ /lib/ollama/ COPY --from=jetpack-5 dist/lib/ollama /lib/ollama/cuda_jetpack5 COPY --from=jetpack-6 dist/lib/ollama /lib/ollama/cuda_jetpack6 diff --git a/discover/cuda_common.go b/discover/cuda_common.go index b539f6b32..ca008af63 100644 --- a/discover/cuda_common.go +++ b/discover/cuda_common.go @@ -43,14 +43,15 @@ func cudaVariant(gpuInfo CudaGPUInfo) string { } } } - return "sbsa" } - // driver 12.0 has problems with the cuda v12 library, so run v11 on those older drivers - if gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) { - // The detected driver is older than Feb 2023 - slog.Warn("old CUDA driver detected - please upgrade to a newer driver", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) - return "v11" + if gpuInfo.DriverMajor < 13 { + // The detected driver is older than 580 (Aug 2025) + // Warn if their CC is compatible with v13 and they should upgrade their driver to get better performance + if gpuInfo.computeMajor > 7 || (gpuInfo.computeMajor == 7 && gpuInfo.computeMinor >= 5) { + slog.Warn("old CUDA driver detected - please upgrade to a newer driver for best performance", "version", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor)) + } + return "v12" } - return "v12" + return "v13" } diff --git a/docs/linux.md b/docs/linux.md index 9a156d1dc..ce5ed860b 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -11,12 +11,13 @@ curl -fsSL https://ollama.com/install.sh | sh ## Manual install > [!NOTE] -> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first. +> If you are upgrading from a prior version, you **MUST** remove the old libraries with `sudo rm -rf /usr/lib/ollama` first. Download and extract the package: ```shell curl -LO https://ollama.com/download/ollama-linux-amd64.tgz +sudo rm -rf /usr/lib/ollama sudo tar -C /usr -xzf ollama-linux-amd64.tgz ``` diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 27f3eb9d4..37fe87961 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -78,7 +78,7 @@ function checkEnv() { } -function buildOllama() { +function buildCPU() { mkdir -Force -path "${script:DIST_DIR}\" if ($script:ARCH -ne "arm64") { Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" @@ -90,20 +90,72 @@ function buildOllama() { if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & cmake --install build --component CPU --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } +} +function buildCUDA11() { + # CUDA v11 claims to be compatible with MSVC 2022, but the latest updates are no longer compatible + # 19.40 is the last compiler version that works, but recent udpates are 19.43 + # So this pins to MSVC 2019 for best compatibility + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { $hashEnv = @{} Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } - if ("$script:CUDA_DIRS".Contains("v12")) { - $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12")) { $v12="$_" }} - $env:CUDAToolkit_ROOT=$hashEnv[$v12] - write-host "Building CUDA v12 backend libraries" - & cmake --fresh --preset "CUDA 12" --install-prefix $script:DIST_DIR + if ("$script:CUDA_DIRS".Contains("v11")) { + $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V11")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} + write-host "Building CUDA v11 backend libraries $cuda" + $env:CUDAToolkit_ROOT=$cuda + & cmake --fresh --preset "CUDA 11" -T cuda="$cuda" -DCMAKE_CUDA_COMPILER="$cuda\bin\nvcc.exe" -G "Visual Studio 16 2019" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v11" + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --build --preset "CUDA 11" --config Release --parallel $script:JOBS + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --install build --component "CUDA" --strip + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } + } +} + +function buildCUDA12() { + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { + $hashEnv = @{} + Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } + if ("$script:CUDA_DIRS".Contains("v12.8")) { + $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V12_8")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} + write-host "Building CUDA v12 backend libraries $cuda" + $env:CUDAToolkit_ROOT=$cuda + & cmake --fresh --preset "CUDA 12" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v12" if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & cmake --build --preset "CUDA 12" --config Release --parallel $script:JOBS if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & cmake --install build --component "CUDA" --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } + } +} + +function buildCUDA13() { + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { + $hashEnv = @{} + Get-ChildItem env: | foreach { $hashEnv[$_.Name] = $_.Value } + if ("$script:CUDA_DIRS".Contains("v13")) { + $hashEnv.Keys | foreach { if ($_.Contains("CUDA_PATH_V13")) { $x=$hashEnv[$_]; if (test-path -literalpath "$x\bin\nvcc.exe" ) { $cuda=$x} }} + $env:CUDAToolkit_ROOT=$cuda + write-host "Building CUDA v13 backend libraries $cuda" + & cmake --fresh --preset "CUDA 13" -T cuda="$cuda" --install-prefix $script:DIST_DIR -DOLLAMA_RUNNER_DIR="cuda_v13" + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --build --preset "CUDA 13" --config Release --parallel $script:JOBS + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --install build --component "CUDA" --strip + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } + } +} + +function buildROCm() { + mkdir -Force -path "${script:DIST_DIR}\" + if ($script:ARCH -ne "arm64") { if ($env:HIP_PATH) { write-host "Building ROCm backend libraries" if (-Not (get-command -ErrorAction silent ninja)) { @@ -129,6 +181,10 @@ function buildOllama() { if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } } +} + +function buildOllama() { + mkdir -Force -path "${script:DIST_DIR}\" write-host "Building ollama CLI" & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" . if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} @@ -236,6 +292,10 @@ function distZip() { checkEnv try { if ($($args.count) -eq 0) { + buildCPU + buildCUDA12 + buildCUDA13 + buildROCm buildOllama buildApp gatherDependencies From 5198956372fb2098622fafe143b7026c8ce6ef2d Mon Sep 17 00:00:00 2001 From: "CarbonatedWater.org" Date: Wed, 10 Sep 2025 16:37:10 -0700 Subject: [PATCH 11/13] docs: add ollama-co2 to community integrations (#12230) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 481a29aea..5962f5b28 100644 --- a/README.md +++ b/README.md @@ -414,6 +414,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.) - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models) - [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare) +- [ollama-co2](https://github.com/carbonatedWaterOrg/ollama-co2) (FastAPI web interface for monitoring and managing local and remote Ollama servers with real-time model monitoring and concurrent downloads) ### Cloud From 71cb86af3e8b8006540550a8eeb9fed106b77eee Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 9 Sep 2025 10:37:28 -0700 Subject: [PATCH 12/13] llm: Remove unneeded warning with flash attention enabled If flash attention is enabled without KV cache quanitization, we will currently always get this warning: level=WARN source=server.go:226 msg="kv cache type not supported by model" type="" --- fs/ggml/ggml.go | 10 +++++++--- llm/memory.go | 2 +- llm/server.go | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 56ad420e5..57476a9a8 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -864,12 +864,16 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) { // SupportsKVCacheType checks if the requested cache type is supported func (f GGML) SupportsKVCacheType(cacheType string) bool { + if cacheType == "" || cacheType == "f16" { + return true + } + if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) { // gpt-oss uses attention with sinks which does not support quantized cache types - slog.Warn("model only supports non-quantized cache types ", "mode", arch) - return cacheType == "f16" + slog.Warn("model only supports non-quantized cache types", "model", arch) + return false } - return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType) + return slices.Contains([]string{"q8_0", "q4_0"}, cacheType) } // SupportsFlashAttention checks if the model supports flash attention diff --git a/llm/memory.go b/llm/memory.go index ce128eb58..7a87b28fe 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -202,7 +202,7 @@ func estimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin var kvct string if useFlashAttention { requested := strings.ToLower(envconfig.KvCacheType()) - if requested != "" && f.SupportsKVCacheType(requested) { + if f.SupportsKVCacheType(requested) { kvct = requested } } diff --git a/llm/server.go b/llm/server.go index 4740a1fd4..a22ae9722 100644 --- a/llm/server.go +++ b/llm/server.go @@ -221,7 +221,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a // Flash Attention also supports kv cache quantization // Enable if the requested and kv cache type is supported by the model - if kvct != "" && f.SupportsKVCacheType(kvct) { + if f.SupportsKVCacheType(kvct) { loadRequest.KvCacheType = kvct } else { slog.Warn("kv cache type not supported by model", "type", kvct) From 29ddfc2cab7f5a83a96c3133094f67b22e4f27d1 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 9 Sep 2025 10:48:34 -0700 Subject: [PATCH 13/13] ggml: Disable flash attention for gemma2 Our new engine implementation of gemma2 doesn't support flash attention, which means that it also doesn't support KV cache quantization. Currently, it is possible to turn these two on, which will result in a crash. --- fs/ggml/ggml.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 57476a9a8..6b582b499 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -883,6 +883,10 @@ func (f GGML) SupportsFlashAttention() bool { return false } + if arch := f.KV().Architecture(); slices.Contains([]string{"gemma2"}, arch) { + return false + } + // Check head counts match and are non-zero headCountK := f.KV().EmbeddingHeadCountK() headCountV := f.KV().EmbeddingHeadCountV()