models: qwen3vl

tests: add tool calling integration test (#12232 )
tests: reduce stress on CPU to 2 models (#12161 )
2025-09-10 12:11:46 -07:00 · 2025-09-09 14:01:11 -07:00 · 2025-09-09 09:32:15 -07:00 · 2025-09-08 21:31:29 -07:00 · 2025-09-08 20:40:11 -07:00 · 2025-09-08 15:07:59 -07:00
29 changed files with 730 additions and 254 deletions
--- a/README.md
+++ b/README.md
@@ -413,6 +413,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
 - [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
 - [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
+- [Clueless](https://github.com/KashyapTan/clueless) (Open Source & Local Cluely: A desktop application LLM assistant to help you talk to anything on your screen using locally served Ollama models. Also undetectable to screenshare)

 ### Cloud

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@@ -92,6 +92,9 @@ If none of those resolve the problem, gather additional information and file an
 - Set `CUDA_ERROR_LEVEL=50` and try again to get more diagnostic logs
 - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`

+You may get more details for initialization failures by enabling debug prints in the uvm driver.  You should only use this temporarily while troubleshooting
+- `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm uvm_debug_prints=1`
+

 ## AMD GPU Discovery

--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -57,10 +57,28 @@ func (kv KV) EmbeddingLength() uint64 {
 	return uint64(kv.Uint("embedding_length"))
 }

+func (kv KV) HeadCount() []uint64 {
+	headCountDefault := uint32(1)
+	headCount := kv.UintOrArrayValueAsArray("attention.head_count", headCountDefault)
+	if len(headCount) == 1 {
+		headCountDefault = headCount[0]
+	}
+	nLayers := int(kv.BlockCount())
+	if len(headCount) > nLayers {
+		slog.Warn("got more elements of attention.head_count than layers", "len(headCount)", len(headCount), "layers", nLayers)
+	}
+	out := make([]uint64, nLayers)
+	for i := range nLayers {
+		if i >= len(headCount) {
+			out[i] = uint64(headCountDefault)
+		} else {
+			out[i] = uint64(headCount[i])
+		}
+	}
+	return out
+}
+
 func (kv KV) HeadCountMax() uint64 {
-	// TODO(drifkin): using the max value can cause an overestimation. In the
-	// future if array values become more popular, we can adapt the more invasive
-	// <https://github.com/ollama/ollama/pull/10225>
 	return uint64(kv.UintOrMaxArrayValue("attention.head_count", 1))
 }

@@ -68,6 +86,27 @@ func (kv KV) HeadCountMin() uint64 {
 	return uint64(kv.UintOrMinArrayValue("attention.head_count", 1))
 }

+func (kv KV) HeadCountKV() []uint64 {
+	headCountKVDefault := uint32(1)
+	headCountKV := kv.UintOrArrayValueAsArray("attention.head_count_kv", headCountKVDefault)
+	if len(headCountKV) == 1 {
+		headCountKVDefault = headCountKV[0]
+	}
+	nLayers := int(kv.BlockCount())
+	if len(headCountKV) > nLayers {
+		slog.Warn("got more elements of attention.head_count than layers", "len(headCountKV)", len(headCountKV), "layers", nLayers)
+	}
+	out := make([]uint64, nLayers)
+	for i := range nLayers {
+		if i >= len(headCountKV) {
+			out[i] = uint64(headCountKVDefault)
+		} else {
+			out[i] = uint64(headCountKV[i])
+		}
+	}
+	return out
+}
+
 func (kv KV) HeadCountKVMax() uint64 {
 	return uint64(kv.UintOrMaxArrayValue("attention.head_count_kv", 1))
 }
@@ -100,6 +139,26 @@ func (kv KV) ChatTemplate() string {
 	return kv.String("tokenizer.chat_template")
 }

+// ssm architecture parameters
+
+func (kv KV) SSMConvKernel() uint64 {
+	return uint64(kv.Uint("ssm.conv_kernel"))
+}
+
+func (kv KV) SSMInnerSize() uint64 {
+	return uint64(kv.Uint("ssm.inner_size"))
+}
+
+func (kv KV) SSMStateSize() uint64 {
+	return uint64(kv.Uint("ssm.state_size"))
+}
+
+func (kv KV) SSMGroupCount() uint64 {
+	return uint64(kv.Uint("ssm.group_count"))
+}
+
+// general types
+
 func (kv KV) String(key string, defaultValue ...string) string {
 	val, _ := keyValue(kv, key, append(defaultValue, "")...)
 	return val
@@ -131,22 +190,27 @@ func (kv KV) UintOrMinArrayValue(key string, defaultValue uint32) uint32 {
 }

 func (kv KV) UintOrArrayValue(key string, defaultValue uint32) (uint32, uint32) {
+	arrVal := kv.UintOrArrayValueAsArray(key, defaultValue)
+	return slices.Min(arrVal), slices.Max(arrVal)
+}
+
+func (kv KV) UintOrArrayValueAsArray(key string, defaultValue uint32) []uint32 {
 	if u32, ok := keyValue(kv, key, uint32(0)); ok {
-		return u32, u32
+		return []uint32{u32}
 	} else if u32s, ok := keyValue(kv, key, &array[uint32]{}); ok {
-		min := slices.Min(u32s.values)
-		max := slices.Max(u32s.values)
-		return min, max
+		return u32s.values
 	} else if i32s, ok := keyValue(kv, key, &array[int32]{}); ok {
-		min := slices.Min(i32s.values)
-		max := slices.Max(i32s.values)
-		if min < 0 || max < 0 {
-			slog.Warn("array values are unexpectedly negative", "key", key, "min", min, "max", max)
+		dst := make([]uint32, len(i32s.values))
+		for i, v := range i32s.values {
+			if v < 0 {
+				slog.Warn("array values are unexpectedly negative", "key", key, "i", i, "v", v)
+			}
+			dst[i] = uint32(v)
 		}
-		return uint32(min), uint32(max)
+		return dst
 	}

-	return defaultValue, defaultValue
+	return []uint32{defaultValue}
 }

 func (kv KV) Strings(key string, defaultValue ...[]string) []string {
@@ -486,7 +550,9 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri

 	embedding := f.KV().EmbeddingLength()
 	heads := f.KV().HeadCountMax()
+	headsArr := f.KV().HeadCount()
 	headsKV := f.KV().HeadCountKVMax()
+	headsKVArr := f.KV().HeadCountKV()
 	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array[string]).size)

 	embeddingHeads := f.KV().EmbeddingHeadCountMax()
@@ -496,12 +562,51 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 	layers := f.Tensors().GroupLayers()

 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
+
+	// Default for models unless special-cased below. These defaults mirror the
+	// cache usage in llama.cpp under the assumption that models without special
+	// cases below will use the llamarunner and caching will be handled by the
+	// llama.cpp layer.
+	//
+	// This also assumes that a layer without heads or headsKV set is recurrent
+	// which is usually the case. Some models (eg nemotronh) use "blocks" in
+	// place of layers where some are MLP blocks that don't have any cache.
+	// Models like this will need a special case below to be accurately
+	// estimated.
 	var kvTotal uint64
 	kv = make([]uint64, f.KV().BlockCount())
+	kvSizeAttn := uint64(0)
+	kvSizeRecurrent := uint64(0)
 	for i := range kv {
-		kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+		headsL := headsArr[i]
+		headsKVL := headsKVArr[i]
+		if headsL > 0 && headsKVL > 0 {
+			// full attention layer
+			// NOTE: Assumes uniform values for all attn layers
+			kv[i] = uint64(float64(context*(embeddingHeadsK+embeddingHeadsV)*headsKVL) * bytesPerElement)
+			kvSizeAttn += kv[i]
+		} else {
+			// recurrent layer
+			ssmDConv := f.KV().SSMConvKernel()
+			ssmDState := f.KV().SSMStateSize()
+			ssmDInner := f.KV().SSMInnerSize()
+			ssmNGroups := f.KV().SSMGroupCount()
+			nEmbdR := uint64(0)
+			if ssmDConv > 0 {
+				nEmbdR = (ssmDConv - 1) * (ssmDInner + 2*ssmNGroups*ssmDState)
+			}
+			nEmbdS := ssmDState * ssmDInner
+
+			// recurrent always uses F32 in llama.cpp backend
+			// https://github.com/ggml-org/llama.cpp/blob/master/src/llama-model.cpp#L18644
+			bytesPerElementRecurrent := kvCacheBytesPerElement("f32")
+
+			kv[i] = (nEmbdR + nEmbdS) * uint64(bytesPerElementRecurrent)
+			kvSizeRecurrent += kv[i]
+		}
 		kvTotal += kv[i]
 	}
+	slog.Debug("default cache size estimate", "attention MiB", float32(kvSizeAttn)/(1024.*1024.), "attention bytes", kvSizeAttn, "recurrent MiB", float32(kvSizeRecurrent)/(1024.*1024.), "recurrent bytes", kvSizeRecurrent)

 	switch f.KV().Architecture() {
 	case "llama", "llama4":
@@ -794,6 +899,8 @@ func kvCacheBytesPerElement(cacheType string) float64 {
 		return 1 // 1/2 of fp16
 	case "q4_0":
 		return 0.5 // 1/4 of fp16
+	case "f32":
+		return 4 // f32 (default for recurrent)
 	default:
 		return 2 // f16 (default)
 	}
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -1,14 +1,13 @@
 package harmony

 import (
-	"encoding/json"
 	"fmt"
 	"log/slog"
-	"maps"
 	"slices"
 	"strings"
 	"unicode"

+	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/template"
 )
@@ -48,13 +47,12 @@ func (s harmonyParserState) String() string {
 }

 type HarmonyParser struct {
-	state            harmonyParserState
-	MessageStartTag  string
-	MessageEndTag    string
-	HeaderEndTag     string
-	ConstrainAllowed bool
-	acc              strings.Builder
-	lifetimeAcc      strings.Builder
+	state           harmonyParserState
+	MessageStartTag string
+	MessageEndTag   string
+	HeaderEndTag    string
+	acc             strings.Builder
+	lifetimeAcc     strings.Builder
 }

 type HarmonyEvent interface {
@@ -91,19 +89,28 @@ func (s *HarmonyParser) AddImplicitStart() {
 	s.acc.WriteString("<|start|>assistant")
 }

-// AddImplicitStartOrPrefill adds content or thinking to the accumulator else adds start tag
-func (s *HarmonyParser) AddImplicitStartOrPrefill(prefillContentOrThinking *bool) {
-	if prefillContentOrThinking != nil {
-		if *prefillContentOrThinking {
-			s.acc.WriteString("<|start|>assistant<|channel|>final<|message|>")
-			return
-		} else {
-			s.acc.WriteString("<|start|>assistant<|channel|>analysis<|message|>")
-			return
-		}
+func Prefill(lastMessage api.Message) string {
+	if lastMessage.Role != "assistant" {
+		return ""
 	}

-	s.AddImplicitStart()
+	switch {
+	case strings.TrimSpace(lastMessage.Content) != "":
+		return "<|start|>assistant<|channel|>final<|message|>"
+	case strings.TrimSpace(lastMessage.Thinking) != "":
+		return "<|start|>assistant<|channel|>analysis<|message|>"
+	default:
+		return ""
+	}
+}
+
+// AddImplicitStartOrPrefill adds an implicit start tag or prefill string if provided
+func (s *HarmonyParser) AddImplicitStartOrPrefill(prefillString string) {
+	if strings.TrimSpace(prefillString) != "" {
+		s.acc.WriteString(prefillString)
+	} else {
+		s.AddImplicitStart()
+	}
 }

 func (s *HarmonyParser) AddContent(content string) []HarmonyEvent {
@@ -329,7 +336,6 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 				}
 			case "final":
 				h.state = harmonyMessageState_Normal
-				h.HarmonyParser.ConstrainAllowed = true
 			}
 		case HarmonyEventContentEmitted:
 			logutil.Trace("harmony event content", "content", event.Content, "state", h.state)
@@ -395,38 +401,6 @@ type FunctionNameMap struct {
 	harmonyToUser map[string]string
 }

-func (m FunctionNameMap) MarshalJSON() ([]byte, error) {
-	// necessary to avoid exposing map internals
-	type alias struct {
-		UserToHarmony map[string]string `json:"userToHarmony"`
-		HarmonyToUser map[string]string `json:"harmonyToUser"`
-	}
-	return json.Marshal(alias{
-		UserToHarmony: m.userToHarmony,
-		HarmonyToUser: m.harmonyToUser,
-	})
-}
-
-func (m *FunctionNameMap) UnmarshalJSON(b []byte) error {
-	type alias struct {
-		UserToHarmony map[string]string `json:"userToHarmony"`
-		HarmonyToUser map[string]string `json:"harmonyToUser"`
-	}
-	var a alias
-	if err := json.Unmarshal(b, &a); err != nil {
-		return err
-	}
-	if m.userToHarmony == nil {
-		m.userToHarmony = make(map[string]string)
-	}
-	if m.harmonyToUser == nil {
-		m.harmonyToUser = make(map[string]string)
-	}
-	maps.Copy(m.userToHarmony, a.UserToHarmony)
-	maps.Copy(m.harmonyToUser, a.HarmonyToUser)
-	return nil
-}
-
 func NewFunctionNameMap() *FunctionNameMap {
 	return &FunctionNameMap{
 		userToHarmony: make(map[string]string),
--- a/harmony/harmonyparser_test.go
+++ b/harmony/harmonyparser_test.go
@@ -1,7 +1,6 @@
 package harmony

 import (
-	"encoding/json"
 	"fmt"
 	"reflect"
 	"strings"
@@ -736,25 +735,3 @@ func TestHarmonyMessageHandlerStreamingScenarios(t *testing.T) {
 		}
 	})
 }
-
-func TestFunctionNameMapJSONRoundTrip(t *testing.T) {
-	m := NewFunctionNameMap()
-	gotConverted := m.ConvertAndAdd("get weather")
-	if gotConverted == "" {
-		t.Fatal("conversion returned empty")
-	}
-	b, err := json.Marshal(m)
-	if err != nil {
-		t.Fatalf("marshal: %v", err)
-	}
-	var m2 FunctionNameMap
-	if err := json.Unmarshal(b, &m2); err != nil {
-		t.Fatalf("unmarshal: %v", err)
-	}
-	if m2.userToHarmony["get weather"] != gotConverted {
-		t.Fatalf("userToHarmony lost: got %q want %q", m2.userToHarmony["get weather"], gotConverted)
-	}
-	if m2.harmonyToUser[gotConverted] != "get weather" {
-		t.Fatalf("harmonyToUser lost: got %q want %q", m2.harmonyToUser[gotConverted], "get weather")
-	}
-}
--- a/integration/api_test.go
+++ b/integration/api_test.go
@@ -410,3 +410,99 @@ func TestAPIEmbeddings(t *testing.T) {
 		t.Errorf("zero length embedding response")
 	}
 }
+
+func TestAPIToolCalling(t *testing.T) {
+	initialTimeout := 60 * time.Second
+	streamTimeout := 30 * time.Second
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+
+	modelName := "qwen3:0.6b"
+	if err := PullIfMissing(ctx, client, modelName); err != nil {
+		t.Fatalf("pull failed %s", err)
+	}
+
+	tools := []api.Tool{
+		{
+			Type: "function",
+			Function: api.ToolFunction{
+				Name:        "get_weather",
+				Description: "Get the current weather in a given location",
+				Parameters: api.ToolFunctionParameters{
+					Type:     "object",
+					Required: []string{"location"},
+					Properties: map[string]api.ToolProperty{
+						"location": {
+							Type:        api.PropertyType{"string"},
+							Description: "The city and state, e.g. San Francisco, CA",
+						},
+					},
+				},
+			},
+		},
+	}
+
+	req := api.ChatRequest{
+		Model: modelName,
+		Messages: []api.Message{
+			{
+				Role:    "user",
+				Content: "Call get_weather with location set to San Francisco.",
+			},
+		},
+		Tools: tools,
+		Options: map[string]any{
+			"temperature": 0,
+		},
+	}
+
+	stallTimer := time.NewTimer(initialTimeout)
+	var gotToolCall bool
+	var lastToolCall api.ToolCall
+
+	fn := func(response api.ChatResponse) error {
+		if len(response.Message.ToolCalls) > 0 {
+			gotToolCall = true
+			lastToolCall = response.Message.ToolCalls[len(response.Message.ToolCalls)-1]
+		}
+		if !stallTimer.Reset(streamTimeout) {
+			return fmt.Errorf("stall was detected while streaming response, aborting")
+		}
+		return nil
+	}
+
+	stream := true
+	req.Stream = &stream
+	done := make(chan int)
+	var genErr error
+	go func() {
+		genErr = client.Chat(ctx, &req, fn)
+		done <- 0
+	}()
+
+	select {
+	case <-stallTimer.C:
+		t.Errorf("tool-calling chat never started. Timed out after: %s", initialTimeout.String())
+	case <-done:
+		if genErr != nil {
+			t.Fatalf("chat failed: %v", genErr)
+		}
+
+		if !gotToolCall {
+			t.Fatalf("expected at least one tool call, got none")
+		}
+
+		if lastToolCall.Function.Name != "get_weather" {
+			t.Errorf("unexpected tool called: got %q want %q", lastToolCall.Function.Name, "get_weather")
+		}
+
+		if _, ok := lastToolCall.Function.Arguments["location"]; !ok {
+			t.Errorf("expected tool arguments to include 'location', got: %s", lastToolCall.Function.Arguments.String())
+		}
+	case <-ctx.Done():
+		t.Error("outer test context done while waiting for tool-calling chat")
+	}
+}
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -121,6 +121,7 @@ func TestMultiModelStress(t *testing.T) {
 	// The intent is to go 1 over what can fit so we force the scheduler to thrash
 	targetLoadCount := 0
 	slog.Info("Loading models to find how many can fit in VRAM before overflowing")
+chooseModels:
 	for i, model := range chosenModels {
 		req := &api.GenerateRequest{Model: model}
 		slog.Info("loading", "model", model)
@@ -142,6 +143,13 @@ func TestMultiModelStress(t *testing.T) {
 				slog.Info("found model load capacity", "target", targetLoadCount, "current", loaded, "chosen", chosenModels[:targetLoadCount])
 				break
 			}
+			// Effectively limit model count to 2 on CPU only systems to avoid thrashing and timeouts
+			for _, m := range models.Models {
+				if m.SizeVRAM == 0 {
+					slog.Info("model running on CPU", "name", m.Name, "target", targetLoadCount, "chosen", chosenModels[:targetLoadCount])
+					break chooseModels
+				}
+			}
 		}
 	}
 	if targetLoadCount == len(chosenModels) {
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -36,7 +36,7 @@ func TestLongInputContext(t *testing.T) {
 	if err := PullIfMissing(ctx, client, req.Model); err != nil {
 		t.Fatalf("PullIfMissing failed: %v", err)
 	}
-	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
+	DoGenerate(ctx, t, client, req, []string{"russia", "germany", "france", "england", "austria", "prussia", "europe", "individuals", "coalition", "conflict"}, 120*time.Second, 10*time.Second)
 }

 func TestContextExhaustion(t *testing.T) {
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -38,8 +38,9 @@ func TestAllMiniLMEmbeddings(t *testing.T) {
 	defer cleanup()

 	req := api.EmbeddingRequest{
-		Model:  "all-minilm",
-		Prompt: "why is the sky blue?",
+		Model:     "all-minilm",
+		Prompt:    "why is the sky blue?",
+		KeepAlive: &api.Duration{Duration: 10 * time.Second},
 	}

 	res, err := embeddingTestHelper(ctx, client, t, req)
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -502,6 +502,22 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 		done <- 0
 	}()

+	var response string
+	verify := func() {
+		// Verify the response contains the expected data
+		response = buf.String()
+		atLeastOne := false
+		for _, resp := range anyResp {
+			if strings.Contains(strings.ToLower(response), resp) {
+				atLeastOne = true
+				break
+			}
+		}
+		if !atLeastOne {
+			t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response)
+		}
+	}
+
 	select {
 	case <-stallTimer.C:
 		if buf.Len() == 0 {
@@ -517,21 +533,14 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
 		if genErr != nil {
 			t.Fatalf("%s failed with %s request prompt %s", genErr, genReq.Model, genReq.Prompt)
 		}
-		// Verify the response contains the expected data
-		response := buf.String()
-		atLeastOne := false
-		for _, resp := range anyResp {
-			if strings.Contains(strings.ToLower(response), resp) {
-				atLeastOne = true
-				break
-			}
-		}
-		if !atLeastOne {
-			t.Fatalf("%s: none of %v found in %s", genReq.Model, anyResp, response)
-		}
+		verify()
 		slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response)
 	case <-ctx.Done():
-		t.Error("outer test context done while waiting for generate")
+		// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
+		// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
+		// if they are still generating valid responses
+		slog.Warn("outer test context done while waiting for generate")
+		verify()
 	}
 	return context
 }
@@ -599,6 +608,22 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
 		done <- 0
 	}()

+	var response string
+	verify := func() {
+		// Verify the response contains the expected data
+		response = buf.String()
+		atLeastOne := false
+		for _, resp := range anyResp {
+			if strings.Contains(strings.ToLower(response), resp) {
+				atLeastOne = true
+				break
+			}
+		}
+		if !atLeastOne {
+			t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages)
+		}
+	}
+
 	select {
 	case <-stallTimer.C:
 		if buf.Len() == 0 {
@@ -614,23 +639,14 @@ func DoChat(ctx context.Context, t *testing.T, client *api.Client, req api.ChatR
 		if genErr != nil {
 			t.Fatalf("%s failed with %s request prompt %v", genErr, req.Model, req.Messages)
 		}
-
-		// Verify the response contains the expected data
-		response := buf.String()
-		atLeastOne := false
-		for _, resp := range anyResp {
-			if strings.Contains(strings.ToLower(response), resp) {
-				atLeastOne = true
-				break
-			}
-		}
-		if !atLeastOne {
-			t.Fatalf("%s: none of %v found in \"%s\" -- request was:%v", req.Model, anyResp, response, req.Messages)
-		}
-
+		verify()
 		slog.Info("test pass", "model", req.Model, "messages", req.Messages, "contains", anyResp, "response", response)
 	case <-ctx.Done():
-		t.Error("outer test context done while waiting for generate")
+		// On slow systems, we might timeout before some models finish rambling, so check what we have so far to see
+		// if it's considered a pass - the stallTimer will detect hangs, but we want to consider slow systems a pass
+		// if they are still generating valid responses
+		slog.Warn("outer test context done while waiting for chat")
+		verify()
 	}
 	return &api.Message{Role: role, Content: buf.String()}
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -173,6 +173,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		opts.NumCtx = int(trainCtx)
 	}

+	opts.NumBatch = min(opts.NumBatch, opts.NumCtx)
+
 	loadRequest := LoadRequest{LoraPath: adapters, KvSize: opts.NumCtx * numParallel, BatchSize: opts.NumBatch, Parallel: numParallel, MultiUserCache: envconfig.MultiUserCache()}

 	defaultThreads := discover.GetSystemInfo().GetOptimalThreadCount()
@@ -1347,9 +1349,9 @@ type CompletionRequest struct {
 	Images  []ImageData
 	Options *api.Options

-	Grammar        string // set before sending the request to the subprocess
-	UseHarmony     bool
-	PrefillContent *bool
+	Grammar       string // set before sending the request to the subprocess
+	UseHarmony    bool
+	PrefillString string
 }

 // DoneReason represents the reason why a completion response is done
@@ -1362,6 +1364,8 @@ const (
 	DoneReasonLength
 	// DoneReasonConnectionClosed indicates the completion stopped due to the connection being closed
 	DoneReasonConnectionClosed
+	// DoneReasonTokenRepeatLimit indicates the completion stopped due to a token repeat limit
+	DoneReasonTokenRepeatLimit
 )

 func (d DoneReason) String() string {
@@ -1370,6 +1374,8 @@ func (d DoneReason) String() string {
 		return "length"
 	case DoneReasonStop:
 		return "stop"
+	case DoneReasonTokenRepeatLimit:
+		return "token_repeat_limit"
 	default:
 		return "" // closed
 	}
@@ -1502,7 +1508,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 				return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
 			}
 			switch {
-			case lastToken != "" && (strings.TrimSpace(c.Content) == lastToken || strings.TrimSpace(c.Thinking) == lastToken):
+			// TODO(parthsareen): token repeat limit is now handled in the runner, this currently support legacy model and can be removed in the future
+			case strings.TrimSpace(c.Content) == lastToken && c.Content != "":
 				tokenRepeat++
 			default:
 				lastToken = strings.TrimSpace(c.Content)
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -201,12 +201,11 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

-	logutil.Trace("encoded", "string", s, "ids", ids)
-
 	if addSpecial && len(ids) > 0 {
 		ids = bpe.vocab.addSpecials(ids)
 	}

+	logutil.Trace("encoded", "string", s, "ids", ids)
 	return ids, nil
 }

--- a/model/model.go
+++ b/model/model.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	_ "image/jpeg"
 	_ "image/png"
+	"math"
 	"os"
 	"reflect"
 	"strconv"
@@ -103,6 +104,10 @@ func New(modelPath string, params ml.BackendParams) (Model, error) {
 	}

 	arch := b.Config().Architecture()
+	if b.Config().Uint("pooling_type", math.MaxUint32) != math.MaxUint32 {
+		arch = arch + "_embed"
+	}
+
 	f, ok := models[arch]
 	if !ok {
 		return nil, fmt.Errorf("unsupported model architecture %q", arch)
--- a/model/models/gemma3/embed.go
+++ b/model/models/gemma3/embed.go
@@ -0,0 +1,73 @@
+package gemma3
+
+import (
+	"errors"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type embedModel struct {
+	model.Base
+	model.SentencePieceModel
+
+	*TextModel
+	PoolingType uint32
+
+	Dense [2]*nn.Linear `gguf:"dense"`
+}
+
+func (m *embedModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	batch.Outputs = batch.Positions // return all positions
+	hiddenStates := m.TextModel.Forward(ctx, batch, m.Cache)
+
+	switch m.PoolingType {
+	case 0: // None
+	case 1: // Mean
+		hiddenStates = hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx).Mean(ctx)
+		hiddenStates = hiddenStates.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+	default:
+		return nil, errors.New("unsupported pooling type")
+	}
+
+	for _, dense := range m.Dense {
+		hiddenStates = dense.Forward(ctx, hiddenStates)
+	}
+
+	return hiddenStates, nil
+}
+
+func newEmbedModel(c fs.Config) (model.Model, error) {
+	m := &embedModel{
+		SentencePieceModel: model.NewSentencePieceModel(
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Scores: c.Floats("tokenizer.ggml.scores"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{
+						int32(c.Uint("tokenizer.ggml.eos_token_id")),
+						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
+					},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		TextModel:   newTextModel(c),
+		PoolingType: c.Uint("pooling_type", 0),
+	}
+
+	m.Cache = kvcache.NewWrapperCache(
+		kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),
+		kvcache.NewCausalCache(m.Shift),
+	)
+
+	return m, nil
+}
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -141,12 +141,11 @@ func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-
-	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
+	hiddenStates := m.TextModel.Forward(ctx, batch, m.Cache)
+	return m.Output.Forward(ctx, hiddenStates), nil
 }

 func init() {
 	model.Register("gemma3", New)
+	model.Register("gemma3_embed", newEmbedModel)
 }
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -159,8 +159,11 @@ func (l *TextLayer) Forward(ctx ml.Context, layer int, hiddenState, positionIDs,
 	return hiddenState.Add(ctx, residual)
 }

-func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor, batch input.Batch, cache kvcache.Cache) ml.Tensor {
-	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
+func (m *TextModel) Forward(ctx ml.Context, batch input.Batch, cache kvcache.Cache) ml.Tensor {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+
+	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.TextConfig.hiddenSize)))

 	// set image embeddings
@@ -198,5 +201,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
-	return m.Output.Forward(ctx, hiddenState)
+	return hiddenState
 }
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -12,4 +12,5 @@ import (
 	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
+	_ "github.com/ollama/ollama/model/models/qwen3vl"
 )
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -44,8 +44,8 @@ func New(c fs.Config) (model.Model, error) {
 			},
 		),
 		TextModel:      NewTextModel(c),
-		VisionModel:    newVisionModel(c),
-		ImageProcessor: newImageProcessor(c),
+		VisionModel:    NewVisionModel(c),
+		ImageProcessor: NewImageProcessor(c),
 	}

 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -65,8 +65,8 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 	}

 	// Calculate tensor dimensions
-	patchDim := m.ImageProcessor.numChannels * m.ImageProcessor.temporalPatchSize *
-		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
+	patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
+		m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width

 	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -345,8 +345,8 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 	return positionalEmbedding
 }

-// newVisionModel creates a new instance of the Qwen vision model
-func newVisionModel(c fs.Config) *VisionModel {
+// NewVisionModel creates a new instance of the Qwen vision model
+func NewVisionModel(c fs.Config) *VisionModel {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	hiddenSize := int(c.Uint("vision.embedding_length", 1280))
 	numHeads := int(c.Uint("vision.attention.head_count", 16))
--- a/model/models/qwen25vl/process_image.go
+++ b/model/models/qwen25vl/process_image.go
@@ -11,40 +11,40 @@ import (

 // ImageProcessor contains configuration for the Qwen 2.5 VL image processing
 type ImageProcessor struct {
-	numChannels       int
-	patchSize         int
-	temporalPatchSize int
-	mergeSize         int
-	minPixels         int
-	maxPixels         int
-	factor            int
-	rescaleFactor     float32
-	imageMean         []float32
-	imageStd          []float32
+	NumChannels       int
+	PatchSize         int
+	TemporalPatchSize int
+	MergeSize         int
+	MinPixels         int
+	MaxPixels         int
+	Factor            int
+	RescaleFactor     float32
+	ImageMean         []float32
+	ImageStd          []float32
 }

 // newImageProcessor creates a new image processor with default values
-func newImageProcessor(c fs.Config) ImageProcessor {
+func NewImageProcessor(c fs.Config) ImageProcessor {
 	patchSize := int(c.Uint("vision.patch_size", 14))
 	mergeSize := int(c.Uint("vision.spatial_merge_size", 2))

 	return ImageProcessor{
-		numChannels:       int(c.Uint("vision.num_channels", 3)), // not set
-		patchSize:         patchSize,
-		temporalPatchSize: 2,
-		mergeSize:         mergeSize,
-		minPixels:         56 * 56,
-		maxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
-		factor:            patchSize * mergeSize,
-		rescaleFactor:     1.0 / 255.0,
-		imageMean:         imageproc.ClipDefaultMean[:],
-		imageStd:          imageproc.ClipDefaultSTD[:],
+		NumChannels:       int(c.Uint("vision.num_channels", 3)), // not set
+		PatchSize:         patchSize,
+		TemporalPatchSize: 2,
+		MergeSize:         mergeSize,
+		MinPixels:         56 * 56,
+		MaxPixels:         int(c.Uint("vision.max_pixels", 28*28*1280)), // 1MP limit
+		Factor:            patchSize * mergeSize,
+		RescaleFactor:     1.0 / 255.0,
+		ImageMean:         imageproc.ClipDefaultMean[:],
+		ImageStd:          imageproc.ClipDefaultSTD[:],
 	}
 }

 // SmartResize implements the smart resize algorithm
 func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
-	factor := p.factor
+	factor := p.Factor

 	if height < factor || width < factor {
 		panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
@@ -57,13 +57,13 @@ func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
 	hBar := round(float64(height)/float64(factor)) * factor
 	wBar := round(float64(width)/float64(factor)) * factor

-	if hBar*wBar > p.maxPixels {
-		beta := math.Sqrt(float64(height*width) / float64(p.maxPixels))
+	if hBar*wBar > p.MaxPixels {
+		beta := math.Sqrt(float64(height*width) / float64(p.MaxPixels))

 		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
 		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
-	} else if hBar*wBar < p.minPixels {
-		beta := math.Sqrt(float64(p.minPixels) / float64(height*width))
+	} else if hBar*wBar < p.MinPixels {
+		beta := math.Sqrt(float64(p.MinPixels) / float64(height*width))

 		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
 		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
@@ -90,16 +90,16 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)

 	normalizedPixels := imageproc.Normalize(
 		resizedImg,
-		[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
-		[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
+		[3]float32{p.ImageMean[0], p.ImageMean[1], p.ImageMean[2]},
+		[3]float32{p.ImageStd[0], p.ImageStd[1], p.ImageStd[2]},
 		true, // rescale
 		true, // channelFirst
 	)

 	// Calculate grid dimensions
 	grid := &Grid{
-		Height:   resizedHeight / p.patchSize,
-		Width:    resizedWidth / p.patchSize,
+		Height:   resizedHeight / p.PatchSize,
+		Width:    resizedWidth / p.PatchSize,
 		Temporal: 1, // For single images, temporal dimension is 1
 	}

@@ -113,10 +113,10 @@ func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error)
 }

 func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
-	channels := p.numChannels
-	patchSize := p.patchSize
-	mergeSize := p.mergeSize
-	temporalPatchSize := p.temporalPatchSize
+	channels := p.NumChannels
+	patchSize := p.PatchSize
+	mergeSize := p.MergeSize
+	temporalPatchSize := p.TemporalPatchSize

 	// Calculate output dimensions
 	numPatches := grid.Temporal * grid.Height * grid.Width
--- a/model/models/qwen3vl/model.go
+++ b/model/models/qwen3vl/model.go
@@ -0,0 +1,153 @@
+package qwen3vl
+
+import (
+	"bytes"
+	"fmt"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/model/models/qwen25vl"
+	"github.com/ollama/ollama/model/models/qwen3"
+)
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TextModel *qwen3.Model
+	*qwen25vl.VisionModel
+
+	qwen25vl.ImageProcessor
+}
+
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	textModel, err := qwen3.New(c)
+	if err != nil {
+		return nil, err
+	}
+
+	m := &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		TextModel:      textModel.(*qwen3.Model),
+		VisionModel:    qwen25vl.NewVisionModel(c),
+		ImageProcessor: qwen25vl.NewImageProcessor(c),
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+
+	return m, nil
+}
+
+func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *qwen25vl.Grid, error) {
+	image, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, nil, err
+	}
+
+	f32s, grid, err := m.ImageProcessor.ProcessImage(image)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Calculate tensor dimensions
+	patchDim := m.ImageProcessor.NumChannels * m.ImageProcessor.TemporalPatchSize *
+		m.ImageProcessor.PatchSize * m.ImageProcessor.PatchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+
+	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
+
+	return pixelValues, grid, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Layers) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	pixels, grid, err := m.PixelValues(ctx, multimodalData)
+	if err != nil {
+		return nil, err
+	}
+
+	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+}
+
+// PostTokenize arranges Qwen-3-VL's inputs for the forward pass
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	var result []*input.Input
+
+	var (
+		imageToken       int32 = 151655
+		visionStartToken int32 = 151652
+		visionEndToken   int32 = 151653
+	)
+
+	nImg := 0
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			// If not a multimodal input, add it to the result unchanged
+			result = append(result, inp)
+		} else {
+			// Adding the 'Picture' prefix is a hack, at the time of writing there is no way to prefix
+			// the image tokens with a prompt, so we add a prefix here
+			nImg++
+			pre, err := m.Encode(fmt.Sprintf(" Picture %d: ", nImg), true)
+			if err != nil {
+				return nil, fmt.Errorf("failed to encode image prompt: %w", err)
+			}
+			for i := range pre {
+				result = append(result, &input.Input{Token: pre[i]})
+			}
+
+			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+
+			// First add the vision start token
+			result = append(result, &input.Input{Token: visionStartToken})
+
+			// Add the image token with the multimodal tensor data at the first position
+			result = append(result, &input.Input{
+				Token:          imageToken,
+				Multimodal:     inp.Multimodal,
+				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
+			})
+
+			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
+			result = append(result, slices.Repeat([]*input.Input{{Token: imageToken}}, patchesPerChunk-1)...)
+
+			result = append(result, &input.Input{Token: visionEndToken})
+		}
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	return m.TextModel.Forward(ctx, batch)
+}
+
+func init() {
+	model.Register("qwen3vl", New)
+}
--- a/model/sentencepiece.go
+++ b/model/sentencepiece.go
@@ -181,12 +181,11 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}

-	logutil.Trace("encoded", "string", s, "ids", ids)
-
 	if addSpecial && len(ids) > 0 {
 		ids = spm.vocab.addSpecials(ids)
 	}

+	logutil.Trace("encoded", "string", s, "ids", ids)
 	return ids, nil
 }

--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -49,7 +49,7 @@ func (v *Vocabulary) addSpecials(ids []int32) []int32 {
 			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
 		}

-		slog.Debug("adding bos token to prompt", "id", v.BOS)
+		slog.Debug("adding bos token to prompt", "id", v.BOS[0])
 		ids = append([]int32{v.BOS[0]}, ids...)
 	}

@@ -58,7 +58,7 @@ func (v *Vocabulary) addSpecials(ids []int32) []int32 {
 			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
 		}

-		slog.Debug("adding eos token to prompt", "id", v.EOS)
+		slog.Debug("adding eos token to prompt", "id", v.EOS[0])
 		ids = append(ids, v.EOS[0])
 	}

--- a/parser/parser.go
+++ b/parser/parser.go
@@ -246,7 +246,7 @@ func filesForModel(path string) ([]string, error) {
 		for _, match := range matches {
 			if ct, err := detectContentType(match); err != nil {
 				return nil, err
-			} else if ct != contentType {
+			} else if len(contentType) > 0 && ct != contentType {
 				return nil, fmt.Errorf("invalid content type: expected %s for %s", ct, match)
 			}
 		}
@@ -255,7 +255,8 @@ func filesForModel(path string) ([]string, error) {
 	}

 	var files []string
-	if st, _ := glob(filepath.Join(path, "*.safetensors"), "application/octet-stream"); len(st) > 0 {
+	// some safetensors files do not properly match "application/octet-stream", so skip checking their contentType
+	if st, _ := glob(filepath.Join(path, "*.safetensors"), ""); len(st) > 0 {
 		// safetensors files might be unresolved git lfs references; skip if they are
 		// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
 		files = append(files, st...)
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -34,8 +34,8 @@ type InputCache struct {
 func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, batchSize int, multiUserCache bool) (*InputCache, error) {
 	numCtx := kvSize / int32(numSlots)

-	if numCtx < 1 {
-		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+	if int(numCtx) < batchSize {
+		return nil, fmt.Errorf("kv size must be at least as large as batch size * parallel (kv: %v batch: %v parallel: %v)", kvSize, batchSize, numSlots)
 	}

 	slots := make([]InputCacheSlot, numSlots)
@@ -70,11 +70,9 @@ func kvCacheTypeFromStr(s string) ml.DType {
 }

 func (c *InputCache) Close() {
-	if c == nil {
-		return
+	if c != nil && c.cache != nil {
+		c.cache.Close()
 	}
-
-	c.cache.Close()
 }

 // Locking: Operations on InputCacheSlot (including finding one
@@ -95,7 +93,7 @@ type InputCacheSlot struct {
 	lastUsed time.Time
 }

-func (c *InputCache) LoadCacheSlot(prompt []*input.Input) (*InputCacheSlot, []*input.Input, error) {
+func (c *InputCache) LoadCacheSlot(prompt []*input.Input, cachePrompt bool) (*InputCacheSlot, []*input.Input, error) {
 	var slot *InputCacheSlot
 	var numPast int32
 	var err error
@@ -113,6 +111,10 @@ func (c *InputCache) LoadCacheSlot(prompt []*input.Input) (*InputCacheSlot, []*i
 		return nil, nil, err
 	}

+	if !cachePrompt {
+		numPast = 0
+	}
+
 	slot.InUse = true
 	slot.lastUsed = time.Now()

--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -393,7 +393,7 @@ func TestLoadCacheSlot(t *testing.T) {

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			slot, remainingPrompt, err := tt.cache.LoadCacheSlot(tt.prompt)
+			slot, remainingPrompt, err := tt.cache.LoadCacheSlot(tt.prompt, true)

 			// Check error state
 			if (err != nil) != tt.wantErr {
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -11,6 +11,7 @@ import (
 	"image"
 	"log"
 	"log/slog"
+	"math"
 	"net"
 	"net/http"
 	"os"
@@ -406,6 +407,8 @@ func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
 func (s *Server) run(ctx context.Context) {
 	s.ready.Wait()

+	supportsAsync := s.model.Backend().Config().Uint("pooling_type", math.MaxUint32) == math.MaxUint32
+
 	var activeBatch batchState
 	for {
 		select {
@@ -419,7 +422,12 @@ func (s *Server) run(ctx context.Context) {
 			if err != nil {
 				panic(err)
 			}
-			go s.computeBatch(activeBatch)
+
+			if supportsAsync {
+				go s.computeBatch(activeBatch)
+			} else {
+				s.computeBatch(activeBatch)
+			}
 		}
 	}
 }
@@ -430,12 +438,12 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
 	// before setting up the next batch so the seqs inputs are ready to receive their
 	// token values and we get the correct input pointers for the batchInputs
 	if pendingBatch.ctx != nil {
-		slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch waiting for compute to start", "pendingBatch.id", pendingBatch.id)
+		logutil.Trace("forwardBatch waiting for compute to start", "pendingBatch.id", pendingBatch.id)
 		<-pendingBatch.computeStartedCh
-		slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch compute started, setting up next batch", "pendingBatch.id", pendingBatch.id, "id", s.batchID)
+		logutil.Trace("forwardBatch compute started, setting up next batch", "pendingBatch.id", pendingBatch.id, "id", s.batchID)
 		nextBatch.inputsReadyCh = pendingBatch.outputsReadyCh // Chain the ouputs from the pending batch to the next inputs batch
 	} else {
-		slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch no pending batch detected", "batchID", s.batchID)
+		logutil.Trace("forwardBatch no pending batch detected", "batchID", s.batchID)
 		// No pendingBatch, so the inputs will be ready in the seqs immediately
 		nextBatch.inputsReadyCh = make(chan struct{}, 1)
 		nextBatch.inputsReadyCh <- struct{}{}
@@ -547,7 +555,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
 			if i+1 == len(seq.inputs) {
 				batch.Outputs = append(batch.Outputs, int32(len(batchInputs)-1))
 			}
-			slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch iBatch", "batchID", s.batchID, "seqIdx", seqIdx, "seq.iBatch", seq.iBatch, "i+1", i+1, "len(seq.inputs)", len(seq.inputs))
+			logutil.Trace("forwardBatch iBatch", "batchID", s.batchID, "seqIdx", seqIdx, "seq.iBatch", seq.iBatch, "i+1", i+1, "len(seq.inputs)", len(seq.inputs))
 			seq.pendingInputs = append(seq.pendingInputs, inp)
 		}

@@ -561,7 +569,7 @@ func (s *Server) forwardBatch(pendingBatch batchState) (nextBatch batchState, er
 	}

 	if len(batchInputs) == 0 {
-		slog.Log(context.TODO(), logutil.LevelTrace, "forwardBatch no batchInputs, going idle", "batchID", s.batchID)
+		logutil.Trace("forwardBatch no batchInputs, going idle", "batchID", s.batchID)
 		nextBatch.ctx.Close()
 		nextBatch.ctx = nil
 		return
@@ -590,14 +598,14 @@ func (s *Server) computeBatch(activeBatch batchState) {
 	defer activeBatch.ctx.Close()

 	// Wait until inputs are ready
-	slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: waiting for inputs to be ready", "batchID", activeBatch.id)
+	logutil.Trace("computeBatch: waiting for inputs to be ready", "batchID", activeBatch.id)
 	<-activeBatch.inputsReadyCh
-	slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: inputs are ready", "batchID", activeBatch.id)
+	logutil.Trace("computeBatch: inputs are ready", "batchID", activeBatch.id)

 	// Once we complete, signal the next batch of inputs are ready
 	// This will unblock the next computeBatch, or forwardBatch if new seqs come in
 	defer func() {
-		slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: outputs are ready", "batchID", activeBatch.id)
+		logutil.Trace("computeBatch: outputs are ready", "batchID", activeBatch.id)
 		activeBatch.outputsReadyCh <- struct{}{}
 	}()

@@ -627,7 +635,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
 		// Detect if the sequence we're processing has already been completed and replaced
 		// with a new sequence
 		if seq != activeBatch.seqs[i] {
-			slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: sequence replaced, discarding its results", "batchID", activeBatch.id, "seqIdx", i)
+			logutil.Trace("computeBatch: sequence replaced, discarding its results", "batchID", activeBatch.id, "seqIdx", i)
 			continue
 		}

@@ -667,18 +675,19 @@ func (s *Server) computeBatch(activeBatch batchState) {
 	activeBatch.batch.Inputs.SetValueFromIntSlice(batchInputs)
 	activeBatch.ctx.ComputeWithNotify(
 		func() {
-			slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: signaling computeStartedCh", "batchID", activeBatch.id)
+			logutil.Trace("computeBatch: signaling computeStartedCh", "batchID", activeBatch.id)
 			activeBatch.computeStartedCh <- struct{}{}
 		},
 		activeBatch.modelOutput)
-	logits := activeBatch.modelOutput.Floats()

-	slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: logits ready", "batchID", activeBatch.id)
+	outputs := activeBatch.modelOutput.Floats()
+
+	logutil.Trace("computeBatch: logits ready", "batchID", activeBatch.id)

 	s.mu.Lock()
 	defer s.mu.Unlock()

-	slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: decoding", "batchID", activeBatch.id)
+	logutil.Trace("computeBatch: decoding", "batchID", activeBatch.id)
 	for i, seq := range s.seqs {
 		if seq == nil || nextBatchTokens[i] == nil {
 			continue
@@ -690,16 +699,15 @@ func (s *Server) computeBatch(activeBatch batchState) {

 		// if done processing the prompt, generate an embedding and return
 		if seq.embeddingOnly {
-			// TODO(jessegross): Embedding support
-			slog.Warn("generation of embedding outputs not yet supported", "id", activeBatch.id, "seqIdx", i)
+			seq.embedding <- outputs
 			s.removeSequence(i, llm.DoneReasonStop)
 			continue
 		}

 		// sample a token
-		vocabSize := len(logits) / len(activeBatch.batch.Outputs)
-		slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(logits), "len(activeBatch.batch.Outputs)", len(activeBatch.batch.Outputs), "vocabSize", vocabSize, "iBatches", iBatches)
-		token, err := seq.sampler.Sample(logits[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize])
+		vocabSize := len(outputs) / len(activeBatch.batch.Outputs)
+		logutil.Trace("computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(outputs), "len(activeBatch.batch.Outputs)", len(activeBatch.batch.Outputs), "vocabSize", vocabSize, "iBatches", iBatches)
+		token, err := seq.sampler.Sample(outputs[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize])
 		if err != nil {
 			s.hardErrCh <- fmt.Errorf("failed to sample token: %w", err)
 			return
@@ -712,7 +720,7 @@ func (s *Server) computeBatch(activeBatch batchState) {
 			// TODO (jmorganca): we should send this back
 			// as it's important for the /api/generate context
 			// seq.responses <- piece
-			slog.Log(context.TODO(), logutil.LevelTrace, "computeBatch: EOS", "batchID", activeBatch.id, "seqIdx", i)
+			logutil.Trace("computeBatch: EOS", "batchID", activeBatch.id, "seqIdx", i)
 			s.removeSequence(i, llm.DoneReasonStop)
 			continue
 		}
@@ -778,7 +786,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
 	if req.UseHarmony {
 		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
-		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(req.PrefillContent)
+		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(req.PrefillString)
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
 	}

@@ -814,7 +822,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		req.Options.TopP,
 		req.Options.MinP,
 		req.Options.Seed,
-		nil,
+		grammar,
 	)

 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
@@ -843,7 +851,7 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	found := false
 	for i, sq := range s.seqs {
 		if sq == nil {
-			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs)
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, true)
 			if err != nil {
 				s.mu.Unlock()
 				s.seqsSem.Release(1)
@@ -864,13 +872,10 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
 		return
 	}
+	var lastToken string
+	tokenRepeat := 0
+	const tokenRepeatLimit = 30

-	// TODO(parthsareen): generalize grammar enablement on the fly for all thinking models
-	if harmonyMessageHandler == nil {
-		seq.sampler.SetGrammar(grammar)
-	}
-
-	grammarSet := false
 	for {
 		select {
 		case <-r.Context().Done():
@@ -878,15 +883,22 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
+				if strings.TrimSpace(content) == lastToken {
+					tokenRepeat++
+				}
+				if tokenRepeat == tokenRepeatLimit {
+					http.Error(w, "token repeat limit reached", http.StatusInternalServerError)
+					seq.doneReason = llm.DoneReasonTokenRepeatLimit
+					close(seq.quit)
+					return
+				}
+				lastToken = strings.TrimSpace(content)
+
 				var thinking string
 				if harmonyMessageHandler != nil {
 					var toolContent string
 					content, thinking, toolContent = harmonyMessageHandler.AddContent(content, harmonyToolParser)
 					harmonyToolParser.Add(toolContent)
-					if harmonyMessageHandler.HarmonyParser.ConstrainAllowed && !grammarSet {
-						seq.sampler.SetGrammar(grammar)
-						grammarSet = true
-					}
 				}

 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
@@ -939,6 +951,67 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	}
 }

+func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
+	if s.model.Backend().Config().Uint("pooling_type", math.MaxUint32) == math.MaxUint32 {
+		http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
+		return
+	}
+
+	var req llm.EmbeddingRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
+		return
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
+	if err != nil {
+		http.Error(w, fmt.Sprintf("failed to create new sequence: %v", err), http.StatusInternalServerError)
+		return
+	}
+
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting embedding request due to client closing the connection")
+		} else {
+			http.Error(w, fmt.Sprintf("failed to acquire semaphore: %v", err), http.StatusInternalServerError)
+		}
+		return
+	}
+
+	s.mu.Lock()
+	found := false
+	for i, sq := range s.seqs {
+		if sq == nil {
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, false)
+			if err != nil {
+				s.mu.Unlock()
+				s.seqsSem.Release(1)
+				http.Error(w, fmt.Sprintf("failed to load cache: %v", err), http.StatusInternalServerError)
+				return
+			}
+
+			s.seqs[i] = seq
+			s.cond.Signal()
+			found = true
+			break
+		}
+	}
+	s.mu.Unlock()
+
+	if !found {
+		s.seqsSem.Release(1)
+		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
+		return
+	}
+
+	if err := json.NewEncoder(w).Encode(&llm.EmbeddingResponse{
+		Embedding: <-seq.embedding,
+	}); err != nil {
+		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
+	}
+}
+
 func (s *Server) health(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/json")
 	if err := json.NewEncoder(w).Encode(&llm.ServerStatusResponse{
@@ -1255,10 +1328,7 @@ func Execute(args []string) error {
 	mux := http.NewServeMux()
 	// TODO: support embeddings
 	mux.HandleFunc("POST /load", server.load)
-	mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
-		http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
-	})
-
+	mux.HandleFunc("POST /embedding", server.embeddings)
 	mux.HandleFunc("POST /completion", server.completion)
 	mux.HandleFunc("GET /health", server.health)

--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -25,10 +25,6 @@ type Sampler struct {
 	grammar     *GrammarSampler
 }

-func (s *Sampler) SetGrammar(grammar *GrammarSampler) {
-	s.grammar = grammar
-}
-
 func (s *Sampler) Sample(logits []float32) (int32, error) {
 	if len(logits) == 0 {
 		return -1, errors.New("sample: no logits provided to sample")
--- a/server/routes.go
+++ b/server/routes.go
@@ -1595,26 +1595,11 @@ func (s *Server) ChatHandler(c *gin.Context) {

 	processedTools := req.Tools
 	var functionNameMap *harmony.FunctionNameMap
-	var prefillContentOrThinking *bool
+	var prefillString string
+	// TODO(parthsareen): this can be abstracted to not be model specific and potentially moved to the runner
 	if useHarmony {
+		prefillString = harmony.Prefill(msgs[len(msgs)-1])
 		functionNameMap = harmony.NewFunctionNameMap()
-		var lastMessage *api.Message
-		if len(msgs) > 0 {
-			lastMessage = &msgs[len(msgs)-1]
-		}
-
-		// prefill content or thinking flag if the last message is an assistant message
-		if lastMessage != nil && lastMessage.Role == "assistant" {
-			if lastMessage.Content != "" {
-				trueVal := true
-				// true sets content to be prefilled
-				prefillContentOrThinking = &trueVal
-			} else if lastMessage.Thinking != "" {
-				// false sets thinking to be prefilled
-				falseVal := false
-				prefillContentOrThinking = &falseVal
-			}
-		}
 		// make a copy of tools to pass to the chat prompt. Function names may be
 		// renamed to be valid Harmony function names.
 		processedTools = make([]api.Tool, len(req.Tools))
@@ -1673,12 +1658,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		defer close(ch)

 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
-			Prompt:         prompt,
-			Images:         images,
-			Format:         req.Format,
-			Options:        opts,
-			UseHarmony:     useHarmony,
-			PrefillContent: prefillContentOrThinking,
+			Prompt:        prompt,
+			Images:        images,
+			Format:        req.Format,
+			Options:       opts,
+			UseHarmony:    useHarmony,
+			PrefillString: prefillString,
 		}, func(r llm.CompletionResponse) {
 			res := api.ChatResponse{
 				Model:     req.Model,
@@ -1699,10 +1684,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}

 			if useHarmony {
-				// only send messages with meaningful content (empty messages confuse clients)
 				for i, tool := range res.Message.ToolCalls {
 					res.Message.ToolCalls[i].Function.Name = functionNameMap.OriginalFromConverted(tool.Function.Name)
 				}
+				// only send messages with meaningful content (empty messages confuse clients)
 				if res.Message.Content != "" || res.Message.Thinking != "" || len(res.Message.ToolCalls) > 0 || res.Done {
 					ch <- res
 				}
Author	SHA1	Message	Date
Bruce MacDonald	f5c9eb5aa2	models: qwen3vl	2025-09-10 12:11:46 -07:00
Parth Sareen	20b53eaa72	tests: add tool calling integration test (#12232 )	2025-09-09 14:01:11 -07:00
Daniel Hiltgen	6745182885	tests: reduce stress on CPU to 2 models (#12161 ) * tests: reduce stress on CPU to 2 models This should avoid flakes due to systems getting overloaded with 3 (or more) models running concurrently * tests: allow slow systems to pass on timeout If a slow system is still streaming a response, and the response will pass validation, don't fail just because the system is slow. * test: unload embedding models more quickly	2025-09-09 09:32:15 -07:00
Kashyap Tanuku	f810ec741c	readme: add Clueless to community integrations (#12188 )	2025-09-08 21:31:29 -07:00
Jesse Gross	e119783e66	llm: Clamp batch size to context size The context must always be able to store the current batch, so if the user requests a small context then we should also shrink the batch to match. This also fixes the TestLongInputContext test on the new engine. (The old engine already has this behavior.)	2025-09-08 20:40:11 -07:00
Parth Sareen	1a558f98e2	runner: move harmony to runner (#12052 )	2025-09-08 15:07:59 -07:00
Gabe Goodhart	7b91c9ce51	Hybrid and recurrent memory estimates (#12186 ) This PR updates the memory size estimate logic to better handle recurrent and hybrid-recurrent models which are currently being badly overestimated because the default logic assumes full attention for all layers. The logic for the sizing of the recurrent layers comes from the llama.cpp implementation ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()mem_size); ggml_tensor s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size); Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>	2025-09-08 14:53:22 -07:00
Daniel Hiltgen	950d33aa30	docs: show how to debug nvidia init failures (#12216 ) This debug setting can help troubleshoot obscure initialization failures.	2025-09-08 11:39:00 -07:00
Michael Yang	9714e38dd0	fix: nil pointer dereference if cache is nil (#12215 )	2025-09-08 09:53:59 -07:00
frob	4378ae4ffa	parser: don't check the file type of safetensors to prevent false negatives. (#12176 ) * Don't check the file type of safetensor to prevent false negatives. --------- Co-authored-by: Patrick Devine <patrick@infrahq.com>	2025-09-05 16:27:40 -07:00
Michael Yang	5994e8e8fd	embedding gemma model (#12181 ) * ollama: add embeddings	2025-09-04 09:09:07 -07:00
Michael Yang	b3e6120736	more logutil.Trace (#12177 )	2025-09-03 17:24:39 -07:00