temp

2025-05-29 16:17:11 -07:00
69 changed files with 3187 additions and 3813 deletions
--- a/README.md
+++ b/README.md
@@ -40,10 +40,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 ## Quickstart

-To run and chat with [Gemma 3](https://ollama.com/library/gemma3):
+To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):

 ```shell
-ollama run gemma3
+ollama run llama3.2
 ```

 ## Model library
@@ -451,7 +451,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [orca-cli](https://github.com/molbal/orca-cli) Ollama Registry CLI Application - Browse, pull, and download models from Ollama Registry in your terminal.
 - [GGUF-to-Ollama](https://github.com/jonathanhecl/gguf-to-ollama) - Importing GGUF to Ollama made easy (multiplatform)
 - [AWS-Strands-With-Ollama](https://github.com/rapidarchitect/ollama_strands) - AWS Strands Agents with Ollama Examples
- [ollama-multirun](https://github.com/attogram/ollama-multirun) - A bash shell script to run a single prompt against any or all of your locally installed ollama models, saving the output and performance statistics as easily navigable web pages. ([Demo](https://attogram.github.io/ai_test_zone/))

 ### Apple Vision Pro

@@ -588,7 +587,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Simple-Discord-AI](https://github.com/zyphixor/simple-discord-ai)
 - [LLM Telegram Bot](https://github.com/innightwolfsleep/llm_telegram_bot) (telegram bot, primary for RP. Oobabooga-like buttons, [A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) API integration e.t.c)
 - [mcp-llm](https://github.com/sammcj/mcp-llm) (MCP Server to allow LLMs to call other LLMs)
- [SimpleOllamaUnity](https://github.com/HardCodeDev777/SimpleOllamaUnity) (Unity Engine extension for communicating with Ollama in a few lines of code. Also works at runtime)
 - [UnityCodeLama](https://github.com/HardCodeDev777/UnityCodeLama) (Unity Edtior tool to analyze scripts via Ollama)

 ### Supported backends
--- a/cmd/start_darwin.go
+++ b/cmd/start_darwin.go
@@ -5,7 +5,7 @@ import (
 	"errors"
 	"os"
 	"os/exec"
-	"regexp"
+	"strings"

 	"github.com/ollama/ollama/api"
 )
@@ -19,12 +19,11 @@ func startApp(ctx context.Context, client *api.Client) error {
 	if err != nil {
 		return err
 	}
-	r := regexp.MustCompile(`^.*/Ollama\s?\d*.app`)
-	m := r.FindStringSubmatch(link)
-	if len(m) != 1 {
+	if !strings.Contains(link, "Ollama.app") {
 		return errors.New("could not find ollama app")
 	}
-	if err := exec.Command("/usr/bin/open", "-j", "-a", m[0], "--args", "--fast-startup").Run(); err != nil {
+	path := strings.Split(link, "Ollama.app")
+	if err := exec.Command("/usr/bin/open", "-a", path[0]+"Ollama.app").Run(); err != nil {
 		return err
 	}
 	return waitForServer(ctx, client)
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -45,11 +45,14 @@ func startApp(ctx context.Context, client *api.Client) error {
 			}
 		}
 	}
+	// log.Printf("XXX attempting to start app %s", appExe)

 	cmd_path := "c:\\Windows\\system32\\cmd.exe"
-	cmd := exec.Command(cmd_path, "/c", appExe, "--hide", "--fast-startup")
+	cmd := exec.Command(cmd_path, "/c", appExe)
+	// TODO - these hide flags aren't working - still pops up a command window for some reason
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: 0x08000000, HideWindow: true}

+	// TODO this didn't help either...
 	cmd.Stdin = strings.NewReader("")
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -71,16 +74,7 @@ func isProcRunning(procName string) []uint32 {
 		slog.Debug("failed to check for running installers", "error", err)
 		return nil
 	}
-	if ret > uint32(len(pids)) {
-		pids = make([]uint32, ret+10)
-		if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
-			slog.Debug("failed to check for running installers", "error", err)
-			return nil
-		}
-	}
-	if ret < uint32(len(pids)) {
-		pids = pids[:ret]
-	}
+	pids = pids[:ret]
 	var matches []uint32
 	for _, pid := range pids {
 		if pid == 0 {
--- a/convert/convert_qwen25vl.go
+++ b/convert/convert_qwen25vl.go
@@ -65,17 +65,17 @@ func (q *qwen25VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	for _, t := range ts {
 		if strings.Contains(t.Name(), "patch_embed.proj") {
 			for t := range splitDim(t, 2,
-				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_0")},
-				split{Replacer: strings.NewReplacer("patch_embed.proj", "patch_embd_1")},
+				strings.NewReplacer("patch_embed.proj", "patch_embd_0"),
+				strings.NewReplacer("patch_embed.proj", "patch_embd_1"),
 			) {
 				t.Shape = slices.DeleteFunc(t.Shape, func(i uint64) bool { return i == 1 })
 				out = append(out, t)
 			}
 		} else if strings.Contains(t.Name(), "attn.qkv") {
 			out = append(out, slices.Collect(splitDim(t, 0,
-				split{Replacer: strings.NewReplacer("attn.qkv", "attn_q")},
-				split{Replacer: strings.NewReplacer("attn.qkv", "attn_k")},
-				split{Replacer: strings.NewReplacer("attn.qkv", "attn_v")},
+				strings.NewReplacer("attn.qkv", "attn_q"),
+				strings.NewReplacer("attn.qkv", "attn_k"),
+				strings.NewReplacer("attn.qkv", "attn_v"),
 			))...)
 		} else {
 			out = append(out, &ggml.Tensor{
--- a/convert/tensor.go
+++ b/convert/tensor.go
@@ -1,73 +1,53 @@
 package convert

 import (
-	"cmp"
 	"iter"
 	"slices"
 	"strings"

+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
-
-	"github.com/ollama/ollama/fs/ggml"
 )

-type split struct {
-	*strings.Replacer
-	dim int
-
-	// fn is an optional function to apply to the tensor after slicing
-	fn func(tensor.Tensor) (tensor.Tensor, error)
-}
-
 // splitDim splits a tensor along a specified dimension into multiple tensors. The dimension
-// is split evenly based on the number of replacers provided unless a specific count is given.
-func splitDim(t Tensor, dim int, splits ...split) iter.Seq[*ggml.Tensor] {
+// is split evenly based on the number of replacers provided.
+func splitDim(t Tensor, dim int, replacers ...*strings.Replacer) iter.Seq[*ggml.Tensor] {
 	return func(yield func(*ggml.Tensor) bool) {
-		var offset int
-		for _, split := range splits {
-			t := t.Clone()
+		for i, replacer := range replacers {
 			shape := slices.Clone(t.Shape())
-			shape[dim] = cmp.Or(uint64(split.dim), shape[dim]/uint64(len(splits)))
+			shape[dim] = shape[dim] / uint64(len(replacers))

 			slice := slices.Repeat([]tensor.Slice{nil}, len(shape))
-			slice[dim] = tensor.S(offset, offset+int(shape[dim]))
-			offset += int(shape[dim])
+			slice[dim] = tensor.S(i*int(shape[dim]), (i+1)*int(shape[dim]))

-			t.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
+			tt := t.Clone()
+			tt.SetRepacker(func(_ string, data []float32, shape []uint64) ([]float32, error) {
 				dims := make([]int, len(shape))
 				for i := range shape {
 					dims[i] = int(shape[i])
 				}

-				var tt tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
-				tt, err := tt.Slice(slice...)
+				var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
+				t, err := t.Slice(slice...)
 				if err != nil {
 					return nil, err
 				}

-				tt = tensor.Materialize(tt)
-
-				if split.fn != nil {
-					tt, err = split.fn(tt)
-					if err != nil {
-						return nil, err
-					}
-				}
-
+				t = tensor.Materialize(t)
 				// flatten tensor so it can be written as a vector
-				if err := tt.Reshape(tt.Shape().TotalSize()); err != nil {
+				if err := t.Reshape(t.Shape().TotalSize()); err != nil {
 					return nil, err
 				}

-				return native.VectorF32(tt.(*tensor.Dense))
+				return native.VectorF32(t.(*tensor.Dense))
 			})

 			if !yield(&ggml.Tensor{
-				Name:     split.Replace(t.Name()),
+				Name:     replacer.Replace(t.Name()),
 				Kind:     t.Kind(),
 				Shape:    shape,
-				WriterTo: t,
+				WriterTo: tt,
 			}) {
 				break
 			}
--- a/convert/tensor_test.go
+++ b/convert/tensor_test.go
@@ -1,304 +0,0 @@
-package convert
-
-import (
-	"bytes"
-	"encoding/binary"
-	"io"
-	"iter"
-	"slices"
-	"strings"
-	"testing"
-
-	"github.com/pdevine/tensor"
-)
-
-type fakeTensor struct {
-	name  string
-	shape []uint64
-	data  []float32
-
-	repacker Repacker
-}
-
-func (f fakeTensor) Name() string {
-	return f.name
-}
-
-func (f fakeTensor) Shape() []uint64 {
-	return f.shape
-}
-
-func (f fakeTensor) Kind() uint32 {
-	return 0
-}
-
-func (f *fakeTensor) SetRepacker(fn Repacker) {
-	f.repacker = fn
-}
-
-func (f fakeTensor) Clone() Tensor {
-	return &fakeTensor{
-		name:     f.name,
-		shape:    slices.Clone(f.shape),
-		data:     slices.Clone(f.data),
-		repacker: f.repacker,
-	}
-}
-
-func (f fakeTensor) WriteTo(w io.Writer) (n int64, err error) {
-	data := f.data
-	if f.repacker != nil {
-		data, err = f.repacker(f.name, data, f.shape)
-		if err != nil {
-			return 0, err
-		}
-	}
-
-	if err := binary.Write(w, binary.LittleEndian, data); err != nil {
-		return 0, err
-	}
-
-	return int64(len(data) * 4), nil
-}
-
-func mul(shape []uint64) int {
-	n := 1
-	for _, dim := range shape {
-		n *= int(dim)
-	}
-	return n
-}
-
-func TestSplitDim(t *testing.T) {
-	r := fakeTensor{
-		name:  "a.b",
-		shape: []uint64{3, 4},
-		data:  []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
-	}
-
-	t.Run("no split", func(t *testing.T) {
-		for tt := range splitDim(&r, 0, split{Replacer: strings.NewReplacer("a", "x")}) {
-			if tt.Name != "x.b" {
-				t.Fatalf("expected name 'x', got '%s'", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 4}) {
-				t.Fatalf("expected shape [3, 4], got %v", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) {
-				t.Fatalf("expected data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], got %v", f32s)
-			}
-		}
-	})
-
-	t.Run("even split", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 1,
-			split{Replacer: strings.NewReplacer("a", "x")},
-			split{Replacer: strings.NewReplacer("b", "y")},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
-				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
-			}
-		}
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{2, 3, 6, 7, 10, 11}) {
-				t.Fatal("expected data [2, 3, 6, 7, 10, 11], got", f32s)
-			}
-		}
-	})
-
-	t.Run("uneven split", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 0,
-			split{Replacer: strings.NewReplacer("a", "x"), dim: 2},
-			split{Replacer: strings.NewReplacer("b", "y"), dim: 1},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{2, 4}) {
-				t.Fatal("expected shape [2, 4], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 2, 3, 4, 5, 6, 7}) {
-				t.Fatal("expected data [0, 1, 2, 3, 4, 5, 6, 7], got", f32s)
-			}
-		}
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{1, 4}) {
-				t.Fatal("expected shape [1, 4], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{8, 9, 10, 11}) {
-				t.Fatal("expected data [8, 9, 10, 11], got", f32s)
-			}
-		}
-	})
-
-	t.Run("split with transpose", func(t *testing.T) {
-		next, stop := iter.Pull(splitDim(&r, 1,
-			split{Replacer: strings.NewReplacer("a", "x")},
-			split{Replacer: strings.NewReplacer("b", "y"), fn: func(tt tensor.Tensor) (tensor.Tensor, error) {
-				return tensor.Transpose(tt, 1, 0)
-			}},
-		))
-		defer stop()
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "x.b" {
-				t.Fatal("expected name 'x.b', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{0, 1, 4, 5, 8, 9}) {
-				t.Fatal("expected data [0, 1, 4, 5, 8, 9], got", f32s)
-			}
-		}
-
-		{
-			tt, ok := next()
-			if !ok {
-				t.Fatal("expected at least one split")
-			}
-
-			if tt.Name != "a.y" {
-				t.Fatal("expected name 'a.y', got", tt.Name)
-			}
-
-			if !slices.Equal(tt.Shape, []uint64{3, 2}) {
-				t.Fatal("expected shape [3, 2], got", tt.Shape)
-			}
-
-			var b bytes.Buffer
-			if _, err := tt.WriteTo(&b); err != nil {
-				t.Fatal(err)
-			}
-
-			f32s := make([]float32, mul(tt.Shape))
-			if err := binary.Read(&b, binary.LittleEndian, &f32s); err != nil {
-				t.Fatal(err)
-			}
-
-			if !slices.Equal(f32s, []float32{2, 6, 10, 3, 7, 11}) {
-				t.Fatal("expected data [2, 6, 10, 3, 7, 11], got", f32s)
-			}
-		}
-	})
-}
--- a/docs/development.md
+++ b/docs/development.md
@@ -118,7 +118,7 @@ To run tests, use `go test`:
 go test ./...
 ```

-> NOTE: In rare cirumstances, you may need to change a package using the new
+> NOTE: In rare cirumstances, you may nedd to change a package using the new
 > "synctest" package in go1.24.
 >
 > If you do not have the "synctest" package enabled, you will not see build or
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -112,8 +112,8 @@ sudo systemctl status ollama
 > While AMD has contributed the `amdgpu` driver upstream to the official linux
 > kernel source, the version is older and may not support all ROCm features. We
 > recommend you install the latest driver from
-> [AMD](https://www.amd.com/en/support/download/linux-drivers.html) for best support
-> of your Radeon GPU.
+> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
+> GPU.

 ## Customizing

--- a/fs/gguf/gguf.go
+++ b/fs/gguf/gguf.go
@@ -1,347 +0,0 @@
-package gguf
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-	"iter"
-	"os"
-	"slices"
-	"strings"
-)
-
-const (
-	typeUint8 uint32 = iota
-	typeInt8
-	typeUint16
-	typeInt16
-	typeUint32
-	typeInt32
-	typeFloat32
-	typeBool
-	typeString
-	typeArray
-	typeUint64
-	typeInt64
-	typeFloat64
-)
-
-var ErrUnsupported = errors.New("unsupported")
-
-type File struct {
-	Magic   [4]byte
-	Version uint32
-
-	keyValues *lazy[KeyValue]
-	tensors   *lazy[TensorInfo]
-	offset    int64
-
-	file   *os.File
-	reader *bufferedReader
-	bts    []byte
-}
-
-func Open(path string) (f *File, err error) {
-	f = &File{bts: make([]byte, 4096)}
-	f.file, err = os.Open(path)
-	if err != nil {
-		return nil, err
-	}
-
-	f.reader = newBufferedReader(f.file, 32<<10)
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Magic); err != nil {
-		return nil, err
-	}
-
-	if bytes.Equal(f.Magic[:], []byte("gguf")) {
-		return nil, fmt.Errorf("%w file type %v", ErrUnsupported, f.Magic)
-	}
-
-	if err := binary.Read(f.reader, binary.LittleEndian, &f.Version); err != nil {
-		return nil, err
-	}
-
-	if f.Version != 3 {
-		return nil, fmt.Errorf("%w version %v", ErrUnsupported, f.Version)
-	}
-
-	f.tensors, err = newLazy(f, f.readTensor)
-	if err != nil {
-		return nil, err
-	}
-
-	f.tensors.successFunc = func() error {
-		offset := f.reader.offset
-
-		alignment := cmp.Or(f.KeyValue("general.alignment").Int(), 32)
-		f.offset = offset + (alignment-offset%alignment)%alignment
-		return nil
-	}
-
-	f.keyValues, err = newLazy(f, f.readKeyValue)
-	if err != nil {
-		return nil, err
-	}
-
-	return f, nil
-}
-
-func (f *File) readTensor() (TensorInfo, error) {
-	name, err := readString(f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	dims, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	shape := make([]uint64, dims)
-	for i := range dims {
-		shape[i], err = read[uint64](f)
-		if err != nil {
-			return TensorInfo{}, err
-		}
-	}
-
-	type_, err := read[uint32](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	offset, err := read[uint64](f)
-	if err != nil {
-		return TensorInfo{}, err
-	}
-
-	return TensorInfo{
-		Name:   name,
-		Offset: offset,
-		Shape:  shape,
-		Type:   TensorType(type_),
-	}, nil
-}
-
-func (f *File) readKeyValue() (KeyValue, error) {
-	key, err := readString(f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	t, err := read[uint32](f)
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	value, err := func() (any, error) {
-		switch t {
-		case typeUint8:
-			return read[uint8](f)
-		case typeInt8:
-			return read[int8](f)
-		case typeUint16:
-			return read[uint16](f)
-		case typeInt16:
-			return read[int16](f)
-		case typeUint32:
-			return read[uint32](f)
-		case typeInt32:
-			return read[int32](f)
-		case typeUint64:
-			return read[uint64](f)
-		case typeInt64:
-			return read[int64](f)
-		case typeFloat32:
-			return read[float32](f)
-		case typeFloat64:
-			return read[float64](f)
-		case typeBool:
-			return read[bool](f)
-		case typeString:
-			return readString(f)
-		case typeArray:
-			return readArray(f)
-		default:
-			return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-		}
-	}()
-	if err != nil {
-		return KeyValue{}, err
-	}
-
-	return KeyValue{
-		Key:   key,
-		Value: Value{value},
-	}, nil
-}
-
-func read[T any](f *File) (t T, err error) {
-	err = binary.Read(f.reader, binary.LittleEndian, &t)
-	return t, err
-}
-
-func readString(f *File) (string, error) {
-	n, err := read[uint64](f)
-	if err != nil {
-		return "", err
-	}
-
-	if int(n) > len(f.bts) {
-		f.bts = make([]byte, n)
-	}
-
-	bts := f.bts[:n]
-	if _, err := io.ReadFull(f.reader, bts); err != nil {
-		return "", err
-	}
-	defer clear(bts)
-
-	return string(bts), nil
-}
-
-func readArray(f *File) (any, error) {
-	t, err := read[uint32](f)
-	if err != nil {
-		return nil, err
-	}
-
-	n, err := read[uint64](f)
-	if err != nil {
-		return nil, err
-	}
-
-	switch t {
-	case typeUint8:
-		return readArrayData[uint8](f, n)
-	case typeInt8:
-		return readArrayData[int8](f, n)
-	case typeUint16:
-		return readArrayData[uint16](f, n)
-	case typeInt16:
-		return readArrayData[int16](f, n)
-	case typeUint32:
-		return readArrayData[uint32](f, n)
-	case typeInt32:
-		return readArrayData[int32](f, n)
-	case typeUint64:
-		return readArrayData[uint64](f, n)
-	case typeInt64:
-		return readArrayData[int64](f, n)
-	case typeFloat32:
-		return readArrayData[float32](f, n)
-	case typeFloat64:
-		return readArrayData[float64](f, n)
-	case typeBool:
-		return readArrayData[bool](f, n)
-	case typeString:
-		return readArrayString(f, n)
-	default:
-		return nil, fmt.Errorf("%w type %d", ErrUnsupported, t)
-	}
-}
-
-func readArrayData[T any](f *File, n uint64) (s []T, err error) {
-	s = make([]T, n)
-	for i := range n {
-		e, err := read[T](f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func readArrayString(f *File, n uint64) (s []string, err error) {
-	s = make([]string, n)
-	for i := range n {
-		e, err := readString(f)
-		if err != nil {
-			return nil, err
-		}
-
-		s[i] = e
-	}
-
-	return s, nil
-}
-
-func (f *File) Close() error {
-	f.keyValues.stop()
-	f.tensors.stop()
-	return f.file.Close()
-}
-
-func (f *File) KeyValue(key string) KeyValue {
-	if !strings.HasPrefix(key, "general.") && !strings.HasPrefix(key, "tokenizer.") {
-		key = f.KeyValue("general.architecture").String() + "." + key
-	}
-
-	if index := slices.IndexFunc(f.keyValues.values, func(kv KeyValue) bool {
-		return kv.Key == key
-	}); index >= 0 {
-		return f.keyValues.values[index]
-	}
-
-	for keyValue, ok := f.keyValues.next(); ok; keyValue, ok = f.keyValues.next() {
-		if keyValue.Key == key {
-			return keyValue
-		}
-	}
-
-	return KeyValue{}
-}
-
-func (f *File) NumKeyValues() int {
-	return int(f.keyValues.count)
-}
-
-func (f *File) KeyValues() iter.Seq2[int, KeyValue] {
-	return f.keyValues.All()
-}
-
-func (f *File) TensorInfo(name string) TensorInfo {
-	if index := slices.IndexFunc(f.tensors.values, func(t TensorInfo) bool {
-		return t.Name == name
-	}); index >= 0 {
-		return f.tensors.values[index]
-	}
-
-	// fast-forward through key values if we haven't already
-	_ = f.keyValues.rest()
-	for tensor, ok := f.tensors.next(); ok; tensor, ok = f.tensors.next() {
-		if tensor.Name == name {
-			return tensor
-		}
-	}
-
-	return TensorInfo{}
-}
-
-func (f *File) NumTensors() int {
-	return int(f.tensors.count)
-}
-
-func (f *File) TensorInfos() iter.Seq2[int, TensorInfo] {
-	// fast forward through key values if we haven't already
-	f.keyValues.rest()
-	return f.tensors.All()
-}
-
-func (f *File) TensorReader(name string) (TensorInfo, io.Reader, error) {
-	t := f.TensorInfo(name)
-	if t.NumBytes() == 0 {
-		return TensorInfo{}, nil, fmt.Errorf("tensor %s not found", name)
-	}
-
-	// fast forward through tensor info if we haven't already
-	_ = f.tensors.rest()
-	return t, io.NewSectionReader(f.file, f.offset+int64(t.Offset), t.NumBytes()), nil
-}
--- a/fs/gguf/gguf_test.go
+++ b/fs/gguf/gguf_test.go
@@ -1,249 +0,0 @@
-package gguf_test
-
-import (
-	"bytes"
-	"os"
-	"strconv"
-	"strings"
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-	"github.com/google/go-cmp/cmp/cmpopts"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/fs/gguf"
-)
-
-func createBinFile(tb testing.TB) string {
-	tb.Helper()
-	f, err := os.CreateTemp(tb.TempDir(), "")
-	if err != nil {
-		tb.Fatal(err)
-	}
-	defer f.Close()
-
-	kv := ggml.KV{
-		"general.architecture":                   "llama",
-		"llama.block_count":                      uint32(8),
-		"llama.embedding_length":                 uint32(3),
-		"llama.attention.head_count":             uint32(2),
-		"llama.attention.head_count_kv":          uint32(2),
-		"llama.attention.key_length":             uint32(3),
-		"llama.rope.dimension_count":             uint32(4),
-		"llama.rope.freq_base":                   float32(10000.0),
-		"llama.rope.freq_scale":                  float32(1.0),
-		"llama.attention.layer_norm_rms_epsilon": float32(1e-6),
-		"tokenizer.ggml.eos_token_id":            uint32(0),
-		"tokenizer.ggml.eos_token_ids":           []int32{1, 2, 3},
-		"tokenizer.ggml.tokens":                  []string{"hello", "world"},
-		"tokenizer.ggml.scores":                  []float32{0, 1},
-	}
-
-	tensors := []*ggml.Tensor{
-		{
-			Name:     "token_embd.weight",
-			Kind:     0,
-			Shape:    []uint64{2, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*2*3)),
-		},
-		{
-			Name:     "output.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 2},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*2)),
-		},
-	}
-
-	for i := range 8 {
-		tensors = append(tensors, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_q.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_k.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_v.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		}, &ggml.Tensor{
-			Name:     "blk." + strconv.Itoa(i) + ".attn_output.weight",
-			Kind:     0,
-			Shape:    []uint64{3, 3},
-			WriterTo: bytes.NewBuffer(make([]byte, 4*3*3)),
-		})
-	}
-
-	if err := ggml.WriteGGUF(f, kv, tensors); err != nil {
-		tb.Fatal(err)
-	}
-
-	return f.Name()
-}
-
-func TestRead(t *testing.T) {
-	f, err := gguf.Open(createBinFile(t))
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer f.Close()
-
-	if got := f.KeyValue("does.not.exist").Valid(); got {
-		t.Errorf(`KeyValue("does.not.exist").Exists() = %v, want false`, got)
-	}
-
-	if got := f.KeyValue("general.architecture").String(); got != "llama" {
-		t.Errorf(`KeyValue("general.architecture").String() = %q, want %q`, got, "llama")
-	}
-
-	if got := f.TensorInfo("token_embd.weight"); got.Name != "token_embd.weight" {
-		t.Errorf(`TensorInfo("token_embd.weight").Name = %q, want %q`, got.Name, "token_embd.weight")
-	} else if diff := cmp.Diff(got.Shape, []uint64{2, 3}); diff != "" {
-		t.Errorf(`TensorInfo("token_embd.weight").Shape mismatch (-got +want):\n%s`, diff)
-	} else if got.Type != gguf.TensorTypeF32 {
-		t.Errorf(`TensorInfo("token_embd.weight").Type = %d, want %d`, got.Type, gguf.TensorTypeF32)
-	}
-
-	if got := f.KeyValue("block_count").Uint(); got != 8 {
-		t.Errorf(`KeyValue("block_count").Uint() = %d, want %d`, got, 8)
-	}
-
-	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.tokens").Strings(), []string{"hello", "world"}); diff != "" {
-		t.Errorf("KeyValue(\"tokenizer.ggml.tokens\").Strings() mismatch (-got +want):\n%s", diff)
-	}
-
-	if diff := cmp.Diff(f.KeyValue("tokenizer.ggml.scores").Floats(), []float64{0, 1}); diff != "" {
-		t.Errorf("KeyValue(\"tokenizer.ggml.scores\").Ints() mismatch (-got +want):\n%s", diff)
-	}
-
-	var kvs []string
-	for _, kv := range f.KeyValues() {
-		if !kv.Valid() {
-			t.Error("found invalid key-value pair:", kv)
-		}
-
-		kvs = append(kvs, kv.Key)
-	}
-
-	if len(kvs) != f.NumKeyValues() {
-		t.Errorf("iterated key count = %d, want %d", len(kvs), f.NumKeyValues())
-	}
-
-	if diff := cmp.Diff(kvs, []string{
-		"general.architecture",
-		"llama.block_count",
-		"llama.embedding_length",
-		"llama.attention.head_count",
-		"llama.attention.head_count_kv",
-		"llama.attention.key_length",
-		"llama.rope.dimension_count",
-		"llama.rope.freq_base",
-		"llama.rope.freq_scale",
-		"llama.attention.layer_norm_rms_epsilon",
-		"tokenizer.ggml.eos_token_id",
-		"tokenizer.ggml.eos_token_ids",
-		"tokenizer.ggml.tokens",
-		"tokenizer.ggml.scores",
-	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
-		t.Errorf("KeyValues() mismatch (-got +want):\n%s", diff)
-	}
-
-	var tis []string
-	for _, ti := range f.TensorInfos() {
-		if !ti.Valid() {
-			t.Error("found invalid tensor info:", ti)
-		}
-
-		tis = append(tis, ti.Name)
-	}
-
-	if len(tis) != f.NumTensors() {
-		t.Errorf("iterated tensor count = %d, want %d", len(tis), f.NumTensors())
-	}
-
-	if diff := cmp.Diff(tis, []string{
-		"token_embd.weight",
-		"output.weight",
-		"blk.0.attn_q.weight",
-		"blk.0.attn_k.weight",
-		"blk.0.attn_v.weight",
-		"blk.0.attn_output.weight",
-		"blk.1.attn_q.weight",
-		"blk.1.attn_k.weight",
-		"blk.1.attn_v.weight",
-		"blk.1.attn_output.weight",
-		"blk.2.attn_q.weight",
-		"blk.2.attn_k.weight",
-		"blk.2.attn_v.weight",
-		"blk.2.attn_output.weight",
-		"blk.3.attn_q.weight",
-		"blk.3.attn_k.weight",
-		"blk.3.attn_v.weight",
-		"blk.3.attn_output.weight",
-		"blk.4.attn_q.weight",
-		"blk.4.attn_k.weight",
-		"blk.4.attn_v.weight",
-		"blk.4.attn_output.weight",
-		"blk.5.attn_q.weight",
-		"blk.5.attn_k.weight",
-		"blk.5.attn_v.weight",
-		"blk.5.attn_output.weight",
-		"blk.6.attn_q.weight",
-		"blk.6.attn_k.weight",
-		"blk.6.attn_v.weight",
-		"blk.6.attn_output.weight",
-		"blk.7.attn_q.weight",
-		"blk.7.attn_k.weight",
-		"blk.7.attn_v.weight",
-		"blk.7.attn_output.weight",
-	}, cmpopts.SortSlices(strings.Compare)); diff != "" {
-		t.Errorf("TensorInfos() mismatch (-got +want):\n%s", diff)
-	}
-
-	ti, r, err := f.TensorReader("output.weight")
-	if err != nil {
-		t.Fatalf(`TensorReader("output.weight") error: %v`, err)
-	}
-
-	if ti.Name != "output.weight" {
-		t.Errorf(`TensorReader("output.weight").Name = %q, want %q`, ti.Name, "output.weight")
-	} else if diff := cmp.Diff(ti.Shape, []uint64{3, 2}); diff != "" {
-		t.Errorf(`TensorReader("output.weight").Shape mismatch (-got +want):\n%s`, diff)
-	} else if ti.Type != gguf.TensorTypeF32 {
-		t.Errorf(`TensorReader("output.weight").Type = %d, want %d`, ti.Type, gguf.TensorTypeF32)
-	}
-
-	var b bytes.Buffer
-	if _, err := b.ReadFrom(r); err != nil {
-		t.Fatalf(`ReadFrom TensorReader("output.weight") error: %v`, err)
-	}
-
-	if b.Len() != int(ti.NumBytes()) {
-		t.Errorf(`ReadFrom TensorReader("output.weight") length = %d, want %d`, b.Len(), ti.NumBytes())
-	}
-}
-
-func BenchmarkRead(b *testing.B) {
-	b.ReportAllocs()
-
-	p := createBinFile(b)
-	for b.Loop() {
-		f, err := gguf.Open(p)
-		if err != nil {
-			b.Fatal(err)
-		}
-
-		if got := f.KeyValue("general.architecture").String(); got != "llama" {
-			b.Errorf("got = %q, want %q", got, "llama")
-		}
-
-		// Iterate through some tensors
-		for range f.TensorInfos() {
-		}
-
-		f.Close()
-	}
-}
--- a/fs/gguf/keyvalue.go
+++ b/fs/gguf/keyvalue.go
@@ -1,90 +0,0 @@
-package gguf
-
-import (
-	"reflect"
-	"slices"
-)
-
-type KeyValue struct {
-	Key string
-	Value
-}
-
-func (kv KeyValue) Valid() bool {
-	return kv.Key != "" && kv.Value.value != nil
-}
-
-type Value struct {
-	value any
-}
-
-func value[T any](v Value, kinds ...reflect.Kind) (t T) {
-	vv := reflect.ValueOf(v.value)
-	if slices.Contains(kinds, vv.Kind()) {
-		t = vv.Convert(reflect.TypeOf(t)).Interface().(T)
-	}
-	return
-}
-
-func values[T any](v Value, kinds ...reflect.Kind) (ts []T) {
-	switch vv := reflect.ValueOf(v.value); vv.Kind() {
-	case reflect.Slice:
-		if slices.Contains(kinds, vv.Type().Elem().Kind()) {
-			ts = make([]T, vv.Len())
-			for i := range vv.Len() {
-				ts[i] = vv.Index(i).Convert(reflect.TypeOf(ts[i])).Interface().(T)
-			}
-		}
-	}
-	return
-}
-
-// Int returns Value as a signed integer. If it is not a signed integer, it returns 0.
-func (v Value) Int() int64 {
-	return value[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Ints returns Value as a signed integer slice. If it is not a signed integer slice, it returns nil.
-func (v Value) Ints() (i64s []int64) {
-	return values[int64](v, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64)
-}
-
-// Uint converts an unsigned integer value to uint64. If the value is not a unsigned integer, it returns 0.
-func (v Value) Uint() uint64 {
-	return value[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Uints returns Value as a unsigned integer slice. If it is not a unsigned integer slice, it returns nil.
-func (v Value) Uints() (u64s []uint64) {
-	return values[uint64](v, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64)
-}
-
-// Float returns Value as a float. If it is not a float, it returns 0.
-func (v Value) Float() float64 {
-	return value[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Floats returns Value as a float slice. If it is not a float slice, it returns nil.
-func (v Value) Floats() (f64s []float64) {
-	return values[float64](v, reflect.Float32, reflect.Float64)
-}
-
-// Bool returns Value as a boolean. If it is not a boolean, it returns false.
-func (v Value) Bool() bool {
-	return value[bool](v, reflect.Bool)
-}
-
-// Bools returns Value as a boolean slice. If it is not a boolean slice, it returns nil.
-func (v Value) Bools() (bools []bool) {
-	return values[bool](v, reflect.Bool)
-}
-
-// String returns Value as a string. If it is not a string, it returns an empty string.
-func (v Value) String() string {
-	return value[string](v, reflect.String)
-}
-
-// Strings returns Value as a string slice. If it is not a string slice, it returns nil.
-func (v Value) Strings() (strings []string) {
-	return values[string](v, reflect.String)
-}
--- a/fs/gguf/keyvalue_test.go
+++ b/fs/gguf/keyvalue_test.go
@@ -1,208 +0,0 @@
-package gguf
-
-import (
-	"testing"
-
-	"github.com/google/go-cmp/cmp"
-)
-
-func split(name string, values map[string][]any) (matched []any, unmatched []any) {
-	for key, value := range values {
-		if key == name {
-			matched = value
-		} else {
-			unmatched = append(unmatched, value...)
-		}
-	}
-	return
-}
-
-func TestValue(t *testing.T) {
-	values := map[string][]any{
-		"int64":   {int(42), int8(42), int16(42), int32(42), int64(42)},
-		"uint64":  {uint(42), uint8(42), uint16(42), uint32(42), uint64(42)},
-		"float64": {float32(42), float64(42)},
-		"string":  {"42", "hello"},
-		"bool":    {true, false},
-	}
-
-	t.Run("int64", func(t *testing.T) {
-		matched, unmatched := split("int64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 42 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64 := kv.Int(); i64 != 0 {
-				t.Errorf("expected 42, got %d", i64)
-			}
-		}
-	})
-
-	t.Run("uint64", func(t *testing.T) {
-		matched, unmatched := split("uint64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 42 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64 := kv.Uint(); u64 != 0 {
-				t.Errorf("expected 42, got %d", u64)
-			}
-		}
-	})
-
-	t.Run("float64", func(t *testing.T) {
-		matched, unmatched := split("float64", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 42 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64 := kv.Float(); f64 != 0 {
-				t.Errorf("expected 42, got %f", f64)
-			}
-		}
-	})
-
-	t.Run("string", func(t *testing.T) {
-		matched, unmatched := split("string", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != v {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.String(); s != "" {
-				t.Errorf("expected 42, got %s", s)
-			}
-		}
-	})
-
-	t.Run("bool", func(t *testing.T) {
-		matched, unmatched := split("bool", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != v {
-				t.Errorf("expected true, got %v", b)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bool(); b != false {
-				t.Errorf("expected false, got %v", b)
-			}
-		}
-	})
-}
-
-func TestValues(t *testing.T) {
-	values := map[string][]any{
-		"int64s":   {[]int{42}, []int8{42}, []int16{42}, []int32{42}, []int64{42}},
-		"uint64s":  {[]uint{42}, []uint8{42}, []uint16{42}, []uint32{42}, []uint64{42}},
-		"float64s": {[]float32{42}, []float64{42}},
-		"strings":  {[]string{"42"}, []string{"hello"}},
-		"bools":    {[]bool{true}, []bool{false}},
-	}
-
-	t.Run("int64s", func(t *testing.T) {
-		matched, unmatched := split("int64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Ints(), []int64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if i64s := kv.Ints(); i64s != nil {
-				t.Errorf("expected nil, got %v", i64s)
-			}
-		}
-	})
-
-	t.Run("uint64s", func(t *testing.T) {
-		matched, unmatched := split("uint64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Uints(), []uint64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if u64s := kv.Uints(); u64s != nil {
-				t.Errorf("expected nil, got %v", u64s)
-			}
-		}
-	})
-
-	t.Run("float64s", func(t *testing.T) {
-		matched, unmatched := split("float64s", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Floats(), []float64{42}); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if f64s := kv.Floats(); f64s != nil {
-				t.Errorf("expected nil, got %v", f64s)
-			}
-		}
-	})
-
-	t.Run("strings", func(t *testing.T) {
-		matched, unmatched := split("strings", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Strings(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if s := kv.Strings(); s != nil {
-				t.Errorf("expected nil, got %v", s)
-			}
-		}
-	})
-
-	t.Run("bools", func(t *testing.T) {
-		matched, unmatched := split("bools", values)
-		for _, v := range matched {
-			kv := KeyValue{"key", Value{v}}
-			if diff := cmp.Diff(kv.Bools(), v); diff != "" {
-				t.Errorf("diff: %s", diff)
-			}
-		}
-
-		for _, v := range unmatched {
-			kv := KeyValue{"key", Value{v}}
-			if b := kv.Bools(); b != nil {
-				t.Errorf("expected nil, got %v", b)
-			}
-		}
-	})
-}
--- a/fs/gguf/lazy.go
+++ b/fs/gguf/lazy.go
@@ -1,89 +0,0 @@
-package gguf
-
-import (
-	"encoding/binary"
-	"iter"
-	"log/slog"
-)
-
-type lazy[T any] struct {
-	count  uint64
-	next   func() (T, bool)
-	stop   func()
-	values []T
-
-	// successFunc is called when all values have been successfully read.
-	successFunc func() error
-}
-
-func newLazy[T any](f *File, fn func() (T, error)) (*lazy[T], error) {
-	it := lazy[T]{}
-	if err := binary.Read(f.reader, binary.LittleEndian, &it.count); err != nil {
-		return nil, err
-	}
-
-	it.values = make([]T, 0)
-	it.next, it.stop = iter.Pull(func(yield func(T) bool) {
-		for i := range it.count {
-			t, err := fn()
-			if err != nil {
-				slog.Error("error reading tensor", "index", i, "error", err)
-				return
-			}
-
-			it.values = append(it.values, t)
-			if !yield(t) {
-				break
-			}
-		}
-
-		if it.successFunc != nil {
-			it.successFunc()
-		}
-	})
-
-	return &it, nil
-}
-
-func (g *lazy[T]) Values() iter.Seq[T] {
-	return func(yield func(T) bool) {
-		for _, v := range g.All() {
-			if !yield(v) {
-				break
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) All() iter.Seq2[int, T] {
-	return func(yield func(int, T) bool) {
-		for i := range int(g.count) {
-			if i < len(g.values) {
-				if !yield(i, g.values[i]) {
-					break
-				}
-			} else {
-				t, ok := g.next()
-				if !ok {
-					break
-				}
-
-				if !yield(i, t) {
-					break
-				}
-			}
-		}
-	}
-}
-
-func (g *lazy[T]) rest() (collected bool) {
-	for {
-		_, ok := g.next()
-		collected = collected || ok
-		if !ok {
-			break
-		}
-	}
-
-	return collected
-}
--- a/fs/gguf/reader.go
+++ b/fs/gguf/reader.go
@@ -1,23 +0,0 @@
-package gguf
-
-import (
-	"bufio"
-	"io"
-)
-
-type bufferedReader struct {
-	offset int64
-	*bufio.Reader
-}
-
-func newBufferedReader(rs io.ReadSeeker, size int) *bufferedReader {
-	return &bufferedReader{
-		Reader: bufio.NewReaderSize(rs, size),
-	}
-}
-
-func (rs *bufferedReader) Read(p []byte) (n int, err error) {
-	n, err = rs.Reader.Read(p)
-	rs.offset += int64(n)
-	return n, err
-}
--- a/fs/gguf/tensor.go
+++ b/fs/gguf/tensor.go
@@ -1,288 +0,0 @@
-package gguf
-
-import (
-	"log/slog"
-	"strings"
-)
-
-type TensorInfo struct {
-	Name   string
-	Offset uint64
-	Shape  []uint64
-	Type   TensorType
-}
-
-func (ti TensorInfo) Valid() bool {
-	return ti.Name != "" && ti.NumBytes() > 0
-}
-
-func (ti TensorInfo) NumValues() int64 {
-	var numItems int64 = 1
-	for _, dim := range ti.Shape {
-		numItems *= int64(dim)
-	}
-	return numItems
-}
-
-// NumBytes returns the number of bytes in the tensor.
-func (ti TensorInfo) NumBytes() int64 {
-	return int64(float64(ti.NumValues()) * ti.Type.NumBytes())
-}
-
-func (ti TensorInfo) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.String("name", ti.Name),
-		slog.Int64("offset", int64(ti.Offset)),
-		slog.Any("shape", ti.Shape),
-		slog.Int64("num_values", ti.NumValues()),
-		slog.Int64("num_bytes", ti.NumBytes()),
-		slog.Any("type", ti.Type),
-	)
-}
-
-type TensorType uint32
-
-const (
-	TensorTypeF32 TensorType = iota
-	TensorTypeF16
-	TensorTypeQ4_0
-	TensorTypeQ4_1
-
-	// unexported // unused in gguf
-	tensorTypeQ4_2
-	tensorTypeQ4_3
-
-	TensorTypeQ5_0
-	TensorTypeQ5_1
-	TensorTypeQ8_0
-	TensorTypeQ8_1
-	TensorTypeQ2_K
-	TensorTypeQ3_K
-	TensorTypeQ4_K
-	TensorTypeQ5_K
-	TensorTypeQ6_K
-	TensorTypeQ8_K
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ2_XXS
-	tensorTypeIQ2_XS
-	tensorTypeIQ3_XXS
-	tensorTypeIQ1_S
-	tensorTypeIQ4_NL
-	tensorTypeIQ3_S
-	tensorTypeIQ2_S
-	tensorTypeIQ4_XS
-
-	TensorTypeI8
-	TensorTypeI16
-	TensorTypeI32
-	TensorTypeI64
-	TensorTypeF64
-
-	// unexported // unquantizable by ollama
-	tensorTypeIQ1_M
-
-	TensorTypeBF16
-
-	// unexported // unused in gguf
-	tensorTypeQ4_0_4_4
-	tensorTypeQ4_0_4_8
-	tensorTypeQ4_0_8_8
-
-	// unexported // unquantizable by ollama
-	tensorTypeTQ1_0
-	tensorTypeTQ2_0
-
-	// unexported // unused in gguf
-	tensorTypeIQ4_NL_4_4
-	tensorTypeIQ4_NL_4_8
-	tensorTypeIQ4_NL_8_8
-)
-
-func (tt TensorType) NumBytes() float64 {
-	return float64(tt.typeSize()) / float64(tt.blockSize())
-}
-
-func (tt TensorType) typeSize() int64 {
-	switch tt {
-	case TensorTypeF32:
-		return 4
-	case TensorTypeF16:
-		return 2
-	case TensorTypeQ4_0:
-		return 2 + tt.blockSize()/2
-	case TensorTypeQ4_1:
-		return 2 + 2 + tt.blockSize()/2
-	case TensorTypeQ5_0:
-		return 2 + 4 + tt.blockSize()/2
-	case TensorTypeQ5_1:
-		return 2 + 2 + 4 + tt.blockSize()/2
-	case TensorTypeQ8_0:
-		return 2 + tt.blockSize()
-	case TensorTypeQ8_1:
-		return 2 + 2 + tt.blockSize()
-	case TensorTypeQ2_K:
-		return tt.blockSize()/16 + tt.blockSize()/4 + 2 + 2
-	case TensorTypeQ3_K:
-		return tt.blockSize()/8 + tt.blockSize()/4 + 12 + 2
-	case TensorTypeQ4_K:
-		return 2 + 2 + 12 + tt.blockSize()/2
-	case TensorTypeQ5_K:
-		return 2 + 2 + 12 + tt.blockSize()/8 + tt.blockSize()/2
-	case TensorTypeQ6_K:
-		return tt.blockSize()/2 + tt.blockSize()/4 + tt.blockSize()/16 + 2
-	case TensorTypeQ8_K:
-		return 4 + tt.blockSize() + 2*tt.blockSize()/16
-	case tensorTypeIQ2_XXS:
-		return 2 + 2*tt.blockSize()/8
-	case tensorTypeIQ2_XS:
-		return 2 + 2*tt.blockSize()/8 + tt.blockSize()/32
-	case tensorTypeIQ3_XXS:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/8
-	case tensorTypeIQ1_S:
-		return 2 + tt.blockSize()/8 + tt.blockSize()/16
-	case tensorTypeIQ4_NL:
-		return 2 + tt.blockSize()/2
-	case tensorTypeIQ3_S:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/8 + tt.blockSize()/32 + 4
-	case tensorTypeIQ2_S:
-		return 2 + tt.blockSize()/4 + tt.blockSize()/16
-	case tensorTypeIQ4_XS:
-		return 2 + 2 + tt.blockSize()/2 + tt.blockSize()/64
-	case TensorTypeI8:
-		return 1
-	case TensorTypeI16:
-		return 2
-	case TensorTypeI32:
-		return 4
-	case TensorTypeI64:
-		return 8
-	case TensorTypeF64:
-		return 8
-	case tensorTypeIQ1_M:
-		return tt.blockSize()/8 + tt.blockSize()/16 + tt.blockSize()/32
-	case TensorTypeBF16:
-		return 2
-	default:
-		return 0
-	}
-}
-
-func (tt TensorType) blockSize() int64 {
-	switch tt {
-	case TensorTypeF32,
-		TensorTypeF16,
-		TensorTypeI8,
-		TensorTypeI16,
-		TensorTypeI32,
-		TensorTypeI64,
-		TensorTypeF64,
-		TensorTypeBF16:
-		return 1
-	case TensorTypeQ4_0,
-		TensorTypeQ4_1,
-		TensorTypeQ5_0,
-		TensorTypeQ5_1,
-		TensorTypeQ8_0,
-		TensorTypeQ8_1,
-		tensorTypeIQ4_NL:
-		return 32
-	default:
-		return 256
-	}
-}
-
-func (tt TensorType) String() string {
-	switch tt {
-	case TensorTypeF32:
-		return "f32"
-	case TensorTypeF16:
-		return "f16"
-	case TensorTypeQ4_0:
-		return "q4_0"
-	case TensorTypeQ4_1:
-		return "q4_1"
-	case tensorTypeQ4_2:
-		return "q4_2"
-	case tensorTypeQ4_3:
-		return "q4_3"
-	case TensorTypeQ5_0:
-		return "q5_0"
-	case TensorTypeQ5_1:
-		return "q5_1"
-	case TensorTypeQ8_0:
-		return "q8_0"
-	case TensorTypeQ8_1:
-		return "q8_1"
-	case TensorTypeQ2_K:
-		return "q2_k"
-	case TensorTypeQ3_K:
-		return "q3_k"
-	case TensorTypeQ4_K:
-		return "q4_k"
-	case TensorTypeQ5_K:
-		return "q5_k"
-	case TensorTypeQ6_K:
-		return "q6_k"
-	case TensorTypeQ8_K:
-		return "q8_k"
-	case tensorTypeIQ2_XXS:
-		return "iq2_xxs"
-	case tensorTypeIQ2_XS:
-		return "iq2_xs"
-	case tensorTypeIQ3_XXS:
-		return "iq3_xxs"
-	case tensorTypeIQ1_S:
-		return "iq1_s"
-	case tensorTypeIQ4_NL:
-		return "iq4_nl"
-	case tensorTypeIQ3_S:
-		return "iq3_s"
-	case tensorTypeIQ2_S:
-		return "iq2_s"
-	case tensorTypeIQ4_XS:
-		return "iq4_xs"
-	case TensorTypeI8:
-		return "i8"
-	case TensorTypeI16:
-		return "i16"
-	case TensorTypeI32:
-		return "i32"
-	case TensorTypeI64:
-		return "i64"
-	case TensorTypeF64:
-		return "f64"
-	case tensorTypeIQ1_M:
-		return "iq1_m"
-	case TensorTypeBF16:
-		return "bf16"
-	case tensorTypeQ4_0_4_4:
-		return "q4_0_4_4"
-	case tensorTypeQ4_0_4_8:
-		return "q4_0_4_8"
-	case tensorTypeQ4_0_8_8:
-		return "q4_0_8_8"
-	case tensorTypeTQ1_0:
-		return "tq1_0"
-	case tensorTypeTQ2_0:
-		return "tq2_0"
-	case tensorTypeIQ4_NL_4_4:
-		return "iq4_nl_4_4"
-	case tensorTypeIQ4_NL_4_8:
-		return "iq4_nl_4_8"
-	case tensorTypeIQ4_NL_8_8:
-		return "iq4_nl_8_8"
-	default:
-		return "unknown"
-	}
-}
-
-func (tt TensorType) LogValue() slog.Value {
-	return slog.GroupValue(
-		slog.Uint64("value", uint64(tt)),
-		slog.String("name", strings.ToUpper(tt.String())),
-		slog.Int64("size", tt.typeSize()),
-		slog.Int64("block_size", tt.blockSize()),
-		slog.Float64("num_bytes", tt.NumBytes()),
-	)
-}
--- a/go.mod
+++ b/go.mod
@@ -19,7 +19,7 @@ require (
 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
 	github.com/dlclark/regexp2 v1.11.4
 	github.com/emirpasic/gods/v2 v2.0.0-alpha
-	github.com/google/go-cmp v0.7.0
+	github.com/google/go-cmp v0.6.0
 	github.com/mattn/go-runewidth v0.0.14
 	github.com/nlpodyssey/gopickle v0.3.0
 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
--- a/go.sum
+++ b/go.sum
@@ -112,8 +112,8 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
 github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
-github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
-github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
--- a/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
+++ b/llama/patches/0017-ggml-Export-GPU-UUIDs.patch
@@ -1,102 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Thu, 24 Apr 2025 14:48:51 -0700
-Subject: [PATCH] ggml: Export GPU UUIDs
-
-This enables matching up devices and information reported by the backend
-with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
---
- ggml/include/ggml-backend.h      |  1 +
- ggml/src/ggml-cuda/ggml-cuda.cu  | 33 ++++++++++++++++++++++++++++++++
- ggml/src/ggml-metal/ggml-metal.m |  1 +
- 3 files changed, 35 insertions(+)
-
-diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
-index 74e46716..a880df33 100644
--- a/ggml/include/ggml-backend.h
-+++ b/ggml/include/ggml-backend.h
-@@ -152,6 +152,7 @@ extern "C" {
-     struct ggml_backend_dev_props {
-         const char * name;
-         const char * description;
-+        const char * uuid;
-         size_t memory_free;
-         size_t memory_total;
-         enum ggml_backend_dev_type type;
-diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index cb0d8528..4c829153 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
-+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
-@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
-     int device;
-     std::string name;
-     std::string description;
-+    std::string uuid;
- };
- 
- static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
-@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
-     return ctx->description.c_str();
- }
- 
-+static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-+    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-+    return ctx->uuid.c_str();
-+}
-+
- static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-     ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-     ggml_cuda_set_device(ctx->device);
-@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
- static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_cuda_device_get_name(dev);
-     props->description = ggml_backend_cuda_device_get_description(dev);
-+    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
-     props->type        = ggml_backend_cuda_device_get_type(dev);
-     ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
- 
-@@ -3458,6 +3465,32 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
-                 CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
-                 dev_ctx->description = prop.name;
- 
-+                #if !defined(GGML_USE_HIP)
-+                char uuid[64];
-+                snprintf(uuid, sizeof(uuid),
-+                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-+                    (unsigned char)prop.uuid.bytes[0],
-+                    (unsigned char)prop.uuid.bytes[1],
-+                    (unsigned char)prop.uuid.bytes[2],
-+                    (unsigned char)prop.uuid.bytes[3],
-+                    (unsigned char)prop.uuid.bytes[4],
-+                    (unsigned char)prop.uuid.bytes[5],
-+                    (unsigned char)prop.uuid.bytes[6],
-+                    (unsigned char)prop.uuid.bytes[7],
-+                    (unsigned char)prop.uuid.bytes[8],
-+                    (unsigned char)prop.uuid.bytes[9],
-+                    (unsigned char)prop.uuid.bytes[10],
-+                    (unsigned char)prop.uuid.bytes[11],
-+                    (unsigned char)prop.uuid.bytes[12],
-+                    (unsigned char)prop.uuid.bytes[13],
-+                    (unsigned char)prop.uuid.bytes[14],
-+                    (unsigned char)prop.uuid.bytes[15]
-+                  );
-+                dev_ctx->uuid = uuid;
-+                #else
-+                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-+                #endif
-+
-                 ggml_backend_dev_t dev = new ggml_backend_device {
-                     /* .iface   = */ ggml_backend_cuda_device_interface,
-                     /* .reg     = */ &reg,
-diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
-index 1b56f858..ee4f2dcb 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
-+++ b/ggml/src/ggml-metal/ggml-metal.m
-@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
- static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-     props->name        = ggml_backend_metal_device_get_name(dev);
-     props->description = ggml_backend_metal_device_get_description(dev);
-+    props->uuid        = "0";
-     props->type        = ggml_backend_metal_device_get_type(dev);
-     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
-     props->caps = (struct ggml_backend_dev_caps) {
--- a/llm/server.go
+++ b/llm/server.go
@@ -22,7 +22,6 @@ import (
 	"strings"
 	"sync"
 	"time"
-	"unicode/utf8"

 	"golang.org/x/sync/semaphore"

@@ -726,68 +725,10 @@ type CompletionResponse struct {
 	EvalDuration       time.Duration `json:"eval_duration"`
 }

-// unicodeBufferHandler wraps a completion response callback to handle partial UTF-8 sequences.
-// This function creates a stateful closure that is NOT safe for concurrent use.
-// Each completion request should create its own handler instance.
-func unicodeBufferHandler(fn func(CompletionResponse)) func(CompletionResponse) {
-	var pendingUTF8 string
-
-	return func(resp CompletionResponse) {
-		if resp.Content == "" && !resp.Done {
-			// No content to process, just pass through
-			fn(resp)
-			return
-		}
-
-		// Combine any pending UTF-8 with current content
-		combinedContent := pendingUTF8 + resp.Content
-		pendingUTF8 = ""
-
-		// Check if combined content is valid UTF-8
-		if utf8.ValidString(combinedContent) {
-			// Valid UTF-8, send it
-			resp.Content = combinedContent
-			fn(resp)
-		} else {
-			// Invalid UTF-8
-			if resp.Done {
-				// This is the final response, trim incomplete UTF-8
-				trimmedContent := combinedContent
-				for !utf8.ValidString(trimmedContent) && len(trimmedContent) > 0 {
-					trimmedContent = trimmedContent[:len(trimmedContent)-1]
-				}
-				resp.Content = trimmedContent
-				fn(resp)
-			} else {
-				// Not final response, split valid and invalid parts
-				validPrefix := combinedContent
-				for !utf8.ValidString(validPrefix) && len(validPrefix) > 0 {
-					validPrefix = validPrefix[:len(validPrefix)-1]
-				}
-
-				if len(validPrefix) > 0 {
-					// Send valid prefix
-					resp.Content = validPrefix
-					fn(resp)
-					// Buffer the remainder
-					pendingUTF8 = combinedContent[len(validPrefix):]
-				} else {
-					// No valid prefix, buffer everything
-					pendingUTF8 = combinedContent
-					// Don't send this response
-				}
-			}
-		}
-	}
-}
-
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	slog.Debug("completion request", "images", len(req.Images), "prompt", len(req.Prompt), "format", string(req.Format))
 	slog.Log(ctx, logutil.LevelTrace, "completion request", "prompt", req.Prompt)

-	// Wrap the callback with unicode buffer handling
-	unicodeFn := unicodeBufferHandler(fn)
-
 	if len(req.Format) > 0 {
 		switch string(req.Format) {
 		case `null`, `""`:
@@ -913,13 +854,13 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			}

 			if c.Content != "" {
-				unicodeFn(CompletionResponse{
+				fn(CompletionResponse{
 					Content: c.Content,
 				})
 			}

 			if c.Done {
-				unicodeFn(c)
+				fn(c)
 				return nil
 			}
 		}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -70,152 +70,3 @@ func TestLLMServerCompletionFormat(t *testing.T) {
 	}, nil)
 	checkValid(err)
 }
-
-func TestUnicodeBufferHandler(t *testing.T) {
-	tests := []struct {
-		name              string
-		inputResponses    []CompletionResponse
-		expectedResponses []CompletionResponse
-		description       string
-	}{
-		{
-			name: "complete_unicode",
-			inputResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: " world", Done: false},
-				{Content: "!", Done: true},
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: " world", Done: false},
-				{Content: "!", Done: true},
-			},
-			description: "All responses with valid unicode should pass through unchanged",
-		},
-		{
-			name: "incomplete_unicode_at_end_with_done",
-			inputResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: string([]byte{0xF0, 0x9F}), Done: true}, // Incomplete emoji with Done=true
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Hello", Done: false},
-				{Content: "", Done: true}, // Content is trimmed but response is still sent with Done=true
-			},
-			description: "When Done=true, incomplete Unicode at the end should be trimmed",
-		},
-		{
-			name: "split_unicode_across_responses",
-			inputResponses: []CompletionResponse{
-				{Content: "Hello " + string([]byte{0xF0, 0x9F}), Done: false}, // First part of 😀
-				{Content: string([]byte{0x98, 0x80}) + " world!", Done: true}, // Second part of 😀 and more text
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Hello ", Done: false},  // Incomplete Unicode trimmed
-				{Content: "😀 world!", Done: true}, // Complete emoji in second response
-			},
-			description: "Unicode split across responses should be handled correctly",
-		},
-		{
-			name: "incomplete_unicode_buffered",
-			inputResponses: []CompletionResponse{
-				{Content: "Test " + string([]byte{0xF0, 0x9F}), Done: false}, // Incomplete emoji
-				{Content: string([]byte{0x98, 0x80}), Done: false},           // Complete the emoji
-				{Content: " done", Done: true},
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Test ", Done: false}, // First part without incomplete unicode
-				{Content: "😀", Done: false},     // Complete emoji
-				{Content: " done", Done: true},
-			},
-			description: "Incomplete unicode should be buffered and combined with next response",
-		},
-		{
-			name: "empty_response_with_done",
-			inputResponses: []CompletionResponse{
-				{Content: "Complete response", Done: false},
-				{Content: "", Done: true}, // Empty response with Done=true
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Complete response", Done: false},
-				{Content: "", Done: true}, // Should still be sent because Done=true
-			},
-			description: "Empty final response with Done=true should still be sent",
-		},
-		{
-			name: "done_reason_preserved",
-			inputResponses: []CompletionResponse{
-				{Content: "Response", Done: false},
-				{Content: " complete", Done: true, DoneReason: DoneReasonStop},
-			},
-			expectedResponses: []CompletionResponse{
-				{Content: "Response", Done: false},
-				{Content: " complete", Done: true, DoneReason: DoneReasonStop},
-			},
-			description: "DoneReason should be preserved in the final response",
-		},
-		{
-			name: "only_incomplete_unicode_not_done",
-			inputResponses: []CompletionResponse{
-				{Content: string([]byte{0xF0, 0x9F}), Done: false}, // Only incomplete unicode
-			},
-			expectedResponses: []CompletionResponse{
-				// No response expected - should be buffered
-			},
-			description: "Response with only incomplete unicode should be buffered if not done",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			var actualResponses []CompletionResponse
-
-			// Create a callback that collects responses
-			callback := func(resp CompletionResponse) {
-				actualResponses = append(actualResponses, resp)
-			}
-
-			// Create the unicode buffer handler
-			handler := unicodeBufferHandler(callback)
-
-			// Send all input responses through the handler
-			for _, resp := range tt.inputResponses {
-				handler(resp)
-			}
-
-			// Verify the number of responses
-			if len(actualResponses) != len(tt.expectedResponses) {
-				t.Fatalf("%s: got %d responses, want %d responses",
-					tt.description, len(actualResponses), len(tt.expectedResponses))
-			}
-
-			// Verify each response matches the expected one
-			for i, expected := range tt.expectedResponses {
-				if i >= len(actualResponses) {
-					t.Fatalf("%s: missing response at index %d", tt.description, i)
-					continue
-				}
-
-				actual := actualResponses[i]
-
-				// Verify content
-				if actual.Content != expected.Content {
-					t.Errorf("%s: response[%d].Content = %q, want %q",
-						tt.description, i, actual.Content, expected.Content)
-				}
-
-				// Verify Done flag
-				if actual.Done != expected.Done {
-					t.Errorf("%s: response[%d].Done = %v, want %v",
-						tt.description, i, actual.Done, expected.Done)
-				}
-
-				// Verify DoneReason if specified
-				if actual.DoneReason != expected.DoneReason {
-					t.Errorf("%s: response[%d].DoneReason = %v, want %v",
-						tt.description, i, actual.DoneReason, expected.DoneReason)
-				}
-			}
-		})
-	}
-}
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -124,10 +124,6 @@ type DeviceMemory struct {
 	// may not be persistent across instances of the runner.
 	Name string

-	// UUID is a unique persistent identifier for the device for matching
-	// with system management libraries
-	UUID string
-
 	// Weights is the per-layer memory needed for the model weights.
 	Weights []Memory

@@ -156,10 +152,6 @@ func (m DeviceMemory) LogValue() slog.Value {
 		attrs = append(attrs, slog.Any("Graph", m.Graph))
 	}

-	if len(attrs) > 0 && m.UUID != "" {
-		attrs = append([]slog.Attr{slog.String("UUID", m.UUID)}, attrs...)
-	}
-
 	return slog.GroupValue(attrs...)
 }

--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -136,9 +136,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 	}

 	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
-	var props C.struct_ggml_backend_dev_props
-	C.ggml_backend_dev_get_props(cpuDeviceBufferType.d, &props)
-	requiredMemory.CPU.UUID = C.GoString(props.uuid)
 	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
 	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)

@@ -153,9 +150,6 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 		})
 		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
 		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
-		var props C.struct_ggml_backend_dev_props
-		C.ggml_backend_dev_get_props(d, &props)
-		requiredMemory.GPUs[i].UUID = C.GoString(props.uuid)
 		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
 		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -152,7 +152,6 @@ extern "C" {
    struct ggml_backend_dev_props {
        const char * name;
        const char * description;
-        const char * uuid;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2884,7 +2884,6 @@ struct ggml_backend_cuda_device_context {
    int device;
    std::string name;
    std::string description;
-    std::string uuid;
 };

 static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -2897,11 +2896,6 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
    return ctx->description.c_str();
 }

-static const char * ggml_backend_cuda_device_get_uuid(ggml_backend_dev_t dev) {
-    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
-    return ctx->uuid.c_str();
-}
-
 static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
    ggml_cuda_set_device(ctx->device);
@@ -2916,7 +2910,6 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
 static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_cuda_device_get_name(dev);
    props->description = ggml_backend_cuda_device_get_description(dev);
-    props->uuid        = ggml_backend_cuda_device_get_uuid(dev);
    props->type        = ggml_backend_cuda_device_get_type(dev);
    ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);

@@ -3465,32 +3458,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
                dev_ctx->description = prop.name;

-                #if !defined(GGML_USE_HIP)
-                char uuid[64];
-                snprintf(uuid, sizeof(uuid),
-                    "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-                    (unsigned char)prop.uuid.bytes[0],
-                    (unsigned char)prop.uuid.bytes[1],
-                    (unsigned char)prop.uuid.bytes[2],
-                    (unsigned char)prop.uuid.bytes[3],
-                    (unsigned char)prop.uuid.bytes[4],
-                    (unsigned char)prop.uuid.bytes[5],
-                    (unsigned char)prop.uuid.bytes[6],
-                    (unsigned char)prop.uuid.bytes[7],
-                    (unsigned char)prop.uuid.bytes[8],
-                    (unsigned char)prop.uuid.bytes[9],
-                    (unsigned char)prop.uuid.bytes[10],
-                    (unsigned char)prop.uuid.bytes[11],
-                    (unsigned char)prop.uuid.bytes[12],
-                    (unsigned char)prop.uuid.bytes[13],
-                    (unsigned char)prop.uuid.bytes[14],
-                    (unsigned char)prop.uuid.bytes[15]
-                  );
-                dev_ctx->uuid = uuid;
-                #else
-                dev_ctx->uuid = "GPU-" + std::string(prop.uuid.bytes, 16);
-                #endif
-
                ggml_backend_dev_t dev = new ggml_backend_device {
                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -5703,7 +5703,6 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
 static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
    props->name        = ggml_backend_metal_device_get_name(dev);
    props->description = ggml_backend_metal_device_get_description(dev);
-    props->uuid        = "0";
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -63,9 +63,9 @@ func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOp
 }

 type TextExperts struct {
-	Gate *nn.Linear `gguf:"ffn_gate_exps"`
-	Up   *nn.Linear `gguf:"ffn_up_exps"`
-	Down *nn.Linear `gguf:"ffn_down_exps"`
+	Gate ml.Tensor `gguf:"ffn_gate_exps.weight"`
+	Up   ml.Tensor `gguf:"ffn_up_exps.weight"`
+	Down ml.Tensor `gguf:"ffn_down_exps.weight"`
 }

 func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tensor, opts *TextOptions) ml.Tensor {
@@ -76,9 +76,9 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens
 	hiddenStates = hiddenStates.Repeat(ctx, 1, opts.numExpertsUsed)
 	hiddenStates = hiddenStates.Mul(ctx, scores)

-	upStates := e.Up.Weight.MulmatID(ctx, hiddenStates, experts)
-	gateStates := e.Gate.Weight.MulmatID(ctx, hiddenStates, experts)
-	downStates := e.Down.Weight.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)
+	upStates := e.Up.MulmatID(ctx, hiddenStates, experts)
+	gateStates := e.Gate.MulmatID(ctx, hiddenStates, experts)
+	downStates := e.Down.MulmatID(ctx, upStates.Mul(ctx, gateStates.SILU(ctx)), experts)

 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -66,9 +66,9 @@ type MLP interface {

 type sparse struct {
 	Router *nn.Linear `gguf:"ffn_gate_inp"`
-	Gate   *nn.Linear `gguf:"ffn_gate_exps"`
-	Up     *nn.Linear `gguf:"ffn_up_exps"`
-	Down   *nn.Linear `gguf:"ffn_down_exps"`
+	Gate   ml.Tensor  `gguf:"ffn_gate_exps.weight"`
+	Up     ml.Tensor  `gguf:"ffn_up_exps.weight"`
+	Down   ml.Tensor  `gguf:"ffn_down_exps.weight"`
 }

 func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
@@ -87,13 +87,13 @@ func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options

 	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))

-	upStates := mlp.Up.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	upStates := mlp.Up.MulmatID(ctx, hiddenStates, selectedExperts)

-	hiddenStates = mlp.Gate.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	hiddenStates = mlp.Gate.MulmatID(ctx, hiddenStates, selectedExperts)
 	hiddenStates = hiddenStates.SILU(ctx)
 	hiddenStates = hiddenStates.Mul(ctx, upStates)

-	experts := mlp.Down.Weight.MulmatID(ctx, hiddenStates, selectedExperts)
+	experts := mlp.Down.MulmatID(ctx, hiddenStates, selectedExperts)
 	experts = experts.Mul(ctx, routingWeights)

 	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -292,18 +292,13 @@ func filesForModel(path string) ([]string, error) {
 	}
 	files = append(files, js...)

-	// only include tokenizer.model is tokenizer.json is not present
-	if !slices.ContainsFunc(files, func(s string) bool {
-		return slices.Contains(strings.Split(s, string(os.PathSeparator)), "tokenizer.json")
-	}) {
-		if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
-			// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
-			// tokenizer.model might be a unresolved git lfs reference; error if it is
-			files = append(files, tks...)
-		} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
-			// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
-			files = append(files, tks...)
-		}
+	if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
+		// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
+		// tokenizer.model might be a unresolved git lfs reference; error if it is
+		files = append(files, tks...)
+	} else if tks, _ := glob(filepath.Join(path, "**/tokenizer.model"), "text/plain"); len(tks) > 0 {
+		// some times tokenizer.model is in a subdirectory (e.g. meta-llama/Meta-Llama-3-8B)
+		files = append(files, tks...)
 	}

 	return files, nil
--- a/runner/common/stop.go
+++ b/runner/common/stop.go
@@ -2,8 +2,6 @@ package common

 import (
 	"strings"
-
-	"github.com/ollama/ollama/llm"
 )

 func FindStop(sequence string, stops []string) (bool, string) {
@@ -31,41 +29,68 @@ func ContainsStopSuffix(sequence string, stops []string) bool {
 // truncateStop removes the provided stop string from pieces,
 // returning the partial pieces with stop removed, including truncating
 // the last piece if required (and signalling if this was the case)
-func TruncateStop(resps []llm.CompletionResponse, stop string) ([]llm.CompletionResponse, bool) {
-	var sequence string
-	for _, resp := range resps {
-		sequence += resp.Content
+func TruncateStop(pieces []string, stop string) ([]string, bool) {
+	joined := strings.Join(pieces, "")
+
+	index := strings.Index(joined, stop)
+	if index == -1 {
+		return pieces, false
 	}

-	idx := strings.Index(sequence, stop)
-	if idx < 0 {
-		return resps, false
+	joined = joined[:index]
+
+	// Split truncated string back into pieces of original lengths
+	lengths := make([]int, len(pieces))
+	for i, piece := range pieces {
+		lengths[i] = len(piece)
 	}

-	truncated := sequence[:idx]
-	if len(truncated) == 0 {
-		return nil, true
-	}
-
-	result := make([]llm.CompletionResponse, 0, len(resps))
-
-	// Track position in truncated sequence
-	pos := 0
-	truncationHappened := false
-	for _, resp := range resps {
-		if pos >= len(truncated) {
+	var result []string
+	tokenTruncated := false
+	start := 0
+	for _, length := range lengths {
+		if start >= len(joined) {
 			break
 		}

-		chunk := truncated[pos:min(pos+len(resp.Content), len(truncated))]
-		if len(chunk) < len(resp.Content) {
-			truncationHappened = true
+		end := start + length
+		if end > len(joined) {
+			end = len(joined)
+			tokenTruncated = true
 		}
-		if len(chunk) > 0 {
-			result = append(result, llm.CompletionResponse{Content: chunk})
-		}
-		pos += len(resp.Content)
+		result = append(result, joined[start:end])
+		start = end
 	}

-	return result, truncationHappened
+	return result, tokenTruncated
+}
+
+func IncompleteUnicode(token string) bool {
+	incomplete := false
+
+	// check if there is incomplete UTF-8 character at the end
+	for i := 1; i < 5 && i <= len(token); i++ {
+		c := token[len(token)-i]
+
+		if (c & 0xc0) == 0x80 {
+			// continuation byte: 10xxxxxx
+			continue
+		}
+
+		if (c & 0xe0) == 0xc0 {
+			// 2-byte character: 110xxxxx ...
+			incomplete = i < 2
+		} else if (c & 0xf0) == 0xe0 {
+			// 3-byte character: 1110xxxx ...
+			incomplete = i < 3
+		} else if (c & 0xf8) == 0xf0 {
+			// 4-byte character: 11110xxx ...
+			incomplete = i < 4
+		}
+
+		// else 1-byte character or invalid byte
+		break
+	}
+
+	return incomplete
 }
--- a/runner/common/stop_test.go
+++ b/runner/common/stop_test.go
@@ -1,84 +1,51 @@
 package common

 import (
-	"fmt"
 	"reflect"
 	"testing"
-
-	"github.com/ollama/ollama/llm"
 )

 func TestTruncateStop(t *testing.T) {
 	tests := []struct {
 		name          string
-		pieces        []llm.CompletionResponse
+		pieces        []string
 		stop          string
-		expected      []llm.CompletionResponse
+		expected      []string
 		expectedTrunc bool
 	}{
 		{
-			name: "Single word",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: "world"},
-			},
-			stop: "world",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-			},
+			name:          "Single word",
+			pieces:        []string{"hello", "world"},
+			stop:          "world",
+			expected:      []string{"hello"},
 			expectedTrunc: false,
 		},
 		{
-			name: "Partial",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " wor"},
-			},
-			stop: "or",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " w"},
-			},
+			name:          "Partial",
+			pieces:        []string{"hello", "wor"},
+			stop:          "or",
+			expected:      []string{"hello", "w"},
 			expectedTrunc: true,
 		},
 		{
-			name: "Suffix",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " there"},
-				{Content: "!"},
-			},
-			stop: "!",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " there"},
-			},
+			name:          "Suffix",
+			pieces:        []string{"Hello", " there", "!"},
+			stop:          "!",
+			expected:      []string{"Hello", " there"},
 			expectedTrunc: false,
 		},
 		{
-			name: "Suffix partial",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " the"},
-				{Content: "re!"},
-			},
-			stop: "there!",
-			expected: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " "},
-			},
+			name:          "Suffix partial",
+			pieces:        []string{"Hello", " the", "re!"},
+			stop:          "there!",
+			expected:      []string{"Hello", " "},
 			expectedTrunc: true,
 		},
 		{
-			name: "Middle",
-			pieces: []llm.CompletionResponse{
-				{Content: "Hello"},
-				{Content: " wo"},
-			},
-			stop: "llo w",
-			expected: []llm.CompletionResponse{
-				{Content: "He"},
-			},
+			name:          "Middle",
+			pieces:        []string{"hello", " wor"},
+			stop:          "llo w",
+			expected:      []string{"he"},
 			expectedTrunc: true,
 		},
 	}
@@ -87,23 +54,76 @@ func TestTruncateStop(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
 			if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
-				t.Errorf("truncateStop(%v, %v):\n%shave truncated %v\nwant truncated %v",
-					tt.pieces, tt.stop, formatContentDiff(result, tt.expected), resultTrunc, tt.expectedTrunc)
+				t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
 			}
 		})
 	}
 }

-func formatContentDiff(result, expected []llm.CompletionResponse) string {
-	var s string
-	for i := 0; i < len(result) || i < len(expected); i++ {
-		if i < len(result) && i < len(expected) && result[i].Content != expected[i].Content {
-			s += fmt.Sprintf("[%d] %q vs %q\n", i, result[i].Content, expected[i].Content)
-		} else if i < len(result) && i >= len(expected) {
-			s += fmt.Sprintf("[%d] extra %q\n", i, result[i].Content)
-		} else if i >= len(result) && i < len(expected) {
-			s += fmt.Sprintf("[%d] missing %q\n", i, expected[i].Content)
-		}
+func TestIncompleteUnicode(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected bool
+	}{
+		{
+			name:     "Basic",
+			input:    "hi",
+			expected: false,
+		},
+		{
+			name:     "Two byte",
+			input:    "hi" + string([]byte{0xc2, 0xa3}),
+			expected: false,
+		},
+		{
+			name:     "Two byte - missing last",
+			input:    "hi" + string([]byte{0xc2}),
+			expected: true,
+		},
+		{
+			name:     "Three byte",
+			input:    "hi" + string([]byte{0xe0, 0xA0, 0x80}),
+			expected: false,
+		},
+		{
+			name:     "Three byte - missing last",
+			input:    "hi" + string([]byte{0xe0, 0xA0}),
+			expected: true,
+		},
+		{
+			name:     "Three byte - missing last 2",
+			input:    "hi" + string([]byte{0xe0}),
+			expected: true,
+		},
+		{
+			name:     "Four byte",
+			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a, 0xb7}),
+			expected: false,
+		},
+		{
+			name:     "Four byte - missing last",
+			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a}),
+			expected: true,
+		},
+		{
+			name:     "Four byte - missing last 2",
+			input:    "hi" + string([]byte{0xf0, 0x92}),
+			expected: true,
+		},
+		{
+			name:     "Four byte - missing last 3",
+			input:    "hi" + string([]byte{0xf0}),
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := IncompleteUnicode(tt.input)
+			if result != tt.expected {
+				t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
+			}
+		})
 	}
-	return s
 }
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -17,6 +17,7 @@ import (
 	"strings"
 	"sync"
 	"time"
+	"unicode/utf8"

 	"golang.org/x/sync/semaphore"

@@ -51,13 +52,13 @@ type Sequence struct {
 	pendingInputs []input

 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
-	pendingResponses []llm.CompletionResponse
+	pendingResponses []string

 	// input cache being used by this sequence
 	cache *InputCacheSlot

 	// channel to send responses over
-	responses chan llm.CompletionResponse
+	responses chan string

 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@@ -88,19 +89,6 @@ type Sequence struct {
 	numPromptInputs     int
 }

-func (seq *Sequence) send(resp llm.CompletionResponse) bool {
-	if len(resp.Content) > 0 || resp.Done {
-		select {
-		case seq.responses <- resp:
-			// Successfully sent
-			return true
-		case <-seq.quit:
-			return false
-		}
-	}
-	return true
-}
-
 type NewSequenceParams struct {
 	numPredict     int
 	stop           []string
@@ -159,8 +147,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
 		numPredict:          params.numPredict,
-		pendingResponses:    make([]llm.CompletionResponse, 0),
-		responses:           make(chan llm.CompletionResponse, 100),
+		pendingResponses:    make([]string, 0),
+		responses:           make(chan string, 100),
 		quit:                make(chan bool, 1),
 		embedding:           make(chan []float32, 1),
 		samplingCtx:         sc,
@@ -284,15 +272,36 @@ func (s *Server) allNil() bool {
 	return true
 }

+func flushPending(seq *Sequence) bool {
+	joined := strings.Join(seq.pendingResponses, "")
+	seq.pendingResponses = []string{}
+
+	// Check if there are any partial UTF-8 characters remaining.
+	// We already check and queue as we are generating but some may
+	// still make it here:
+	// - Sequence is ending, e.g. generation limit has been hit
+	// - Invalid characters in the middle of a string
+	// This is a stricter check to ensure we never output invalid Unicode.
+	for !utf8.ValidString(joined) {
+		joined = joined[:len(joined)-1]
+	}
+
+	if len(joined) == 0 {
+		return true
+	}
+
+	select {
+	case seq.responses <- joined:
+		return true
+	case <-seq.quit:
+		return false
+	}
+}
+
 func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
 	seq := s.seqs[seqIndex]

-	// Send any remaining pending responses
-	for _, resp := range seq.pendingResponses {
-		seq.send(resp)
-	}
-	seq.pendingResponses = []llm.CompletionResponse{}
-
+	flushPending(seq)
 	seq.doneReason = reason
 	close(seq.responses)
 	close(seq.embedding)
@@ -481,11 +490,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 		seq.inputs = []input{{token: token}}

-		seq.pendingResponses = append(seq.pendingResponses, llm.CompletionResponse{Content: piece})
-		sequence := ""
-		for _, r := range seq.pendingResponses {
-			sequence += r.Content
-		}
+		seq.pendingResponses = append(seq.pendingResponses, piece)
+		sequence := strings.Join(seq.pendingResponses, "")

 		if ok, stop := common.FindStop(sequence, seq.stop); ok {
 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
@@ -517,13 +523,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}

-		for _, resp := range seq.pendingResponses {
-			if !seq.send(resp) {
-				s.removeSequence(i, llm.DoneReasonConnectionClosed)
-				break
-			}
+		if common.IncompleteUnicode(sequence) {
+			continue
+		}
+
+		if !flushPending(seq) {
+			s.removeSequence(i, llm.DoneReasonConnectionClosed)
 		}
-		seq.pendingResponses = []llm.CompletionResponse{}
 	}

 	return nil
@@ -621,7 +627,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
-				if err := json.NewEncoder(w).Encode(&content); err != nil {
+				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
+					Content: content,
+				}); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
 					return
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -20,6 +20,7 @@ import (
 	"strings"
 	"sync"
 	"time"
+	"unicode/utf8"

 	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"
@@ -55,13 +56,13 @@ type Sequence struct {
 	pendingInputs []input.Input

 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
-	pendingResponses []llm.CompletionResponse
+	pendingResponses []string

 	// input cache being used by this sequence
 	cache *InputCacheSlot

 	// channel to send responses over
-	responses chan llm.CompletionResponse
+	responses chan string

 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@@ -93,19 +94,6 @@ type Sequence struct {
 	numPromptInputs     int
 }

-func (seq *Sequence) send(resp llm.CompletionResponse) bool {
-	if len(resp.Content) > 0 || resp.Done {
-		select {
-		case seq.responses <- resp:
-			// Successfully sent
-			return true
-		case <-seq.quit:
-			return false
-		}
-	}
-	return true
-}
-
 type NewSequenceParams struct {
 	numPredict int
 	stop       []string
@@ -179,8 +167,8 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
 		numPredict:          params.numPredict,
-		pendingResponses:    make([]llm.CompletionResponse, 0),
-		responses:           make(chan llm.CompletionResponse, 100),
+		pendingResponses:    make([]string, 0),
+		responses:           make(chan string, 100),
 		quit:                make(chan bool, 1),
 		embedding:           make(chan []float32, 1),
 		sampler:             params.sampler,
@@ -325,15 +313,36 @@ func (s *Server) allNil() bool {
 	return true
 }

+func flushPending(seq *Sequence) bool {
+	joined := strings.Join(seq.pendingResponses, "")
+	seq.pendingResponses = []string{}
+
+	// Check if there are any partial UTF-8 characters remaining.
+	// We already check and queue as we are generating but some may
+	// still make it here:
+	// - Sequence is ending, e.g. generation limit has been hit
+	// - Invalid characters in the middle of a string
+	// This is a stricter check to ensure we never output invalid Unicode.
+	for !utf8.ValidString(joined) {
+		joined = joined[:len(joined)-1]
+	}
+
+	if len(joined) == 0 {
+		return true
+	}
+
+	select {
+	case seq.responses <- joined:
+		return true
+	case <-seq.quit:
+		return false
+	}
+}
+
 func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
 	seq := s.seqs[seqIndex]

-	// Send any remaining pending responses
-	for _, resp := range seq.pendingResponses {
-		seq.send(resp)
-	}
-	seq.pendingResponses = []llm.CompletionResponse{}
-
+	flushPending(seq)
 	seq.doneReason = reason
 	close(seq.responses)
 	close(seq.embedding)
@@ -532,11 +541,8 @@ func (s *Server) processBatch() error {

 		seq.inputs = []input.Input{{Token: token}}

-		seq.pendingResponses = append(seq.pendingResponses, llm.CompletionResponse{Content: piece})
-		sequence := ""
-		for _, r := range seq.pendingResponses {
-			sequence += r.Content
-		}
+		seq.pendingResponses = append(seq.pendingResponses, piece)
+		sequence := strings.Join(seq.pendingResponses, "")

 		if ok, stop := common.FindStop(sequence, seq.stop); ok {
 			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
@@ -568,14 +574,13 @@ func (s *Server) processBatch() error {
 			continue
 		}

-		// Send all pending responses directly without unicode checking
-		for _, resp := range seq.pendingResponses {
-			if !seq.send(resp) {
-				s.removeSequence(i, llm.DoneReasonConnectionClosed)
-				break
-			}
+		if common.IncompleteUnicode(sequence) {
+			continue
+		}
+
+		if !flushPending(seq) {
+			s.removeSequence(i, llm.DoneReasonConnectionClosed)
 		}
-		seq.pendingResponses = []llm.CompletionResponse{}
 	}

 	return nil
@@ -678,7 +683,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 			return
 		case content, ok := <-seq.responses:
 			if ok {
-				if err := json.NewEncoder(w).Encode(&content); err != nil {
+				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
+					Content: content,
+				}); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
 					return
--- a/server/images.go
+++ b/server/images.go
@@ -23,10 +23,9 @@ import (

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/fs/gguf"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
-	"github.com/ollama/ollama/thinking"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@@ -73,18 +72,22 @@ func (m *Model) Capabilities() []model.Capability {
 	capabilities := []model.Capability{}

 	// Check for completion capability
-	f, err := gguf.Open(m.ModelPath)
+	r, err := os.Open(m.ModelPath)
 	if err == nil {
-		defer f.Close()
+		defer r.Close()

-		if f.KeyValue("pooling_type").Valid() {
-			capabilities = append(capabilities, model.CapabilityEmbedding)
+		f, err := ggml.Decode(r, 1024)
+		if err == nil {
+			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
+				capabilities = append(capabilities, model.CapabilityEmbedding)
+			} else {
+				capabilities = append(capabilities, model.CapabilityCompletion)
+			}
+			if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok {
+				capabilities = append(capabilities, model.CapabilityVision)
+			}
 		} else {
-			// If no embedding is specified, we assume the model supports completion
-			capabilities = append(capabilities, model.CapabilityCompletion)
-		}
-		if f.KeyValue("vision.block_count").Valid() {
-			capabilities = append(capabilities, model.CapabilityVision)
+			slog.Error("couldn't decode ggml", "error", err)
 		}
 	} else {
 		slog.Error("couldn't open model file", "error", err)
@@ -110,7 +113,7 @@ func (m *Model) Capabilities() []model.Capability {
 	}

 	// Check for thinking capability
-	openingTag, closingTag := thinking.InferTags(m.Template.Template)
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
 	if openingTag != "" && closingTag != "" {
 		capabilities = append(capabilities, model.CapabilityThinking)
 	}
--- a/server/images_test.go
+++ b/server/images_test.go
@@ -1,42 +1,123 @@
 package server

 import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"os"
+	"path/filepath"
 	"strings"
 	"testing"

-	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 )

+// Constants for GGUF magic bytes and version
+var (
+	ggufMagic = []byte{0x47, 0x47, 0x55, 0x46} // "GGUF"
+	ggufVer   = uint32(3)                      // Version 3
+)
+
+// Helper function to create mock GGUF data
+func createMockGGUFData(architecture string, vision bool) []byte {
+	var buf bytes.Buffer
+
+	// Write GGUF header
+	buf.Write(ggufMagic)
+	binary.Write(&buf, binary.LittleEndian, ggufVer)
+
+	// Write tensor count (0 for our test)
+	var numTensors uint64 = 0
+	binary.Write(&buf, binary.LittleEndian, numTensors)
+
+	// Calculate number of metadata entries
+	numMetaEntries := uint64(1) // architecture entry
+	if vision {
+		numMetaEntries++
+	}
+	// Add embedding entry if architecture is "bert"
+	if architecture == "bert" {
+		numMetaEntries++
+	}
+	binary.Write(&buf, binary.LittleEndian, numMetaEntries)
+
+	// Write architecture metadata
+	archKey := "general.architecture"
+	keyLen := uint64(len(archKey))
+	binary.Write(&buf, binary.LittleEndian, keyLen)
+	buf.WriteString(archKey)
+
+	// String type (8)
+	var strType uint32 = 8
+	binary.Write(&buf, binary.LittleEndian, strType)
+
+	// String length
+	strLen := uint64(len(architecture))
+	binary.Write(&buf, binary.LittleEndian, strLen)
+	buf.WriteString(architecture)
+
+	if vision {
+		visionKey := architecture + ".vision.block_count"
+		keyLen = uint64(len(visionKey))
+		binary.Write(&buf, binary.LittleEndian, keyLen)
+		buf.WriteString(visionKey)
+
+		// uint32 type (4)
+		var uint32Type uint32 = 4
+		binary.Write(&buf, binary.LittleEndian, uint32Type)
+
+		// uint32 value (1)
+		var countVal uint32 = 1
+		binary.Write(&buf, binary.LittleEndian, countVal)
+	}
+	// Write embedding metadata if architecture is "bert"
+	if architecture == "bert" {
+		poolKey := architecture + ".pooling_type"
+		keyLen = uint64(len(poolKey))
+		binary.Write(&buf, binary.LittleEndian, keyLen)
+		buf.WriteString(poolKey)
+
+		// uint32 type (4)
+		var uint32Type uint32 = 4
+		binary.Write(&buf, binary.LittleEndian, uint32Type)
+
+		// uint32 value (1)
+		var poolingVal uint32 = 1
+		binary.Write(&buf, binary.LittleEndian, poolingVal)
+	}
+
+	return buf.Bytes()
+}
+
 func TestModelCapabilities(t *testing.T) {
-	// Create completion model (llama architecture without vision)
-	completionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "llama",
-	}, []*ggml.Tensor{})
+	// Create a temporary directory for test files
+	tempDir := t.TempDir()

-	// Create vision model (llama architecture with vision block count)
-	visionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture":     "llama",
-		"llama.vision.block_count": uint32(1),
-	}, []*ggml.Tensor{})
+	// Create different types of mock model files
+	completionModelPath := filepath.Join(tempDir, "model.bin")
+	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
+	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")
+	// Create a simple model file for tests that don't depend on GGUF content
+	simpleModelPath := filepath.Join(tempDir, "simple_model.bin")

-	// Create embedding model (bert architecture with pooling type)
-	embeddingModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "bert",
-		"bert.pooling_type":    uint32(1),
-	}, []*ggml.Tensor{})
+	if err := errors.Join(
+		os.WriteFile(completionModelPath, createMockGGUFData("llama", false), 0o644),
+		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+	); err != nil {
+		t.Fatalf("Failed to create model files: %v", err)
+	}

 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -64,13 +145,21 @@ func TestModelCapabilities(t *testing.T) {
 			},
 			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools, model.CapabilityInsert},
 		},
+		{
+			name: "model with tools and insert capability",
+			model: Model{
+				ModelPath: simpleModelPath,
+				Template:  toolsInsertTemplate,
+			},
+			expectedCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
+		},
 		{
 			name: "model with tools capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsTemplate,
 			},
-			expectedCaps: []model.Capability{model.CapabilityCompletion, model.CapabilityTools},
+			expectedCaps: []model.Capability{model.CapabilityTools},
 		},
 		{
 			name: "model with vision capability",
@@ -135,33 +224,29 @@ func TestModelCapabilities(t *testing.T) {
 }

 func TestModelCheckCapabilities(t *testing.T) {
-	// Create simple model file for tests that don't depend on GGUF content
-	completionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "llama",
-	}, []*ggml.Tensor{})
+	// Create a temporary directory for test files
+	tempDir := t.TempDir()

-	// Create vision model (llama architecture with vision block count)
-	visionModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture":     "llama",
-		"llama.vision.block_count": uint32(1),
-	}, []*ggml.Tensor{})
+	visionModelPath := filepath.Join(tempDir, "vision_model.bin")
+	simpleModelPath := filepath.Join(tempDir, "model.bin")
+	embeddingModelPath := filepath.Join(tempDir, "embedding_model.bin")

-	// Create embedding model (bert architecture with pooling type)
-	embeddingModelPath, _ := createBinFile(t, ggml.KV{
-		"general.architecture": "bert",
-		"bert.pooling_type":    uint32(1),
-	}, []*ggml.Tensor{})
+	if err := errors.Join(
+		os.WriteFile(simpleModelPath, []byte("dummy model data"), 0o644),
+		os.WriteFile(visionModelPath, createMockGGUFData("llama", true), 0o644),
+		os.WriteFile(embeddingModelPath, createMockGGUFData("bert", false), 0o644),
+	); err != nil {
+		t.Fatalf("Failed to create model files: %v", err)
+	}

 	toolsInsertTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}{{ if .suffix }}{{ .suffix }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	chatTemplate, err := template.Parse("{{ .prompt }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
 	}
-
 	toolsTemplate, err := template.Parse("{{ .prompt }}{{ if .tools }}{{ .tools }}{{ end }}")
 	if err != nil {
 		t.Fatalf("Failed to parse template: %v", err)
@@ -176,7 +261,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "completion model without tools capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityTools},
@@ -185,7 +270,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model with all needed capabilities",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsInsertTemplate,
 			},
 			checkCaps: []model.Capability{model.CapabilityTools, model.CapabilityInsert},
@@ -193,7 +278,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing insert capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityInsert},
@@ -202,7 +287,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "model missing vision capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  toolsTemplate,
 			},
 			checkCaps:      []model.Capability{model.CapabilityVision},
@@ -227,7 +312,7 @@ func TestModelCheckCapabilities(t *testing.T) {
 		{
 			name: "unknown capability",
 			model: Model{
-				ModelPath: completionModelPath,
+				ModelPath: simpleModelPath,
 				Template:  chatTemplate,
 			},
 			checkCaps:      []model.Capability{"unknown"},
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -257,8 +257,16 @@ func TestQuantizeModel(t *testing.T) {

 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			p, _ := createBinFile(t, tt.kv, tt.tensors)
-			fp, err := os.Open(p)
+			f, err := os.CreateTemp(t.TempDir(), tt.name)
+			if err != nil {
+				t.Fatal(err.Error())
+			}
+			defer f.Close()
+			err = fsggml.WriteGGUF(f, tt.kv, tt.tensors)
+			if err != nil {
+				t.Fatalf("failed to create initial model: %s", err)
+			}
+			fp, err := os.Open(f.Name())
 			if err != nil {
 				t.Fatal(err.Error())
 			}
--- a/server/routes.go
+++ b/server/routes.go
@@ -37,7 +37,6 @@ import (
 	"github.com/ollama/ollama/server/internal/client/ollama"
 	"github.com/ollama/ollama/server/internal/registry"
 	"github.com/ollama/ollama/template"
-	"github.com/ollama/ollama/thinking"
 	"github.com/ollama/ollama/tools"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
@@ -283,12 +282,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}

-	var thinkingState *thinking.Parser
-	openingTag, closingTag := thinking.InferTags(m.Template.Template)
+	var thinkingState *thinkingParser
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
 	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &thinking.Parser{
-			OpeningTag: openingTag,
-			ClosingTag: closingTag,
+		thinkingState = &thinkingParser{
+			openingTag: openingTag,
+			closingTag: closingTag,
 		}
 	}

@@ -317,7 +316,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}

 			if thinkingState != nil {
-				thinking, content := thinkingState.AddContent(cr.Content)
+				thinking, content := thinkingState.addContent(cr.Content)
 				res.Thinking = thinking
 				res.Response = content
 			}
@@ -1515,18 +1514,35 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}

-	var thinkingState *thinking.Parser
-	openingTag, closingTag := thinking.InferTags(m.Template.Template)
+	var thinkingState *thinkingParser
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
 	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
-		thinkingState = &thinking.Parser{
-			OpeningTag: openingTag,
-			ClosingTag: closingTag,
+		thinkingState = &thinkingParser{
+			openingTag: openingTag,
+			closingTag: closingTag,
 		}
 	}

-	var toolParser *tools.Parser
-	if len(req.Tools) > 0 {
-		toolParser = tools.NewParser(m.Template.Template, req.Tools)
+	var toolParser tools.ToolParser
+
+	fmt.Println("m.Config.ModelFamily", m.Config.ModelFamily)
+	if m.Config.ModelFamily == "qwen" {
+		slog.Info("using deepseek tool parser")
+		fmt.Println("m.Template.Template", m.Template.Template)
+		toolParser, err = tools.NewDeepSeekToolParser(m.Template.Template)
+		if err != nil {
+			slog.Error("failed to create tool parser", "error", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
+	} else if len(req.Tools) > 0 {
+		slog.Info("using default tool parser")
+		toolParser, err = tools.NewParser(m.Template.Template)
+		if err != nil {
+			slog.Error("failed to create tool parser", "error", err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+			return
+		}
 	}

 	ch := make(chan any)
@@ -1553,7 +1569,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}

 			if thinkingState != nil {
-				thinkingContent, remainingContent := thinkingState.AddContent(res.Message.Content)
+				thinkingContent, remainingContent := thinkingState.addContent(res.Message.Content)
 				if thinkingContent == "" && remainingContent == "" && !r.Done {
 					// need to accumulate more to decide what to send
 					return
@@ -1579,7 +1595,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					// don't return
 				} else {
 					if r.Done {
-						res.Message.Content = toolParser.Content()
 						ch <- res
 					}
 					return
@@ -1665,11 +1680,11 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 				// change the user output), we should probably perform this filtering
 				// for all thinking models (not just qwen3 & deepseek-r1) since it tends
 				// to save tokens and improve quality.
-				thinkingState := &thinking.Parser{
-					OpeningTag: "<think>",
-					ClosingTag: "</think>",
+				thinkingState := &thinkingParser{
+					openingTag: "<think>",
+					closingTag: "</think>",
 				}
-				_, content := thinkingState.AddContent(msg.Content)
+				_, content := thinkingState.addContent(msg.Content)
 				msgs[i].Content = content
 			}
 		}
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -112,7 +112,11 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	b.ctx, b.ctxDone = context.WithCancel(ctx)
 	t.Helper()

-	p, _ := createBinFile(t, ggml.KV{
+	f, err := os.CreateTemp(t.TempDir(), modelName)
+	require.NoError(t, err)
+	defer f.Close()
+
+	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
@@ -125,14 +129,14 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	}, []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
-	})
+	}))
+	require.NoError(t, err)
+
+	fname := f.Name()
+	model := &Model{Name: modelName, ModelPath: fname}
+	b.f, err = llm.LoadModel(model.ModelPath, 0)
+	require.NoError(t, err)

-	model := &Model{Name: modelName, ModelPath: p}
-	f, err := llm.LoadModel(model.ModelPath, 0)
-	if err != nil {
-		t.Fatal(err)
-	}
-	b.f = f
 	if duration == nil {
 		duration = &api.Duration{Duration: 5 * time.Millisecond}
 	}
--- a/server/thinking.go
+++ b/server/thinking.go
@@ -1,7 +1,9 @@
-package thinking
+package server

 import (
 	"strings"
+	"text/template"
+	"text/template/parse"
 	"unicode"
 )

@@ -44,17 +46,17 @@ func (s thinkingState) String() string {
 	}
 }

-type Parser struct {
+type thinkingParser struct {
 	state      thinkingState
-	OpeningTag string
-	ClosingTag string
+	openingTag string
+	closingTag string
 	acc        strings.Builder
 }

-// AddContent returns the thinking content and the non-thinking content that
+// addContent returns the thinking content and the non-thinking content that
 // should be immediately sent to the user. It will internally buffer if it needs
 // to see more raw content to disambiguate
-func (s *Parser) AddContent(content string) (string, string) {
+func (s *thinkingParser) addContent(content string) (string, string) {
 	s.acc.WriteString(content)

 	var thinkingSb, remainingSb strings.Builder
@@ -74,12 +76,12 @@ func (s *Parser) AddContent(content string) (string, string) {
 }

 // the additional bool return is true iff we should continue eating
-func eat(s *Parser) (string, string, bool) {
+func eat(s *thinkingParser) (string, string, bool) {
 	switch s.state {
 	case thinkingState_LookingForOpening:
 		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
-		if strings.HasPrefix(trimmed, s.OpeningTag) {
-			after := strings.Join(strings.Split(trimmed, s.OpeningTag)[1:], s.OpeningTag)
+		if strings.HasPrefix(trimmed, s.openingTag) {
+			after := strings.Join(strings.Split(trimmed, s.openingTag)[1:], s.openingTag)
 			after = strings.TrimLeftFunc(after, unicode.IsSpace)
 			// after might contain more than just thinking tokens, so we continue
 			// parsing instead of returning it as thinking tokens here
@@ -91,7 +93,7 @@ func eat(s *Parser) (string, string, bool) {
 				s.state = thinkingState_Thinking
 			}
 			return "", "", true
-		} else if strings.HasPrefix(s.OpeningTag, trimmed) {
+		} else if strings.HasPrefix(s.openingTag, trimmed) {
 			// partial opening seen, so let's keep accumulating
 			return "", "", false
 		} else if trimmed == "" {
@@ -117,10 +119,10 @@ func eat(s *Parser) (string, string, bool) {
 		}
 	case thinkingState_Thinking:
 		acc := s.acc.String()
-		if strings.Contains(acc, s.ClosingTag) {
-			split := strings.Split(acc, s.ClosingTag)
+		if strings.Contains(acc, s.closingTag) {
+			split := strings.Split(acc, s.closingTag)
 			thinking := split[0]
-			remaining := strings.Join(split[1:], s.ClosingTag)
+			remaining := strings.Join(split[1:], s.closingTag)
 			remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
 			s.acc.Reset()
 			if remaining == "" {
@@ -129,7 +131,7 @@ func eat(s *Parser) (string, string, bool) {
 				s.state = thinkingState_ThinkingDone
 			}
 			return thinking, remaining, false
-		} else if overlapLen := overlap(acc, s.ClosingTag); overlapLen > 0 {
+		} else if overlapLen := overlap(acc, s.closingTag); overlapLen > 0 {
 			thinking := acc[:len(acc)-overlapLen]
 			remaining := acc[len(acc)-overlapLen:]
 			s.acc.Reset()
@@ -169,3 +171,130 @@ func overlap(s, delim string) int {
 	}
 	return 0
 }
+
+func templateVisit(n parse.Node, enterFn func(parse.Node) bool, exitFn func(parse.Node)) {
+	if n == nil {
+		return
+	}
+	shouldContinue := enterFn(n)
+	if !shouldContinue {
+		return
+	}
+	switch x := n.(type) {
+	case *parse.ListNode:
+		for _, c := range x.Nodes {
+			templateVisit(c, enterFn, exitFn)
+		}
+	case *parse.BranchNode:
+		if x.Pipe != nil {
+			templateVisit(x.Pipe, enterFn, exitFn)
+		}
+		if x.List != nil {
+			templateVisit(x.List, enterFn, exitFn)
+		}
+		if x.ElseList != nil {
+			templateVisit(x.ElseList, enterFn, exitFn)
+		}
+	case *parse.ActionNode:
+		templateVisit(x.Pipe, enterFn, exitFn)
+	case *parse.WithNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.RangeNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.IfNode:
+		templateVisit(&x.BranchNode, enterFn, exitFn)
+	case *parse.TemplateNode:
+		templateVisit(x.Pipe, enterFn, exitFn)
+	case *parse.PipeNode:
+		for _, c := range x.Cmds {
+			templateVisit(c, enterFn, exitFn)
+		}
+	case *parse.CommandNode:
+		for _, a := range x.Args {
+			templateVisit(a, enterFn, exitFn)
+		}
+		// text, field, number, etc. are leaves – nothing to recurse into
+	}
+	if exitFn != nil {
+		exitFn(n)
+	}
+}
+
+// We use a heuristic to infer the tags that surround thinking traces:
+// We look for a range node that iterates over "Messages" and then look for a
+// reference to "Thinking" like `{{.Thinking}}`. We then go up to the nearest
+// ListNode and take the first and last TextNodes as the opening and closing
+// tags.
+func inferThinkingTags(t *template.Template) (string, string) {
+	ancestors := []parse.Node{}
+
+	openingTag := ""
+	closingTag := ""
+
+	enterFn := func(n parse.Node) bool {
+		ancestors = append(ancestors, n)
+
+		switch x := n.(type) {
+		case *parse.FieldNode:
+			if len(x.Ident) > 0 && x.Ident[0] == "Thinking" {
+				var mostRecentRange *parse.RangeNode
+				for i := len(ancestors) - 1; i >= 0; i-- {
+					if r, ok := ancestors[i].(*parse.RangeNode); ok {
+						mostRecentRange = r
+						break
+					}
+				}
+				if mostRecentRange == nil || !rangeUsesField(mostRecentRange, "Messages") {
+					return true
+				}
+
+				// TODO(drifkin): to be more robust, check that it's in the action
+				// part, not the `if`'s pipeline part. We do match on the nearest list
+				// that starts and ends with text nodes, which makes this not strictly
+				// necessary for our heuristic
+
+				// go up to the nearest ancestor that is a *parse.ListNode
+				for i := len(ancestors) - 1; i >= 0; i-- {
+					if l, ok := ancestors[i].(*parse.ListNode); ok {
+						firstNode := l.Nodes[0]
+						if t, ok := firstNode.(*parse.TextNode); ok {
+							openingTag = strings.TrimSpace(t.String())
+						}
+						lastNode := l.Nodes[len(l.Nodes)-1]
+						if t, ok := lastNode.(*parse.TextNode); ok {
+							closingTag = strings.TrimSpace(t.String())
+						}
+
+						break
+					}
+				}
+			}
+		}
+
+		return true
+	}
+
+	exitFn := func(n parse.Node) {
+		ancestors = ancestors[:len(ancestors)-1]
+	}
+
+	templateVisit(t.Root, enterFn, exitFn)
+
+	return openingTag, closingTag
+}
+
+// checks to see if the given field name is present in the pipeline of the given range node
+func rangeUsesField(rangeNode *parse.RangeNode, field string) bool {
+	found := false
+	enterFn := func(n parse.Node) bool {
+		switch x := n.(type) {
+		case *parse.FieldNode:
+			if x.Ident[0] == field {
+				found = true
+			}
+		}
+		return true
+	}
+	templateVisit(rangeNode.BranchNode.Pipe, enterFn, nil)
+	return found
+}
--- a/server/thinking_test.go
+++ b/server/thinking_test.go
@@ -1,7 +1,8 @@
-package thinking
+package server

 import (
 	"testing"
+	"text/template"
 )

 func TestExtractThinking(t *testing.T) {
@@ -25,11 +26,11 @@ func TestExtractThinking(t *testing.T) {
 		},
 	}
 	for i, tt := range tests {
-		parser := Parser{
-			OpeningTag: "<think>",
-			ClosingTag: "</think>",
+		parser := thinkingParser{
+			openingTag: "<think>",
+			closingTag: "</think>",
 		}
-		gotThinking, gotContent := parser.AddContent(tt.in)
+		gotThinking, gotContent := parser.addContent(tt.in)
 		if gotContent != tt.wantContent || gotThinking != tt.wantThink {
 			t.Errorf("case %d: got (%q,%q), want (%q,%q)", i, gotThinking, gotContent, tt.wantThink, tt.wantContent)
 		}
@@ -258,15 +259,15 @@ func TestThinkingStreaming(t *testing.T) {
 	}

 	for _, c := range cases {
-		parser := Parser{
-			OpeningTag: "<think>",
-			ClosingTag: "</think>",
+		parser := thinkingParser{
+			openingTag: "<think>",
+			closingTag: "</think>",
 		}
 		if c.skip {
 			continue
 		}
 		for i, step := range c.steps {
-			thinking, content := parser.AddContent(step.input)
+			thinking, content := parser.addContent(step.input)
 			if content != step.wantContent || thinking != step.wantThinking {
 				t.Errorf("case %q (step %d): got (%q,%q), want (%q,%q)", c.desc, i, content, thinking, step.wantContent, step.wantThinking)
 			}
@@ -276,3 +277,127 @@ func TestThinkingStreaming(t *testing.T) {
 		}
 	}
 }
+
+func TestInferThinkingTags(t *testing.T) {
+	cases := []struct {
+		desc           string
+		tmplString     string
+		wantOpeningTag string
+		wantClosingTag string
+	}{
+		{
+			desc: "basic",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+				{{ if and $last .Thinking }}
+					<think>{{ .Thinking }}</think>
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "<think>",
+			wantClosingTag: "</think>",
+		},
+		{
+			desc: "doubly nested range",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- range $j, $_ := .NotMessages }}
+					{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+					{{ if and $last .Thinking }}
+						<think>{{ .Thinking }}</think>
+					{{ end }}
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "",
+			wantClosingTag: "",
+		},
+		{
+			desc: "whitespace is trimmed",
+			tmplString: `
+			{{ if .Thinking}}
+				/think
+			{{ end }}
+			{{- range $i, $_ := .Messages }}
+				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+				{{ if and $last .Thinking }}
+					Some text before   {{ .Thinking }}    Some text after
+				{{ end }}
+			{{ end }}
+		`,
+			wantOpeningTag: "Some text before",
+			wantClosingTag: "Some text after",
+		},
+		{
+			desc: "qwen3",
+			tmplString: `
+{{- if or .System .Tools .Thinking }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}
+{{- if .Thinking }}
+/think
+{{- else }}
+/no_think
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if and $last .Thinking }}
+<think>{{ .Thinking }}</think>
+{{ end }}
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+			`,
+			wantOpeningTag: "<think>",
+			wantClosingTag: "</think>",
+		},
+	}
+	for _, c := range cases {
+		tmpl := template.Must(template.New("test").Parse(c.tmplString))
+		openingTag, closingTag := inferThinkingTags(tmpl)
+		if openingTag != c.wantOpeningTag || closingTag != c.wantClosingTag {
+			t.Errorf("case %q: got (%q,%q), want (%q,%q)", c.desc, openingTag, closingTag, c.wantOpeningTag, c.wantClosingTag)
+		}
+	}
+}
--- a/thinking/template.go
+++ b/thinking/template.go
@@ -1,134 +0,0 @@
-package thinking
-
-import (
-	"strings"
-	"text/template"
-	"text/template/parse"
-)
-
-func templateVisit(n parse.Node, enterFn func(parse.Node) bool, exitFn func(parse.Node)) {
-	if n == nil {
-		return
-	}
-	shouldContinue := enterFn(n)
-	if !shouldContinue {
-		return
-	}
-	switch x := n.(type) {
-	case *parse.ListNode:
-		for _, c := range x.Nodes {
-			templateVisit(c, enterFn, exitFn)
-		}
-	case *parse.BranchNode:
-		if x.Pipe != nil {
-			templateVisit(x.Pipe, enterFn, exitFn)
-		}
-		if x.List != nil {
-			templateVisit(x.List, enterFn, exitFn)
-		}
-		if x.ElseList != nil {
-			templateVisit(x.ElseList, enterFn, exitFn)
-		}
-	case *parse.ActionNode:
-		templateVisit(x.Pipe, enterFn, exitFn)
-	case *parse.WithNode:
-		templateVisit(&x.BranchNode, enterFn, exitFn)
-	case *parse.RangeNode:
-		templateVisit(&x.BranchNode, enterFn, exitFn)
-	case *parse.IfNode:
-		templateVisit(&x.BranchNode, enterFn, exitFn)
-	case *parse.TemplateNode:
-		templateVisit(x.Pipe, enterFn, exitFn)
-	case *parse.PipeNode:
-		for _, c := range x.Cmds {
-			templateVisit(c, enterFn, exitFn)
-		}
-	case *parse.CommandNode:
-		for _, a := range x.Args {
-			templateVisit(a, enterFn, exitFn)
-		}
-		// text, field, number, etc. are leaves – nothing to recurse into
-	}
-	if exitFn != nil {
-		exitFn(n)
-	}
-}
-
-// InferTags uses a heuristic to infer the tags that surround thinking traces:
-// We look for a range node that iterates over "Messages" and then look for a
-// reference to "Thinking" like `{{.Thinking}}`. We then go up to the nearest
-// ListNode and take the first and last TextNodes as the opening and closing
-// tags.
-func InferTags(t *template.Template) (string, string) {
-	ancestors := []parse.Node{}
-
-	openingTag := ""
-	closingTag := ""
-
-	enterFn := func(n parse.Node) bool {
-		ancestors = append(ancestors, n)
-
-		switch x := n.(type) {
-		case *parse.FieldNode:
-			if len(x.Ident) > 0 && x.Ident[0] == "Thinking" {
-				var mostRecentRange *parse.RangeNode
-				for i := len(ancestors) - 1; i >= 0; i-- {
-					if r, ok := ancestors[i].(*parse.RangeNode); ok {
-						mostRecentRange = r
-						break
-					}
-				}
-				if mostRecentRange == nil || !rangeUsesField(mostRecentRange, "Messages") {
-					return true
-				}
-
-				// TODO(drifkin): to be more robust, check that it's in the action
-				// part, not the `if`'s pipeline part. We do match on the nearest list
-				// that starts and ends with text nodes, which makes this not strictly
-				// necessary for our heuristic
-
-				// go up to the nearest ancestor that is a *parse.ListNode
-				for i := len(ancestors) - 1; i >= 0; i-- {
-					if l, ok := ancestors[i].(*parse.ListNode); ok {
-						firstNode := l.Nodes[0]
-						if t, ok := firstNode.(*parse.TextNode); ok {
-							openingTag = strings.TrimSpace(t.String())
-						}
-						lastNode := l.Nodes[len(l.Nodes)-1]
-						if t, ok := lastNode.(*parse.TextNode); ok {
-							closingTag = strings.TrimSpace(t.String())
-						}
-
-						break
-					}
-				}
-			}
-		}
-
-		return true
-	}
-
-	exitFn := func(n parse.Node) {
-		ancestors = ancestors[:len(ancestors)-1]
-	}
-
-	templateVisit(t.Root, enterFn, exitFn)
-
-	return openingTag, closingTag
-}
-
-// checks to see if the given field name is present in the pipeline of the given range node
-func rangeUsesField(rangeNode *parse.RangeNode, field string) bool {
-	found := false
-	enterFn := func(n parse.Node) bool {
-		switch x := n.(type) {
-		case *parse.FieldNode:
-			if x.Ident[0] == field {
-				found = true
-			}
-		}
-		return true
-	}
-	templateVisit(rangeNode.BranchNode.Pipe, enterFn, nil)
-	return found
-}
--- a/thinking/template_test.go
+++ b/thinking/template_test.go
@@ -1,130 +0,0 @@
-package thinking
-
-import (
-	"testing"
-	"text/template"
-)
-
-func TestInferThinkingTags(t *testing.T) {
-	cases := []struct {
-		desc           string
-		tmplString     string
-		wantOpeningTag string
-		wantClosingTag string
-	}{
-		{
-			desc: "basic",
-			tmplString: `
-			{{ if .Thinking}}
-				/think
-			{{ end }}
-			{{- range $i, $_ := .Messages }}
-				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-				{{ if and $last .Thinking }}
-					<think>{{ .Thinking }}</think>
-				{{ end }}
-			{{ end }}
-		`,
-			wantOpeningTag: "<think>",
-			wantClosingTag: "</think>",
-		},
-		{
-			desc: "doubly nested range",
-			tmplString: `
-			{{ if .Thinking}}
-				/think
-			{{ end }}
-			{{- range $i, $_ := .Messages }}
-				{{- range $j, $_ := .NotMessages }}
-					{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-					{{ if and $last .Thinking }}
-						<think>{{ .Thinking }}</think>
-					{{ end }}
-				{{ end }}
-			{{ end }}
-		`,
-			wantOpeningTag: "",
-			wantClosingTag: "",
-		},
-		{
-			desc: "whitespace is trimmed",
-			tmplString: `
-			{{ if .Thinking}}
-				/think
-			{{ end }}
-			{{- range $i, $_ := .Messages }}
-				{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-				{{ if and $last .Thinking }}
-					Some text before   {{ .Thinking }}    Some text after
-				{{ end }}
-			{{ end }}
-		`,
-			wantOpeningTag: "Some text before",
-			wantClosingTag: "Some text after",
-		},
-		{
-			desc: "qwen3",
-			tmplString: `
-{{- if or .System .Tools .Thinking }}<|im_start|>system
-{{- if .System }}
-{{ .System }}
-{{- end }}
-{{- if .Tools }}
-
-# Tools
-
-You may call one or more functions to assist with the user query.
-
-You are provided with function signatures within <tools></tools> XML tags:
-<tools>
-{{- range .Tools }}
-{"type": "function", "function": {{ .Function }}}
-{{- end }}
-</tools>
-
-For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
-<tool_call>
-{"name": <function-name>, "arguments": <args-json-object>}
-</tool_call>
-{{- end }}
-{{- if .Thinking }}
-/think
-{{- else }}
-/no_think
-{{- end }}<|im_end|>
-{{ end }}
-{{- range $i, $_ := .Messages }}
-{{- $last := eq (len (slice $.Messages $i)) 1 -}}
-{{- if eq .Role "user" }}<|im_start|>user
-{{ .Content }}<|im_end|>
-{{ else if eq .Role "assistant" }}<|im_start|>assistant
-{{ if and $last .Thinking }}
-<think>{{ .Thinking }}</think>
-{{ end }}
-{{ if .Content }}{{ .Content }}
-{{- else if .ToolCalls }}<tool_call>
-{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
-{{ end }}</tool_call>
-{{- end }}{{ if not $last }}<|im_end|>
-{{ end }}
-{{- else if eq .Role "tool" }}<|im_start|>user
-<tool_response>
-{{ .Content }}
-</tool_response><|im_end|>
-{{ end }}
-{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
-{{ end }}
-{{- end }}
-			`,
-			wantOpeningTag: "<think>",
-			wantClosingTag: "</think>",
-		},
-	}
-	for _, c := range cases {
-		tmpl := template.Must(template.New("test").Parse(c.tmplString))
-		openingTag, closingTag := InferTags(tmpl)
-		if openingTag != c.wantOpeningTag || closingTag != c.wantClosingTag {
-			t.Errorf("case %q: got (%q,%q), want (%q,%q)", c.desc, openingTag, closingTag, c.wantOpeningTag, c.wantClosingTag)
-		}
-	}
-}
--- a/tools/deepseek_tools.go
+++ b/tools/deepseek_tools.go
@@ -0,0 +1,179 @@
+package tools
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strings"
+	gotmpl "text/template"
+
+	"github.com/ollama/ollama/api"
+)
+
+type DeepSeekToolParser struct {
+	parser *Parser // Embed the base parser as a field
+}
+
+func (p *DeepSeekToolParser) Add(s string) (tools []api.ToolCall, content string) {
+	fmt.Println("prefix", p.parser.prefix)
+	fmt.Println("DeepSeekToolParser.Add: Starting with input:", s)
+	p.parser.sb.WriteString(s)
+	s = p.parser.sb.String()
+	fmt.Println("DeepSeekToolParser.Add: After StringBuilder:", s)
+
+	// Check for prefix pattern in input
+	s, err := p.parser.checkPrefix(s)
+	fmt.Println("DeepSeekToolParser.Add: After checkPrefix:", s, "error:", err)
+	if err != nil {
+		// Need more input to complete prefix
+		return nil, s
+	}
+
+	// Exit if prefix exists in template, greedy parsing is off, and prefix not found
+	if !p.parser.prefixFound {
+		fmt.Println("DeepSeekToolParser.Add: Prefix not found, resetting")
+		p.parser.sb.Reset()
+		return nil, s
+	}
+
+	toolCalls, err := parseDeepSeekToolCalls(s)
+	fmt.Println("DeepSeekToolParser.Add: After parseDeepSeekToolCalls:", toolCalls, "error:", err)
+	if err != nil {
+		if errors.Is(err, errAccumulateMore) {
+			return nil, ""
+		}
+		p.parser.sb.Reset()
+		// Only do greedy JSON parsing if there is no prefix from template
+		if p.parser.prefix != "" {
+			fmt.Println("DeepSeekToolParser.Add: Disabling greedy parsing")
+			p.parser.greedyParseJSON = false
+		}
+		if p.parser.index != 0 && p.parser.prefix == "" {
+			return nil, ""
+		}
+		if p.parser.prefixFound {
+			fmt.Println("DeepSeekToolParser.Add: Prefix found but invalid tool call")
+			// Drop tokens since prefix was found
+			return nil, ""
+		}
+		return nil, s
+	}
+
+	fmt.Println("DeepSeekToolParser.Add: Processing tool calls")
+	for _, tc := range toolCalls {
+		tc.Function.Index = p.parser.index
+		p.parser.index++
+	}
+
+	p.parser.sb.Reset()
+	fmt.Println("DeepSeekToolParser.Add: Returning tool calls:", toolCalls)
+	return toolCalls, ""
+}
+
+func (p *DeepSeekToolParser) NewParser(templateToProcess *gotmpl.Template) (ToolParser, error) {
+	return NewDeepSeekToolParser(templateToProcess)
+}
+
+func NewDeepSeekToolParser(templateToProcess *gotmpl.Template) (ToolParser, error) {
+	// Create base parser first
+	baseParser, err := NewParser(templateToProcess)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create base parser: %w", err)
+	}
+
+	return &DeepSeekToolParser{
+		parser: baseParser,
+	}, nil
+}
+
+func parseDeepSeekToolCalls(s string) ([]api.ToolCall, error) {
+	fmt.Println("parseDeepSeekToolCalls: Starting with input:", s)
+	fields := strings.Fields(s)
+	fmt.Println("parseDeepSeekToolCalls: Split fields:", fields)
+
+	sep := "<｜tool▁sep｜>"
+	var functionNames []string
+	for _, field := range fields {
+		fmt.Println("parseDeepSeekToolCalls: Processing field:", field)
+		// TODO: check if brittle
+		if strings.Contains(field, "function") {
+			idx := strings.Index(field, "function")
+			if idx == -1 {
+				fmt.Println("parseDeepSeekToolCalls: No 'function' prefix found")
+				return nil, errAccumulateMore
+			}
+			functionName := field[idx+len("function"):]
+			// functionName, cut := strings.CutPrefix(field, "function")
+			// if !cut {
+			// 	fmt.Println("parseDeepSeekToolCalls: Failed to cut 'function' prefix")
+			// 	return nil, errAccumulateMore
+			// }
+			// pass through on this is fine as it doesn't always come down
+			functionName, _ = strings.CutPrefix(functionName, sep)
+			fmt.Println("parseDeepSeekToolCalls: Found function name:", functionName)
+			functionNames = append(functionNames, functionName)
+		}
+	}
+
+	if len(functionNames) == 0 {
+		fmt.Println("parseDeepSeekToolCalls: No function names found")
+		return nil, errAccumulateMore
+	}
+	fmt.Println("parseDeepSeekToolCalls: Found function names:", functionNames)
+
+	braceCount := 0
+	startIndex := -1
+
+	var rawToolArgs []string
+	for i, c := range s {
+		switch c {
+		case '{':
+			braceCount++
+			if startIndex == -1 {
+				startIndex = i
+				fmt.Printf("parseDeepSeekToolCalls: Found opening brace at index %d\n", i)
+			}
+		case '}':
+			braceCount--
+			if braceCount == 0 {
+				rawToolArgs = append(rawToolArgs, s[startIndex:i+1])
+				fmt.Printf("parseDeepSeekToolCalls: Found closing brace at index %d, captured: %s\n", i, s[startIndex:i+1])
+				startIndex = -1
+			}
+		}
+	}
+	fmt.Println("parseDeepSeekToolCalls: Raw tool arguments:", rawToolArgs)
+
+	var toolCalls []api.ToolCall
+	// unmarshal args
+	var args map[string]any
+	for i, rawToolArg := range rawToolArgs {
+		fmt.Printf("parseDeepSeekToolCalls: Unmarshaling tool arg %d: %s\n", i, rawToolArg)
+		if err := json.Unmarshal([]byte(rawToolArg), &args); err != nil {
+			fmt.Println("parseDeepSeekToolCalls: Failed to unmarshal JSON:", err)
+			return nil, err
+		}
+
+		toolCalls = append(toolCalls, api.ToolCall{
+			Function: api.ToolCallFunction{
+				Name:      functionNames[i],
+				Arguments: args,
+			},
+		})
+		fmt.Printf("parseDeepSeekToolCalls: Created tool call %d with name %s and args %v\n", i, functionNames[i], args)
+	}
+
+	if len(toolCalls) == 0 {
+		fmt.Println("parseDeepSeekToolCalls: No tool calls created")
+		// todo: check err here
+		return nil, errInvalidToolCall
+	}
+
+	fmt.Println("parseDeepSeekToolCalls: Returning tool calls:", toolCalls)
+	return toolCalls, nil
+}
+
+// ! use as prefix
+// {{"<｜tool▁call▁begin｜>
+// ! send to tc parser
+// * function<｜tool▁sep｜><function_name>\n```json\n<function_arguments_in_json_format>\n```<｜tool▁call▁end｜>"}}
--- a/tools/deepseek_tools_test.go
+++ b/tools/deepseek_tools_test.go
@@ -0,0 +1,86 @@
+package tools
+
+import (
+	"fmt"
+	"path/filepath"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/template"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestDeepSeekToolParser(t *testing.T) {
+	p := filepath.Join("testdata")
+	t1 := api.ToolCall{
+		Function: api.ToolCallFunction{
+			Name: "get_current_weather",
+			Arguments: map[string]any{
+				"format":   "fahrenheit",
+				"location": "San Francisco, CA",
+			},
+			Index: 0,
+		},
+	}
+
+	// t2 := api.ToolCall{
+	// 	Function: api.ToolCallFunction{
+	// 		Name: "get_current_weather",
+	// 		Arguments: map[string]any{
+	// 			"format":   "celsius",
+	// 			"location": "Toronto, Canada",
+	// 		},
+	// 		Index: 1,
+	// 	},
+	// }
+
+	tests := []struct {
+		name             string
+		template         string
+		output           string
+		expectedToolCall []api.ToolCall
+		expectedTokens   string
+	}{
+		{
+			name: "single tool call",
+			output: `<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather
+` + "```json\n" + `{"format":"fahrenheit","location":"San Francisco, CA"}` + "\n```" + `<｜tool▁call▁end｜>`,
+			expectedToolCall: []api.ToolCall{t1},
+			expectedTokens:   "",
+		},
+		// 		{
+		// 			name:     "multiple tool calls",
+		// 			template: `"<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n` + "```json\n" + `{"format":"fahrenheit","location":"San Francisco, CA"}` + "\n```" + `<｜tool▁call▁end｜>"`,
+		// 			output: `<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather
+		// ` + "```json\n" + `{"format":"fahrenheit","location":"San Francisco, CA"}` + "\n```" + `<｜tool▁call▁end｜>
+		// <｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather
+		// ` + "```json\n" + `{"format":"celsius","location":"Toronto, Canada"}` + "\n```" + `<｜tool▁call▁end｜>`,
+		// 			expectedToolCall: []api.ToolCall{t1, t2},
+		// 			expectedTokens:   "",
+		// 		},
+		// 		{
+		// 			name:             "invalid tool call format",
+		// 			template:         `{{"<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n` + "```json\n" + `{"format":"fahrenheit","location":"San Francisco, CA"}` + "\n```" + `<｜tool▁call▁end｜>"}}`,
+		// 			output:           "This is just some text without a tool call",
+		// 			expectedToolCall: nil,
+		// 			expectedTokens:   "This is just some text without a tool call",
+		// 		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := template.Parse(readFile(t, p, "deepseek-r1.gotmpl").String())
+			if err != nil {
+				t.Fatal(err)
+			}
+			fmt.Println(tmpl.Template.Root.String())
+
+			parser, err := NewDeepSeekToolParser(tmpl.Template)
+			assert.NoError(t, err)
+
+			tools, content := parser.Add(tt.output)
+			assert.Equal(t, tt.expectedToolCall, tools)
+			assert.Equal(t, tt.expectedTokens, content)
+		})
+	}
+}
--- a/tools/template.go
+++ b/tools/template.go
@@ -1,156 +0,0 @@
-package tools
-
-import (
-	"bytes"
-	"log/slog"
-	"slices"
-	"strings"
-	"text/template"
-	"text/template/parse"
-)
-
-// parseTag finds the tool calling tag from a Go template
-// often <tool_call> [TOOL_CALL] or similar by finding the
-// first text node after .ToolCalls and returning the content
-// if no tag is found, return "{" to indicate that json objects
-// should be attempted to be parsed as tool calls
-func parseTag(tmpl *template.Template) string {
-	if tmpl == nil || tmpl.Tree == nil {
-		slog.Debug("template or tree is nil")
-		return "{"
-	}
-
-	tc := findToolCallNode(tmpl.Tree.Root.Nodes)
-	if tc == nil {
-		return "{"
-	}
-
-	tn := findTextNode(tc.List.Nodes)
-	if tn == nil {
-		return "{"
-	}
-
-	tag := string(tn.Text)
-	tag = strings.ReplaceAll(tag, "\r\n", "\n")
-
-	// avoid parsing { onwards as this may be a tool call
-	// however keep '{' as a prefix if there is no tag
-	// so that all json objects will be attempted to
-	// be parsed as tool calls
-	tag, _, _ = strings.Cut(tag, "{")
-	tag = strings.TrimSpace(tag)
-	if tag == "" {
-		tag = "{"
-	}
-
-	return tag
-}
-
-// findToolCallNode searches for and returns an IfNode with .ToolCalls
-func findToolCallNode(nodes []parse.Node) *parse.IfNode {
-	isToolCallsNode := func(n *parse.IfNode) bool {
-		for _, cmd := range n.Pipe.Cmds {
-			for _, arg := range cmd.Args {
-				if field, ok := arg.(*parse.FieldNode); ok {
-					if slices.Contains(field.Ident, "ToolCalls") {
-						return true
-					}
-				}
-			}
-		}
-		return false
-	}
-
-	for _, node := range nodes {
-		switch n := node.(type) {
-		case *parse.IfNode:
-			if isToolCallsNode(n) {
-				return n
-			}
-			// Recursively search in nested IfNodes
-			if result := findToolCallNode(n.List.Nodes); result != nil {
-				return result
-			}
-			if n.ElseList != nil {
-				if result := findToolCallNode(n.ElseList.Nodes); result != nil {
-					return result
-				}
-			}
-		case *parse.ListNode:
-			if result := findToolCallNode(n.Nodes); result != nil {
-				return result
-			}
-		case *parse.RangeNode:
-			if result := findToolCallNode(n.List.Nodes); result != nil {
-				return result
-			}
-			if n.ElseList != nil {
-				if result := findToolCallNode(n.ElseList.Nodes); result != nil {
-					return result
-				}
-			}
-		case *parse.WithNode:
-			if result := findToolCallNode(n.List.Nodes); result != nil {
-				return result
-			}
-			if n.ElseList != nil {
-				if result := findToolCallNode(n.ElseList.Nodes); result != nil {
-					return result
-				}
-			}
-		}
-	}
-	return nil
-}
-
-// findTextNode does a depth-first search for the first text content in nodes,
-// stopping at template constructs to avoid parsing text after the tool calls
-func findTextNode(nodes []parse.Node) *parse.TextNode {
-	for _, node := range nodes {
-		switch n := node.(type) {
-		case *parse.TextNode:
-			// skip whitespace-only text nodes
-			if len(bytes.TrimSpace(n.Text)) == 0 {
-				continue
-			}
-			return n
-		case *parse.IfNode:
-			if text := findTextNode(n.List.Nodes); text != nil {
-				return text
-			}
-			if n.ElseList != nil {
-				if text := findTextNode(n.ElseList.Nodes); text != nil {
-					return text
-				}
-			}
-			return nil
-		case *parse.ListNode:
-			if text := findTextNode(n.Nodes); text != nil {
-				return text
-			}
-		case *parse.RangeNode:
-			if text := findTextNode(n.List.Nodes); text != nil {
-				return text
-			}
-			if n.ElseList != nil {
-				if text := findTextNode(n.ElseList.Nodes); text != nil {
-					return text
-				}
-			}
-			return nil
-		case *parse.WithNode:
-			if text := findTextNode(n.List.Nodes); text != nil {
-				return text
-			}
-			if n.ElseList != nil {
-				if text := findTextNode(n.ElseList.Nodes); text != nil {
-					return text
-				}
-			}
-			return nil
-		case *parse.ActionNode:
-			return nil
-		}
-	}
-	return nil
-}
--- a/tools/template_test.go
+++ b/tools/template_test.go
@@ -1,139 +0,0 @@
-package tools
-
-import (
-	"testing"
-	"text/template"
-)
-
-func TestParseTag(t *testing.T) {
-	cases := []struct {
-		name     string
-		template string
-		want     string
-	}{
-		{
-			name:     "empty",
-			template: "",
-			want:     "{",
-		},
-		{
-			name:     "no tag",
-			template: "{{if .ToolCalls}}{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "no tag with range",
-			template: "{{if .ToolCalls}}{{range .ToolCalls}}{{ . }}{{end}}{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "tool call with json format",
-			template: "{{if .ToolCalls}}```json\n{{end}}",
-			want:     "```json",
-		},
-		{
-			name:     "square brackets",
-			template: "{{if .ToolCalls}}[{{range .ToolCalls}}{{ . }}{{end}}]{{end}}",
-			want:     "[",
-		},
-		{
-			name:     "square brackets with whitespace",
-			template: "{{if .ToolCalls}}\n [ {{range .ToolCalls}}{{ . }}{{end}}]{{end}}",
-			want:     "[",
-		},
-		{
-			name:     "tailing ]",
-			template: "{{if .ToolCalls}}{{range .ToolCalls}}{{ . }}{{end}}]{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "whitespace only",
-			template: "{{if .ToolCalls}} {{range .ToolCalls}}{{ . }}{{end}}{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "whitespace only in range",
-			template: "{{if .ToolCalls}}{{range .ToolCalls}}\n{{ . }}\n{{end}}{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "json objects",
-			template: `{{if .ToolCalls}}{{range .ToolCalls}}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{end}}{{end}}`,
-			want:     "{",
-		},
-		{
-			name:     "json objects with whitespace",
-			template: "{{if .ToolCalls}}{{range .ToolCalls}}\n{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}{{end}}{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "json objects with CRLF",
-			template: "{{if .ToolCalls}}{{range .ToolCalls}}\r\n{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}{{end}}{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "json objects with whitespace before and after range",
-			template: "{{if .ToolCalls}}\n{{range .ToolCalls}}\n{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}\r\n{{end}}\r\n{{end}}",
-			want:     "{",
-		},
-		{
-			name:     "before and after range",
-			template: "{{if .ToolCalls}}<|tool▁calls▁begin|>{{range .ToolCalls}}<|tool▁call▁begin|>functionget_current_weather\n```json\n{\"location\": \"Tokyo\"}\n```<|tool▁call▁end|>\n{{end}}<|tool▁calls▁end|>{{end}}",
-			want:     "<|tool▁calls▁begin|>",
-		},
-		{
-			name:     "after range",
-			template: "{{if .ToolCalls}}{{range .ToolCalls}}<tool_call>{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}</tool_call>{{end}}{{end}}",
-			want:     "<tool_call>",
-		},
-		{
-			name:     "after range with leading whitespace before range",
-			template: "{{if .ToolCalls}}\n{{range .ToolCalls}}<tool_call>{\"name\": \"{{ .Function.Name }}\", \"arguments\": {{ .Function.Arguments }}}</tool_call>{{end}}{{end}}",
-			want:     "<tool_call>",
-		},
-		{
-			name:     "tool call in range with {",
-			template: `{{if .ToolCalls}}{{range .ToolCalls}}<tool_call>{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}<tool_call>{{end}}{{end}}`,
-			want:     "<tool_call>",
-		},
-		{
-			name:     "tool call with multiple text nodes",
-			template: "{{if .ToolCalls}}First text{{if .Something}}inner{{end}}Second text{{end}}",
-			want:     "First text",
-		},
-		{
-			name:     "action tag",
-			template: "{{if .ToolCalls}}Action: ```json{{end}}",
-			want:     "Action: ```json",
-		},
-		{
-			name:     "incomplete functools bracket",
-			template: "{{if .ToolCalls}}functools[{{end}}",
-			want:     "functools[",
-		},
-		{
-			name:     "uppercase tool call with incomplete bracket",
-			template: "{{if .ToolCalls}}[TOOL_CALL] [{{end}}",
-			want:     "[TOOL_CALL] [",
-		},
-		{
-			name:     "uppercase tool call with adjacent bracket",
-			template: "{{if .ToolCalls}}[TOOL_CALL][{{end}}",
-			want:     "[TOOL_CALL][",
-		},
-	}
-
-	for _, tc := range cases {
-		t.Run(tc.name, func(t *testing.T) {
-			tmpl, err := template.New("test").Parse(tc.template)
-			if err != nil && tc.template != "" {
-				t.Fatalf("failed to parse template: %v", err)
-			}
-
-			got := parseTag(tmpl)
-			if got != tc.want {
-				t.Errorf("got text %q, want %q", got, tc.want)
-			}
-		})
-	}
-}
--- a/tools/testdata/command-r-plus.gotmpl
+++ b/tools/testdata/command-r-plus.gotmpl
@@ -0,0 +1,67 @@
+{{- if or .Tools .System }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+{{- if .Tools }}# Safety Preamble
+The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+# System Preamble
+## Basic Rules
+You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+{{ if .System }}# User Preamble
+{{ .System }}
+{{- end }}
+
+## Available Tools
+Here is a list of tools that you have available to you:
+{{- range .Tools }}
+
+```python
+def {{ .Function.Name }}(
+{{- range $name, $property := .Function.Parameters.Properties }}{{ $name }}: {{ $property.Type }}, {{ end }}) -> List[Dict]:
+    """{{ .Function.Description }}
+
+{{- if .Function.Parameters.Properties }}
+
+    Args:
+{{- range $name, $property := .Function.Parameters.Properties }}
+        {{ $name }} ({{ $property.Type }}): {{ $property.Description }}
+{{- end }}
+{{- end }}
+    """
+    pass
+```
+{{- end }}
+{{- else if .System }}{{ .System }}
+{{- end }}<|END_OF_TURN_TOKEN|>
+{{- end }}
+{{- range .Messages }}
+{{- if eq .Role "system" }}
+{{- continue }}
+{{- end }}<|START_OF_TURN_TOKEN|>
+{{- if eq .Role "user" }}<|USER_TOKEN|>{{ .Content }}
+{{- else if eq .Role "assistant" }}<|CHATBOT_TOKEN|>
+{{- if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}
+Action: ```json
+[
+{{- range .ToolCalls }}
+    {
+        "tool_name": "{{ .Function.Name }}",
+        "parameters": {{ .Function.Arguments }}
+    }
+{{- end }}
+]```
+{{ continue }}
+{{ end }}
+{{- else if eq .Role "tool" }}<|SYSTEM_TOKEN|><results>
+{{ .Content }}</results>
+{{- end }}<|END_OF_TURN_TOKEN|>
+{{- end }}
+{{- if .Tools }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
+```json
+[
+    {
+        "tool_name": title of the tool in the specification,
+        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
+    }
+]```
+{{- end }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
--- a/tools/testdata/command-r-plus.out
+++ b/tools/testdata/command-r-plus.out
@@ -0,0 +1,39 @@
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
+The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+# System Preamble
+## Basic Rules
+You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+# User Preamble
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+
+## Available Tools
+Here is a list of tools that you have available to you:
+
+```python
+def get_current_weather(format: string, location: string, ) -> List[Dict]:
+    """Get the current weather
+
+    Args:
+        format (string): The temperature unit to use. Infer this from the user's location.
+        location (string): The city and state, e.g. San Francisco, CA
+    """
+    pass
+```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>What's the weather like today in Paris?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+Action: ```json
+[
+    {
+        "tool_name": "get_current_weather",
+        "parameters": {"format":"celsius","location":"Paris, France"}
+    }
+]```
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><results>
+22</results><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>The current temperature in Paris, France is 22 degrees Celsius.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>What's the weather like today in San Francisco and Toronto?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
+```json
+[
+    {
+        "tool_name": title of the tool in the specification,
+        "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
+    }
+]```<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
--- a/tools/testdata/firefunction.gotmpl
+++ b/tools/testdata/firefunction.gotmpl
@@ -0,0 +1,31 @@
+{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
+{{- if .System }}
+{{ .System }}
+{{- end }}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+Available functions as JSON spec:
+{{- if .Tools }}
+{{ .Tools }}
+{{- end }}<|eot_id|>
+{{- end }}
+{{- range .Messages }}<|start_header_id|>
+{{- if or (eq .Role "user") (eq .Role "assistant") (eq .Role "tool") }}{{ .Role }}
+{{- end }}<|end_header_id|>
+{{- if .Content }}{{ .Content }}
+{{- else if .ToolCalls }} functools[
+{{- range .ToolCalls }}{{ "{" }}"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}{{ "}" }}
+{{- end }}]
+{{- end }}<|eot_id|>
+{{- end }}<|start_header_id|>assistant<|end_header_id|>
--- a/tools/testdata/firefunction.out
+++ b/tools/testdata/firefunction.out
@@ -0,0 +1,17 @@
+<|start_header_id|>system<|end_header_id|>
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+Available functions as JSON spec:
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]<|eot_id|><|start_header_id|><|end_header_id|>You are a knowledgeable assistant. You can answer questions and perform tasks.<|eot_id|><|start_header_id|>user<|end_header_id|>What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|> functools[{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]<|eot_id|><|start_header_id|>tool<|end_header_id|>22<|eot_id|><|start_header_id|>assistant<|end_header_id|>The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
--- a/tools/testdata/llama3-groq-tool-use.gotmpl
+++ b/tools/testdata/llama3-groq-tool-use.gotmpl
@@ -0,0 +1,43 @@
+{{- if .Messages }}
+{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}
+{{- if .Tools }} You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>,"arguments": <args-dict>}
+</tool_call>
+
+Here are the available tools:
+<tools>
+{{- range .Tools }} {{ .Function }}
+{{- end }} </tools>
+{{- end }}
+{{- end }}<|eot_id|>
+{{- range .Messages }}
+{{- if ne .Role "system" }}<|start_header_id|>{{ .Role }}<|end_header_id|>
+
+{{ if eq .Role "user" }}{{ .Content }}
+{{- else if eq .Role "assistant" }}
+{{- if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{- end }}
+</tool_call>
+{{- end }}
+{{- else if eq .Role "tool" }}<tool_response>
+{{ .Content }}
+</tool_response>
+{{- end }}<|eot_id|>
+{{- end }}
+{{- end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ else }}
+{{ if .System }}<|start_header_id|>system<|end_header_id|>
+
+{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>
+
+{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}{{ .Response }}
+{{- if .Response }}<|eot_id|>
+{{- end }}
--- a/tools/testdata/llama3-groq-tool-use.out
+++ b/tools/testdata/llama3-groq-tool-use.out
@@ -0,0 +1,24 @@
+<|start_header_id|>system<|end_header_id|>
+
+You are a knowledgeable assistant. You can answer questions and perform tasks. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+<tool_call>
+{"name": <function-name>,"arguments": <args-dict>}
+</tool_call>
+
+Here are the available tools:
+<tools> {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}} </tools><|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+<tool_call>
+{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
+</tool_call><|eot_id|><|start_header_id|>tool<|end_header_id|>
+
+<tool_response>
+22
+</tool_response><|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
--- a/tools/testdata/llama3.2.gotmpl
+++ b/tools/testdata/llama3.2.gotmpl
@@ -0,0 +1,44 @@
+<|start_header_id|>system<|end_header_id|>
+
+Cutting Knowledge Date: December 2023
+
+{{ if .System }}{{ .System }}
+{{- end }}
+{{- if .Tools }}When you receive a tool call response, use the output to format an answer to the orginal user question.
+
+You are a helpful assistant with tool calling capabilities.
+{{- end }}<|eot_id|>
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 }}
+{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
+{{- if and $.Tools $last }}
+
+Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
+
+Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
+
+{{ range $.Tools }}
+{{- . }}
+{{ end }}
+{{ .Content }}<|eot_id|>
+{{- else }}
+
+{{ .Content }}<|eot_id|>
+{{- end }}{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- else if eq .Role "assistant" }}<|start_header_id|>assistant<|end_header_id|>
+{{- if .ToolCalls }}
+{{ range .ToolCalls }}
+{"name": "{{ .Function.Name }}", "parameters": {{ .Function.Arguments }}}{{ end }}
+{{- else }}
+
+{{ .Content }}
+{{- end }}{{ if not $last }}<|eot_id|>{{ end }}
+{{- else if eq .Role "tool" }}<|start_header_id|>ipython<|end_header_id|>
+
+{{ .Content }}<|eot_id|>{{ if $last }}<|start_header_id|>assistant<|end_header_id|>
+
+{{ end }}
+{{- end }}
+{{- end }}
--- a/tools/testdata/llama3.2.out
+++ b/tools/testdata/llama3.2.out
@@ -0,0 +1,24 @@
+<|start_header_id|>system<|end_header_id|>
+
+Cutting Knowledge Date: December 2023
+
+You are a knowledgeable assistant. You can answer questions and perform tasks.When you receive a tool call response, use the output to format an answer to the orginal user question.
+
+You are a helpful assistant with tool calling capabilities.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's the weather like today in Paris?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{"name": "get_current_weather", "parameters": {"format":"celsius","location":"Paris, France"}}<|eot_id|><|start_header_id|>ipython<|end_header_id|>
+
+22<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+The current temperature in Paris, France is 22 degrees Celsius.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.
+
+Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
+
+{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
+
+What's the weather like today in San Francisco and Toronto?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
--- a/tools/testdata/messages.json
+++ b/tools/testdata/messages.json
@@ -0,0 +1,39 @@
+[
+  {
+    "role": "system",
+    "content": "You are a knowledgeable assistant. You can answer questions and perform tasks."
+  },
+  {
+    "role": "user",
+    "content": "What's the weather like today in Paris?"
+  },
+  {
+    "role": "assistant",
+    "tool_calls": [
+      {
+        "id": "89a1e453-0bce-4de3-a456-c54bed09c520",
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "arguments": {
+            "location": "Paris, France",
+            "format": "celsius"
+          }
+        }
+      }
+    ]
+  },
+  {
+    "role": "tool",
+    "tool_call_id": "89a1e453-0bce-4de3-a456-c54bed09c520",
+    "content": "22"
+  },
+  {
+    "role": "assistant",
+    "content": "The current temperature in Paris, France is 22 degrees Celsius."
+  },
+  {
+    "role": "user",
+    "content": "What's the weather like today in San Francisco and Toronto?"
+  }
+]
--- a/tools/testdata/mistral.gotmpl
+++ b/tools/testdata/mistral.gotmpl
@@ -0,0 +1,15 @@
+{{- range $index, $_ := .Messages }}
+{{- if eq .Role "user" }}
+{{- if and (eq (len (slice $.Messages $index)) 1) $.Tools }}[AVAILABLE_TOOLS] {{ $.Tools }}[/AVAILABLE_TOOLS]
+{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}
+
+{{ end }}{{ .Content }}[/INST]
+{{- else if eq .Role "assistant" }}
+{{- if .Content }} {{ .Content }}</s>
+{{- else if .ToolCalls }}[TOOL_CALLS] [
+{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{- end }}]</s>
+{{- end }}
+{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS]
+{{- end }}
+{{- end }}
--- a/tools/testdata/mistral.out
+++ b/tools/testdata/mistral.out
@@ -0,0 +1,3 @@
+[INST] What's the weather like today in Paris?[/INST][TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]</s>[TOOL_RESULTS] {"content": 22}[/TOOL_RESULTS] The current temperature in Paris, France is 22 degrees Celsius.</s>[AVAILABLE_TOOLS] [{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}][/AVAILABLE_TOOLS][INST] You are a knowledgeable assistant. You can answer questions and perform tasks.
+
+What's the weather like today in San Francisco and Toronto?[/INST]
--- a/tools/testdata/nemotron.gotmpl
+++ b/tools/testdata/nemotron.gotmpl
@@ -0,0 +1,33 @@
+{{- if (or .Tools .System) }}<extra_id_0>System
+{{ if .System }}{{ .System }}
+
+
+{{ end }}
+{{- if .Tools }}
+{{- range .Tools }}<tool> {{ . }} </tool>{{ end }}
+
+
+{{ end }}
+{{- end }}
+{{- range $i, $m := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<extra_id_1>User
+{{ .Content }}
+{{- if $last }}
+<extra_id_1>Assistant
+{{- end }}
+{{ else if eq .Role "tool" }}<extra_id_1>Tool
+{{ .Content }}
+{{- if $last }}
+<extra_id_1>Assistant
+{{- end }}
+{{ else if eq .Role "assistant" }}<extra_id_1>Assistant
+{{- if .ToolCalls }}
+{{ range .ToolCalls }}<toolcall> {"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} </toolcall> {{ end }}
+{{ else }}
+{{ .Content }}
+{{- if not $last }}
+{{ end }}
+{{- end }}
+{{- end }}
+{{- end }}
--- a/tools/testdata/nemotron.out
+++ b/tools/testdata/nemotron.out
@@ -0,0 +1,18 @@
+<extra_id_0>System
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+
+
+<tool> {"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}} </tool>
+
+
+<extra_id_1>User
+What's the weather like today in Paris?
+<extra_id_1>Assistant
+<toolcall> {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}} </toolcall> 
+<extra_id_1>Tool
+22
+<extra_id_1>Assistant
+The current temperature in Paris, France is 22 degrees Celsius.
+<extra_id_1>User
+What's the weather like today in San Francisco and Toronto?
+<extra_id_1>Assistant
--- a/tools/testdata/qwen2.5.gotmpl
+++ b/tools/testdata/qwen2.5.gotmpl
@@ -0,0 +1,51 @@
+{{- if .Suffix }}<|fim_prefix|>{{ .Prompt }}<|fim_suffix|>{{ .Suffix }}<|fim_middle|>
+{{- else if .Messages }}
+{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
--- a/tools/testdata/qwen2.5.out
+++ b/tools/testdata/qwen2.5.out
@@ -0,0 +1,31 @@
+<|im_start|>system
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call><|im_end|>
+<|im_start|>user
+What's the weather like today in Paris?<|im_end|>
+<|im_start|>assistant
+<tool_call>
+{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
+</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+22
+</tool_response><|im_end|>
+<|im_start|>assistant
+The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
+<|im_start|>user
+What's the weather like today in San Francisco and Toronto?<|im_end|>
+<|im_start|>assistant
--- a/tools/testdata/qwen3.gotmpl
+++ b/tools/testdata/qwen3.gotmpl
@@ -0,0 +1,50 @@
+{{- if .Messages }}
+{{- if or .System .Tools }}<|im_start|>system
+{{- if .System }}
+{{ .System }}
+{{- end }}
+{{- if .Tools }}
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{{- range .Tools }}
+{"type": "function", "function": {{ .Function }}}
+{{- end }}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+{{- end }}<|im_end|>
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<|im_start|>user
+{{ .Content }}<|im_end|>
+{{ else if eq .Role "assistant" }}<|im_start|>assistant
+{{ if .Content }}{{ .Content }}
+{{- else if .ToolCalls }}<tool_call>
+{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
+{{ end }}</tool_call>
+{{- end }}{{ if not $last }}<|im_end|>
+{{ end }}
+{{- else if eq .Role "tool" }}<|im_start|>user
+<tool_response>
+{{ .Content }}
+</tool_response><|im_end|>
+{{ end }}
+{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
+{{ end }}
+{{- end }}
+{{- else }}
+{{- if .System }}<|im_start|>system
+{{ .System }}<|im_end|>
+{{ end }}{{ if .Prompt }}<|im_start|>user
+{{ .Prompt }}<|im_end|>
+{{ end }}<|im_start|>assistant
+{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}
--- a/tools/testdata/qwen3.out
+++ b/tools/testdata/qwen3.out
@@ -0,0 +1,31 @@
+<|im_start|>system
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+
+# Tools
+
+You may call one or more functions to assist with the user query.
+
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}
+</tools>
+
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call><|im_end|>
+<|im_start|>user
+What's the weather like today in Paris?<|im_end|>
+<|im_start|>assistant
+<tool_call>
+{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}
+</tool_call><|im_end|>
+<|im_start|>user
+<tool_response>
+22
+</tool_response><|im_end|>
+<|im_start|>assistant
+The current temperature in Paris, France is 22 degrees Celsius.<|im_end|>
+<|im_start|>user
+What's the weather like today in San Francisco and Toronto?<|im_end|>
+<|im_start|>assistant
--- a/tools/testdata/tools.json
+++ b/tools/testdata/tools.json
@@ -0,0 +1,30 @@
+[
+  {
+    "type": "function",
+    "function": {
+      "name": "get_current_weather",
+      "description": "Get the current weather",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "location": {
+            "type": "string",
+            "description": "The city and state, e.g. San Francisco, CA"
+          },
+          "format": {
+            "type": "string",
+            "enum": [
+              "celsius",
+              "fahrenheit"
+            ],
+            "description": "The temperature unit to use. Infer this from the user's location."
+          }
+        },
+        "required": [
+          "location",
+          "format"
+        ]
+      }
+    }
+  }
+]
--- a/tools/testdata/xlam.gotmpl
+++ b/tools/testdata/xlam.gotmpl
@@ -0,0 +1,45 @@
+{{- if .System }}{{ .System }}
+{{ end }}
+{{- range $i, $_ := .Messages }}
+{{- if eq .Role "user" }}### Instruction:
+{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+{{ $.Tools }}
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+{{ .Content }}
+[END OF QUERY]
+
+
+{{ else }}
+{{ .Content }}
+{{ end }}
+{{- else if .ToolCalls }}### Response:
+{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
+<|EOT|>
+{{ else if eq .Role "assistant" }}### Response:
+{{ .Content }}
+<|EOT|>
+{{ end }}
+{{- end }}### Response:
--- a/tools/testdata/xlam.out
+++ b/tools/testdata/xlam.out
@@ -0,0 +1,40 @@
+You are a knowledgeable assistant. You can answer questions and perform tasks.
+### Instruction:
+What's the weather like today in Paris?
+### Response:
+{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
+<|EOT|>
+### Response:
+The current temperature in Paris, France is 22 degrees Celsius.
+<|EOT|>
+### Instruction:
+[BEGIN OF TASK INSTRUCTION]
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out and refuse to answer. 
+If the given question lacks the parameters required by the function, also point it out.
+[END OF TASK INSTRUCTION]
+
+[BEGIN OF AVAILABLE TOOLS]
+[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the user's location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
+[END OF AVAILABLE TOOLS]
+
+[BEGIN OF FORMAT INSTRUCTION]
+The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
+The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
+```
+{
+    "tool_calls": [
+    {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
+    ... (more tool calls as required)
+    ]
+}
+```
+[END OF FORMAT INSTRUCTION]
+
+[BEGIN OF QUERY]
+What's the weather like today in San Francisco and Toronto?
+[END OF QUERY]
+
+
+### Response:
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -1,287 +1,281 @@
 package tools

 import (
-	"bytes"
 	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
 	"strings"
-	"text/template"
+	gotmpl "text/template"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/template"
 )

-type toolsState int
-
-const (
-	toolsState_LookingForTag toolsState = iota
-	toolsState_ToolCalling
-	toolsState_Done
+var (
+	errInvalidToolCall = errors.New("invalid tool call format")
+	errAccumulateMore  = errors.New("need to accumulate more content")
 )

+type ToolParser interface {
+	Add(s string) (tools []api.ToolCall, content string)
+	NewParser(templateToProcess *gotmpl.Template) (ToolParser, error)
+}
+
 type Parser struct {
-	tag        string
-	names      []string
-	properties []string
-
-	state  toolsState
-	buffer []byte
-	n      int
+	greedyParseJSON bool
+	prefix          string
+	prefixFound     bool
+	tmpl            gotmpl.Template
+	sb              strings.Builder
+	index           int
+	name            string
+	arguments       string
 }

-// NewParser creates a new tool call parser from a model's chat
-// template and a list of provided tools.
-func NewParser(tmpl *template.Template, tools []api.Tool) *Parser {
-	return NewParserWithTag(tools, parseTag(tmpl))
-}
+// parseJSONToolCalls attempts to parse a JSON string into a slice of ToolCalls.
+//
+// Parameters:
+//   - s: The string to parse
+//   - name: The field name from template that identifies the tool call name
+//   - arguments: The field name from template that identifies the tool call arguments
+//
+// Returns:
+//   - []api.ToolCall: The parsed tool calls if successful
+//   - error: ErrAccumulateMore if braces unbalanced, ErrInvalidToolCall if invalid, or nil if successful
+func parseJSONToolCalls(s string, name, arguments string, prefix string) ([]api.ToolCall, error) {
+	// Check for balanced braces before attempting to parse
+	braceCount := 0
+	squareCount := 0
+	startIndex := -1
+	var rawToolCalls []string
+	s = strings.TrimSpace(s)

-func NewParserWithTag(tools []api.Tool, tag string) *Parser {
-	var p Parser
-	for _, t := range tools {
-		p.names = append(p.names, t.Function.Name)
-		for r := range t.Function.Parameters.Properties {
-			p.properties = append(p.properties, r)
+	// Only track these if we don't have a prefix as it will be cut off from the prefix. Also track in the parseLeadingJSON case.
+	trackSquareBrackets := prefix == "" || !strings.HasSuffix(prefix, "[") || strings.HasPrefix(s, "[")
+	for i, c := range s {
+		switch c {
+		case '{':
+			braceCount++
+			if startIndex == -1 {
+				startIndex = i
+			}
+		case '}':
+			braceCount--
+			if braceCount == 0 {
+				rawToolCalls = append(rawToolCalls, s[startIndex:i+1])
+				startIndex = -1
+			}
+		case '[':
+			if trackSquareBrackets {
+				squareCount++
+			}
+		case ']':
+			if trackSquareBrackets {
+				squareCount--
+			}
+		}
+
+		// Negative means we have an extra closing brace/bracket
+		if braceCount < 0 || squareCount < 0 {
+			return nil, errInvalidToolCall
 		}
 	}
-	p.tag = tag
-	return &p
+
+	// If braces/brackets aren't balanced, need more input
+	if braceCount > 0 || squareCount > 0 {
+		return nil, errAccumulateMore
+	}
+
+	t := strings.TrimSpace(s)
+	if len(t) == 0 {
+		return nil, errAccumulateMore
+	}
+	// If the input is a single square bracket, it's not a valid tool call
+	if t[0] == '[' && len(t) == 1 {
+		return nil, errAccumulateMore
+	}
+
+	// Attempt full unmarshal of the JSON
+	var toolCalls []api.ToolCall
+	for _, rawToolCall := range rawToolCalls {
+		var resp map[string]any
+		if err := json.Unmarshal([]byte(rawToolCall), &resp); err != nil {
+			continue
+		}
+
+		// Collect nested objects that could contain tool calls
+		objs := collect(resp)
+		if len(objs) == 0 {
+			continue
+		}
+
+		fmt.Println("name", name)
+		fmt.Println("arguments", arguments)
+		fmt.Println("parseJSONToolCalls: Objects:", objs)
+		// Extract tool calls from objects
+		for _, kv := range objs {
+			n, nok := kv[name].(string)
+			a, aok := kv[arguments].(map[string]any)
+			if nok && aok {
+				toolCalls = append(toolCalls, api.ToolCall{
+					Function: api.ToolCallFunction{
+						Name:      n,
+						Arguments: a,
+					},
+				})
+			} else {
+				slog.Debug("No valid tool call found in object.", "object", kv)
+			}
+		}
+	}
+
+	// Valid JSON, no tool calls found
+	if len(toolCalls) == 0 {
+		return nil, errInvalidToolCall
+	}
+
+	return toolCalls, nil
 }

-// Add processes a string input to parse tool calls and content that
-// should be sent back to the user.
-func (p *Parser) Add(s string) (calls []api.ToolCall, content string) {
-	if p.state == toolsState_Done {
+// checkPrefix processes a string to find and handle a prefix pattern.
+//
+// Returns:
+//   - The processed string with prefix removed if found
+//   - error: ErrAccumulateMore if prefix is incomplete, or nil if successful
+func (p *Parser) checkPrefix(s string) (string, error) {
+	if s == "" || p.prefix == "" {
+		return s, nil
+	}
+
+	// Check for prefix at start of string
+	if cut, hasPrefix := strings.CutPrefix(s, p.prefix); hasPrefix {
+		// Found prefix at start - accumulate for potential tool
+		p.prefixFound = true
+		return cut, nil
+	}
+
+	// Check if prefix overlaps end of string
+	if idx := suffixOverlap(s, p.prefix); idx != -1 {
+		// Return everything except overlapping portion
+		p.sb.Reset()
+		p.sb.WriteString(s[idx:])
+		return s[:idx], errAccumulateMore
+	}
+
+	// Check if prefix appears in middle of string
+	if idx := strings.Index(s, p.prefix); idx != -1 {
+		// Save remainder starting at prefix for next pass
+		p.sb.Reset()
+		p.sb.WriteString(strings.TrimSpace(s[idx:]))
+		// Return everything before prefix
+		return s[:idx], errAccumulateMore
+	}
+
+	// No partial prefix found
+	return s, nil
+}
+
+// Add processes a string input to parse tool calls and content.
+// It handles prefix detection and JSON parsing to extract tool calls.
+//
+// Returns:
+//   - tools: Any parsed tool calls
+//   - content: Non-tool call content
+func (p *Parser) Add(s string) (tools []api.ToolCall, content string) {
+	p.sb.WriteString(s)
+	s = p.sb.String()
+	fmt.Println("Add: Starting with input:", s)
+
+	// Check for prefix pattern in input
+	s, err := p.checkPrefix(s)
+	if err != nil {
+		// Need more input to complete prefix
 		return nil, s
 	}

-	p.buffer = append(p.buffer, s...)
-
-	if p.state == toolsState_LookingForTag {
-		i, found := p.findTag()
-		if i == -1 {
-			content = string(p.buffer)
-			p.buffer = []byte{}
-		} else {
-			content = string(p.buffer[:i])
-			p.buffer = p.buffer[i:]
-		}
-
-		// for models where { or [ are used as tool calling
-		// tags, we only support parsing tools if the first non-
-		// whitespace character is { or [
-		if p.tag == "{" || p.tag == "[" {
-			if strings.TrimSpace(content) != "" {
-				p.state = toolsState_Done
-				return nil, content + string(p.buffer)
-			}
-		}
-
-		if !found {
-			return nil, content
-		}
-
-		p.state = toolsState_ToolCalling
+	// Exit if prefix exists in template, greedy parsing is off, and prefix not found
+	if !p.greedyParseJSON && !p.prefixFound {
+		p.sb.Reset()
+		return nil, s
 	}

-	for {
-		call := p.parseToolCall()
-		if call == nil {
-			break
+	toolCalls, err := parseJSONToolCalls(s, p.name, p.arguments, p.prefix)
+	if err != nil {
+		if errors.Is(err, errAccumulateMore) {
+			return nil, ""
 		}
-
-		calls = append(calls, *call)
+		p.sb.Reset()
+		// Only do greedy JSON parsing if there is no prefix from template
+		if p.prefix != "" {
+			p.greedyParseJSON = false
+		}
+		if p.index != 0 && p.prefix == "" {
+			return nil, ""
+		}
+		if p.prefixFound {
+			// Drop tokens since prefix was found
+			return nil, ""
+		}
+		return nil, s
 	}

-	if p.done() {
-		p.state = toolsState_Done
-		content = string(p.buffer)
-		p.buffer = []byte{}
+	for _, tc := range toolCalls {
+		tc.Function.Index = p.index
+		p.index++
 	}

-	return calls, content
+	p.sb.Reset()
+	return toolCalls, ""
 }

-// findTag searches the buffer to find and handle a tool calling tag
-// returning true if the tag was found and false otherwise, and
-// a string content signaling any content that should be sent back to the user
-func (p *Parser) findTag() (int, bool) {
-	// First check for complete substring anywhere in s
-	if i := bytes.Index(p.buffer, []byte(p.tag)); i > -1 {
-		return i, true
+// NewParser creates a new tool call parser from a template. It extracts the tool call format,
+// prefix, and field names from the template to use for parsing tool calls from model output.
+//
+// Returns an error if the template does not contain valid tool call formatting.
+func NewParser(templateToProcess *gotmpl.Template) (*Parser, error) {
+	fmt.Println("Checkpoint 1: Starting NewParser")
+	parsed, err := template.Parse(templateToProcess.Root.String())
+	if err != nil {
+		fmt.Println("Checkpoint 2: Error parsing template:", err)
+		return nil, err
 	}

-	// Then check for partial suffix overlap
-	max := min(len(p.buffer), len(p.tag))
-	for i := max; i > 0; i-- {
-		if bytes.HasSuffix(p.buffer, []byte(p.tag[:i])) {
-			return len(p.buffer) - i, false
-		}
+	fmt.Println("Checkpoint 3: Getting tool template")
+	tt, err := toolTemplate(parsed)
+	fmt.Println("Checkpoint 4: Tool template:", tt.Root.String())
+	if err != nil {
+		fmt.Println("Checkpoint 5: Error getting tool template:", err)
+		return nil, err
 	}
-	return -1, false
+
+	fmt.Println("Checkpoint 6: Getting tool prefix")
+	tp := toolPrefix(templateToProcess)
+	fmt.Println("Checkpoint 7: Tool prefix:", tp)
+
+	fmt.Println("Checkpoint 8: Extracting tool args")
+	name, arguments, err := extractToolArgs(tt)
+	if err != nil {
+		fmt.Println("Checkpoint 9: Error extracting tool args:", err)
+		return nil, err
+	}
+	// name := "temp1"
+	// args := "temp2"
+
+	fmt.Println("Checkpoint 10: Tool name:", name, "arguments:", arguments)
+
+	fmt.Println("Checkpoint 11: Creating parser")
+	return &Parser{
+		tmpl:            *tt,
+		sb:              strings.Builder{},
+		prefix:          tp,
+		greedyParseJSON: true,
+		name:            name,
+		arguments:       arguments,
+	}, nil
 }

-// parseToolCall finds the next complete tool call in the buffer
-// incrementing n and advancing the buffer.
-func (p *Parser) parseToolCall() *api.ToolCall {
-	var name string
-	var args map[string]any
-	var end int = len(p.buffer)
-
-	// find tool name
-	var i int
-	for _, n := range p.names {
-		if i = bytes.Index(p.buffer, []byte(n)); i != -1 {
-			if i+len(n) < end {
-				name = n
-				end = i + len(n)
-			}
-		}
-	}
-
-	if name == "" {
-		return nil
-	}
-
-	if args, i = p.findArguments(); args == nil {
-		return nil
-	}
-
-	if i > end {
-		end = i
-	}
-
-	tc := &api.ToolCall{
-		Function: api.ToolCallFunction{
-			Name:      name,
-			Arguments: args,
-			Index:     p.n,
-		},
-	}
-
-	p.n++
-	p.buffer = p.buffer[end:]
-	return tc
-}
-
-// findArguments returns the first object that appears to be
-// arguments and the position where the arguments end, returning nil and 0 if
-// an invalid JSON object or non-arguments object is found first
-func (p *Parser) findArguments() (map[string]any, int) {
-	if len(p.buffer) == 0 {
-		return nil, 0
-	}
-
-	var braces int
-	var start int = -1
-	var end int
-	var object []byte
-
-	// find any outer json object
-	for i, c := range p.buffer {
-		if c == '{' {
-			braces++
-			if start == -1 {
-				start = i
-			}
-		}
-
-		if c == '}' {
-			braces--
-			if braces == 0 && start != -1 {
-				end = i + 1
-				object = p.buffer[start:end]
-				break
-			}
-		}
-	}
-
-	if braces > 0 {
-		return nil, 0
-	}
-
-	var data map[string]any
-
-	// not valid json
-	if err := json.Unmarshal(object, &data); err != nil {
-		return nil, 0
-	}
-
-	var find func(obj any) map[string]any
-	find = func(obj any) map[string]any {
-		switch v := obj.(type) {
-		case map[string]any:
-			// check if the object keys are valid tool properties
-			// TODO (jmorganca): check only sets of properties that
-			// go together instead of the entire set
-			for _, prop := range p.properties {
-				if _, exists := v[prop]; exists {
-					return v
-				}
-			}
-
-			for _, value := range v {
-				if result := find(value); result != nil {
-					return result
-				}
-			}
-		case []any:
-			for _, item := range v {
-				if result := find(item); result != nil {
-					return result
-				}
-			}
-		}
-
-		return nil
-	}
-
-	result := find(data)
-	if result != nil {
-		return result, end
-	}
-
-	return nil, 0
-}
-
-// done checks if the parser is done parsing by looking
-// for closing tag. currently only } and ] are supported
-// for closing tags as {} or [] pairs may not always
-// represent tool calls and we need to send the content back
-func (p *Parser) done() bool {
-	var open, close rune
-	switch p.tag {
-	case "{":
-		open, close = '{', '}'
-	case "[":
-		open, close = '[', ']'
-	default:
-		return false
-	}
-
-	var count int
-	for _, c := range p.buffer {
-		if c == byte(open) {
-			count++
-		} else if c == byte(close) {
-			count--
-			if count == 0 {
-				return true
-			}
-		}
-	}
-
-	return false
-}
-
-// Content returns any remaining content that
-// should be sent to the user. This should be the empty string
-// string unless the tag is { or [ and a tool call was not found
-func (p *Parser) Content() string {
-	if p.n > 0 {
-		return ""
-	}
-
-	if p.tag == "{" || p.tag == "[" {
-		return string(p.buffer)
-	}
-
-	return ""
+// NewParser implements the ToolParser interface
+func (p *Parser) NewParser(templateToProcess *gotmpl.Template) (ToolParser, error) {
+	return NewParser(templateToProcess)
 }
--- a/tools/tools_test.go
+++ b/tools/tools_test.go
--- a/tools/tools_utils.go
+++ b/tools/tools_utils.go
@@ -0,0 +1,229 @@
+package tools
+
+import (
+	"bytes"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"slices"
+	"strings"
+	gotmpl "text/template"
+	"text/template/parse"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/template"
+)
+
+// extractToolCallsFormat traverses a template AST to find text that follows a ".ToolCalls" condition.
+// It walks the template nodes looking for if-statements containing ".ToolCalls" and extracts any
+// immediate text nodes that follow. This is used to identify tool call prefixes and formatting.
+//
+// Returns:
+//   - string: The extracted text following the first ".ToolCalls" condition found
+//   - bool: Whether a ".ToolCalls" condition was found in the template
+func extractToolCallsFormat(tmpl *gotmpl.Template) (string, bool) {
+	if tmpl == nil || tmpl.Tree == nil {
+		slog.Debug("template or tree is nil")
+		return "", false
+	}
+
+	var result string
+	var found bool
+
+	var walk func(nodes []parse.Node)
+	walk = func(nodes []parse.Node) {
+		for _, node := range nodes {
+			if found {
+				return
+			}
+
+			switch n := node.(type) {
+			case *parse.IfNode:
+				if isToolCallsNode(n) {
+					// Collect immediate TextNode(s) at start of IfNode's list
+					var sb strings.Builder
+					for _, innerNode := range n.List.Nodes {
+						if tn, ok := innerNode.(*parse.TextNode); ok {
+							sb.Write(tn.Text)
+						} else {
+							// Stop at first non-text node
+							break
+						}
+					}
+					result = sb.String()
+					found = true
+					return
+				}
+				// Recurse into child nodes
+				walk(n.List.Nodes)
+				if n.ElseList != nil {
+					walk(n.ElseList.Nodes)
+				}
+			case *parse.ListNode:
+				walk(n.Nodes)
+			case *parse.RangeNode:
+				walk(n.List.Nodes)
+				if n.ElseList != nil {
+					walk(n.ElseList.Nodes)
+				}
+			case *parse.WithNode:
+				walk(n.List.Nodes)
+				if n.ElseList != nil {
+					walk(n.ElseList.Nodes)
+				}
+			default:
+				// Continue to next node
+				continue
+			}
+		}
+	}
+
+	walk(tmpl.Tree.Root.Nodes)
+	return result, found
+}
+
+// isToolCallsNode detects if a node's condition includes ".ToolCalls"
+func isToolCallsNode(n *parse.IfNode) bool {
+	for _, cmd := range n.Pipe.Cmds {
+		for _, arg := range cmd.Args {
+			if field, ok := arg.(*parse.FieldNode); ok {
+				if slices.Contains(field.Ident, "ToolCalls") {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
+func toolPrefix(tmpl *gotmpl.Template) string {
+	tokenText, ok := extractToolCallsFormat(tmpl)
+	fmt.Println("tokenText", tokenText)
+	if !ok {
+		return ""
+	}
+	tokenText = strings.TrimSpace(tokenText)
+	tokenText = strings.ReplaceAll(tokenText, "\r", "")
+	tokenText = strings.ReplaceAll(tokenText, "\n", " ")
+
+	return tokenText
+}
+
+// toolTemplate creates a subtree from the node that ranges over .ToolCalls
+//
+// Returns:
+//   - *gotmpl.Template: The subtree containing the .ToolCalls range
+//   - error: Error if parsing failed
+func toolTemplate(t *template.Template) (*gotmpl.Template, error) {
+	tmpl := t.Subtree(func(n parse.Node) bool {
+		if t, ok := n.(*parse.RangeNode); ok {
+			return slices.Contains(template.Identifiers(t.Pipe), "ToolCalls")
+		}
+
+		return false
+	})
+
+	if tmpl == nil {
+		return nil, errors.New("failed to find tool template")
+	}
+
+	return tmpl, nil
+}
+
+// suffixOverlap returns the index in s where the longest suffix overlap with prefix begins
+//
+// Returns:
+//   - int: The starting index in s where the suffix overlap begins
+func suffixOverlap(s, prefix string) int {
+	max := min(len(prefix), len(s))
+	for i := max; i > 0; i-- {
+		if strings.HasSuffix(s, prefix[:i]) {
+			return len(s) - i
+		}
+	}
+	return -1
+}
+
+// extractToolArgs executes a template with a known tool call format to extract the name and arguments
+//
+// Returns:
+//   - string: The name of the tool call
+//   - string: The arguments of the tool call
+//   - error: Error if parsing failed
+func extractToolArgs(tmpl *gotmpl.Template) (name, arguments string, err error) {
+	var b bytes.Buffer
+	if err := tmpl.Execute(&b, map[string][]api.ToolCall{
+		"ToolCalls": {
+			{
+				Function: api.ToolCallFunction{
+					Name: "@@name@@",
+					Arguments: api.ToolCallFunctionArguments{
+						"@@argument@@": 1,
+					},
+				},
+			},
+		},
+	}); err != nil {
+		return "", "", err
+	}
+
+	var obj any
+	err = json.Unmarshal(b.Bytes(), &obj)
+	if err != nil {
+		return "", "", err
+	}
+
+	var objs []map[string]any
+	switch v := obj.(type) {
+	case map[string]any:
+		objs = []map[string]any{v}
+	case []map[string]any:
+		objs = v
+	case []any:
+		objs = collect(v)
+	}
+	if len(objs) == 0 {
+		return "", "", errors.New("no template objects found")
+	}
+
+	// find the keys that correspond to the name and arguments fields
+	for k, v := range objs[0] {
+		switch v.(type) {
+		case string:
+			name = k
+		case map[string]any:
+			arguments = k
+		}
+	}
+
+	if name == "" || arguments == "" {
+		slog.Debug("missing required fields in tool call template", "name", name, "arguments", arguments)
+		return "", "", errors.New("missing required fields in tool call template")
+	}
+
+	return name, arguments, nil
+}
+
+// collect recursively traverses an object to collect all nested maps
+//
+// Returns:
+//   - []map[string]any: A slice of all nested maps found in the object
+func collect(obj any) []map[string]any {
+	var all []map[string]any
+	switch o := obj.(type) {
+	case map[string]any:
+		all = append(all, o)
+		for _, v := range o {
+			all = append(all, collect(v)...)
+		}
+	case []any:
+		for _, v := range o {
+			all = append(all, collect(v)...)
+		}
+	default:
+		return nil
+	}
+
+	return all
+}
--- a/tools/tools_utils_test.go
+++ b/tools/tools_utils_test.go
@@ -0,0 +1,464 @@
+package tools
+
+import (
+	"testing"
+	gotmpl "text/template"
+
+	"github.com/ollama/ollama/template"
+)
+
+func TestExtractToolCallsFormat(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     string
+		found    bool
+	}{
+		{
+			name:     "nil template",
+			template: "",
+			want:     "",
+			found:    false,
+		},
+		{
+			name:     "basic tool call with text",
+			template: "{{if .ToolCalls}}Hello world{{end}}",
+			want:     "Hello world",
+			found:    true,
+		},
+		{
+			name:     "tool call with json format",
+			template: "{{if .ToolCalls}}```json\n{{end}}",
+			want:     "```json\n",
+			found:    true,
+		},
+		{
+			name:     "tool call in range",
+			template: "{{range .ToolCalls}}tool: {{.}}{{end}}",
+			want:     "",
+			found:    false,
+		},
+		{
+			name:     "tool call with multiple text nodes",
+			template: "{{if .ToolCalls}}First text{{if .Something}}inner{{end}}Second text{{end}}",
+			want:     "First text",
+			found:    true,
+		},
+		{
+			name:     "nested if without tool calls",
+			template: "{{if .Something}}{{if .OtherThing}}text{{end}}{{end}}",
+			want:     "",
+			found:    false,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tc.template)
+			if err != nil && tc.template != "" {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			got, found := extractToolCallsFormat(tmpl)
+			if got != tc.want {
+				t.Errorf("got text %q, want %q", got, tc.want)
+			}
+			if found != tc.found {
+				t.Errorf("got found %v, want %v", found, tc.found)
+			}
+		})
+	}
+}
+
+func TestToolPrefix(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     string
+	}{
+		{
+			name:     "basic tool call with action prefix",
+			template: "{{if .ToolCalls}}Action: ```json{{end}}",
+			want:     "Action: ```json",
+		},
+		{
+			name:     "incomplete functools bracket",
+			template: "{{if .ToolCalls}}functools[{{end}}",
+			want:     "functools[",
+		},
+		{
+			name:     "tool call with angle brackets",
+			template: "{{if .ToolCalls}}Hello, world! <tool_call>{{end}}",
+			want:     "Hello, world! <tool_call>",
+		},
+		{
+			name:     "multiple tool call formats",
+			template: "{{if .ToolCalls}}[tool_call] <tool_call>{{end}}",
+			want:     "[tool_call] <tool_call>",
+		},
+		{
+			name:     "single angle bracket tool call",
+			template: "{{if .ToolCalls}}<tool_call>{{end}}",
+			want:     "<tool_call>",
+		},
+		{
+			name:     "incomplete angle bracket after tool call",
+			template: "{{if .ToolCalls}}[tool_call] <{{end}}",
+			want:     "[tool_call] <",
+		},
+		{
+			name:     "angle bracket prefix with tool call",
+			template: "{{if .ToolCalls}}> <tool_call>{{end}}",
+			want:     "> <tool_call>",
+		},
+		{
+			name:     "uppercase tool call with incomplete bracket",
+			template: "{{if .ToolCalls}}[TOOL_CALL] [{{end}}",
+			want:     "[TOOL_CALL] [",
+		},
+		{
+			name:     "uppercase tool call with adjacent bracket",
+			template: "{{if .ToolCalls}}[TOOL_CALL][{{end}}",
+			want:     "[TOOL_CALL][",
+		},
+		{
+			name:     "tool call with pipe delimiters",
+			template: "{{if .ToolCalls}}<|tool_call|>{{end}}",
+			want:     "<|tool_call|>",
+		},
+		{
+			name:     "tool with no prefix",
+			template: "{{if .ToolCalls}}{{end}}",
+			want:     "",
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tt.template)
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+			got := toolPrefix(tmpl)
+			if got != tt.want {
+				t.Errorf("ToolToken(%q) = %q; want %q", tt.template, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestToolTemplate(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     bool
+	}{
+		{
+			name:     "basic tool call range",
+			template: "{{range .ToolCalls}}test{{end}}",
+			want:     true,
+		},
+		{
+			name:     "no tool calls",
+			template: "{{range .Other}}test{{end}}",
+			want:     false,
+		},
+		{
+			name:     "nested tool calls",
+			template: "{{range .Outer}}{{range .ToolCalls}}test{{end}}{{end}}",
+			want:     true,
+		},
+		{
+			name:     "empty template",
+			template: "",
+			want:     false,
+		},
+		{
+			name:     "tool calls in if statement",
+			template: "{{if .ToolCalls}}test{{end}}",
+			want:     false,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tt.template)
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			parsed, err := template.Parse(tmpl.Root.String())
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			_, err = toolTemplate(parsed)
+			if err != nil && tt.want {
+				t.Errorf("toolTemplate() = %v; want %v", err, tt.want)
+			}
+		})
+	}
+}
+
+func TestSuffixOverlap(t *testing.T) {
+	cases := []struct {
+		name string
+		s    string
+		d    string
+		want int
+	}{
+		{
+			name: "no overlap",
+			s:    "hello world",
+			d:    "<tool_call>",
+			want: -1,
+		},
+		{
+			name: "full overlap",
+			s:    "<tool_call>",
+			d:    "<tool_call>",
+			want: 0,
+		},
+		{
+			name: "partial overlap",
+			s:    "text <tool_call>",
+			d:    "<tool_call>",
+			want: 5,
+		},
+		{
+			name: "delimiter longer than string",
+			s:    "<tool>",
+			d:    "<tool_call>",
+			want: -1,
+		},
+		{
+			name: "empty string",
+			s:    "",
+			d:    "<tool_call>",
+			want: -1,
+		},
+		{
+			name: "empty delimiter",
+			s:    "<tool_call>",
+			d:    "",
+			want: -1,
+		},
+		{
+			name: "single char overlap",
+			s:    "test<",
+			d:    "<tool_call>",
+			want: 4,
+		},
+		{
+			name: "partial tool call",
+			s:    "hello <tool_",
+			d:    "<tool_call>",
+			want: 6,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			got := suffixOverlap(tt.s, tt.d)
+			if got != tt.want {
+				t.Errorf("suffixOverlap(%q, %q) = %d; want %d", tt.s, tt.d, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestExtractToolArgs(t *testing.T) {
+	cases := []struct {
+		name     string
+		template string
+		want     string
+		ok       bool
+	}{
+		{
+			name:     "basic tool call with text after",
+			template: `{{if .ToolCalls}}tool response{{end}}`,
+			want:     "tool response",
+			ok:       true,
+		},
+		{
+			name:     "tool call with mixed content after",
+			template: `{{if .ToolCalls}}<tool_call>{{.Something}}{{end}}`,
+			want:     "<tool_call>",
+			ok:       true,
+		},
+		{
+			name:     "tool call with no text after",
+			template: `{{if .ToolCalls}}{{.Something}}{{end}}`,
+			want:     "",
+			ok:       true,
+		},
+		{
+			name:     "nested tool call",
+			template: `{{if .Something}}{{if .ToolCalls}}[TOOL_CALL]{{end}}{{end}}`,
+			want:     "[TOOL_CALL]",
+			ok:       true,
+		},
+		{
+			name:     "no tool calls",
+			template: `{{if .Something}}no tools here{{end}}`,
+			want:     "",
+			ok:       false,
+		},
+		{
+			name:     "empty template",
+			template: ``,
+			want:     "",
+			ok:       false,
+		},
+		{
+			name:     "multiple tool calls sections",
+			template: `{{if .ToolCalls}}first{{end}}{{if .ToolCalls}}second{{end}}`,
+			want:     "first",
+			ok:       true,
+		},
+		{
+			name:     "range over tool calls",
+			template: `{{if .ToolCalls}}{{range .ToolCalls}}tool{{end}}{{end}}`,
+			want:     "",
+			ok:       true,
+		},
+		{
+			name:     "tool calls with pipe delimiters",
+			template: `{{if .ToolCalls}}<|tool|>{{end}}`,
+			want:     "<|tool|>",
+			ok:       true,
+		},
+		{
+			name:     "tool calls with nested template",
+			template: `{{if .ToolCalls}}{{template "tool" .}}{{end}}`,
+			want:     "",
+			ok:       true,
+		},
+		{
+			name:     "tool calls with whitespace variations",
+			template: `{{if .ToolCalls}}  tool  {{end}}`,
+			want:     "  tool  ",
+			ok:       true,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			tmpl, err := gotmpl.New("test").Parse(tt.template)
+			if err != nil {
+				t.Fatalf("failed to parse template: %v", err)
+			}
+
+			got, ok := extractToolCallsFormat(tmpl)
+			if got != tt.want {
+				t.Errorf("TextAfterToolCalls() got = %q, want %q", got, tt.want)
+			}
+			if ok != tt.ok {
+				t.Errorf("TextAfterToolCalls() ok = %v, want %v", ok, tt.ok)
+			}
+		})
+	}
+}
+
+func TestCollect(t *testing.T) {
+	cases := []struct {
+		name string
+		obj  any
+		want []map[string]any
+	}{
+		{
+			name: "simple map",
+			obj: map[string]any{
+				"key": "value",
+			},
+			want: []map[string]any{
+				{"key": "value"},
+			},
+		},
+		{
+			name: "nested map",
+			obj: map[string]any{
+				"outer": map[string]any{
+					"inner": "value",
+				},
+			},
+			want: []map[string]any{
+				{"outer": map[string]any{"inner": "value"}},
+				{"inner": "value"},
+			},
+		},
+		{
+			name: "array of maps",
+			obj: []any{
+				map[string]any{"key1": "val1"},
+				map[string]any{"key2": "val2"},
+			},
+			want: []map[string]any{
+				{"key1": "val1"},
+				{"key2": "val2"},
+			},
+		},
+		{
+			name: "deeply nested",
+			obj: map[string]any{
+				"l1": map[string]any{
+					"l2": map[string]any{
+						"l3": "value",
+					},
+				},
+			},
+			want: []map[string]any{
+				{"l1": map[string]any{"l2": map[string]any{"l3": "value"}}},
+				{"l2": map[string]any{"l3": "value"}},
+				{"l3": "value"},
+			},
+		},
+		{
+			name: "non-map value",
+			obj:  "string",
+			want: nil,
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			got := collect(tt.obj)
+			if len(got) != len(tt.want) {
+				t.Errorf("collect() got %d maps, want %d", len(got), len(tt.want))
+				return
+			}
+
+			// Compare each map in the result
+			for i := range tt.want {
+				if !mapsEqual(got[i], tt.want[i]) {
+					t.Errorf("collect() map[%d] = %v, want %v", i, got[i], tt.want[i])
+				}
+			}
+		})
+	}
+}
+
+// mapsEqual compares two maps for deep equality
+func mapsEqual(m1, m2 map[string]any) bool {
+	if len(m1) != len(m2) {
+		return false
+	}
+	for k, v1 := range m1 {
+		v2, ok := m2[k]
+		if !ok {
+			return false
+		}
+		switch val1 := v1.(type) {
+		case map[string]any:
+			val2, ok := v2.(map[string]any)
+			if !ok || !mapsEqual(val1, val2) {
+				return false
+			}
+		default:
+			if v1 != v2 {
+				return false
+			}
+		}
+	}
+	return true
+}