add docs.json

2025-08-17 13:12:39 -07:00
27 changed files with 227 additions and 1340 deletions
--- a/README.md
+++ b/README.md
@@ -411,8 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)

 ### Cloud

@@ -539,8 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
 - [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))

 ### Mobile

--- a/api/types.go
+++ b/api/types.go
@@ -90,10 +90,6 @@ type GenerateRequest struct {
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -124,10 +120,6 @@ type ChatRequest struct {
 	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
 	// for supported models.
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 type Tools []Tool
@@ -286,23 +278,16 @@ func mapToTypeScriptType(jsonType string) string {
 	}
 }

-type ToolFunctionParameters struct {
-	Type       string                  `json:"type"`
-	Defs       any                     `json:"$defs,omitempty"`
-	Items      any                     `json:"items,omitempty"`
-	Required   []string                `json:"required"`
-	Properties map[string]ToolProperty `json:"properties"`
-}
-
-func (t *ToolFunctionParameters) String() string {
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
 type ToolFunction struct {
-	Name        string                 `json:"name"`
-	Description string                 `json:"description"`
-	Parameters  ToolFunctionParameters `json:"parameters"`
+	Name        string `json:"name"`
+	Description string `json:"description"`
+	Parameters  struct {
+		Type       string                  `json:"type"`
+		Defs       any                     `json:"$defs,omitempty"`
+		Items      any                     `json:"items,omitempty"`
+		Required   []string                `json:"required"`
+		Properties map[string]ToolProperty `json:"properties"`
+	} `json:"parameters"`
 }

 func (t *ToolFunction) String() string {
@@ -323,19 +308,6 @@ type ChatResponse struct {
 	Metrics
 }

-// DebugInfo contains debug information for template rendering
-type DebugInfo struct {
-	RenderedTemplate string `json:"rendered_template"`
-	ImageCount       int    `json:"image_count,omitempty"`
-}
-
-// DebugTemplateResponse is returned when _debug_render_only is set to true
-type DebugTemplateResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	DebugInfo DebugInfo `json:"_debug_info"`
-}
-
 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -436,50 +436,3 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 		})
 	}
 }
-
-func TestToolFunctionParameters_String(t *testing.T) {
-	tests := []struct {
-		name     string
-		params   ToolFunctionParameters
-		expected string
-	}{
-		{
-			name: "simple object with string property",
-			params: ToolFunctionParameters{
-				Type:     "object",
-				Required: []string{"name"},
-				Properties: map[string]ToolProperty{
-					"name": {
-						Type:        PropertyType{"string"},
-						Description: "The name of the person",
-					},
-				},
-			},
-			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
-		},
-		{
-			name: "marshal failure returns empty string",
-			params: ToolFunctionParameters{
-				Type: "object",
-				Defs: func() any {
-					// Create a cycle that will cause json.Marshal to fail
-					type selfRef struct {
-						Self *selfRef
-					}
-					s := &selfRef{}
-					s.Self = s
-					return s
-				}(),
-				Properties: map[string]ToolProperty{},
-			},
-			expected: "",
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			result := test.params.String()
-			assert.Equal(t, test.expected, result)
-		})
-	}
-}
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -15,24 +15,19 @@ import (

 type gptossModel struct {
 	ModelParameters
-	HiddenLayers          uint32  `json:"num_hidden_layers"`
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	HiddenSize            uint32  `json:"hidden_size"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	AttentionHeads        uint32  `json:"num_attention_heads"`
-	KeyValueHeads         uint32  `json:"num_key_value_heads"`
-	HeadDim               uint32  `json:"head_dim"`
-	Experts               uint32  `json:"num_experts"`
-	LocalExperts          uint32  `json:"num_local_experts"`
-	ExpertsPerToken       uint32  `json:"experts_per_token"`
-	RMSNormEpsilon        float32 `json:"rms_norm_eps"`
-	InitialContextLength  uint32  `json:"initial_context_length"`
-	RopeTheta             float32 `json:"rope_theta"`
-	RopeScalingFactor     float32 `json:"rope_scaling_factor"`
-	RopeScaling           struct {
-		Factor float32 `json:"factor"`
-	} `json:"rope_scaling"`
-	SlidingWindow uint32 `json:"sliding_window"`
+	HiddenLayers         uint32  `json:"num_hidden_layers"`
+	HiddenSize           uint32  `json:"hidden_size"`
+	IntermediateSize     uint32  `json:"intermediate_size"`
+	AttentionHeads       uint32  `json:"num_attention_heads"`
+	KeyValueHeads        uint32  `json:"num_key_value_heads"`
+	HeadDim              uint32  `json:"head_dim"`
+	Experts              uint32  `json:"num_experts"`
+	ExpertsPerToken      uint32  `json:"experts_per_token"`
+	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
+	InitialContextLength uint32  `json:"initial_context_length"`
+	RopeTheta            float32 `json:"rope_theta"`
+	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
+	SlidingWindow        uint32  `json:"sliding_window"`
 }

 var _ ModelConverter = (*gptossModel)(nil)
@@ -41,11 +36,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
-	kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
+	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
 	kv["gptoss.block_count"] = m.HiddenLayers
 	kv["gptoss.embedding_length"] = m.HiddenSize
 	kv["gptoss.feed_forward_length"] = m.IntermediateSize
-	kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
+	kv["gptoss.expert_count"] = m.Experts
 	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
 	kv["gptoss.attention.head_count"] = m.AttentionHeads
 	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -54,7 +49,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
 	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
 	kv["gptoss.rope.freq_base"] = m.RopeTheta
-	kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
+	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
 	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
 	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
 	kv["tokenizer.ggml.add_bos_token"] = false
@@ -97,11 +92,6 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {

 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
-
-		if !strings.HasSuffix(name, ".weight") {
-			name += ".weight"
-		}
-
 		out = append(out, &ggml.Tensor{
 			Name:     name,
 			Kind:     uint32(ggml.TensorTypeMXFP4),
@@ -114,47 +104,25 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 }

 func (m *gptossModel) Replacements() []string {
-	var replacements []string
-	if m.MaxPositionEmbeddings > 0 {
-		// hf flavored model
-		replacements = []string{
-			"lm_head", "output",
-			"model.embed_tokens", "token_embd",
-			"model.layers", "blk",
-			"input_layernorm", "attn_norm",
-			"self_attn.q_proj", "attn_q",
-			"self_attn.k_proj", "attn_k",
-			"self_attn.v_proj", "attn_v",
-			"self_attn.o_proj", "attn_out",
-			"self_attn.sinks", "attn_sinks",
-			"post_attention_layernorm", "ffn_norm",
-			"mlp.router", "ffn_gate_inp",
-			"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
-			"mlp.experts.down_proj_", "ffn_down_exps.",
-			"model.norm", "output_norm",
-		}
-	} else {
-		replacements = []string{
-			// noop replacements so other replacements will not be applied
-			".blocks", ".blocks",
-			".scales", ".scales",
-			// real replacements
-			"block", "blk",
-			"attn.norm", "attn_norm",
-			"attn.qkv", "attn_qkv",
-			"attn.sinks", "attn_sinks",
-			"attn.out", "attn_out",
-			"mlp.norm", "ffn_norm",
-			"mlp.gate", "ffn_gate_inp",
-			"mlp.mlp1_", "ffn_gate_up_exps.",
-			"mlp.mlp2_", "ffn_down_exps.",
-			"embedding", "token_embd",
-			"norm", "output_norm",
-			"unembedding", "output",
-			"scale", "weight",
-		}
+	return []string{
+		// noop replacements so other replacements will not be applied
+		".blocks", ".blocks",
+		".scales", ".scales",
+		// real replacements
+		"block", "blk",
+		"attn.norm", "attn_norm",
+		"attn.qkv", "attn_qkv",
+		"attn.sinks", "attn_sinks",
+		"attn.out", "attn_out",
+		"mlp.norm", "ffn_norm",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.mlp1_", "ffn_gate_up_exps.",
+		"mlp.mlp2_", "ffn_down_exps.",
+		"embedding", "token_embd",
+		"norm", "output_norm",
+		"unembedding", "output",
+		"scale", "weight",
 	}
-	return replacements
 }

 type mxfp4 struct {
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -0,0 +1,75 @@
+{
+  "$schema": "https://mintlify.com/docs.json",
+  "theme": "mint",
+  "background": {
+    "color": {
+      "light": "#ffffff",
+      "dark": "#000000"
+    }
+  },
+  "appearance": {
+    "default": "light"
+  },
+  "styling": {
+    "codeblocks": "system"
+  },
+  "contextual": {
+    "options": ["copy", "chatgpt", "claude", "view"]
+  },
+  "fonts": {
+    "heading": {
+      "family": "Inter"
+    },
+    "body": {
+      "family": "Inter"
+    }
+  },
+  "name": "Ollama",
+  "colors": {
+    "primary": "#000",
+    "light": "#b5b5b5",
+    "dark": "#fff"
+  },
+  "favicon": "/ollama.png",
+  "logo": {
+    "light": "/ollama.png",
+    "dark": "/favicon.svg"
+  },
+  "navigation": {
+    "tabs": [
+      {
+        "tab": "Documentation",
+        "groups": [
+          {
+            "group": "Home",
+            "pages": ["index", "quickstart", "faq", "troubleshooting"]
+          },
+          {
+            "group": "Platforms",
+            "pages": ["linux", "windows", "docker"]
+          },
+          {
+            "group": "Features",
+            "pages": [
+              "modelfile",
+              "apis",
+              "openai",
+              "import",
+              "gpu",
+              "benchmark"
+            ]
+          }
+        ]
+      },
+      {
+        "tab": "Development",
+        "groups": [
+          {
+            "group": " ",
+            "pages": ["development", "examples", "template"]
+          }
+        ]
+      }
+    ]
+  }
+}
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -378,7 +378,9 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
-		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
+		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
+		ctx.Forward(maskTensor.Copy(ctx, out))
+		maskTensor = out
 	}

 	return maskTensor
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
    const int64_t n_vocab = vocab.n_tokens();
    const int64_t n_embd  = hparams.n_embd;

-    const bool output_all = false;
+    // when computing embeddings, all tokens are output
+    const bool output_all = cparams.embeddings;

    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
+++ b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -13,7 +13,7 @@ checks.
 1 file changed, 18 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 57eae461..c7f9dc3a 100644
+index 57eae461..9db0c8b5 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
--- a/llama/patches/0023-decode-disable-output_all.patch
+++ b/llama/patches/0023-decode-disable-output_all.patch
@@ -1,23 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Mon, 18 Aug 2025 16:58:39 -0700
-Subject: [PATCH] decode: disable output_all
-
---
- src/llama-context.cpp | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 26a5cf9c..6ece5263 100644
--- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
-     const int64_t n_vocab = vocab.n_tokens();
-     const int64_t n_embd  = hparams.n_embd;
- 
-    // when computing embeddings, all tokens are output
-    const bool output_all = cparams.embeddings;
-+    const bool output_all = false;
- 
-     if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
-         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// Try to pack into as few GPUs as possible, starting from 1 GPU
 			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
 				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
+				ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)

 				if ok {
 					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
@@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// - try subsets of GPUs instead of just falling back to 1 or all in a family

 			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
+			if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
 				slog.Info("new model will fit in available VRAM, loading",
 					"model", modelPath,
 					"library", sgl[0].Library,
@@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 	var bestEstimate uint64
 	var bestFit int
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
+		_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestFit = i
@@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 }

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
@@ -97,10 +97,6 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
 				return true, estimatedVRAM
 			}
 		}
-
-		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
-			return true, estimatedVRAM
-		}
 	}
 	return false, estimatedVRAM
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -492,7 +492,6 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		if !requireFull {
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
-			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
 			return ErrLoadRequiredFull
 		}
 	}
@@ -525,6 +524,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		}
 	}

+	if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory {
+		return ErrLoadRequiredFull
+	}
+
 	slog.Info("offload", "", s.estimate)

 	s.gpus = gpus
@@ -648,9 +651,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
 		if !success {
 			s.initModel(ctx, LoadRequest{}, LoadOperationClose)
 		}
-		if s.mem != nil {
-			s.mem.Log(slog.LevelInfo)
-		}
+		s.mem.Log(slog.LevelInfo)
 	}()

 	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -396,7 +396,6 @@ type Tensor interface {

 	Shape() []int
 	DType() DType
-	Cast(ctx Context, dtype DType) Tensor

 	Bytes() []byte
 	Floats() []float32
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -843,7 +843,23 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		panic("set Input or Layer before creating tensors")
 	}

-	cdtype := ggmlDType(dtype)
+	var cdtype uint32
+	switch dtype {
+	case ml.DTypeF32:
+		cdtype = C.GGML_TYPE_F32
+	case ml.DTypeF16:
+		cdtype = C.GGML_TYPE_F16
+	case ml.DTypeQ80:
+		cdtype = C.GGML_TYPE_Q8_0
+	case ml.DTypeQ40:
+		cdtype = C.GGML_TYPE_Q4_0
+	case ml.DTypeI32:
+		cdtype = C.GGML_TYPE_I32
+	case ml.DTypeMXFP4:
+		cdtype = C.GGML_TYPE_MXFP4
+	default:
+		panic("unsupported dtype")
+	}

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
@@ -1040,32 +1056,6 @@ func (t *Tensor) DType() ml.DType {
 	}
 }

-func ggmlDType(dtype ml.DType) uint32 {
-	switch dtype {
-	case ml.DTypeF32:
-		return C.GGML_TYPE_F32
-	case ml.DTypeF16:
-		return C.GGML_TYPE_F16
-	case ml.DTypeQ80:
-		return C.GGML_TYPE_Q8_0
-	case ml.DTypeQ40:
-		return C.GGML_TYPE_Q4_0
-	case ml.DTypeI32:
-		return C.GGML_TYPE_I32
-	case ml.DTypeMXFP4:
-		return C.GGML_TYPE_MXFP4
-	default:
-		panic("unsupported dtype")
-	}
-}
-
-func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
-	return &Tensor{
-		b: t.b,
-		t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
-	}
-}
-
 func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -109,7 +109,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 					r = 0x0143
 				case r <= 0x0020:
 					r = r + 0x0100
-				case r >= 0x007f && r <= 0x00a0:
+				case r >= 0x007e && r <= 0x00a0:
 					r = r + 0x00a2
 				}

--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
@@ -207,36 +207,6 @@ func TestLlama(t *testing.T) {
 			}
 		}
 	})
-
-	t.Run("roundtriping 0x00-0xFF", func(t *testing.T) {
-		t.Parallel()
-
-		for b := 0x00; b <= 0xFF; b++ {
-			input := string(rune(b))
-			ids, err := tokenizer.Encode(input, false)
-			if err != nil {
-				t.Errorf("failed to encode rune 0x%02X: %v", b, err)
-				continue
-			}
-
-			decoded, err := tokenizer.Decode(ids)
-			if err != nil {
-				t.Errorf("failed to decode rune 0x%02X: %v", b, err)
-				continue
-			}
-
-			if b == 0x00 {
-				if len(decoded) != 0 {
-					t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids)
-				}
-				continue
-			}
-
-			if decoded != input {
-				t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input)
-			}
-		}
-	})
 }

 func BenchmarkBytePairEncoding(b *testing.B) {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -557,10 +557,12 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {

 	var think *api.ThinkValue
 	if r.Reasoning != nil {
+		options["reasoning"] = *r.Reasoning.Effort
 		think = &api.ThinkValue{
 			Value: *r.Reasoning.Effort,
 		}
 	} else if r.ReasoningEffort != nil {
+		options["reasoning"] = *r.ReasoningEffort
 		think = &api.ThinkValue{
 			Value: *r.ReasoningEffort,
 		}
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -46,7 +46,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be held that serializes
+// through LoadCacheSlot) require a lock to be be held that serializes
 // these operations with each other and llama.Decode

 type InputCacheSlot struct {
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -78,7 +78,7 @@ func (c *InputCache) Close() {
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be held that serializes
+// through LoadCacheSlot) require a lock to be be held that serializes
 // these operations with each other and processBatch

 type InputCacheSlot struct {
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -1,9 +1,9 @@
-package harmony
+package server

 import (
 	"context"
-	"fmt"
 	"log/slog"
+	"slices"
 	"strings"
 	"unicode"

@@ -19,6 +19,18 @@ const (
 	harmonyParserState_ParsingContent
 )

+func shouldUseHarmony(model Model) bool {
+	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
+		// heuristic to check whether the template expects to be parsed via harmony:
+		// search for harmony tags that are nearly always used
+		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
+			return true
+		}
+	}
+
+	return false
+}
+
 func (s harmonyParserState) String() string {
 	switch s {
 	// we're looking for the message start tag
@@ -263,21 +275,19 @@ const (
 // HarmonyMessageHandler processes harmony events and accumulates content appropriately.
 // This is a higher level interface that maps harmony concepts into ollama concepts
 type HarmonyMessageHandler struct {
-	state           harmonyMessageState
-	HarmonyParser   *HarmonyParser
-	FunctionNameMap *FunctionNameMap
+	state         harmonyMessageState
+	harmonyParser *HarmonyParser
 }

 // NewHarmonyMessageHandler creates a new message handler
 func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 	return &HarmonyMessageHandler{
 		state: harmonyMessageState_Normal,
-		HarmonyParser: &HarmonyParser{
+		harmonyParser: &HarmonyParser{
 			MessageStartTag: "<|start|>",
 			MessageEndTag:   "<|end|>",
 			HeaderEndTag:    "<|message|>",
 		},
-		FunctionNameMap: NewFunctionNameMap(),
 	}
 }

@@ -288,7 +298,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 	thinkingSb := strings.Builder{}
 	toolContentSb := strings.Builder{}

-	events := h.HarmonyParser.AddContent(content)
+	events := h.harmonyParser.AddContent(content)
 	for _, event := range events {
 		switch event := event.(type) {
 		case HarmonyEventHeaderComplete:
@@ -368,97 +378,3 @@ func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
 func (a *HarmonyToolCallAccumulator) Content() string {
 	return a.acc.String()
 }
-
-// FunctionNameMap maps a user-specified function name to a valid function
-// name for harmony (which look like TypeScript identifiers). This is needed to
-// transform user-specified function names, which might contain characters that
-// are not allowed in TypeScript identifiers
-type FunctionNameMap struct {
-	userToHarmony map[string]string
-	harmonyToUser map[string]string
-}
-
-func NewFunctionNameMap() *FunctionNameMap {
-	return &FunctionNameMap{
-		userToHarmony: make(map[string]string),
-		harmonyToUser: make(map[string]string),
-	}
-}
-
-func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
-	harmonyFunctionName := m.deriveName(userFunctionName)
-	m.userToHarmony[userFunctionName] = harmonyFunctionName
-	m.harmonyToUser[harmonyFunctionName] = userFunctionName
-	return harmonyFunctionName
-}
-
-// OriginalFromConverted looks up the reverse-mapping of a previously-converted
-// user->harmony function name. To unmap reliably, the mapping must exist, as
-// the conversion process is not reversible without the appropriate state
-func (m *FunctionNameMap) OriginalFromConverted(harmonyFunctionName string) string {
-	if userFunctionName, ok := m.harmonyToUser[harmonyFunctionName]; ok {
-		return userFunctionName
-	}
-	slog.Warn("harmony parser: no reverse mapping found for function name", "harmonyFunctionName", harmonyFunctionName)
-	// fallback to the original function name if we can't find a mapping
-	return harmonyFunctionName
-}
-
-// convertToValidChars converts a user-specified function name to a valid
-// TypeScript identifier.
-//
-// Limitations:
-//
-//   - This doesn't restrict reserved TypeScript keywords.
-//   - We don't perform a real ID_Start/ID_Continue check, and instead use the more
-//     restrictive unicode.IsLetter/unicode.IsDigit check. Unclear what kind of
-//     identifiers these models were trained on, so in the end we might want to
-//     convert unicode-heavy identifiers to their closest ASCII equivalents.
-func (m *FunctionNameMap) convertToValidChars(userFunctionName string) string {
-	mapper := func(r rune) rune {
-		// first, replace certain characters with underscores
-		if r == ' ' || r == '-' || r == '.' {
-			return '_'
-		}
-
-		if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$' {
-			return r
-		}
-
-		// finally, remove any other characters
-		return -1
-	}
-	candidate := strings.Map(mapper, userFunctionName)
-
-	// set a default name if we end up with nothing left
-	if candidate == "" {
-		return "unnamed"
-	}
-
-	// if the candidate starts with a number, prepend an underscore to make it a
-	// valid identifier
-	if unicode.IsDigit(rune(candidate[0])) {
-		candidate = "_" + candidate
-	}
-
-	return candidate
-}
-
-func (m *FunctionNameMap) deriveName(userFunctionName string) string {
-	originalCandidate := m.convertToValidChars(userFunctionName)
-	candidate := originalCandidate
-
-	// Check for dupes, and if so, add a number to the end.
-	// We start at 2 because if we have dupes and the first is never renamed, it
-	// makes sense for them to be named, say, `f`, `f_2`, `f_3`
-	count := 2
-	for {
-		if _, exists := m.harmonyToUser[candidate]; !exists {
-			break
-		}
-		candidate = fmt.Sprintf("%s_%d", originalCandidate, count)
-		count++
-	}
-
-	return candidate
-}
--- a/harmony/harmonyparser_test.go
+++ b/harmony/harmonyparser_test.go
@@ -1,4 +1,4 @@
-package harmony
+package server

 import (
 	"fmt"
@@ -467,71 +467,3 @@ func TestHarmonyParserStreaming(t *testing.T) {
 		})
 	}
 }
-
-// TestFunctionConvertToValidChars tests only FunctionNameMap.convert(), which doesn't
-// handle any saving (and therefore no dupe handling)
-func TestFunctionConvertToValidChars(t *testing.T) {
-	tests := []struct {
-		name string
-		in   string
-		want string
-	}{
-		{name: "replace spaces with underscores", in: "get weather", want: "get_weather"},
-		{name: "replace hyphens with underscores", in: "get-weather", want: "get_weather"},
-		{name: "replace periods with underscores", in: "get.weather", want: "get_weather"},
-		{name: "disallow non-word characters", in: "get weather!", want: "get_weather"},
-		{name: "strip out invalid non-alphanumeric unicode characters", in: "a🫠bc", want: "abc"},
-		{name: "names that only contain invalid characters", in: "🫠", want: "unnamed"},
-		{name: "leading number", in: "123", want: "_123"},
-		{name: "$ allowed", in: "$", want: "$"},
-		// show that we allow weird unicode letter characters, though we might want
-		// to convert them to their closest ASCII equivalents in the future
-		{name: "allow weird unicode letter characters", in: "𝓸𝓵𝓵𝓪𝓶𝓪", want: "𝓸𝓵𝓵𝓪𝓶𝓪"},
-		// names that look like words but are invalid (i.e., not ID_Start/ID_Continue)
-		{name: "disallow non-word characters that look like words", in: "ⓞⓛⓛⓐⓜⓐ123", want: "_123"},
-	}
-
-	for i, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			parser := NewFunctionNameMap()
-			got := parser.convertToValidChars(tt.in)
-			if got != tt.want {
-				t.Errorf("case %d: got %q, want %q", i, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestFunctionConvertAndAdd(t *testing.T) {
-	// make a fresh map for each test, but within a test use the same map so we can test for dupe handling
-	tests := []struct {
-		name string
-		in   []string
-		want []string
-	}{
-		{name: "basic dupe handling", in: []string{"get weather", "get weather"}, want: []string{"get_weather", "get_weather_2"}},
-		{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
-		{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
-		{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
-	}
-
-	for i, tt := range tests {
-		parser := NewFunctionNameMap()
-		t.Run(tt.name, func(t *testing.T) {
-			for j, in := range tt.in {
-				got := parser.ConvertAndAdd(in)
-				want := tt.want[j]
-				if got != want {
-					t.Errorf("case %d: got %q, want %q", i, got, want)
-				}
-				// check that the maps are correct
-				if parser.userToHarmony[in] != want {
-					t.Errorf("case %d: userToHarmony[%q] = %q, want %q", i, in, parser.userToHarmony[in], want)
-				}
-				if parser.harmonyToUser[want] != in {
-					t.Errorf("case %d: harmonyToUser[%q] = %q, want %q", i, want, parser.harmonyToUser[want], in)
-				}
-			}
-		})
-	}
-}
--- a/server/routes.go
+++ b/server/routes.go
@@ -32,7 +32,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/harmony"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/openai"
@@ -46,18 +45,6 @@ import (
 	"github.com/ollama/ollama/version"
 )

-func shouldUseHarmony(model *Model) bool {
-	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
-		// heuristic to check whether the template expects to be parsed via harmony:
-		// search for harmony tags that are nearly always used
-		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
-			return true
-		}
-	}
-
-	return false
-}
-
 func experimentEnabled(name string) bool {
 	return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name)
 }
@@ -207,12 +194,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	useHarmony := shouldUseHarmony(m) && !req.Raw
-	var harmonyMessageHandler *harmony.HarmonyMessageHandler
-	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
+	useHarmony := shouldUseHarmony(*m) && !req.Raw
+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator
 	if useHarmony {
-		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
-		harmonyMessageHandler.HarmonyParser.AddImplicitStart()
+		harmonyMessageHandler = NewHarmonyMessageHandler()
+		harmonyMessageHandler.harmonyParser.AddImplicitStart()
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
 	}

@@ -327,19 +314,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}

-	// If debug mode is enabled, return the rendered template instead of calling the model
-	if req.DebugRenderOnly {
-		c.JSON(http.StatusOK, api.DebugTemplateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			DebugInfo: api.DebugInfo{
-				RenderedTemplate: prompt,
-				ImageCount:       len(images),
-			},
-		})
-		return
-	}
-
 	var thinkingState *thinking.Parser
 	if !useHarmony {
 		openingTag, closingTag := thinking.InferTags(m.Template.Template)
@@ -1616,49 +1590,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	var harmonyMessageHandler *harmony.HarmonyMessageHandler
-	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
-
-	useHarmony := shouldUseHarmony(m)
-
-	processedTools := req.Tools
-	if useHarmony {
-		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
-		var lastMessage *api.Message
-		if len(msgs) > 0 {
-			lastMessage = &msgs[len(msgs)-1]
-		}
-		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(lastMessage)
-		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
-
-		// make a copy of tools to pass to the chat prompt. Function names may be
-		// renamed to be valid Harmony function names.
-		processedTools = make([]api.Tool, len(req.Tools))
-		copy(processedTools, req.Tools)
-		for i, tool := range processedTools {
-			processedTools[i].Function.Name = harmonyMessageHandler.FunctionNameMap.ConvertAndAdd(tool.Function.Name)
-		}
-	}
-
-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
 	if err != nil {
 		slog.Error("chat prompt error", "error", err)
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

-	// If debug mode is enabled, return the rendered template instead of calling the model
-	if req.DebugRenderOnly {
-		c.JSON(http.StatusOK, api.DebugTemplateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			DebugInfo: api.DebugInfo{
-				RenderedTemplate: prompt,
-				ImageCount:       len(images),
-			},
-		})
-		return
-	}
+	useHarmony := shouldUseHarmony(*m)

 	// Validate Think value: string values currently only allowed for gptoss models
 	if req.Think != nil && req.Think.IsString() && !useHarmony {
@@ -1666,6 +1605,19 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}

+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator
+
+	if useHarmony {
+		harmonyMessageHandler = NewHarmonyMessageHandler()
+		var lastMessage *api.Message
+		if len(msgs) > 0 {
+			lastMessage = &msgs[len(msgs)-1]
+		}
+		harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
+		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
+	}
+
 	var thinkingState *thinking.Parser
 	openingTag, closingTag := thinking.InferTags(m.Template.Template)
 	if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
@@ -1673,10 +1625,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
 		}
-
-		if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
-			thinkingState.AddContent(openingTag)
-		}
 	}

 	var toolParser *tools.Parser
@@ -1722,7 +1670,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					toolName, toolContent := harmonyToolParser.Drain()
 					if toolName != nil {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
-						*toolName = harmonyMessageHandler.FunctionNameMap.OriginalFromConverted(*toolName)
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
 							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@@ -1,413 +0,0 @@
-package server
-
-import (
-	"bytes"
-	"encoding/json"
-	"net/http"
-	"testing"
-	"time"
-
-	"github.com/gin-gonic/gin"
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/llm"
-)
-
-func TestGenerateDebugRenderOnly(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         llm.DoneReasonStop,
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      discover.GetGPUInfo,
-			getCpuFn:      discover.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	// Create a test model
-	stream := false
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:    "test-model",
-		Files:    map[string]string{"file.gguf": digest},
-		Template: "{{ .Prompt }}",
-		Stream:   &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	tests := []struct {
-		name            string
-		request         api.GenerateRequest
-		expectDebug     bool
-		expectTemplate  string
-		expectNumImages int
-	}{
-		{
-			name: "debug render only enabled",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello, world!",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "Hello, world!",
-		},
-		{
-			name: "debug render only disabled",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello, world!",
-				DebugRenderOnly: false,
-			},
-			expectDebug: false,
-		},
-		{
-			name: "debug render only with system prompt",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "User question",
-				System:          "You are a helpful assistant",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "User question",
-		},
-		{
-			name: "debug render only with template",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello",
-				Template:        "PROMPT: {{ .Prompt }}",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "PROMPT: Hello",
-		},
-		{
-			name: "debug render only with images",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Describe this image",
-				Images:          []api.ImageData{[]byte("fake-image-data")},
-				DebugRenderOnly: true,
-			},
-			expectDebug:     true,
-			expectTemplate:  "[img-0]\n\nDescribe this image",
-			expectNumImages: 1,
-		},
-		{
-			name: "debug render only with raw mode",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Raw prompt text",
-				Raw:             true,
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "Raw prompt text",
-		},
-	}
-
-	for _, tt := range tests {
-		// Test both with and without streaming
-		streamValues := []bool{false, true}
-		for _, stream := range streamValues {
-			streamSuffix := ""
-			if stream {
-				streamSuffix = " (streaming)"
-			}
-			t.Run(tt.name+streamSuffix, func(t *testing.T) {
-				req := tt.request
-				req.Stream = &stream
-				w := createRequest(t, s.GenerateHandler, req)
-
-				if tt.expectDebug {
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
-					}
-
-					var response api.DebugTemplateResponse
-					if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
-						t.Fatalf("failed to unmarshal response: %v", err)
-					}
-
-					if response.Model != tt.request.Model {
-						t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
-					}
-
-					if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
-						t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
-					}
-
-					if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
-						t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
-					}
-				} else {
-					// When debug is disabled, it should attempt normal processing
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
-					}
-				}
-			})
-		}
-	}
-}
-
-func TestChatDebugRenderOnly(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         llm.DoneReasonStop,
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      discover.GetGPUInfo,
-			getCpuFn:      discover.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	// Create a test model
-	stream := false
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:    "test-model",
-		Files:    map[string]string{"file.gguf": digest},
-		Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
-		Stream:   &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	tests := []struct {
-		name            string
-		request         api.ChatRequest
-		expectDebug     bool
-		expectTemplate  string
-		expectNumImages int
-	}{
-		{
-			name: "chat debug render only enabled",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "system", Content: "You are a helpful assistant"},
-					{Role: "user", Content: "Hello"},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
-		},
-		{
-			name: "chat debug render only disabled",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-				},
-				DebugRenderOnly: false,
-			},
-			expectDebug: false,
-		},
-		{
-			name: "chat debug with assistant message",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-					{Role: "assistant", Content: "Hi there!"},
-					{Role: "user", Content: "How are you?"},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
-		},
-		{
-			name: "chat debug with images",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "What's in this image?",
-						Images:  []api.ImageData{[]byte("fake-image-data")},
-					},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:     true,
-			expectTemplate:  "user: [img-0]What's in this image?\n",
-			expectNumImages: 1,
-		},
-		{
-			name: "chat debug with tools",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Get the weather"},
-				},
-				Tools: api.Tools{
-					{
-						Type: "function",
-						Function: api.ToolFunction{
-							Name:        "get_weather",
-							Description: "Get weather information",
-						},
-					},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
-		},
-	}
-
-	for _, tt := range tests {
-		// Test both with and without streaming
-		streamValues := []bool{false, true}
-		for _, stream := range streamValues {
-			streamSuffix := ""
-			if stream {
-				streamSuffix = " (streaming)"
-			}
-			t.Run(tt.name+streamSuffix, func(t *testing.T) {
-				req := tt.request
-				req.Stream = &stream
-				w := createRequest(t, s.ChatHandler, req)
-
-				if tt.expectDebug {
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
-					}
-
-					var response api.DebugTemplateResponse
-					if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
-						t.Fatalf("failed to unmarshal response: %v", err)
-					}
-
-					if response.Model != tt.request.Model {
-						t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
-					}
-
-					if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
-						t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
-					}
-
-					if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
-						t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
-					}
-				} else {
-					// When debug is disabled, it should attempt normal processing
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
-					}
-				}
-			})
-		}
-	}
-}
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -969,233 +969,3 @@ func TestGenerate(t *testing.T) {
 		}
 	})
 }
-
-func TestChatWithPromptEndingInThinkTag(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	// Helper to create a standard thinking test setup
-	setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) {
-		mock := &mockRunner{
-			CompletionResponse: llm.CompletionResponse{
-				Done:               true,
-				DoneReason:         llm.DoneReasonStop,
-				PromptEvalCount:    1,
-				PromptEvalDuration: 1,
-				EvalCount:          1,
-				EvalDuration:       1,
-			},
-		}
-
-		s := &Server{
-			sched: &Scheduler{
-				pendingReqCh:  make(chan *LlmRequest, 1),
-				finishedReqCh: make(chan *LlmRequest, 1),
-				expiredCh:     make(chan *runnerRef, 1),
-				unloadedCh:    make(chan any, 1),
-				loaded:        make(map[string]*runnerRef),
-				newServerFn:   newMockServer(mock),
-				getGpuFn:      discover.GetGPUInfo,
-				getCpuFn:      discover.GetCPUInfo,
-				reschedDelay:  250 * time.Millisecond,
-				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-					time.Sleep(time.Millisecond)
-					req.successCh <- &runnerRef{llama: mock}
-					return false
-				},
-			},
-		}
-
-		go s.sched.Run(t.Context())
-
-		// Create a model with thinking support
-		_, digest := createBinFile(t, ggml.KV{
-			"general.architecture":          "llama",
-			"llama.block_count":             uint32(1),
-			"llama.context_length":          uint32(8192),
-			"llama.embedding_length":        uint32(4096),
-			"llama.attention.head_count":    uint32(32),
-			"llama.attention.head_count_kv": uint32(8),
-			"tokenizer.ggml.tokens":         []string{""},
-			"tokenizer.ggml.scores":         []float32{0},
-			"tokenizer.ggml.token_type":     []int32{0},
-		}, []*ggml.Tensor{
-			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		})
-
-		// Create model with thinking template that adds <think> at the end
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
-			Model: "test-thinking",
-			Files: map[string]string{"file.gguf": digest},
-			Template: `{{- range .Messages }}
-{{- if eq .Role "user" }}user: {{ .Content }}
-{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}<think>{{ .Thinking }}</think>{{ end }}{{ .Content }}
-{{ end }}{{ end }}<think>`,
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d", w.Code)
-		}
-
-		return mock, s
-	}
-
-	mock, s := setupThinkingTest(t)
-
-	// Helper to test chat responses
-	testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) {
-		t.Run(name, func(t *testing.T) {
-			mock.CompletionResponse = llm.CompletionResponse{
-				Content:            modelResponse,
-				Done:               true,
-				DoneReason:         llm.DoneReasonStop,
-				PromptEvalCount:    1,
-				PromptEvalDuration: 1,
-				EvalCount:          1,
-				EvalDuration:       1,
-			}
-			mock.CompletionFn = nil
-
-			streamRequest := false
-			req := api.ChatRequest{
-				Model: "test-thinking",
-				Messages: []api.Message{
-					{Role: "user", Content: userContent},
-				},
-				Stream: &streamRequest,
-			}
-			if think {
-				req.Think = &api.ThinkValue{Value: think}
-			}
-
-			w := createRequest(t, s.ChatHandler, req)
-			if w.Code != http.StatusOK {
-				t.Fatalf("expected status 200, got %d", w.Code)
-			}
-
-			var resp api.ChatResponse
-			if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
-				t.Fatal(err)
-			}
-
-			if resp.Message.Thinking != expectedThinking {
-				t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking)
-			}
-
-			if resp.Message.Content != expectedContent {
-				t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content)
-			}
-		})
-	}
-
-	// Test cases - Note: Template adds <think> at the end, and leading whitespace after <think> is eaten by the parser
-	testChatRequest(t, "basic thinking response",
-		"Help me solve this problem",
-		" Let me think about this step by step... </think> The answer is 42.",
-		"Let me think about this step by step... ",
-		"The answer is 42.",
-		true)
-
-	testChatRequest(t, "thinking with multiple sentences",
-		"Explain quantum computing",
-		" First, I need to understand the basics. Quantum bits can be in superposition. </think> Quantum computing uses quantum mechanics principles.",
-		"First, I need to understand the basics. Quantum bits can be in superposition. ",
-		"Quantum computing uses quantum mechanics principles.",
-		true)
-
-	testChatRequest(t, "no thinking content",
-		"What is 2+2?",
-		"</think> The answer is 4.",
-		"",
-		"The answer is 4.",
-		true)
-
-	testChatRequest(t, "thinking disabled but template still adds think tag",
-		"Simple question",
-		" My thoughts </think> The answer.",
-		"",
-		" My thoughts </think> The answer.",
-		false)
-
-	// Test streaming response with template-added <think>
-	t.Run("streaming with thinking", func(t *testing.T) {
-		var wg sync.WaitGroup
-		wg.Add(1)
-
-		mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
-			defer wg.Done()
-
-			// Verify the prompt ends with <think> due to template
-			if !strings.HasSuffix(r.Prompt, "<think>") {
-				t.Errorf("expected prompt to end with <think>, got: %q", r.Prompt)
-			}
-
-			// Simulate streaming chunks
-			responses := []llm.CompletionResponse{
-				{Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
-				{Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
-				{Content: " </think> Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
-				{Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1},
-			}
-
-			for _, resp := range responses {
-				select {
-				case <-ctx.Done():
-					return ctx.Err()
-				default:
-					fn(resp)
-					time.Sleep(10 * time.Millisecond)
-				}
-			}
-			return nil
-		}
-
-		think := true
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model:    "test-thinking",
-			Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}},
-			Think:    &api.ThinkValue{Value: think},
-			Stream:   &stream,
-		})
-
-		wg.Wait()
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d", w.Code)
-		}
-
-		// Parse streaming responses
-		decoder := json.NewDecoder(w.Body)
-		var allThinking, allContent strings.Builder
-
-		for {
-			var resp api.ChatResponse
-			if err := decoder.Decode(&resp); err == io.EOF {
-				break
-			} else if err != nil {
-				t.Fatal(err)
-			}
-			allThinking.WriteString(resp.Message.Thinking)
-			allContent.WriteString(resp.Message.Content)
-		}
-
-		// Note: Leading whitespace after <think> is eaten by the parser
-		if got := allThinking.String(); got != "I need to consider multiple factors here... " {
-			t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got)
-		}
-
-		if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." {
-			t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
-		}
-	})
-}
--- a/thinking/parser.go
+++ b/thinking/parser.go
@@ -103,9 +103,7 @@ func eat(s *Parser) (string, string, bool) {
 			// note that we use the original content, not the trimmed one because we
 			// don't want to eat any whitespace in the real content if there were no
 			// thinking tags
-			untrimmed := s.acc.String()
-			s.acc.Reset()
-			return "", untrimmed, false
+			return "", s.acc.String(), false
 		}
 	case thinkingState_ThinkingStartedEatingWhitespace:
 		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
--- a/thinking/parser_test.go
+++ b/thinking/parser_test.go
@@ -58,15 +58,6 @@ func TestThinkingStreaming(t *testing.T) {
 					wantContent:    "  abc",
 					wantStateAfter: thinkingState_ThinkingDone,
 				},
-				// regression test for a bug where we were transitioning directly to
-				// ThinkingDone without clearing the buffer. This would cuase the first
-				// step to be outputted twice
-				{
-					input:          "def",
-					wantThinking:   "",
-					wantContent:    "def",
-					wantStateAfter: thinkingState_ThinkingDone,
-				},
 			},
 		},
 		{
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -224,45 +224,22 @@ func findArguments(buffer []byte) (map[string]any, int) {
 		return nil, 0
 	}

-	start := -1
 	var braces int
-	var inString, escaped bool
-
-	for i := range buffer {
-		c := buffer[i]
-
-		if escaped {
-			escaped = false
-			continue
-		}
-
-		if c == '\\' {
-			escaped = true
-			continue
-		}
-
-		if c == '"' {
-			inString = !inString
-			continue
-		}
-
-		if inString {
-			continue
-		}
+	var start int = -1

+	for i, c := range buffer {
 		if c == '{' {
 			if braces == 0 {
 				start = i
 			}
 			braces++
-		} else if c == '}' {
+		} else if c == '}' && braces > 0 {
 			braces--
 			if braces == 0 && start != -1 {
 				object := buffer[start : i+1]

 				var data map[string]any
 				if err := json.Unmarshal(object, &data); err != nil {
-					// not a valid object, keep looking
 					start = -1
 					continue
 				}
@@ -305,10 +282,6 @@ func findArguments(buffer []byte) (map[string]any, int) {

 				return data, i
 			}
-
-			if braces < 0 {
-				braces = 0
-			}
 		}
 	}

--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -1,7 +1,6 @@
 package tools

 import (
-	"strings"
 	"testing"
 	"text/template"

@@ -41,7 +40,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_temperature",
 				Description: "Retrieve the temperature for a given location",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type:     "object",
 					Required: []string{"city"},
 					Properties: map[string]api.ToolProperty{
@@ -63,7 +68,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_conditions",
 				Description: "Retrieve the current weather conditions for a given location",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"location": {
@@ -93,7 +104,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_address",
 				Description: "Get the address of a given location",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"location": {
@@ -109,7 +126,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "add",
 				Description: "Add two numbers",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"a": {
@@ -1117,163 +1140,11 @@ func TestFindArguments(t *testing.T) {
 		},
 		{
 			name:   "deepseek",
-			buffer: []byte(`"arguments": {"location": "Tokyo"}}</tool_call>`),
+			buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
 			want: map[string]any{
 				"location": "Tokyo",
 			},
 		},
-		{
-			name:   "string with braces",
-			buffer: []byte(`{"name": "process_code", "arguments": {"code": "if (x > 0) { return true; }"}}`),
-			want: map[string]any{
-				"code": "if (x > 0) { return true; }",
-			},
-		},
-		{
-			name:   "string with nested json",
-			buffer: []byte(`{"name": "send_data", "arguments": {"payload": "{\"nested\": {\"key\": \"value\"}}"}}`),
-			want: map[string]any{
-				"payload": `{"nested": {"key": "value"}}`,
-			},
-		},
-		{
-			name:   "string with escaped quotes and braces",
-			buffer: []byte(`{"name": "analyze", "arguments": {"text": "The JSON is: {\"key\": \"val{ue}\"}"}}`),
-			want: map[string]any{
-				"text": `The JSON is: {"key": "val{ue}"}`,
-			},
-		},
-		{
-			name:   "multiple objects with string containing braces",
-			buffer: []byte(`{"name": "test", "arguments": {"query": "find } in text"}} {"name": "other"}`),
-			want: map[string]any{
-				"query": "find } in text",
-			},
-		},
-		{
-			name:   "unmatched closing brace in string",
-			buffer: []byte(`{"name": "search", "arguments": {"pattern": "regex: }"}}`),
-			want: map[string]any{
-				"pattern": "regex: }",
-			},
-		},
-		{
-			name:   "complex nested with mixed braces",
-			buffer: []byte(`{"name": "analyze", "arguments": {"data": "{\"items\": [{\"value\": \"}\"}, {\"code\": \"if (x) { return y; }\"}]}"}}`),
-			want: map[string]any{
-				"data": `{"items": [{"value": "}"}, {"code": "if (x) { return y; }"}]}`,
-			},
-		},
-		{
-			name:   "string with newline and braces",
-			buffer: []byte(`{"name": "format", "arguments": {"template": "{\n  \"key\": \"value\"\n}"}}`),
-			want: map[string]any{
-				"template": "{\n  \"key\": \"value\"\n}",
-			},
-		},
-		{
-			name:   "string with unicode escape",
-			buffer: []byte(`{"name": "test", "arguments": {"text": "Unicode: \u007B and \u007D"}}`),
-			want: map[string]any{
-				"text": "Unicode: { and }",
-			},
-		},
-		{
-			name:   "array arguments",
-			buffer: []byte(`{"name": "batch", "arguments": ["item1", "item2", "{\"nested\": true}"]}`),
-			want:   nil, // This should return nil because arguments is not a map
-		},
-		{
-			name:   "escaped backslash before quote",
-			buffer: []byte(`{"name": "path", "arguments": {"dir": "C:\\Program Files\\{App}\\"}}`),
-			want: map[string]any{
-				"dir": `C:\Program Files\{App}\`,
-			},
-		},
-		{
-			name:   "single quotes not treated as string delimiters",
-			buffer: []byte(`{"name": "query", "arguments": {"sql": "SELECT * FROM users WHERE name = '{admin}'"}}`),
-			want: map[string]any{
-				"sql": "SELECT * FROM users WHERE name = '{admin}'",
-			},
-		},
-		{
-			name:   "incomplete json at buffer end",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "some {"`),
-			want:   nil,
-		},
-		{
-			name:   "multiple escaped quotes",
-			buffer: []byte(`{"name": "echo", "arguments": {"msg": "He said \"Hello {World}\" loudly"}}`),
-			want: map[string]any{
-				"msg": `He said "Hello {World}" loudly`,
-			},
-		},
-		{
-			name:   "json with comments style string",
-			buffer: []byte(`{"name": "code", "arguments": {"snippet": "// This is a comment with { and }"}}`),
-			want: map[string]any{
-				"snippet": "// This is a comment with { and }",
-			},
-		},
-		{
-			name:   "consecutive escaped backslashes",
-			buffer: []byte(`{"name": "test", "arguments": {"path": "C:\\\\{folder}\\\\"}}`),
-			want: map[string]any{
-				"path": `C:\\{folder}\\`,
-			},
-		},
-		{
-			name:   "empty string with braces after",
-			buffer: []byte(`{"name": "test", "arguments": {"a": "", "b": "{value}"}}`),
-			want: map[string]any{
-				"a": "",
-				"b": "{value}",
-			},
-		},
-		{
-			name:   "unicode in key names",
-			buffer: []byte(`{"name": "test", "arguments": {"key{": "value", "key}": "value2"}}`),
-			want: map[string]any{
-				"key{": "value",
-				"key}": "value2",
-			},
-		},
-		{
-			name:   "very long string with braces",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "` + strings.Repeat("a{b}c", 100) + `"}}`),
-			want: map[string]any{
-				"data": strings.Repeat("a{b}c", 100),
-			},
-		},
-		{
-			name:   "tab characters and braces",
-			buffer: []byte(`{"name": "test", "arguments": {"code": "\tif (true) {\n\t\treturn;\n\t}"}}`),
-			want: map[string]any{
-				"code": "\tif (true) {\n\t\treturn;\n\t}",
-			},
-		},
-		{
-			name:   "null byte in string",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "before\u0000{after}"}}`),
-			want: map[string]any{
-				"data": "before\x00{after}",
-			},
-		},
-		{
-			name:   "escaped quote at end of string",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "text with quote at end\\\""}}`),
-			want: map[string]any{
-				"data": `text with quote at end\"`,
-			},
-		},
-		{
-			name:   "mixed array and object in arguments",
-			buffer: []byte(`{"name": "test", "arguments": {"items": ["{", "}", {"key": "value"}]}}`),
-			want: map[string]any{
-				"items": []any{"{", "}", map[string]any{"key": "value"}},
-			},
-		},
 	}

 	for _, tt := range tests {