gemma3: scale in attention

gemma2: use fast attention
2025-08-19 13:43:47 -07:00 · 2025-08-19 13:33:12 -07:00
20 changed files with 138 additions and 676 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -286,23 +286,16 @@ func mapToTypeScriptType(jsonType string) string {
 	}
 }

-type ToolFunctionParameters struct {
-	Type       string                  `json:"type"`
-	Defs       any                     `json:"$defs,omitempty"`
-	Items      any                     `json:"items,omitempty"`
-	Required   []string                `json:"required"`
-	Properties map[string]ToolProperty `json:"properties"`
-}
-
-func (t *ToolFunctionParameters) String() string {
-	bts, _ := json.Marshal(t)
-	return string(bts)
-}
-
 type ToolFunction struct {
-	Name        string                 `json:"name"`
-	Description string                 `json:"description"`
-	Parameters  ToolFunctionParameters `json:"parameters"`
+	Name        string `json:"name"`
+	Description string `json:"description"`
+	Parameters  struct {
+		Type       string                  `json:"type"`
+		Defs       any                     `json:"$defs,omitempty"`
+		Items      any                     `json:"items,omitempty"`
+		Required   []string                `json:"required"`
+		Properties map[string]ToolProperty `json:"properties"`
+	} `json:"parameters"`
 }

 func (t *ToolFunction) String() string {
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -436,50 +436,3 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 		})
 	}
 }
-
-func TestToolFunctionParameters_String(t *testing.T) {
-	tests := []struct {
-		name     string
-		params   ToolFunctionParameters
-		expected string
-	}{
-		{
-			name: "simple object with string property",
-			params: ToolFunctionParameters{
-				Type:     "object",
-				Required: []string{"name"},
-				Properties: map[string]ToolProperty{
-					"name": {
-						Type:        PropertyType{"string"},
-						Description: "The name of the person",
-					},
-				},
-			},
-			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
-		},
-		{
-			name: "marshal failure returns empty string",
-			params: ToolFunctionParameters{
-				Type: "object",
-				Defs: func() any {
-					// Create a cycle that will cause json.Marshal to fail
-					type selfRef struct {
-						Self *selfRef
-					}
-					s := &selfRef{}
-					s.Self = s
-					return s
-				}(),
-				Properties: map[string]ToolProperty{},
-			},
-			expected: "",
-		},
-	}
-
-	for _, test := range tests {
-		t.Run(test.name, func(t *testing.T) {
-			result := test.params.String()
-			assert.Equal(t, test.expected, result)
-		})
-	}
-}
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -15,24 +15,19 @@ import (

 type gptossModel struct {
 	ModelParameters
-	HiddenLayers          uint32  `json:"num_hidden_layers"`
-	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
-	HiddenSize            uint32  `json:"hidden_size"`
-	IntermediateSize      uint32  `json:"intermediate_size"`
-	AttentionHeads        uint32  `json:"num_attention_heads"`
-	KeyValueHeads         uint32  `json:"num_key_value_heads"`
-	HeadDim               uint32  `json:"head_dim"`
-	Experts               uint32  `json:"num_experts"`
-	LocalExperts          uint32  `json:"num_local_experts"`
-	ExpertsPerToken       uint32  `json:"experts_per_token"`
-	RMSNormEpsilon        float32 `json:"rms_norm_eps"`
-	InitialContextLength  uint32  `json:"initial_context_length"`
-	RopeTheta             float32 `json:"rope_theta"`
-	RopeScalingFactor     float32 `json:"rope_scaling_factor"`
-	RopeScaling           struct {
-		Factor float32 `json:"factor"`
-	} `json:"rope_scaling"`
-	SlidingWindow uint32 `json:"sliding_window"`
+	HiddenLayers         uint32  `json:"num_hidden_layers"`
+	HiddenSize           uint32  `json:"hidden_size"`
+	IntermediateSize     uint32  `json:"intermediate_size"`
+	AttentionHeads       uint32  `json:"num_attention_heads"`
+	KeyValueHeads        uint32  `json:"num_key_value_heads"`
+	HeadDim              uint32  `json:"head_dim"`
+	Experts              uint32  `json:"num_experts"`
+	ExpertsPerToken      uint32  `json:"experts_per_token"`
+	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
+	InitialContextLength uint32  `json:"initial_context_length"`
+	RopeTheta            float32 `json:"rope_theta"`
+	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
+	SlidingWindow        uint32  `json:"sliding_window"`
 }

 var _ ModelConverter = (*gptossModel)(nil)
@@ -41,11 +36,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
-	kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
+	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
 	kv["gptoss.block_count"] = m.HiddenLayers
 	kv["gptoss.embedding_length"] = m.HiddenSize
 	kv["gptoss.feed_forward_length"] = m.IntermediateSize
-	kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
+	kv["gptoss.expert_count"] = m.Experts
 	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
 	kv["gptoss.attention.head_count"] = m.AttentionHeads
 	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -54,7 +49,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
 	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
 	kv["gptoss.rope.freq_base"] = m.RopeTheta
-	kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
+	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
 	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
 	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
 	kv["tokenizer.ggml.add_bos_token"] = false
@@ -97,11 +92,6 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {

 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
-
-		if !strings.HasSuffix(name, ".weight") {
-			name += ".weight"
-		}
-
 		out = append(out, &ggml.Tensor{
 			Name:     name,
 			Kind:     uint32(ggml.TensorTypeMXFP4),
@@ -114,47 +104,25 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 }

 func (m *gptossModel) Replacements() []string {
-	var replacements []string
-	if m.MaxPositionEmbeddings > 0 {
-		// hf flavored model
-		replacements = []string{
-			"lm_head", "output",
-			"model.embed_tokens", "token_embd",
-			"model.layers", "blk",
-			"input_layernorm", "attn_norm",
-			"self_attn.q_proj", "attn_q",
-			"self_attn.k_proj", "attn_k",
-			"self_attn.v_proj", "attn_v",
-			"self_attn.o_proj", "attn_out",
-			"self_attn.sinks", "attn_sinks",
-			"post_attention_layernorm", "ffn_norm",
-			"mlp.router", "ffn_gate_inp",
-			"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
-			"mlp.experts.down_proj_", "ffn_down_exps.",
-			"model.norm", "output_norm",
-		}
-	} else {
-		replacements = []string{
-			// noop replacements so other replacements will not be applied
-			".blocks", ".blocks",
-			".scales", ".scales",
-			// real replacements
-			"block", "blk",
-			"attn.norm", "attn_norm",
-			"attn.qkv", "attn_qkv",
-			"attn.sinks", "attn_sinks",
-			"attn.out", "attn_out",
-			"mlp.norm", "ffn_norm",
-			"mlp.gate", "ffn_gate_inp",
-			"mlp.mlp1_", "ffn_gate_up_exps.",
-			"mlp.mlp2_", "ffn_down_exps.",
-			"embedding", "token_embd",
-			"norm", "output_norm",
-			"unembedding", "output",
-			"scale", "weight",
-		}
+	return []string{
+		// noop replacements so other replacements will not be applied
+		".blocks", ".blocks",
+		".scales", ".scales",
+		// real replacements
+		"block", "blk",
+		"attn.norm", "attn_norm",
+		"attn.qkv", "attn_qkv",
+		"attn.sinks", "attn_sinks",
+		"attn.out", "attn_out",
+		"mlp.norm", "ffn_norm",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.mlp1_", "ffn_gate_up_exps.",
+		"mlp.mlp2_", "ffn_down_exps.",
+		"embedding", "token_embd",
+		"norm", "output_norm",
+		"unembedding", "output",
+		"scale", "weight",
 	}
-	return replacements
 }

 type mxfp4 struct {
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// Try to pack into as few GPUs as possible, starting from 1 GPU
 			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
 				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
+				ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)

 				if ok {
 					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
@@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// - try subsets of GPUs instead of just falling back to 1 or all in a family

 			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
+			if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
 				slog.Info("new model will fit in available VRAM, loading",
 					"model", modelPath,
 					"library", sgl[0].Library,
@@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 	var bestEstimate uint64
 	var bestFit int
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
+		_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestFit = i
@@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 }

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
@@ -97,10 +97,6 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
 				return true, estimatedVRAM
 			}
 		}
-
-		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
-			return true, estimatedVRAM
-		}
 	}
 	return false, estimatedVRAM
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -492,7 +492,6 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		if !requireFull {
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
-			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
 			return ErrLoadRequiredFull
 		}
 	}
@@ -525,6 +524,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		}
 	}

+	if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory {
+		return ErrLoadRequiredFull
+	}
+
 	slog.Info("offload", "", s.estimate)

 	s.gpus = gpus
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -109,7 +109,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 					r = 0x0143
 				case r <= 0x0020:
 					r = r + 0x0100
-				case r >= 0x007f && r <= 0x00a0:
+				case r >= 0x007e && r <= 0x00a0:
 					r = r + 0x00a2
 				}

--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
@@ -207,36 +207,6 @@ func TestLlama(t *testing.T) {
 			}
 		}
 	})
-
-	t.Run("roundtriping 0x00-0xFF", func(t *testing.T) {
-		t.Parallel()
-
-		for b := 0x00; b <= 0xFF; b++ {
-			input := string(rune(b))
-			ids, err := tokenizer.Encode(input, false)
-			if err != nil {
-				t.Errorf("failed to encode rune 0x%02X: %v", b, err)
-				continue
-			}
-
-			decoded, err := tokenizer.Decode(ids)
-			if err != nil {
-				t.Errorf("failed to decode rune 0x%02X: %v", b, err)
-				continue
-			}
-
-			if b == 0x00 {
-				if len(decoded) != 0 {
-					t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids)
-				}
-				continue
-			}
-
-			if decoded != input {
-				t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input)
-			}
-		}
-	})
 }

 func BenchmarkBytePairEncoding(b *testing.B) {
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -69,10 +69,10 @@ func New(c fs.Config) (model.Model, error) {
 		},
 	}

-	slidingWindowLen := int32(c.Uint("attention.sliding_window"))
-	m.Cache = kvcache.NewWrapperCache(kvcache.NewSWACache(slidingWindowLen, m.Shift), kvcache.NewCausalCache(m.Shift))
-	m.Cache.SetConfig(ml.CacheConfig{})
-
+	m.Cache = kvcache.NewWrapperCache(
+		kvcache.NewSWACache(int32(c.Uint("attention.sliding_window")), m.Shift),
+		kvcache.NewCausalCache(m.Shift),
+	)
 	return &m, nil
 }

@@ -90,12 +90,6 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())

-	if opts.largeModelScaling {
-		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
-	} else {
-		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
-	}
-
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
@@ -103,28 +97,14 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)

-	cache.Put(ctx, k, v)
-	k, v, mask := cache.Get(ctx)
+	scale := 1.0 / math.Sqrt(float64(opts.attnKeyLen))
+	if opts.largeModelScaling {
+		scale = 1.0 / math.Sqrt(float64(opts.hiddenSize/opts.numHeads))
+	}

-	q = q.Permute(ctx, 0, 2, 1, 3)
-	k = k.Permute(ctx, 0, 2, 1, 3)
-	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
-
-	kq := k.Mulmat(ctx, q)
-
-	// logit softcap
-	kq = kq.Scale(ctx, 1.0/float64(opts.attnLogitSoftcap))
-	kq = kq.Tanh(ctx)
-	kq = kq.Scale(ctx, float64(opts.attnLogitSoftcap))
-
-	kq = kq.Add(ctx, mask)
-	kq = kq.Softmax(ctx)
-
-	kqv := v.Mulmat(ctx, kq)
-	kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
-	kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
-
-	return sa.Output.Forward(ctx, kqv)
+	attn := nn.Attention(ctx, q, k, v, scale, cache)
+	attn = attn.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)
+	return sa.Output.Forward(ctx, attn)
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -86,12 +86,6 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
 	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())

-	if opts.largeModelScaling {
-		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
-	} else {
-		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.attnKeyLen)))
-	}
-
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
@@ -100,8 +94,12 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)

-	scaleFactor := 1.0
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	scale := 1.0 / math.Sqrt(float64(opts.attnKeyLen))
+	if opts.largeModelScaling {
+		scale = 1.0 / math.Sqrt(float64(opts.hiddenSize/opts.numHeads))
+	}
+
+	kqv := nn.Attention(ctx, q, k, v, scale, cache)
 	kqv = kqv.Reshape(ctx, opts.attnValLen*opts.numHeads, batchSize)

 	return sa.Output.Forward(ctx, kqv)
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -557,10 +557,12 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {

 	var think *api.ThinkValue
 	if r.Reasoning != nil {
+		options["reasoning"] = *r.Reasoning.Effort
 		think = &api.ThinkValue{
 			Value: *r.Reasoning.Effort,
 		}
 	} else if r.ReasoningEffort != nil {
+		options["reasoning"] = *r.ReasoningEffort
 		think = &api.ThinkValue{
 			Value: *r.ReasoningEffort,
 		}
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -46,7 +46,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be held that serializes
+// through LoadCacheSlot) require a lock to be be held that serializes
 // these operations with each other and llama.Decode

 type InputCacheSlot struct {
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -78,7 +78,7 @@ func (c *InputCache) Close() {
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be held that serializes
+// through LoadCacheSlot) require a lock to be be held that serializes
 // these operations with each other and processBatch

 type InputCacheSlot struct {
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -1,9 +1,10 @@
-package harmony
+package server

 import (
 	"context"
 	"fmt"
 	"log/slog"
+	"slices"
 	"strings"
 	"unicode"

@@ -19,6 +20,18 @@ const (
 	harmonyParserState_ParsingContent
 )

+func shouldUseHarmony(model Model) bool {
+	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
+		// heuristic to check whether the template expects to be parsed via harmony:
+		// search for harmony tags that are nearly always used
+		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
+			return true
+		}
+	}
+
+	return false
+}
+
 func (s harmonyParserState) String() string {
 	switch s {
 	// we're looking for the message start tag
@@ -264,20 +277,20 @@ const (
 // This is a higher level interface that maps harmony concepts into ollama concepts
 type HarmonyMessageHandler struct {
 	state           harmonyMessageState
-	HarmonyParser   *HarmonyParser
-	FunctionNameMap *FunctionNameMap
+	harmonyParser   *HarmonyParser
+	functionNameMap *FunctionNameMap
 }

 // NewHarmonyMessageHandler creates a new message handler
 func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 	return &HarmonyMessageHandler{
 		state: harmonyMessageState_Normal,
-		HarmonyParser: &HarmonyParser{
+		harmonyParser: &HarmonyParser{
 			MessageStartTag: "<|start|>",
 			MessageEndTag:   "<|end|>",
 			HeaderEndTag:    "<|message|>",
 		},
-		FunctionNameMap: NewFunctionNameMap(),
+		functionNameMap: NewFunctionNameMap(),
 	}
 }

@@ -288,7 +301,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 	thinkingSb := strings.Builder{}
 	toolContentSb := strings.Builder{}

-	events := h.HarmonyParser.AddContent(content)
+	events := h.harmonyParser.AddContent(content)
 	for _, event := range events {
 		switch event := event.(type) {
 		case HarmonyEventHeaderComplete:
--- a/harmony/harmonyparser_test.go
+++ b/harmony/harmonyparser_test.go
@@ -1,4 +1,4 @@
-package harmony
+package server

 import (
 	"fmt"
--- a/server/routes.go
+++ b/server/routes.go
@@ -32,7 +32,6 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/harmony"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/openai"
@@ -46,18 +45,6 @@ import (
 	"github.com/ollama/ollama/version"
 )

-func shouldUseHarmony(model *Model) bool {
-	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
-		// heuristic to check whether the template expects to be parsed via harmony:
-		// search for harmony tags that are nearly always used
-		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
-			return true
-		}
-	}
-
-	return false
-}
-
 func experimentEnabled(name string) bool {
 	return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name)
 }
@@ -207,12 +194,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	useHarmony := shouldUseHarmony(m) && !req.Raw
-	var harmonyMessageHandler *harmony.HarmonyMessageHandler
-	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
+	useHarmony := shouldUseHarmony(*m) && !req.Raw
+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator
 	if useHarmony {
-		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
-		harmonyMessageHandler.HarmonyParser.AddImplicitStart()
+		harmonyMessageHandler = NewHarmonyMessageHandler()
+		harmonyMessageHandler.harmonyParser.AddImplicitStart()
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
 	}

@@ -1616,19 +1603,19 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	var harmonyMessageHandler *harmony.HarmonyMessageHandler
-	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator

-	useHarmony := shouldUseHarmony(m)
+	useHarmony := shouldUseHarmony(*m)

 	processedTools := req.Tools
 	if useHarmony {
-		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
+		harmonyMessageHandler = NewHarmonyMessageHandler()
 		var lastMessage *api.Message
 		if len(msgs) > 0 {
 			lastMessage = &msgs[len(msgs)-1]
 		}
-		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(lastMessage)
+		harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()

 		// make a copy of tools to pass to the chat prompt. Function names may be
@@ -1636,7 +1623,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		processedTools = make([]api.Tool, len(req.Tools))
 		copy(processedTools, req.Tools)
 		for i, tool := range processedTools {
-			processedTools[i].Function.Name = harmonyMessageHandler.FunctionNameMap.ConvertAndAdd(tool.Function.Name)
+			processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
 		}
 	}

@@ -1673,10 +1660,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
 		}
-
-		if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
-			thinkingState.AddContent(openingTag)
-		}
 	}

 	var toolParser *tools.Parser
@@ -1722,7 +1705,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					toolName, toolContent := harmonyToolParser.Drain()
 					if toolName != nil {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
-						*toolName = harmonyMessageHandler.FunctionNameMap.OriginalFromConverted(*toolName)
+						*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
 							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -969,233 +969,3 @@ func TestGenerate(t *testing.T) {
 		}
 	})
 }
-
-func TestChatWithPromptEndingInThinkTag(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	// Helper to create a standard thinking test setup
-	setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) {
-		mock := &mockRunner{
-			CompletionResponse: llm.CompletionResponse{
-				Done:               true,
-				DoneReason:         llm.DoneReasonStop,
-				PromptEvalCount:    1,
-				PromptEvalDuration: 1,
-				EvalCount:          1,
-				EvalDuration:       1,
-			},
-		}
-
-		s := &Server{
-			sched: &Scheduler{
-				pendingReqCh:  make(chan *LlmRequest, 1),
-				finishedReqCh: make(chan *LlmRequest, 1),
-				expiredCh:     make(chan *runnerRef, 1),
-				unloadedCh:    make(chan any, 1),
-				loaded:        make(map[string]*runnerRef),
-				newServerFn:   newMockServer(mock),
-				getGpuFn:      discover.GetGPUInfo,
-				getCpuFn:      discover.GetCPUInfo,
-				reschedDelay:  250 * time.Millisecond,
-				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-					time.Sleep(time.Millisecond)
-					req.successCh <- &runnerRef{llama: mock}
-					return false
-				},
-			},
-		}
-
-		go s.sched.Run(t.Context())
-
-		// Create a model with thinking support
-		_, digest := createBinFile(t, ggml.KV{
-			"general.architecture":          "llama",
-			"llama.block_count":             uint32(1),
-			"llama.context_length":          uint32(8192),
-			"llama.embedding_length":        uint32(4096),
-			"llama.attention.head_count":    uint32(32),
-			"llama.attention.head_count_kv": uint32(8),
-			"tokenizer.ggml.tokens":         []string{""},
-			"tokenizer.ggml.scores":         []float32{0},
-			"tokenizer.ggml.token_type":     []int32{0},
-		}, []*ggml.Tensor{
-			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-			{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		})
-
-		// Create model with thinking template that adds <think> at the end
-		w := createRequest(t, s.CreateHandler, api.CreateRequest{
-			Model: "test-thinking",
-			Files: map[string]string{"file.gguf": digest},
-			Template: `{{- range .Messages }}
-{{- if eq .Role "user" }}user: {{ .Content }}
-{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}<think>{{ .Thinking }}</think>{{ end }}{{ .Content }}
-{{ end }}{{ end }}<think>`,
-			Stream: &stream,
-		})
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d", w.Code)
-		}
-
-		return mock, s
-	}
-
-	mock, s := setupThinkingTest(t)
-
-	// Helper to test chat responses
-	testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) {
-		t.Run(name, func(t *testing.T) {
-			mock.CompletionResponse = llm.CompletionResponse{
-				Content:            modelResponse,
-				Done:               true,
-				DoneReason:         llm.DoneReasonStop,
-				PromptEvalCount:    1,
-				PromptEvalDuration: 1,
-				EvalCount:          1,
-				EvalDuration:       1,
-			}
-			mock.CompletionFn = nil
-
-			streamRequest := false
-			req := api.ChatRequest{
-				Model: "test-thinking",
-				Messages: []api.Message{
-					{Role: "user", Content: userContent},
-				},
-				Stream: &streamRequest,
-			}
-			if think {
-				req.Think = &api.ThinkValue{Value: think}
-			}
-
-			w := createRequest(t, s.ChatHandler, req)
-			if w.Code != http.StatusOK {
-				t.Fatalf("expected status 200, got %d", w.Code)
-			}
-
-			var resp api.ChatResponse
-			if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
-				t.Fatal(err)
-			}
-
-			if resp.Message.Thinking != expectedThinking {
-				t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking)
-			}
-
-			if resp.Message.Content != expectedContent {
-				t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content)
-			}
-		})
-	}
-
-	// Test cases - Note: Template adds <think> at the end, and leading whitespace after <think> is eaten by the parser
-	testChatRequest(t, "basic thinking response",
-		"Help me solve this problem",
-		" Let me think about this step by step... </think> The answer is 42.",
-		"Let me think about this step by step... ",
-		"The answer is 42.",
-		true)
-
-	testChatRequest(t, "thinking with multiple sentences",
-		"Explain quantum computing",
-		" First, I need to understand the basics. Quantum bits can be in superposition. </think> Quantum computing uses quantum mechanics principles.",
-		"First, I need to understand the basics. Quantum bits can be in superposition. ",
-		"Quantum computing uses quantum mechanics principles.",
-		true)
-
-	testChatRequest(t, "no thinking content",
-		"What is 2+2?",
-		"</think> The answer is 4.",
-		"",
-		"The answer is 4.",
-		true)
-
-	testChatRequest(t, "thinking disabled but template still adds think tag",
-		"Simple question",
-		" My thoughts </think> The answer.",
-		"",
-		" My thoughts </think> The answer.",
-		false)
-
-	// Test streaming response with template-added <think>
-	t.Run("streaming with thinking", func(t *testing.T) {
-		var wg sync.WaitGroup
-		wg.Add(1)
-
-		mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
-			defer wg.Done()
-
-			// Verify the prompt ends with <think> due to template
-			if !strings.HasSuffix(r.Prompt, "<think>") {
-				t.Errorf("expected prompt to end with <think>, got: %q", r.Prompt)
-			}
-
-			// Simulate streaming chunks
-			responses := []llm.CompletionResponse{
-				{Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
-				{Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
-				{Content: " </think> Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
-				{Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1},
-			}
-
-			for _, resp := range responses {
-				select {
-				case <-ctx.Done():
-					return ctx.Err()
-				default:
-					fn(resp)
-					time.Sleep(10 * time.Millisecond)
-				}
-			}
-			return nil
-		}
-
-		think := true
-		w := createRequest(t, s.ChatHandler, api.ChatRequest{
-			Model:    "test-thinking",
-			Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}},
-			Think:    &api.ThinkValue{Value: think},
-			Stream:   &stream,
-		})
-
-		wg.Wait()
-
-		if w.Code != http.StatusOK {
-			t.Fatalf("expected status 200, got %d", w.Code)
-		}
-
-		// Parse streaming responses
-		decoder := json.NewDecoder(w.Body)
-		var allThinking, allContent strings.Builder
-
-		for {
-			var resp api.ChatResponse
-			if err := decoder.Decode(&resp); err == io.EOF {
-				break
-			} else if err != nil {
-				t.Fatal(err)
-			}
-			allThinking.WriteString(resp.Message.Thinking)
-			allContent.WriteString(resp.Message.Content)
-		}
-
-		// Note: Leading whitespace after <think> is eaten by the parser
-		if got := allThinking.String(); got != "I need to consider multiple factors here... " {
-			t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got)
-		}
-
-		if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." {
-			t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
-		}
-	})
-}
--- a/thinking/parser.go
+++ b/thinking/parser.go
@@ -103,9 +103,7 @@ func eat(s *Parser) (string, string, bool) {
 			// note that we use the original content, not the trimmed one because we
 			// don't want to eat any whitespace in the real content if there were no
 			// thinking tags
-			untrimmed := s.acc.String()
-			s.acc.Reset()
-			return "", untrimmed, false
+			return "", s.acc.String(), false
 		}
 	case thinkingState_ThinkingStartedEatingWhitespace:
 		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
--- a/thinking/parser_test.go
+++ b/thinking/parser_test.go
@@ -58,15 +58,6 @@ func TestThinkingStreaming(t *testing.T) {
 					wantContent:    "  abc",
 					wantStateAfter: thinkingState_ThinkingDone,
 				},
-				// regression test for a bug where we were transitioning directly to
-				// ThinkingDone without clearing the buffer. This would cuase the first
-				// step to be outputted twice
-				{
-					input:          "def",
-					wantThinking:   "",
-					wantContent:    "def",
-					wantStateAfter: thinkingState_ThinkingDone,
-				},
 			},
 		},
 		{
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -224,45 +224,22 @@ func findArguments(buffer []byte) (map[string]any, int) {
 		return nil, 0
 	}

-	start := -1
 	var braces int
-	var inString, escaped bool
-
-	for i := range buffer {
-		c := buffer[i]
-
-		if escaped {
-			escaped = false
-			continue
-		}
-
-		if c == '\\' {
-			escaped = true
-			continue
-		}
-
-		if c == '"' {
-			inString = !inString
-			continue
-		}
-
-		if inString {
-			continue
-		}
+	var start int = -1

+	for i, c := range buffer {
 		if c == '{' {
 			if braces == 0 {
 				start = i
 			}
 			braces++
-		} else if c == '}' {
+		} else if c == '}' && braces > 0 {
 			braces--
 			if braces == 0 && start != -1 {
 				object := buffer[start : i+1]

 				var data map[string]any
 				if err := json.Unmarshal(object, &data); err != nil {
-					// not a valid object, keep looking
 					start = -1
 					continue
 				}
@@ -305,10 +282,6 @@ func findArguments(buffer []byte) (map[string]any, int) {

 				return data, i
 			}
-
-			if braces < 0 {
-				braces = 0
-			}
 		}
 	}

--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -1,7 +1,6 @@
 package tools

 import (
-	"strings"
 	"testing"
 	"text/template"

@@ -41,7 +40,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_temperature",
 				Description: "Retrieve the temperature for a given location",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type:     "object",
 					Required: []string{"city"},
 					Properties: map[string]api.ToolProperty{
@@ -63,7 +68,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_conditions",
 				Description: "Retrieve the current weather conditions for a given location",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"location": {
@@ -93,7 +104,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_address",
 				Description: "Get the address of a given location",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"location": {
@@ -109,7 +126,13 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "add",
 				Description: "Add two numbers",
-				Parameters: api.ToolFunctionParameters{
+				Parameters: struct {
+					Type       string                      `json:"type"`
+					Defs       any                         `json:"$defs,omitempty"`
+					Items      any                         `json:"items,omitempty"`
+					Required   []string                    `json:"required"`
+					Properties map[string]api.ToolProperty `json:"properties"`
+				}{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"a": {
@@ -1117,163 +1140,11 @@ func TestFindArguments(t *testing.T) {
 		},
 		{
 			name:   "deepseek",
-			buffer: []byte(`"arguments": {"location": "Tokyo"}}</tool_call>`),
+			buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
 			want: map[string]any{
 				"location": "Tokyo",
 			},
 		},
-		{
-			name:   "string with braces",
-			buffer: []byte(`{"name": "process_code", "arguments": {"code": "if (x > 0) { return true; }"}}`),
-			want: map[string]any{
-				"code": "if (x > 0) { return true; }",
-			},
-		},
-		{
-			name:   "string with nested json",
-			buffer: []byte(`{"name": "send_data", "arguments": {"payload": "{\"nested\": {\"key\": \"value\"}}"}}`),
-			want: map[string]any{
-				"payload": `{"nested": {"key": "value"}}`,
-			},
-		},
-		{
-			name:   "string with escaped quotes and braces",
-			buffer: []byte(`{"name": "analyze", "arguments": {"text": "The JSON is: {\"key\": \"val{ue}\"}"}}`),
-			want: map[string]any{
-				"text": `The JSON is: {"key": "val{ue}"}`,
-			},
-		},
-		{
-			name:   "multiple objects with string containing braces",
-			buffer: []byte(`{"name": "test", "arguments": {"query": "find } in text"}} {"name": "other"}`),
-			want: map[string]any{
-				"query": "find } in text",
-			},
-		},
-		{
-			name:   "unmatched closing brace in string",
-			buffer: []byte(`{"name": "search", "arguments": {"pattern": "regex: }"}}`),
-			want: map[string]any{
-				"pattern": "regex: }",
-			},
-		},
-		{
-			name:   "complex nested with mixed braces",
-			buffer: []byte(`{"name": "analyze", "arguments": {"data": "{\"items\": [{\"value\": \"}\"}, {\"code\": \"if (x) { return y; }\"}]}"}}`),
-			want: map[string]any{
-				"data": `{"items": [{"value": "}"}, {"code": "if (x) { return y; }"}]}`,
-			},
-		},
-		{
-			name:   "string with newline and braces",
-			buffer: []byte(`{"name": "format", "arguments": {"template": "{\n  \"key\": \"value\"\n}"}}`),
-			want: map[string]any{
-				"template": "{\n  \"key\": \"value\"\n}",
-			},
-		},
-		{
-			name:   "string with unicode escape",
-			buffer: []byte(`{"name": "test", "arguments": {"text": "Unicode: \u007B and \u007D"}}`),
-			want: map[string]any{
-				"text": "Unicode: { and }",
-			},
-		},
-		{
-			name:   "array arguments",
-			buffer: []byte(`{"name": "batch", "arguments": ["item1", "item2", "{\"nested\": true}"]}`),
-			want:   nil, // This should return nil because arguments is not a map
-		},
-		{
-			name:   "escaped backslash before quote",
-			buffer: []byte(`{"name": "path", "arguments": {"dir": "C:\\Program Files\\{App}\\"}}`),
-			want: map[string]any{
-				"dir": `C:\Program Files\{App}\`,
-			},
-		},
-		{
-			name:   "single quotes not treated as string delimiters",
-			buffer: []byte(`{"name": "query", "arguments": {"sql": "SELECT * FROM users WHERE name = '{admin}'"}}`),
-			want: map[string]any{
-				"sql": "SELECT * FROM users WHERE name = '{admin}'",
-			},
-		},
-		{
-			name:   "incomplete json at buffer end",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "some {"`),
-			want:   nil,
-		},
-		{
-			name:   "multiple escaped quotes",
-			buffer: []byte(`{"name": "echo", "arguments": {"msg": "He said \"Hello {World}\" loudly"}}`),
-			want: map[string]any{
-				"msg": `He said "Hello {World}" loudly`,
-			},
-		},
-		{
-			name:   "json with comments style string",
-			buffer: []byte(`{"name": "code", "arguments": {"snippet": "// This is a comment with { and }"}}`),
-			want: map[string]any{
-				"snippet": "// This is a comment with { and }",
-			},
-		},
-		{
-			name:   "consecutive escaped backslashes",
-			buffer: []byte(`{"name": "test", "arguments": {"path": "C:\\\\{folder}\\\\"}}`),
-			want: map[string]any{
-				"path": `C:\\{folder}\\`,
-			},
-		},
-		{
-			name:   "empty string with braces after",
-			buffer: []byte(`{"name": "test", "arguments": {"a": "", "b": "{value}"}}`),
-			want: map[string]any{
-				"a": "",
-				"b": "{value}",
-			},
-		},
-		{
-			name:   "unicode in key names",
-			buffer: []byte(`{"name": "test", "arguments": {"key{": "value", "key}": "value2"}}`),
-			want: map[string]any{
-				"key{": "value",
-				"key}": "value2",
-			},
-		},
-		{
-			name:   "very long string with braces",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "` + strings.Repeat("a{b}c", 100) + `"}}`),
-			want: map[string]any{
-				"data": strings.Repeat("a{b}c", 100),
-			},
-		},
-		{
-			name:   "tab characters and braces",
-			buffer: []byte(`{"name": "test", "arguments": {"code": "\tif (true) {\n\t\treturn;\n\t}"}}`),
-			want: map[string]any{
-				"code": "\tif (true) {\n\t\treturn;\n\t}",
-			},
-		},
-		{
-			name:   "null byte in string",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "before\u0000{after}"}}`),
-			want: map[string]any{
-				"data": "before\x00{after}",
-			},
-		},
-		{
-			name:   "escaped quote at end of string",
-			buffer: []byte(`{"name": "test", "arguments": {"data": "text with quote at end\\\""}}`),
-			want: map[string]any{
-				"data": `text with quote at end\"`,
-			},
-		},
-		{
-			name:   "mixed array and object in arguments",
-			buffer: []byte(`{"name": "test", "arguments": {"items": ["{", "}", {"key": "value"}]}}`),
-			want: map[string]any{
-				"items": []any{"{", "}", map[string]any{"key": "value"}},
-			},
-		},
 	}

 	for _, tt := range tests {
Author	SHA1	Message	Date
Michael Yang	12a7e5ec46	gemma3: scale in attention	2025-08-19 13:43:47 -07:00
Michael Yang	b323cfe731	gemma2: use fast attention	2025-08-19 13:33:12 -07:00