api: implement stringer for ToolFunctionParameters (#12038 )

tools: avoid matching braces that are part of tool content (#12039 )
Merge pull request #12021 from ollama/drifkin/thinking-double-emit
2025-08-22 16:26:48 -07:00 · 2025-08-22 15:22:14 -07:00 · 2025-08-22 12:01:37 -07:00 · 2025-08-22 12:00:16 -07:00 · 2025-08-22 11:00:27 -07:00 · 2025-08-21 21:03:12 -07:00
21 changed files with 666 additions and 141 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -286,16 +286,23 @@ func mapToTypeScriptType(jsonType string) string {
 	}
 }

+type ToolFunctionParameters struct {
+	Type       string                  `json:"type"`
+	Defs       any                     `json:"$defs,omitempty"`
+	Items      any                     `json:"items,omitempty"`
+	Required   []string                `json:"required"`
+	Properties map[string]ToolProperty `json:"properties"`
+}
+
+func (t *ToolFunctionParameters) String() string {
+	bts, _ := json.Marshal(t)
+	return string(bts)
+}
+
 type ToolFunction struct {
-	Name        string `json:"name"`
-	Description string `json:"description"`
-	Parameters  struct {
-		Type       string                  `json:"type"`
-		Defs       any                     `json:"$defs,omitempty"`
-		Items      any                     `json:"items,omitempty"`
-		Required   []string                `json:"required"`
-		Properties map[string]ToolProperty `json:"properties"`
-	} `json:"parameters"`
+	Name        string                 `json:"name"`
+	Description string                 `json:"description"`
+	Parameters  ToolFunctionParameters `json:"parameters"`
 }

 func (t *ToolFunction) String() string {
--- a/api/types_test.go
+++ b/api/types_test.go
@@ -436,3 +436,50 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
 		})
 	}
 }
+
+func TestToolFunctionParameters_String(t *testing.T) {
+	tests := []struct {
+		name     string
+		params   ToolFunctionParameters
+		expected string
+	}{
+		{
+			name: "simple object with string property",
+			params: ToolFunctionParameters{
+				Type:     "object",
+				Required: []string{"name"},
+				Properties: map[string]ToolProperty{
+					"name": {
+						Type:        PropertyType{"string"},
+						Description: "The name of the person",
+					},
+				},
+			},
+			expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
+		},
+		{
+			name: "marshal failure returns empty string",
+			params: ToolFunctionParameters{
+				Type: "object",
+				Defs: func() any {
+					// Create a cycle that will cause json.Marshal to fail
+					type selfRef struct {
+						Self *selfRef
+					}
+					s := &selfRef{}
+					s.Self = s
+					return s
+				}(),
+				Properties: map[string]ToolProperty{},
+			},
+			expected: "",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			result := test.params.String()
+			assert.Equal(t, test.expected, result)
+		})
+	}
+}
--- a/convert/convert_gptoss.go
+++ b/convert/convert_gptoss.go
@@ -15,19 +15,24 @@ import (

 type gptossModel struct {
 	ModelParameters
-	HiddenLayers         uint32  `json:"num_hidden_layers"`
-	HiddenSize           uint32  `json:"hidden_size"`
-	IntermediateSize     uint32  `json:"intermediate_size"`
-	AttentionHeads       uint32  `json:"num_attention_heads"`
-	KeyValueHeads        uint32  `json:"num_key_value_heads"`
-	HeadDim              uint32  `json:"head_dim"`
-	Experts              uint32  `json:"num_experts"`
-	ExpertsPerToken      uint32  `json:"experts_per_token"`
-	RMSNormEpsilon       float32 `json:"rms_norm_eps"`
-	InitialContextLength uint32  `json:"initial_context_length"`
-	RopeTheta            float32 `json:"rope_theta"`
-	RopeScalingFactor    float32 `json:"rope_scaling_factor"`
-	SlidingWindow        uint32  `json:"sliding_window"`
+	HiddenLayers          uint32  `json:"num_hidden_layers"`
+	MaxPositionEmbeddings uint32  `json:"max_position_embeddings"`
+	HiddenSize            uint32  `json:"hidden_size"`
+	IntermediateSize      uint32  `json:"intermediate_size"`
+	AttentionHeads        uint32  `json:"num_attention_heads"`
+	KeyValueHeads         uint32  `json:"num_key_value_heads"`
+	HeadDim               uint32  `json:"head_dim"`
+	Experts               uint32  `json:"num_experts"`
+	LocalExperts          uint32  `json:"num_local_experts"`
+	ExpertsPerToken       uint32  `json:"experts_per_token"`
+	RMSNormEpsilon        float32 `json:"rms_norm_eps"`
+	InitialContextLength  uint32  `json:"initial_context_length"`
+	RopeTheta             float32 `json:"rope_theta"`
+	RopeScalingFactor     float32 `json:"rope_scaling_factor"`
+	RopeScaling           struct {
+		Factor float32 `json:"factor"`
+	} `json:"rope_scaling"`
+	SlidingWindow uint32 `json:"sliding_window"`
 }

 var _ ModelConverter = (*gptossModel)(nil)
@@ -36,11 +41,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv := m.ModelParameters.KV(t)
 	kv["general.architecture"] = "gptoss"
 	kv["general.file_type"] = uint32(4)
-	kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
+	kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
 	kv["gptoss.block_count"] = m.HiddenLayers
 	kv["gptoss.embedding_length"] = m.HiddenSize
 	kv["gptoss.feed_forward_length"] = m.IntermediateSize
-	kv["gptoss.expert_count"] = m.Experts
+	kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
 	kv["gptoss.expert_used_count"] = m.ExpertsPerToken
 	kv["gptoss.attention.head_count"] = m.AttentionHeads
 	kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -49,7 +54,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
 	kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
 	kv["gptoss.attention.sliding_window"] = m.SlidingWindow
 	kv["gptoss.rope.freq_base"] = m.RopeTheta
-	kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
+	kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
 	kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
 	kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
 	kv["tokenizer.ggml.add_bos_token"] = false
@@ -92,6 +97,11 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {

 	for name, mxfp4 := range mxfp4s {
 		dims := mxfp4.blocks.Shape()
+
+		if !strings.HasSuffix(name, ".weight") {
+			name += ".weight"
+		}
+
 		out = append(out, &ggml.Tensor{
 			Name:     name,
 			Kind:     uint32(ggml.TensorTypeMXFP4),
@@ -104,25 +114,47 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
 }

 func (m *gptossModel) Replacements() []string {
-	return []string{
-		// noop replacements so other replacements will not be applied
-		".blocks", ".blocks",
-		".scales", ".scales",
-		// real replacements
-		"block", "blk",
-		"attn.norm", "attn_norm",
-		"attn.qkv", "attn_qkv",
-		"attn.sinks", "attn_sinks",
-		"attn.out", "attn_out",
-		"mlp.norm", "ffn_norm",
-		"mlp.gate", "ffn_gate_inp",
-		"mlp.mlp1_", "ffn_gate_up_exps.",
-		"mlp.mlp2_", "ffn_down_exps.",
-		"embedding", "token_embd",
-		"norm", "output_norm",
-		"unembedding", "output",
-		"scale", "weight",
+	var replacements []string
+	if m.MaxPositionEmbeddings > 0 {
+		// hf flavored model
+		replacements = []string{
+			"lm_head", "output",
+			"model.embed_tokens", "token_embd",
+			"model.layers", "blk",
+			"input_layernorm", "attn_norm",
+			"self_attn.q_proj", "attn_q",
+			"self_attn.k_proj", "attn_k",
+			"self_attn.v_proj", "attn_v",
+			"self_attn.o_proj", "attn_out",
+			"self_attn.sinks", "attn_sinks",
+			"post_attention_layernorm", "ffn_norm",
+			"mlp.router", "ffn_gate_inp",
+			"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
+			"mlp.experts.down_proj_", "ffn_down_exps.",
+			"model.norm", "output_norm",
+		}
+	} else {
+		replacements = []string{
+			// noop replacements so other replacements will not be applied
+			".blocks", ".blocks",
+			".scales", ".scales",
+			// real replacements
+			"block", "blk",
+			"attn.norm", "attn_norm",
+			"attn.qkv", "attn_qkv",
+			"attn.sinks", "attn_sinks",
+			"attn.out", "attn_out",
+			"mlp.norm", "ffn_norm",
+			"mlp.gate", "ffn_gate_inp",
+			"mlp.mlp1_", "ffn_gate_up_exps.",
+			"mlp.mlp2_", "ffn_down_exps.",
+			"embedding", "token_embd",
+			"norm", "output_norm",
+			"unembedding", "output",
+			"scale", "weight",
+		}
 	}
+	return replacements
 }

 type mxfp4 struct {
--- a/harmony/harmonyparser.go
+++ b/harmony/harmonyparser.go
@@ -1,10 +1,9 @@
-package server
+package harmony

 import (
 	"context"
 	"fmt"
 	"log/slog"
-	"slices"
 	"strings"
 	"unicode"

@@ -20,18 +19,6 @@ const (
 	harmonyParserState_ParsingContent
 )

-func shouldUseHarmony(model Model) bool {
-	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
-		// heuristic to check whether the template expects to be parsed via harmony:
-		// search for harmony tags that are nearly always used
-		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
-			return true
-		}
-	}
-
-	return false
-}
-
 func (s harmonyParserState) String() string {
 	switch s {
 	// we're looking for the message start tag
@@ -277,20 +264,20 @@ const (
 // This is a higher level interface that maps harmony concepts into ollama concepts
 type HarmonyMessageHandler struct {
 	state           harmonyMessageState
-	harmonyParser   *HarmonyParser
-	functionNameMap *FunctionNameMap
+	HarmonyParser   *HarmonyParser
+	FunctionNameMap *FunctionNameMap
 }

 // NewHarmonyMessageHandler creates a new message handler
 func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 	return &HarmonyMessageHandler{
 		state: harmonyMessageState_Normal,
-		harmonyParser: &HarmonyParser{
+		HarmonyParser: &HarmonyParser{
 			MessageStartTag: "<|start|>",
 			MessageEndTag:   "<|end|>",
 			HeaderEndTag:    "<|message|>",
 		},
-		functionNameMap: NewFunctionNameMap(),
+		FunctionNameMap: NewFunctionNameMap(),
 	}
 }

@@ -301,7 +288,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
 	thinkingSb := strings.Builder{}
 	toolContentSb := strings.Builder{}

-	events := h.harmonyParser.AddContent(content)
+	events := h.HarmonyParser.AddContent(content)
 	for _, event := range events {
 		switch event := event.(type) {
 		case HarmonyEventHeaderComplete:
--- a/harmony/harmonyparser_test.go
+++ b/harmony/harmonyparser_test.go
@@ -1,4 +1,4 @@
-package server
+package harmony

 import (
 	"fmt"
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -378,9 +378,7 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
-		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
-		ctx.Forward(maskTensor.Copy(ctx, out))
-		maskTensor = out
+		maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
 	}

 	return maskTensor
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// Try to pack into as few GPUs as possible, starting from 1 GPU
 			for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
 				gpuSubset := sgl[:numGPUs]
-				ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
+				ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)

 				if ok {
 					slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
@@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
 			// - try subsets of GPUs instead of just falling back to 1 or all in a family

 			// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
-			if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
+			if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
 				slog.Info("new model will fit in available VRAM, loading",
 					"model", modelPath,
 					"library", sgl[0].Library,
@@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 	var bestEstimate uint64
 	var bestFit int
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
+		_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestFit = i
@@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
 }

 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
+func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
@@ -97,6 +97,10 @@ func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
 				return true, estimatedVRAM
 			}
 		}
+
+		if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
+			return true, estimatedVRAM
+		}
 	}
 	return false, estimatedVRAM
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -492,6 +492,7 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		if !requireFull {
 			g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
 		} else {
+			slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
 			return ErrLoadRequiredFull
 		}
 	}
@@ -524,10 +525,6 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
 		}
 	}

-	if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory {
-		return ErrLoadRequiredFull
-	}
-
 	slog.Info("offload", "", s.estimate)

 	s.gpus = gpus
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -396,6 +396,7 @@ type Tensor interface {

 	Shape() []int
 	DType() DType
+	Cast(ctx Context, dtype DType) Tensor

 	Bytes() []byte
 	Floats() []float32
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -843,23 +843,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 		panic("set Input or Layer before creating tensors")
 	}

-	var cdtype uint32
-	switch dtype {
-	case ml.DTypeF32:
-		cdtype = C.GGML_TYPE_F32
-	case ml.DTypeF16:
-		cdtype = C.GGML_TYPE_F16
-	case ml.DTypeQ80:
-		cdtype = C.GGML_TYPE_Q8_0
-	case ml.DTypeQ40:
-		cdtype = C.GGML_TYPE_Q4_0
-	case ml.DTypeI32:
-		cdtype = C.GGML_TYPE_I32
-	case ml.DTypeMXFP4:
-		cdtype = C.GGML_TYPE_MXFP4
-	default:
-		panic("unsupported dtype")
-	}
+	cdtype := ggmlDType(dtype)

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
@@ -1056,6 +1040,32 @@ func (t *Tensor) DType() ml.DType {
 	}
 }

+func ggmlDType(dtype ml.DType) uint32 {
+	switch dtype {
+	case ml.DTypeF32:
+		return C.GGML_TYPE_F32
+	case ml.DTypeF16:
+		return C.GGML_TYPE_F16
+	case ml.DTypeQ80:
+		return C.GGML_TYPE_Q8_0
+	case ml.DTypeQ40:
+		return C.GGML_TYPE_Q4_0
+	case ml.DTypeI32:
+		return C.GGML_TYPE_I32
+	case ml.DTypeMXFP4:
+		return C.GGML_TYPE_MXFP4
+	default:
+		panic("unsupported dtype")
+	}
+}
+
+func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
+	}
+}
+
 func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -109,7 +109,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 					r = 0x0143
 				case r <= 0x0020:
 					r = r + 0x0100
-				case r >= 0x007e && r <= 0x00a0:
+				case r >= 0x007f && r <= 0x00a0:
 					r = r + 0x00a2
 				}

--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
@@ -207,6 +207,36 @@ func TestLlama(t *testing.T) {
 			}
 		}
 	})
+
+	t.Run("roundtriping 0x00-0xFF", func(t *testing.T) {
+		t.Parallel()
+
+		for b := 0x00; b <= 0xFF; b++ {
+			input := string(rune(b))
+			ids, err := tokenizer.Encode(input, false)
+			if err != nil {
+				t.Errorf("failed to encode rune 0x%02X: %v", b, err)
+				continue
+			}
+
+			decoded, err := tokenizer.Decode(ids)
+			if err != nil {
+				t.Errorf("failed to decode rune 0x%02X: %v", b, err)
+				continue
+			}
+
+			if b == 0x00 {
+				if len(decoded) != 0 {
+					t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids)
+				}
+				continue
+			}
+
+			if decoded != input {
+				t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input)
+			}
+		}
+	})
 }

 func BenchmarkBytePairEncoding(b *testing.B) {
--- a/openai/openai.go
+++ b/openai/openai.go
@@ -557,12 +557,10 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {

 	var think *api.ThinkValue
 	if r.Reasoning != nil {
-		options["reasoning"] = *r.Reasoning.Effort
 		think = &api.ThinkValue{
 			Value: *r.Reasoning.Effort,
 		}
 	} else if r.ReasoningEffort != nil {
-		options["reasoning"] = *r.ReasoningEffort
 		think = &api.ThinkValue{
 			Value: *r.ReasoningEffort,
 		}
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -46,7 +46,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be be held that serializes
+// through LoadCacheSlot) require a lock to be held that serializes
 // these operations with each other and llama.Decode

 type InputCacheSlot struct {
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -78,7 +78,7 @@ func (c *InputCache) Close() {
 }

 // Locking: Operations on InputCacheSlot (including finding one
-// through LoadCacheSlot) require a lock to be be held that serializes
+// through LoadCacheSlot) require a lock to be held that serializes
 // these operations with each other and processBatch

 type InputCacheSlot struct {
--- a/server/routes.go
+++ b/server/routes.go
@@ -32,6 +32,7 @@ import (
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/fs/ggml"
+	"github.com/ollama/ollama/harmony"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/openai"
@@ -45,6 +46,18 @@ import (
 	"github.com/ollama/ollama/version"
 )

+func shouldUseHarmony(model *Model) bool {
+	if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
+		// heuristic to check whether the template expects to be parsed via harmony:
+		// search for harmony tags that are nearly always used
+		if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
+			return true
+		}
+	}
+
+	return false
+}
+
 func experimentEnabled(name string) bool {
 	return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name)
 }
@@ -194,12 +207,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}

-	useHarmony := shouldUseHarmony(*m) && !req.Raw
-	var harmonyMessageHandler *HarmonyMessageHandler
-	var harmonyToolParser *HarmonyToolCallAccumulator
+	useHarmony := shouldUseHarmony(m) && !req.Raw
+	var harmonyMessageHandler *harmony.HarmonyMessageHandler
+	var harmonyToolParser *harmony.HarmonyToolCallAccumulator
 	if useHarmony {
-		harmonyMessageHandler = NewHarmonyMessageHandler()
-		harmonyMessageHandler.harmonyParser.AddImplicitStart()
+		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
+		harmonyMessageHandler.HarmonyParser.AddImplicitStart()
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
 	}

@@ -1603,19 +1616,19 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	var harmonyMessageHandler *HarmonyMessageHandler
-	var harmonyToolParser *HarmonyToolCallAccumulator
+	var harmonyMessageHandler *harmony.HarmonyMessageHandler
+	var harmonyToolParser *harmony.HarmonyToolCallAccumulator

-	useHarmony := shouldUseHarmony(*m)
+	useHarmony := shouldUseHarmony(m)

 	processedTools := req.Tools
 	if useHarmony {
-		harmonyMessageHandler = NewHarmonyMessageHandler()
+		harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
 		var lastMessage *api.Message
 		if len(msgs) > 0 {
 			lastMessage = &msgs[len(msgs)-1]
 		}
-		harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
+		harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(lastMessage)
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()

 		// make a copy of tools to pass to the chat prompt. Function names may be
@@ -1623,7 +1636,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		processedTools = make([]api.Tool, len(req.Tools))
 		copy(processedTools, req.Tools)
 		for i, tool := range processedTools {
-			processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
+			processedTools[i].Function.Name = harmonyMessageHandler.FunctionNameMap.ConvertAndAdd(tool.Function.Name)
 		}
 	}

@@ -1660,6 +1673,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			OpeningTag: openingTag,
 			ClosingTag: closingTag,
 		}
+
+		if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
+			thinkingState.AddContent(openingTag)
+		}
 	}

 	var toolParser *tools.Parser
@@ -1705,7 +1722,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					toolName, toolContent := harmonyToolParser.Drain()
 					if toolName != nil {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
-						*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
+						*toolName = harmonyMessageHandler.FunctionNameMap.OriginalFromConverted(*toolName)
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
 							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -969,3 +969,233 @@ func TestGenerate(t *testing.T) {
 		}
 	})
 }
+
+func TestChatWithPromptEndingInThinkTag(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+
+	// Helper to create a standard thinking test setup
+	setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) {
+		mock := &mockRunner{
+			CompletionResponse: llm.CompletionResponse{
+				Done:               true,
+				DoneReason:         llm.DoneReasonStop,
+				PromptEvalCount:    1,
+				PromptEvalDuration: 1,
+				EvalCount:          1,
+				EvalDuration:       1,
+			},
+		}
+
+		s := &Server{
+			sched: &Scheduler{
+				pendingReqCh:  make(chan *LlmRequest, 1),
+				finishedReqCh: make(chan *LlmRequest, 1),
+				expiredCh:     make(chan *runnerRef, 1),
+				unloadedCh:    make(chan any, 1),
+				loaded:        make(map[string]*runnerRef),
+				newServerFn:   newMockServer(mock),
+				getGpuFn:      discover.GetGPUInfo,
+				getCpuFn:      discover.GetCPUInfo,
+				reschedDelay:  250 * time.Millisecond,
+				loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
+					time.Sleep(time.Millisecond)
+					req.successCh <- &runnerRef{llama: mock}
+					return false
+				},
+			},
+		}
+
+		go s.sched.Run(t.Context())
+
+		// Create a model with thinking support
+		_, digest := createBinFile(t, ggml.KV{
+			"general.architecture":          "llama",
+			"llama.block_count":             uint32(1),
+			"llama.context_length":          uint32(8192),
+			"llama.embedding_length":        uint32(4096),
+			"llama.attention.head_count":    uint32(32),
+			"llama.attention.head_count_kv": uint32(8),
+			"tokenizer.ggml.tokens":         []string{""},
+			"tokenizer.ggml.scores":         []float32{0},
+			"tokenizer.ggml.token_type":     []int32{0},
+		}, []*ggml.Tensor{
+			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+			{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
+		})
+
+		// Create model with thinking template that adds <think> at the end
+		w := createRequest(t, s.CreateHandler, api.CreateRequest{
+			Model: "test-thinking",
+			Files: map[string]string{"file.gguf": digest},
+			Template: `{{- range .Messages }}
+{{- if eq .Role "user" }}user: {{ .Content }}
+{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}<think>{{ .Thinking }}</think>{{ end }}{{ .Content }}
+{{ end }}{{ end }}<think>`,
+			Stream: &stream,
+		})
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status 200, got %d", w.Code)
+		}
+
+		return mock, s
+	}
+
+	mock, s := setupThinkingTest(t)
+
+	// Helper to test chat responses
+	testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) {
+		t.Run(name, func(t *testing.T) {
+			mock.CompletionResponse = llm.CompletionResponse{
+				Content:            modelResponse,
+				Done:               true,
+				DoneReason:         llm.DoneReasonStop,
+				PromptEvalCount:    1,
+				PromptEvalDuration: 1,
+				EvalCount:          1,
+				EvalDuration:       1,
+			}
+			mock.CompletionFn = nil
+
+			streamRequest := false
+			req := api.ChatRequest{
+				Model: "test-thinking",
+				Messages: []api.Message{
+					{Role: "user", Content: userContent},
+				},
+				Stream: &streamRequest,
+			}
+			if think {
+				req.Think = &api.ThinkValue{Value: think}
+			}
+
+			w := createRequest(t, s.ChatHandler, req)
+			if w.Code != http.StatusOK {
+				t.Fatalf("expected status 200, got %d", w.Code)
+			}
+
+			var resp api.ChatResponse
+			if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
+				t.Fatal(err)
+			}
+
+			if resp.Message.Thinking != expectedThinking {
+				t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking)
+			}
+
+			if resp.Message.Content != expectedContent {
+				t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content)
+			}
+		})
+	}
+
+	// Test cases - Note: Template adds <think> at the end, and leading whitespace after <think> is eaten by the parser
+	testChatRequest(t, "basic thinking response",
+		"Help me solve this problem",
+		" Let me think about this step by step... </think> The answer is 42.",
+		"Let me think about this step by step... ",
+		"The answer is 42.",
+		true)
+
+	testChatRequest(t, "thinking with multiple sentences",
+		"Explain quantum computing",
+		" First, I need to understand the basics. Quantum bits can be in superposition. </think> Quantum computing uses quantum mechanics principles.",
+		"First, I need to understand the basics. Quantum bits can be in superposition. ",
+		"Quantum computing uses quantum mechanics principles.",
+		true)
+
+	testChatRequest(t, "no thinking content",
+		"What is 2+2?",
+		"</think> The answer is 4.",
+		"",
+		"The answer is 4.",
+		true)
+
+	testChatRequest(t, "thinking disabled but template still adds think tag",
+		"Simple question",
+		" My thoughts </think> The answer.",
+		"",
+		" My thoughts </think> The answer.",
+		false)
+
+	// Test streaming response with template-added <think>
+	t.Run("streaming with thinking", func(t *testing.T) {
+		var wg sync.WaitGroup
+		wg.Add(1)
+
+		mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
+			defer wg.Done()
+
+			// Verify the prompt ends with <think> due to template
+			if !strings.HasSuffix(r.Prompt, "<think>") {
+				t.Errorf("expected prompt to end with <think>, got: %q", r.Prompt)
+			}
+
+			// Simulate streaming chunks
+			responses := []llm.CompletionResponse{
+				{Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+				{Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+				{Content: " </think> Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
+				{Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1},
+			}
+
+			for _, resp := range responses {
+				select {
+				case <-ctx.Done():
+					return ctx.Err()
+				default:
+					fn(resp)
+					time.Sleep(10 * time.Millisecond)
+				}
+			}
+			return nil
+		}
+
+		think := true
+		w := createRequest(t, s.ChatHandler, api.ChatRequest{
+			Model:    "test-thinking",
+			Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}},
+			Think:    &api.ThinkValue{Value: think},
+			Stream:   &stream,
+		})
+
+		wg.Wait()
+
+		if w.Code != http.StatusOK {
+			t.Fatalf("expected status 200, got %d", w.Code)
+		}
+
+		// Parse streaming responses
+		decoder := json.NewDecoder(w.Body)
+		var allThinking, allContent strings.Builder
+
+		for {
+			var resp api.ChatResponse
+			if err := decoder.Decode(&resp); err == io.EOF {
+				break
+			} else if err != nil {
+				t.Fatal(err)
+			}
+			allThinking.WriteString(resp.Message.Thinking)
+			allContent.WriteString(resp.Message.Content)
+		}
+
+		// Note: Leading whitespace after <think> is eaten by the parser
+		if got := allThinking.String(); got != "I need to consider multiple factors here... " {
+			t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got)
+		}
+
+		if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." {
+			t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
+		}
+	})
+}
--- a/thinking/parser.go
+++ b/thinking/parser.go
@@ -103,7 +103,9 @@ func eat(s *Parser) (string, string, bool) {
 			// note that we use the original content, not the trimmed one because we
 			// don't want to eat any whitespace in the real content if there were no
 			// thinking tags
-			return "", s.acc.String(), false
+			untrimmed := s.acc.String()
+			s.acc.Reset()
+			return "", untrimmed, false
 		}
 	case thinkingState_ThinkingStartedEatingWhitespace:
 		trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
--- a/thinking/parser_test.go
+++ b/thinking/parser_test.go
@@ -58,6 +58,15 @@ func TestThinkingStreaming(t *testing.T) {
 					wantContent:    "  abc",
 					wantStateAfter: thinkingState_ThinkingDone,
 				},
+				// regression test for a bug where we were transitioning directly to
+				// ThinkingDone without clearing the buffer. This would cuase the first
+				// step to be outputted twice
+				{
+					input:          "def",
+					wantThinking:   "",
+					wantContent:    "def",
+					wantStateAfter: thinkingState_ThinkingDone,
+				},
 			},
 		},
 		{
--- a/tools/tools.go
+++ b/tools/tools.go
@@ -224,22 +224,45 @@ func findArguments(buffer []byte) (map[string]any, int) {
 		return nil, 0
 	}

+	start := -1
 	var braces int
-	var start int = -1
+	var inString, escaped bool
+
+	for i := range buffer {
+		c := buffer[i]
+
+		if escaped {
+			escaped = false
+			continue
+		}
+
+		if c == '\\' {
+			escaped = true
+			continue
+		}
+
+		if c == '"' {
+			inString = !inString
+			continue
+		}
+
+		if inString {
+			continue
+		}

-	for i, c := range buffer {
 		if c == '{' {
 			if braces == 0 {
 				start = i
 			}
 			braces++
-		} else if c == '}' && braces > 0 {
+		} else if c == '}' {
 			braces--
 			if braces == 0 && start != -1 {
 				object := buffer[start : i+1]

 				var data map[string]any
 				if err := json.Unmarshal(object, &data); err != nil {
+					// not a valid object, keep looking
 					start = -1
 					continue
 				}
@@ -282,6 +305,10 @@ func findArguments(buffer []byte) (map[string]any, int) {

 				return data, i
 			}
+
+			if braces < 0 {
+				braces = 0
+			}
 		}
 	}

--- a/tools/tools_test.go
+++ b/tools/tools_test.go
@@ -1,6 +1,7 @@
 package tools

 import (
+	"strings"
 	"testing"
 	"text/template"

@@ -40,13 +41,7 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_temperature",
 				Description: "Retrieve the temperature for a given location",
-				Parameters: struct {
-					Type       string                      `json:"type"`
-					Defs       any                         `json:"$defs,omitempty"`
-					Items      any                         `json:"items,omitempty"`
-					Required   []string                    `json:"required"`
-					Properties map[string]api.ToolProperty `json:"properties"`
-				}{
+				Parameters: api.ToolFunctionParameters{
 					Type:     "object",
 					Required: []string{"city"},
 					Properties: map[string]api.ToolProperty{
@@ -68,13 +63,7 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_conditions",
 				Description: "Retrieve the current weather conditions for a given location",
-				Parameters: struct {
-					Type       string                      `json:"type"`
-					Defs       any                         `json:"$defs,omitempty"`
-					Items      any                         `json:"items,omitempty"`
-					Required   []string                    `json:"required"`
-					Properties map[string]api.ToolProperty `json:"properties"`
-				}{
+				Parameters: api.ToolFunctionParameters{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"location": {
@@ -104,13 +93,7 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "get_address",
 				Description: "Get the address of a given location",
-				Parameters: struct {
-					Type       string                      `json:"type"`
-					Defs       any                         `json:"$defs,omitempty"`
-					Items      any                         `json:"items,omitempty"`
-					Required   []string                    `json:"required"`
-					Properties map[string]api.ToolProperty `json:"properties"`
-				}{
+				Parameters: api.ToolFunctionParameters{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"location": {
@@ -126,13 +109,7 @@ func TestParser(t *testing.T) {
 			Function: api.ToolFunction{
 				Name:        "add",
 				Description: "Add two numbers",
-				Parameters: struct {
-					Type       string                      `json:"type"`
-					Defs       any                         `json:"$defs,omitempty"`
-					Items      any                         `json:"items,omitempty"`
-					Required   []string                    `json:"required"`
-					Properties map[string]api.ToolProperty `json:"properties"`
-				}{
+				Parameters: api.ToolFunctionParameters{
 					Type: "object",
 					Properties: map[string]api.ToolProperty{
 						"a": {
@@ -1140,11 +1117,163 @@ func TestFindArguments(t *testing.T) {
 		},
 		{
 			name:   "deepseek",
-			buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
+			buffer: []byte(`"arguments": {"location": "Tokyo"}}</tool_call>`),
 			want: map[string]any{
 				"location": "Tokyo",
 			},
 		},
+		{
+			name:   "string with braces",
+			buffer: []byte(`{"name": "process_code", "arguments": {"code": "if (x > 0) { return true; }"}}`),
+			want: map[string]any{
+				"code": "if (x > 0) { return true; }",
+			},
+		},
+		{
+			name:   "string with nested json",
+			buffer: []byte(`{"name": "send_data", "arguments": {"payload": "{\"nested\": {\"key\": \"value\"}}"}}`),
+			want: map[string]any{
+				"payload": `{"nested": {"key": "value"}}`,
+			},
+		},
+		{
+			name:   "string with escaped quotes and braces",
+			buffer: []byte(`{"name": "analyze", "arguments": {"text": "The JSON is: {\"key\": \"val{ue}\"}"}}`),
+			want: map[string]any{
+				"text": `The JSON is: {"key": "val{ue}"}`,
+			},
+		},
+		{
+			name:   "multiple objects with string containing braces",
+			buffer: []byte(`{"name": "test", "arguments": {"query": "find } in text"}} {"name": "other"}`),
+			want: map[string]any{
+				"query": "find } in text",
+			},
+		},
+		{
+			name:   "unmatched closing brace in string",
+			buffer: []byte(`{"name": "search", "arguments": {"pattern": "regex: }"}}`),
+			want: map[string]any{
+				"pattern": "regex: }",
+			},
+		},
+		{
+			name:   "complex nested with mixed braces",
+			buffer: []byte(`{"name": "analyze", "arguments": {"data": "{\"items\": [{\"value\": \"}\"}, {\"code\": \"if (x) { return y; }\"}]}"}}`),
+			want: map[string]any{
+				"data": `{"items": [{"value": "}"}, {"code": "if (x) { return y; }"}]}`,
+			},
+		},
+		{
+			name:   "string with newline and braces",
+			buffer: []byte(`{"name": "format", "arguments": {"template": "{\n  \"key\": \"value\"\n}"}}`),
+			want: map[string]any{
+				"template": "{\n  \"key\": \"value\"\n}",
+			},
+		},
+		{
+			name:   "string with unicode escape",
+			buffer: []byte(`{"name": "test", "arguments": {"text": "Unicode: \u007B and \u007D"}}`),
+			want: map[string]any{
+				"text": "Unicode: { and }",
+			},
+		},
+		{
+			name:   "array arguments",
+			buffer: []byte(`{"name": "batch", "arguments": ["item1", "item2", "{\"nested\": true}"]}`),
+			want:   nil, // This should return nil because arguments is not a map
+		},
+		{
+			name:   "escaped backslash before quote",
+			buffer: []byte(`{"name": "path", "arguments": {"dir": "C:\\Program Files\\{App}\\"}}`),
+			want: map[string]any{
+				"dir": `C:\Program Files\{App}\`,
+			},
+		},
+		{
+			name:   "single quotes not treated as string delimiters",
+			buffer: []byte(`{"name": "query", "arguments": {"sql": "SELECT * FROM users WHERE name = '{admin}'"}}`),
+			want: map[string]any{
+				"sql": "SELECT * FROM users WHERE name = '{admin}'",
+			},
+		},
+		{
+			name:   "incomplete json at buffer end",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "some {"`),
+			want:   nil,
+		},
+		{
+			name:   "multiple escaped quotes",
+			buffer: []byte(`{"name": "echo", "arguments": {"msg": "He said \"Hello {World}\" loudly"}}`),
+			want: map[string]any{
+				"msg": `He said "Hello {World}" loudly`,
+			},
+		},
+		{
+			name:   "json with comments style string",
+			buffer: []byte(`{"name": "code", "arguments": {"snippet": "// This is a comment with { and }"}}`),
+			want: map[string]any{
+				"snippet": "// This is a comment with { and }",
+			},
+		},
+		{
+			name:   "consecutive escaped backslashes",
+			buffer: []byte(`{"name": "test", "arguments": {"path": "C:\\\\{folder}\\\\"}}`),
+			want: map[string]any{
+				"path": `C:\\{folder}\\`,
+			},
+		},
+		{
+			name:   "empty string with braces after",
+			buffer: []byte(`{"name": "test", "arguments": {"a": "", "b": "{value}"}}`),
+			want: map[string]any{
+				"a": "",
+				"b": "{value}",
+			},
+		},
+		{
+			name:   "unicode in key names",
+			buffer: []byte(`{"name": "test", "arguments": {"key{": "value", "key}": "value2"}}`),
+			want: map[string]any{
+				"key{": "value",
+				"key}": "value2",
+			},
+		},
+		{
+			name:   "very long string with braces",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "` + strings.Repeat("a{b}c", 100) + `"}}`),
+			want: map[string]any{
+				"data": strings.Repeat("a{b}c", 100),
+			},
+		},
+		{
+			name:   "tab characters and braces",
+			buffer: []byte(`{"name": "test", "arguments": {"code": "\tif (true) {\n\t\treturn;\n\t}"}}`),
+			want: map[string]any{
+				"code": "\tif (true) {\n\t\treturn;\n\t}",
+			},
+		},
+		{
+			name:   "null byte in string",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "before\u0000{after}"}}`),
+			want: map[string]any{
+				"data": "before\x00{after}",
+			},
+		},
+		{
+			name:   "escaped quote at end of string",
+			buffer: []byte(`{"name": "test", "arguments": {"data": "text with quote at end\\\""}}`),
+			want: map[string]any{
+				"data": `text with quote at end\"`,
+			},
+		},
+		{
+			name:   "mixed array and object in arguments",
+			buffer: []byte(`{"name": "test", "arguments": {"items": ["{", "}", {"key": "value"}]}}`),
+			want: map[string]any{
+				"items": []any{"{", "}", map[string]any{"key": "value"}},
+			},
+		},
 	}

 	for _, tt := range tests {
Author	SHA1	Message	Date
Jeffrey Morgan	d3450dd52e	api: implement stringer for ToolFunctionParameters (#12038 )	2025-08-22 16:26:48 -07:00
Jeffrey Morgan	4bcb04ad88	tools: avoid matching braces that are part of tool content (#12039 )	2025-08-22 15:22:14 -07:00
Devon Rifkin	e3d5708754	Merge pull request #12021 from ollama/drifkin/thinking-double-emit thinking: fix double emit when no opening tag	2025-08-22 12:01:37 -07:00
Jeffrey Morgan	4be4dc8717	server: skip parsing initial <think> if provided in the prompt (#12024 )	2025-08-22 12:00:16 -07:00
zoupingshi	109d4fc3b4	chore: remove redundant words in comment (#12028 ) Signed-off-by: zoupingshi <hangfachang@outlook.com>	2025-08-22 11:00:27 -07:00
Devon Rifkin	2cb0a580f3	thinking: fix double emit when no opening tag The thinking parser will automatically transition to being a pass-through if non-whitespace is seen before an opening tag. However, we weren't clearing the buffer after the first non-whitespace input, so in practice the first token would be emitted twice. Added a test that demonstrated this, and then fixed the bug.	2025-08-21 21:03:12 -07:00
Parth Sareen	7cce5aac76	harmony: move harmony parsing into a package (#12016 )	2025-08-21 13:56:22 -07:00
Michael Yang	4ae4f47b16	gpt-oss: convert from hugging face format (#11907 )	2025-08-20 15:39:18 -07:00
Jesse Gross	073fa31df5	llm: Don't always evict models in CPU-only mode With old memory estimates, it's currently impossible to load more than one model at a time when no GPUs are available. This is because the check for whether we need to evict a model looks to see if all layers of the new model can be loaded onto GPUs, which is never true if there are no GPUs. Before the memory management changes, there was a special code path for CPU-only systems. This problem does not exist with new memory estimates. Fixes #11974	2025-08-20 14:31:02 -07:00
Michael Yang	91fc3c48e3	openai: remove reasoning as an api.Options (#11993 )	2025-08-20 12:21:42 -07:00
Devon Rifkin	6de62664d9	Merge pull request #11973 from ollama/drifkin/bpe model: fix boundary in bpe	2025-08-19 22:58:33 -07:00
Devon Rifkin	463a6caad8	model: add bpe roundtripping tests	2025-08-19 22:05:48 -07:00
Devon Rifkin	fc5fb09f51	model: fix boundary in bpe 0x007e is a tilde and was getting adjusted (+0x00a2) to 0x0120 in the encode, but then in the decode it was getting adjusted down (-0x0100) to 0x0020. The boundary for the +0x00a2 case has been adjusted to fix this Fixes: #11966	2025-08-19 18:34:49 -07:00
Jesse Gross	05ccb17c6e	kvcache: Use Cast instead of Copy for flash attention masks Flash attention kernels require the mask of the KV cache be a F16 rather than an F32. We can use the GGML operation ggml_cast to do this rather than doing it ourselves, which allows reuse of a preallocated buffer in the graph rather than allocating a new one for each batch. This improves token generation performance with flash attention by 10-30% (with gpt-oss). This also makes performance with flash attention better than without it, as expected.	2025-08-19 12:36:28 -07:00