Compare commits

..

1 Commits

Author SHA1 Message Date
jmorganca
19638cec55 add docs.json 2025-08-17 13:12:39 -07:00
27 changed files with 227 additions and 1340 deletions

View File

@@ -411,8 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
### Cloud
@@ -539,8 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
- [Ollama for D](https://github.com/kassane/ollama-d)
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
### Mobile

View File

@@ -90,10 +90,6 @@ type GenerateRequest struct {
// (request that thinking _not_ be used) and unset (use the old behavior
// before this option was introduced)
Think *ThinkValue `json:"think,omitempty"`
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
// template instead of calling the model.
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
}
// ChatRequest describes a request sent by [Client.Chat].
@@ -124,10 +120,6 @@ type ChatRequest struct {
// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
// for supported models.
Think *ThinkValue `json:"think,omitempty"`
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
// template instead of calling the model.
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
}
type Tools []Tool
@@ -286,23 +278,16 @@ func mapToTypeScriptType(jsonType string) string {
}
}
type ToolFunctionParameters struct {
Type string `json:"type"`
Defs any `json:"$defs,omitempty"`
Items any `json:"items,omitempty"`
Required []string `json:"required"`
Properties map[string]ToolProperty `json:"properties"`
}
func (t *ToolFunctionParameters) String() string {
bts, _ := json.Marshal(t)
return string(bts)
}
type ToolFunction struct {
Name string `json:"name"`
Description string `json:"description"`
Parameters ToolFunctionParameters `json:"parameters"`
Name string `json:"name"`
Description string `json:"description"`
Parameters struct {
Type string `json:"type"`
Defs any `json:"$defs,omitempty"`
Items any `json:"items,omitempty"`
Required []string `json:"required"`
Properties map[string]ToolProperty `json:"properties"`
} `json:"parameters"`
}
func (t *ToolFunction) String() string {
@@ -323,19 +308,6 @@ type ChatResponse struct {
Metrics
}
// DebugInfo contains debug information for template rendering
type DebugInfo struct {
RenderedTemplate string `json:"rendered_template"`
ImageCount int `json:"image_count,omitempty"`
}
// DebugTemplateResponse is returned when _debug_render_only is set to true
type DebugTemplateResponse struct {
Model string `json:"model"`
CreatedAt time.Time `json:"created_at"`
DebugInfo DebugInfo `json:"_debug_info"`
}
type Metrics struct {
TotalDuration time.Duration `json:"total_duration,omitempty"`
LoadDuration time.Duration `json:"load_duration,omitempty"`

View File

@@ -436,50 +436,3 @@ func TestThinking_UnmarshalJSON(t *testing.T) {
})
}
}
func TestToolFunctionParameters_String(t *testing.T) {
tests := []struct {
name string
params ToolFunctionParameters
expected string
}{
{
name: "simple object with string property",
params: ToolFunctionParameters{
Type: "object",
Required: []string{"name"},
Properties: map[string]ToolProperty{
"name": {
Type: PropertyType{"string"},
Description: "The name of the person",
},
},
},
expected: `{"type":"object","required":["name"],"properties":{"name":{"type":"string","description":"The name of the person"}}}`,
},
{
name: "marshal failure returns empty string",
params: ToolFunctionParameters{
Type: "object",
Defs: func() any {
// Create a cycle that will cause json.Marshal to fail
type selfRef struct {
Self *selfRef
}
s := &selfRef{}
s.Self = s
return s
}(),
Properties: map[string]ToolProperty{},
},
expected: "",
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
result := test.params.String()
assert.Equal(t, test.expected, result)
})
}
}

View File

@@ -15,24 +15,19 @@ import (
type gptossModel struct {
ModelParameters
HiddenLayers uint32 `json:"num_hidden_layers"`
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
HiddenSize uint32 `json:"hidden_size"`
IntermediateSize uint32 `json:"intermediate_size"`
AttentionHeads uint32 `json:"num_attention_heads"`
KeyValueHeads uint32 `json:"num_key_value_heads"`
HeadDim uint32 `json:"head_dim"`
Experts uint32 `json:"num_experts"`
LocalExperts uint32 `json:"num_local_experts"`
ExpertsPerToken uint32 `json:"experts_per_token"`
RMSNormEpsilon float32 `json:"rms_norm_eps"`
InitialContextLength uint32 `json:"initial_context_length"`
RopeTheta float32 `json:"rope_theta"`
RopeScalingFactor float32 `json:"rope_scaling_factor"`
RopeScaling struct {
Factor float32 `json:"factor"`
} `json:"rope_scaling"`
SlidingWindow uint32 `json:"sliding_window"`
HiddenLayers uint32 `json:"num_hidden_layers"`
HiddenSize uint32 `json:"hidden_size"`
IntermediateSize uint32 `json:"intermediate_size"`
AttentionHeads uint32 `json:"num_attention_heads"`
KeyValueHeads uint32 `json:"num_key_value_heads"`
HeadDim uint32 `json:"head_dim"`
Experts uint32 `json:"num_experts"`
ExpertsPerToken uint32 `json:"experts_per_token"`
RMSNormEpsilon float32 `json:"rms_norm_eps"`
InitialContextLength uint32 `json:"initial_context_length"`
RopeTheta float32 `json:"rope_theta"`
RopeScalingFactor float32 `json:"rope_scaling_factor"`
SlidingWindow uint32 `json:"sliding_window"`
}
var _ ModelConverter = (*gptossModel)(nil)
@@ -41,11 +36,11 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
kv := m.ModelParameters.KV(t)
kv["general.architecture"] = "gptoss"
kv["general.file_type"] = uint32(4)
kv["gptoss.context_length"] = cmp.Or(m.MaxPositionEmbeddings, uint32(m.RopeScalingFactor*float32(m.InitialContextLength)))
kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
kv["gptoss.block_count"] = m.HiddenLayers
kv["gptoss.embedding_length"] = m.HiddenSize
kv["gptoss.feed_forward_length"] = m.IntermediateSize
kv["gptoss.expert_count"] = cmp.Or(m.Experts, m.LocalExperts)
kv["gptoss.expert_count"] = m.Experts
kv["gptoss.expert_used_count"] = m.ExpertsPerToken
kv["gptoss.attention.head_count"] = m.AttentionHeads
kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
@@ -54,7 +49,7 @@ func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
kv["gptoss.attention.sliding_window"] = m.SlidingWindow
kv["gptoss.rope.freq_base"] = m.RopeTheta
kv["gptoss.rope.scaling.factor"] = cmp.Or(m.RopeScalingFactor, m.RopeScaling.Factor)
kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
kv["tokenizer.ggml.add_bos_token"] = false
@@ -97,11 +92,6 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
for name, mxfp4 := range mxfp4s {
dims := mxfp4.blocks.Shape()
if !strings.HasSuffix(name, ".weight") {
name += ".weight"
}
out = append(out, &ggml.Tensor{
Name: name,
Kind: uint32(ggml.TensorTypeMXFP4),
@@ -114,47 +104,25 @@ func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
}
func (m *gptossModel) Replacements() []string {
var replacements []string
if m.MaxPositionEmbeddings > 0 {
// hf flavored model
replacements = []string{
"lm_head", "output",
"model.embed_tokens", "token_embd",
"model.layers", "blk",
"input_layernorm", "attn_norm",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_out",
"self_attn.sinks", "attn_sinks",
"post_attention_layernorm", "ffn_norm",
"mlp.router", "ffn_gate_inp",
"mlp.experts.gate_up_proj_", "ffn_gate_up_exps.",
"mlp.experts.down_proj_", "ffn_down_exps.",
"model.norm", "output_norm",
}
} else {
replacements = []string{
// noop replacements so other replacements will not be applied
".blocks", ".blocks",
".scales", ".scales",
// real replacements
"block", "blk",
"attn.norm", "attn_norm",
"attn.qkv", "attn_qkv",
"attn.sinks", "attn_sinks",
"attn.out", "attn_out",
"mlp.norm", "ffn_norm",
"mlp.gate", "ffn_gate_inp",
"mlp.mlp1_", "ffn_gate_up_exps.",
"mlp.mlp2_", "ffn_down_exps.",
"embedding", "token_embd",
"norm", "output_norm",
"unembedding", "output",
"scale", "weight",
}
return []string{
// noop replacements so other replacements will not be applied
".blocks", ".blocks",
".scales", ".scales",
// real replacements
"block", "blk",
"attn.norm", "attn_norm",
"attn.qkv", "attn_qkv",
"attn.sinks", "attn_sinks",
"attn.out", "attn_out",
"mlp.norm", "ffn_norm",
"mlp.gate", "ffn_gate_inp",
"mlp.mlp1_", "ffn_gate_up_exps.",
"mlp.mlp2_", "ffn_down_exps.",
"embedding", "token_embd",
"norm", "output_norm",
"unembedding", "output",
"scale", "weight",
}
return replacements
}
type mxfp4 struct {

75
docs/docs.json Normal file
View File

@@ -0,0 +1,75 @@
{
"$schema": "https://mintlify.com/docs.json",
"theme": "mint",
"background": {
"color": {
"light": "#ffffff",
"dark": "#000000"
}
},
"appearance": {
"default": "light"
},
"styling": {
"codeblocks": "system"
},
"contextual": {
"options": ["copy", "chatgpt", "claude", "view"]
},
"fonts": {
"heading": {
"family": "Inter"
},
"body": {
"family": "Inter"
}
},
"name": "Ollama",
"colors": {
"primary": "#000",
"light": "#b5b5b5",
"dark": "#fff"
},
"favicon": "/ollama.png",
"logo": {
"light": "/ollama.png",
"dark": "/favicon.svg"
},
"navigation": {
"tabs": [
{
"tab": "Documentation",
"groups": [
{
"group": "Home",
"pages": ["index", "quickstart", "faq", "troubleshooting"]
},
{
"group": "Platforms",
"pages": ["linux", "windows", "docker"]
},
{
"group": "Features",
"pages": [
"modelfile",
"apis",
"openai",
"import",
"gpu",
"benchmark"
]
}
]
},
{
"tab": "Development",
"groups": [
{
"group": " ",
"pages": ["development", "examples", "template"]
}
]
}
]
}
}

View File

@@ -378,7 +378,9 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)
if c.config.MaskDType != ml.DTypeF32 {
maskTensor = maskTensor.Cast(ctx, c.config.MaskDType)
out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
ctx.Forward(maskTensor.Copy(ctx, out))
maskTensor = out
}
return maskTensor

View File

@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
const bool output_all = false;
// when computing embeddings, all tokens are output
const bool output_all = cparams.embeddings;
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);

View File

@@ -13,7 +13,7 @@ checks.
1 file changed, 18 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 57eae461..c7f9dc3a 100644
index 57eae461..9db0c8b5 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud

View File

@@ -1,23 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <git@mxy.ng>
Date: Mon, 18 Aug 2025 16:58:39 -0700
Subject: [PATCH] decode: disable output_all
---
src/llama-context.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 26a5cf9c..6ece5263 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
- // when computing embeddings, all tokens are output
- const bool output_all = cparams.embeddings;
+ const bool output_all = false;
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);

View File

@@ -30,7 +30,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := predictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
ok, estimatedVRAM := PredictServerFit(gpuSubset, f, adapters, projectors, opts, numParallel)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
@@ -48,7 +48,7 @@ func pickBestFullFitByLibrary(f *ggml.GGML, modelPath string, projectors []strin
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
if ok, estimatedVRAM := predictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
if ok, estimatedVRAM := PredictServerFit(sgl, f, adapters, projectors, opts, numParallel); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", modelPath,
"library", sgl[0].Library,
@@ -71,7 +71,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := predictServerFit(gl, f, adapters, projectors, opts, numParallel)
_, estimatedVRAM := PredictServerFit(gl, f, adapters, projectors, opts, numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
@@ -81,7 +81,7 @@ func pickBestPartialFitByLibrary(f *ggml.GGML, projectors []string, adapters []s
}
// This algorithm looks for a complete fit to determine if we need to unload other models
func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
// Split up the GPUs by type and try them
var estimatedVRAM uint64
for _, gpus := range allGpus.ByLibrary() {
@@ -97,10 +97,6 @@ func predictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, proj
return true, estimatedVRAM
}
}
if len(gpus) == 1 && gpus[0].Library == "cpu" && estimate.TotalSize <= gpus[0].FreeMemory {
return true, estimatedVRAM
}
}
return false, estimatedVRAM
}

View File

@@ -492,7 +492,6 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
if !requireFull {
g = pickBestPartialFitByLibrary(s.ggml, []string{s.loadRequest.ProjectorPath}, s.loadRequest.LoraPath, s.options, gpus, s.numParallel)
} else {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "estimate", s.estimate)
return ErrLoadRequiredFull
}
}
@@ -525,6 +524,10 @@ func (s *llamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requi
}
}
if requireFull && len(gpus) == 1 && gpus[0].Library == "cpu" && s.estimate.TotalSize > gpus[0].FreeMemory {
return ErrLoadRequiredFull
}
slog.Info("offload", "", s.estimate)
s.gpus = gpus
@@ -648,9 +651,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
if !success {
s.initModel(ctx, LoadRequest{}, LoadOperationClose)
}
if s.mem != nil {
s.mem.Log(slog.LevelInfo)
}
s.mem.Log(slog.LevelInfo)
}()
slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)

View File

@@ -396,7 +396,6 @@ type Tensor interface {
Shape() []int
DType() DType
Cast(ctx Context, dtype DType) Tensor
Bytes() []byte
Floats() []float32

View File

@@ -843,7 +843,23 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
panic("set Input or Layer before creating tensors")
}
cdtype := ggmlDType(dtype)
var cdtype uint32
switch dtype {
case ml.DTypeF32:
cdtype = C.GGML_TYPE_F32
case ml.DTypeF16:
cdtype = C.GGML_TYPE_F16
case ml.DTypeQ80:
cdtype = C.GGML_TYPE_Q8_0
case ml.DTypeQ40:
cdtype = C.GGML_TYPE_Q4_0
case ml.DTypeI32:
cdtype = C.GGML_TYPE_I32
case ml.DTypeMXFP4:
cdtype = C.GGML_TYPE_MXFP4
default:
panic("unsupported dtype")
}
if len(shape) < 1 || shape[0] == 0 {
var shape C.int64_t = 0
@@ -1040,32 +1056,6 @@ func (t *Tensor) DType() ml.DType {
}
}
func ggmlDType(dtype ml.DType) uint32 {
switch dtype {
case ml.DTypeF32:
return C.GGML_TYPE_F32
case ml.DTypeF16:
return C.GGML_TYPE_F16
case ml.DTypeQ80:
return C.GGML_TYPE_Q8_0
case ml.DTypeQ40:
return C.GGML_TYPE_Q4_0
case ml.DTypeI32:
return C.GGML_TYPE_I32
case ml.DTypeMXFP4:
return C.GGML_TYPE_MXFP4
default:
panic("unsupported dtype")
}
}
func (t *Tensor) Cast(ctx ml.Context, dtype ml.DType) ml.Tensor {
return &Tensor{
b: t.b,
t: C.ggml_cast(ctx.(*Context).ctx, t.t, ggmlDType(dtype)),
}
}
func (t *Tensor) Neg(ctx ml.Context) ml.Tensor {
return &Tensor{
b: t.b,

View File

@@ -109,7 +109,7 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
r = 0x0143
case r <= 0x0020:
r = r + 0x0100
case r >= 0x007f && r <= 0x00a0:
case r >= 0x007e && r <= 0x00a0:
r = r + 0x00a2
}

View File

@@ -207,36 +207,6 @@ func TestLlama(t *testing.T) {
}
}
})
t.Run("roundtriping 0x00-0xFF", func(t *testing.T) {
t.Parallel()
for b := 0x00; b <= 0xFF; b++ {
input := string(rune(b))
ids, err := tokenizer.Encode(input, false)
if err != nil {
t.Errorf("failed to encode rune 0x%02X: %v", b, err)
continue
}
decoded, err := tokenizer.Decode(ids)
if err != nil {
t.Errorf("failed to decode rune 0x%02X: %v", b, err)
continue
}
if b == 0x00 {
if len(decoded) != 0 {
t.Errorf("Decode(Encode(0x00)) should be empty, got %v", ids)
}
continue
}
if decoded != input {
t.Errorf("rune 0x%02X failed roundtrip: got %q, want %q", b, decoded, input)
}
}
})
}
func BenchmarkBytePairEncoding(b *testing.B) {

View File

@@ -557,10 +557,12 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
var think *api.ThinkValue
if r.Reasoning != nil {
options["reasoning"] = *r.Reasoning.Effort
think = &api.ThinkValue{
Value: *r.Reasoning.Effort,
}
} else if r.ReasoningEffort != nil {
options["reasoning"] = *r.ReasoningEffort
think = &api.ThinkValue{
Value: *r.ReasoningEffort,
}

View File

@@ -46,7 +46,7 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
}
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be held that serializes
// through LoadCacheSlot) require a lock to be be held that serializes
// these operations with each other and llama.Decode
type InputCacheSlot struct {

View File

@@ -78,7 +78,7 @@ func (c *InputCache) Close() {
}
// Locking: Operations on InputCacheSlot (including finding one
// through LoadCacheSlot) require a lock to be held that serializes
// through LoadCacheSlot) require a lock to be be held that serializes
// these operations with each other and processBatch
type InputCacheSlot struct {

View File

@@ -1,9 +1,9 @@
package harmony
package server
import (
"context"
"fmt"
"log/slog"
"slices"
"strings"
"unicode"
@@ -19,6 +19,18 @@ const (
harmonyParserState_ParsingContent
)
func shouldUseHarmony(model Model) bool {
if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
// heuristic to check whether the template expects to be parsed via harmony:
// search for harmony tags that are nearly always used
if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
return true
}
}
return false
}
func (s harmonyParserState) String() string {
switch s {
// we're looking for the message start tag
@@ -263,21 +275,19 @@ const (
// HarmonyMessageHandler processes harmony events and accumulates content appropriately.
// This is a higher level interface that maps harmony concepts into ollama concepts
type HarmonyMessageHandler struct {
state harmonyMessageState
HarmonyParser *HarmonyParser
FunctionNameMap *FunctionNameMap
state harmonyMessageState
harmonyParser *HarmonyParser
}
// NewHarmonyMessageHandler creates a new message handler
func NewHarmonyMessageHandler() *HarmonyMessageHandler {
return &HarmonyMessageHandler{
state: harmonyMessageState_Normal,
HarmonyParser: &HarmonyParser{
harmonyParser: &HarmonyParser{
MessageStartTag: "<|start|>",
MessageEndTag: "<|end|>",
HeaderEndTag: "<|message|>",
},
FunctionNameMap: NewFunctionNameMap(),
}
}
@@ -288,7 +298,7 @@ func (h *HarmonyMessageHandler) AddContent(content string, toolParser *HarmonyTo
thinkingSb := strings.Builder{}
toolContentSb := strings.Builder{}
events := h.HarmonyParser.AddContent(content)
events := h.harmonyParser.AddContent(content)
for _, event := range events {
switch event := event.(type) {
case HarmonyEventHeaderComplete:
@@ -368,97 +378,3 @@ func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
func (a *HarmonyToolCallAccumulator) Content() string {
return a.acc.String()
}
// FunctionNameMap maps a user-specified function name to a valid function
// name for harmony (which look like TypeScript identifiers). This is needed to
// transform user-specified function names, which might contain characters that
// are not allowed in TypeScript identifiers
type FunctionNameMap struct {
userToHarmony map[string]string
harmonyToUser map[string]string
}
func NewFunctionNameMap() *FunctionNameMap {
return &FunctionNameMap{
userToHarmony: make(map[string]string),
harmonyToUser: make(map[string]string),
}
}
func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
harmonyFunctionName := m.deriveName(userFunctionName)
m.userToHarmony[userFunctionName] = harmonyFunctionName
m.harmonyToUser[harmonyFunctionName] = userFunctionName
return harmonyFunctionName
}
// OriginalFromConverted looks up the reverse-mapping of a previously-converted
// user->harmony function name. To unmap reliably, the mapping must exist, as
// the conversion process is not reversible without the appropriate state
func (m *FunctionNameMap) OriginalFromConverted(harmonyFunctionName string) string {
if userFunctionName, ok := m.harmonyToUser[harmonyFunctionName]; ok {
return userFunctionName
}
slog.Warn("harmony parser: no reverse mapping found for function name", "harmonyFunctionName", harmonyFunctionName)
// fallback to the original function name if we can't find a mapping
return harmonyFunctionName
}
// convertToValidChars converts a user-specified function name to a valid
// TypeScript identifier.
//
// Limitations:
//
// - This doesn't restrict reserved TypeScript keywords.
// - We don't perform a real ID_Start/ID_Continue check, and instead use the more
// restrictive unicode.IsLetter/unicode.IsDigit check. Unclear what kind of
// identifiers these models were trained on, so in the end we might want to
// convert unicode-heavy identifiers to their closest ASCII equivalents.
func (m *FunctionNameMap) convertToValidChars(userFunctionName string) string {
mapper := func(r rune) rune {
// first, replace certain characters with underscores
if r == ' ' || r == '-' || r == '.' {
return '_'
}
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$' {
return r
}
// finally, remove any other characters
return -1
}
candidate := strings.Map(mapper, userFunctionName)
// set a default name if we end up with nothing left
if candidate == "" {
return "unnamed"
}
// if the candidate starts with a number, prepend an underscore to make it a
// valid identifier
if unicode.IsDigit(rune(candidate[0])) {
candidate = "_" + candidate
}
return candidate
}
func (m *FunctionNameMap) deriveName(userFunctionName string) string {
originalCandidate := m.convertToValidChars(userFunctionName)
candidate := originalCandidate
// Check for dupes, and if so, add a number to the end.
// We start at 2 because if we have dupes and the first is never renamed, it
// makes sense for them to be named, say, `f`, `f_2`, `f_3`
count := 2
for {
if _, exists := m.harmonyToUser[candidate]; !exists {
break
}
candidate = fmt.Sprintf("%s_%d", originalCandidate, count)
count++
}
return candidate
}

View File

@@ -1,4 +1,4 @@
package harmony
package server
import (
"fmt"
@@ -467,71 +467,3 @@ func TestHarmonyParserStreaming(t *testing.T) {
})
}
}
// TestFunctionConvertToValidChars tests only FunctionNameMap.convert(), which doesn't
// handle any saving (and therefore no dupe handling)
func TestFunctionConvertToValidChars(t *testing.T) {
tests := []struct {
name string
in string
want string
}{
{name: "replace spaces with underscores", in: "get weather", want: "get_weather"},
{name: "replace hyphens with underscores", in: "get-weather", want: "get_weather"},
{name: "replace periods with underscores", in: "get.weather", want: "get_weather"},
{name: "disallow non-word characters", in: "get weather!", want: "get_weather"},
{name: "strip out invalid non-alphanumeric unicode characters", in: "a🫠bc", want: "abc"},
{name: "names that only contain invalid characters", in: "🫠", want: "unnamed"},
{name: "leading number", in: "123", want: "_123"},
{name: "$ allowed", in: "$", want: "$"},
// show that we allow weird unicode letter characters, though we might want
// to convert them to their closest ASCII equivalents in the future
{name: "allow weird unicode letter characters", in: "𝓸𝓵𝓵𝓪𝓶𝓪", want: "𝓸𝓵𝓵𝓪𝓶𝓪"},
// names that look like words but are invalid (i.e., not ID_Start/ID_Continue)
{name: "disallow non-word characters that look like words", in: "ⓞⓛⓛⓐⓜⓐ123", want: "_123"},
}
for i, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
parser := NewFunctionNameMap()
got := parser.convertToValidChars(tt.in)
if got != tt.want {
t.Errorf("case %d: got %q, want %q", i, got, tt.want)
}
})
}
}
func TestFunctionConvertAndAdd(t *testing.T) {
// make a fresh map for each test, but within a test use the same map so we can test for dupe handling
tests := []struct {
name string
in []string
want []string
}{
{name: "basic dupe handling", in: []string{"get weather", "get weather"}, want: []string{"get_weather", "get_weather_2"}},
{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
}
for i, tt := range tests {
parser := NewFunctionNameMap()
t.Run(tt.name, func(t *testing.T) {
for j, in := range tt.in {
got := parser.ConvertAndAdd(in)
want := tt.want[j]
if got != want {
t.Errorf("case %d: got %q, want %q", i, got, want)
}
// check that the maps are correct
if parser.userToHarmony[in] != want {
t.Errorf("case %d: userToHarmony[%q] = %q, want %q", i, in, parser.userToHarmony[in], want)
}
if parser.harmonyToUser[want] != in {
t.Errorf("case %d: harmonyToUser[%q] = %q, want %q", i, want, parser.harmonyToUser[want], in)
}
}
})
}
}

View File

@@ -32,7 +32,6 @@ import (
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/harmony"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/logutil"
"github.com/ollama/ollama/openai"
@@ -46,18 +45,6 @@ import (
"github.com/ollama/ollama/version"
)
func shouldUseHarmony(model *Model) bool {
if slices.Contains([]string{"gptoss", "gpt-oss"}, model.Config.ModelFamily) {
// heuristic to check whether the template expects to be parsed via harmony:
// search for harmony tags that are nearly always used
if model.Template.Contains("<|start|>") && model.Template.Contains("<|end|>") {
return true
}
}
return false
}
func experimentEnabled(name string) bool {
return slices.Contains(strings.Split(os.Getenv("OLLAMA_EXPERIMENT"), ","), name)
}
@@ -207,12 +194,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return
}
useHarmony := shouldUseHarmony(m) && !req.Raw
var harmonyMessageHandler *harmony.HarmonyMessageHandler
var harmonyToolParser *harmony.HarmonyToolCallAccumulator
useHarmony := shouldUseHarmony(*m) && !req.Raw
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
if useHarmony {
harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
harmonyMessageHandler.HarmonyParser.AddImplicitStart()
harmonyMessageHandler = NewHarmonyMessageHandler()
harmonyMessageHandler.harmonyParser.AddImplicitStart()
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
}
@@ -327,19 +314,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
prompt = b.String()
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
c.JSON(http.StatusOK, api.DebugTemplateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
DebugInfo: api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
var thinkingState *thinking.Parser
if !useHarmony {
openingTag, closingTag := thinking.InferTags(m.Template.Template)
@@ -1616,49 +1590,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
msgs = filterThinkTags(msgs, m)
var harmonyMessageHandler *harmony.HarmonyMessageHandler
var harmonyToolParser *harmony.HarmonyToolCallAccumulator
useHarmony := shouldUseHarmony(m)
processedTools := req.Tools
if useHarmony {
harmonyMessageHandler = harmony.NewHarmonyMessageHandler()
var lastMessage *api.Message
if len(msgs) > 0 {
lastMessage = &msgs[len(msgs)-1]
}
harmonyMessageHandler.HarmonyParser.AddImplicitStartOrPrefill(lastMessage)
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
// make a copy of tools to pass to the chat prompt. Function names may be
// renamed to be valid Harmony function names.
processedTools = make([]api.Tool, len(req.Tools))
copy(processedTools, req.Tools)
for i, tool := range processedTools {
processedTools[i].Function.Name = harmonyMessageHandler.FunctionNameMap.ConvertAndAdd(tool.Function.Name)
}
}
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
if err != nil {
slog.Error("chat prompt error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
c.JSON(http.StatusOK, api.DebugTemplateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
DebugInfo: api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
useHarmony := shouldUseHarmony(*m)
// Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony {
@@ -1666,6 +1605,19 @@ func (s *Server) ChatHandler(c *gin.Context) {
return
}
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
if useHarmony {
harmonyMessageHandler = NewHarmonyMessageHandler()
var lastMessage *api.Message
if len(msgs) > 0 {
lastMessage = &msgs[len(msgs)-1]
}
harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
}
var thinkingState *thinking.Parser
openingTag, closingTag := thinking.InferTags(m.Template.Template)
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
@@ -1673,10 +1625,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
OpeningTag: openingTag,
ClosingTag: closingTag,
}
if strings.HasSuffix(strings.TrimSpace(prompt), openingTag) {
thinkingState.AddContent(openingTag)
}
}
var toolParser *tools.Parser
@@ -1722,7 +1670,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
toolName, toolContent := harmonyToolParser.Drain()
if toolName != nil {
*toolName = strings.TrimPrefix(*toolName, "functions.")
*toolName = harmonyMessageHandler.FunctionNameMap.OriginalFromConverted(*toolName)
var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())

View File

@@ -1,413 +0,0 @@
package server
import (
"bytes"
"encoding/json"
"net/http"
"testing"
"time"
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
)
func TestGenerateDebugRenderOnly(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ .Prompt }}",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
tests := []struct {
name string
request api.GenerateRequest
expectDebug bool
expectTemplate string
expectNumImages int
}{
{
name: "debug render only enabled",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello, world!",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "Hello, world!",
},
{
name: "debug render only disabled",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello, world!",
DebugRenderOnly: false,
},
expectDebug: false,
},
{
name: "debug render only with system prompt",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "User question",
System: "You are a helpful assistant",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "User question",
},
{
name: "debug render only with template",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello",
Template: "PROMPT: {{ .Prompt }}",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "PROMPT: Hello",
},
{
name: "debug render only with images",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Describe this image",
Images: []api.ImageData{[]byte("fake-image-data")},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "[img-0]\n\nDescribe this image",
expectNumImages: 1,
},
{
name: "debug render only with raw mode",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Raw prompt text",
Raw: true,
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "Raw prompt text",
},
}
for _, tt := range tests {
// Test both with and without streaming
streamValues := []bool{false, true}
for _, stream := range streamValues {
streamSuffix := ""
if stream {
streamSuffix = " (streaming)"
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
w := createRequest(t, s.GenerateHandler, req)
if tt.expectDebug {
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
}
var response api.DebugTemplateResponse
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
if response.Model != tt.request.Model {
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
}
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
}
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
}
} else {
// When debug is disabled, it should attempt normal processing
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
}
}
})
}
}
}
func TestChatDebugRenderOnly(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
tests := []struct {
name string
request api.ChatRequest
expectDebug bool
expectTemplate string
expectNumImages int
}{
{
name: "chat debug render only enabled",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "system", Content: "You are a helpful assistant"},
{Role: "user", Content: "Hello"},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
},
{
name: "chat debug render only disabled",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Hello"},
},
DebugRenderOnly: false,
},
expectDebug: false,
},
{
name: "chat debug with assistant message",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "Hi there!"},
{Role: "user", Content: "How are you?"},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
},
{
name: "chat debug with images",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's in this image?",
Images: []api.ImageData{[]byte("fake-image-data")},
},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "user: [img-0]What's in this image?\n",
expectNumImages: 1,
},
{
name: "chat debug with tools",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Get the weather"},
},
Tools: api.Tools{
{
Type: "function",
Function: api.ToolFunction{
Name: "get_weather",
Description: "Get weather information",
},
},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
},
}
for _, tt := range tests {
// Test both with and without streaming
streamValues := []bool{false, true}
for _, stream := range streamValues {
streamSuffix := ""
if stream {
streamSuffix = " (streaming)"
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
w := createRequest(t, s.ChatHandler, req)
if tt.expectDebug {
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
}
var response api.DebugTemplateResponse
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
if response.Model != tt.request.Model {
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
}
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
}
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
}
} else {
// When debug is disabled, it should attempt normal processing
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
}
}
})
}
}
}

View File

@@ -969,233 +969,3 @@ func TestGenerate(t *testing.T) {
}
})
}
func TestChatWithPromptEndingInThinkTag(t *testing.T) {
gin.SetMode(gin.TestMode)
// Helper to create a standard thinking test setup
setupThinkingTest := func(t *testing.T) (*mockRunner, *Server) {
mock := &mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := &Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{llama: mock}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a model with thinking support
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
// Create model with thinking template that adds <think> at the end
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-thinking",
Files: map[string]string{"file.gguf": digest},
Template: `{{- range .Messages }}
{{- if eq .Role "user" }}user: {{ .Content }}
{{ else if eq .Role "assistant" }}assistant: {{ if .Thinking }}<think>{{ .Thinking }}</think>{{ end }}{{ .Content }}
{{ end }}{{ end }}<think>`,
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
return mock, s
}
mock, s := setupThinkingTest(t)
// Helper to test chat responses
testChatRequest := func(t *testing.T, name string, userContent string, modelResponse string, expectedThinking string, expectedContent string, think bool) {
t.Run(name, func(t *testing.T) {
mock.CompletionResponse = llm.CompletionResponse{
Content: modelResponse,
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
}
mock.CompletionFn = nil
streamRequest := false
req := api.ChatRequest{
Model: "test-thinking",
Messages: []api.Message{
{Role: "user", Content: userContent},
},
Stream: &streamRequest,
}
if think {
req.Think = &api.ThinkValue{Value: think}
}
w := createRequest(t, s.ChatHandler, req)
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
var resp api.ChatResponse
if err := json.NewDecoder(w.Body).Decode(&resp); err != nil {
t.Fatal(err)
}
if resp.Message.Thinking != expectedThinking {
t.Errorf("expected thinking %q, got %q", expectedThinking, resp.Message.Thinking)
}
if resp.Message.Content != expectedContent {
t.Errorf("expected content %q, got %q", expectedContent, resp.Message.Content)
}
})
}
// Test cases - Note: Template adds <think> at the end, and leading whitespace after <think> is eaten by the parser
testChatRequest(t, "basic thinking response",
"Help me solve this problem",
" Let me think about this step by step... </think> The answer is 42.",
"Let me think about this step by step... ",
"The answer is 42.",
true)
testChatRequest(t, "thinking with multiple sentences",
"Explain quantum computing",
" First, I need to understand the basics. Quantum bits can be in superposition. </think> Quantum computing uses quantum mechanics principles.",
"First, I need to understand the basics. Quantum bits can be in superposition. ",
"Quantum computing uses quantum mechanics principles.",
true)
testChatRequest(t, "no thinking content",
"What is 2+2?",
"</think> The answer is 4.",
"",
"The answer is 4.",
true)
testChatRequest(t, "thinking disabled but template still adds think tag",
"Simple question",
" My thoughts </think> The answer.",
"",
" My thoughts </think> The answer.",
false)
// Test streaming response with template-added <think>
t.Run("streaming with thinking", func(t *testing.T) {
var wg sync.WaitGroup
wg.Add(1)
mock.CompletionFn = func(ctx context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
defer wg.Done()
// Verify the prompt ends with <think> due to template
if !strings.HasSuffix(r.Prompt, "<think>") {
t.Errorf("expected prompt to end with <think>, got: %q", r.Prompt)
}
// Simulate streaming chunks
responses := []llm.CompletionResponse{
{Content: " I need to consider", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
{Content: " multiple factors here...", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
{Content: " </think> Based on my analysis,", Done: false, PromptEvalCount: 1, PromptEvalDuration: 1},
{Content: " the solution is straightforward.", Done: true, DoneReason: llm.DoneReasonStop, PromptEvalCount: 1, PromptEvalDuration: 1, EvalCount: 1, EvalDuration: 1},
}
for _, resp := range responses {
select {
case <-ctx.Done():
return ctx.Err()
default:
fn(resp)
time.Sleep(10 * time.Millisecond)
}
}
return nil
}
think := true
w := createRequest(t, s.ChatHandler, api.ChatRequest{
Model: "test-thinking",
Messages: []api.Message{{Role: "user", Content: "Analyze this complex problem"}},
Think: &api.ThinkValue{Value: think},
Stream: &stream,
})
wg.Wait()
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
// Parse streaming responses
decoder := json.NewDecoder(w.Body)
var allThinking, allContent strings.Builder
for {
var resp api.ChatResponse
if err := decoder.Decode(&resp); err == io.EOF {
break
} else if err != nil {
t.Fatal(err)
}
allThinking.WriteString(resp.Message.Thinking)
allContent.WriteString(resp.Message.Content)
}
// Note: Leading whitespace after <think> is eaten by the parser
if got := allThinking.String(); got != "I need to consider multiple factors here... " {
t.Errorf("expected thinking %q, got %q", "I need to consider multiple factors here... ", got)
}
if got := allContent.String(); got != "Based on my analysis, the solution is straightforward." {
t.Errorf("expected content %q, got %q", "Based on my analysis, the solution is straightforward.", got)
}
})
}

View File

@@ -103,9 +103,7 @@ func eat(s *Parser) (string, string, bool) {
// note that we use the original content, not the trimmed one because we
// don't want to eat any whitespace in the real content if there were no
// thinking tags
untrimmed := s.acc.String()
s.acc.Reset()
return "", untrimmed, false
return "", s.acc.String(), false
}
case thinkingState_ThinkingStartedEatingWhitespace:
trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)

View File

@@ -58,15 +58,6 @@ func TestThinkingStreaming(t *testing.T) {
wantContent: " abc",
wantStateAfter: thinkingState_ThinkingDone,
},
// regression test for a bug where we were transitioning directly to
// ThinkingDone without clearing the buffer. This would cuase the first
// step to be outputted twice
{
input: "def",
wantThinking: "",
wantContent: "def",
wantStateAfter: thinkingState_ThinkingDone,
},
},
},
{

View File

@@ -224,45 +224,22 @@ func findArguments(buffer []byte) (map[string]any, int) {
return nil, 0
}
start := -1
var braces int
var inString, escaped bool
for i := range buffer {
c := buffer[i]
if escaped {
escaped = false
continue
}
if c == '\\' {
escaped = true
continue
}
if c == '"' {
inString = !inString
continue
}
if inString {
continue
}
var start int = -1
for i, c := range buffer {
if c == '{' {
if braces == 0 {
start = i
}
braces++
} else if c == '}' {
} else if c == '}' && braces > 0 {
braces--
if braces == 0 && start != -1 {
object := buffer[start : i+1]
var data map[string]any
if err := json.Unmarshal(object, &data); err != nil {
// not a valid object, keep looking
start = -1
continue
}
@@ -305,10 +282,6 @@ func findArguments(buffer []byte) (map[string]any, int) {
return data, i
}
if braces < 0 {
braces = 0
}
}
}

View File

@@ -1,7 +1,6 @@
package tools
import (
"strings"
"testing"
"text/template"
@@ -41,7 +40,13 @@ func TestParser(t *testing.T) {
Function: api.ToolFunction{
Name: "get_temperature",
Description: "Retrieve the temperature for a given location",
Parameters: api.ToolFunctionParameters{
Parameters: struct {
Type string `json:"type"`
Defs any `json:"$defs,omitempty"`
Items any `json:"items,omitempty"`
Required []string `json:"required"`
Properties map[string]api.ToolProperty `json:"properties"`
}{
Type: "object",
Required: []string{"city"},
Properties: map[string]api.ToolProperty{
@@ -63,7 +68,13 @@ func TestParser(t *testing.T) {
Function: api.ToolFunction{
Name: "get_conditions",
Description: "Retrieve the current weather conditions for a given location",
Parameters: api.ToolFunctionParameters{
Parameters: struct {
Type string `json:"type"`
Defs any `json:"$defs,omitempty"`
Items any `json:"items,omitempty"`
Required []string `json:"required"`
Properties map[string]api.ToolProperty `json:"properties"`
}{
Type: "object",
Properties: map[string]api.ToolProperty{
"location": {
@@ -93,7 +104,13 @@ func TestParser(t *testing.T) {
Function: api.ToolFunction{
Name: "get_address",
Description: "Get the address of a given location",
Parameters: api.ToolFunctionParameters{
Parameters: struct {
Type string `json:"type"`
Defs any `json:"$defs,omitempty"`
Items any `json:"items,omitempty"`
Required []string `json:"required"`
Properties map[string]api.ToolProperty `json:"properties"`
}{
Type: "object",
Properties: map[string]api.ToolProperty{
"location": {
@@ -109,7 +126,13 @@ func TestParser(t *testing.T) {
Function: api.ToolFunction{
Name: "add",
Description: "Add two numbers",
Parameters: api.ToolFunctionParameters{
Parameters: struct {
Type string `json:"type"`
Defs any `json:"$defs,omitempty"`
Items any `json:"items,omitempty"`
Required []string `json:"required"`
Properties map[string]api.ToolProperty `json:"properties"`
}{
Type: "object",
Properties: map[string]api.ToolProperty{
"a": {
@@ -1117,163 +1140,11 @@ func TestFindArguments(t *testing.T) {
},
{
name: "deepseek",
buffer: []byte(`"arguments": {"location": "Tokyo"}}</tool_call>`),
buffer: []byte(`", "arguments": {"location": "Tokyo"}}</tool_call>`),
want: map[string]any{
"location": "Tokyo",
},
},
{
name: "string with braces",
buffer: []byte(`{"name": "process_code", "arguments": {"code": "if (x > 0) { return true; }"}}`),
want: map[string]any{
"code": "if (x > 0) { return true; }",
},
},
{
name: "string with nested json",
buffer: []byte(`{"name": "send_data", "arguments": {"payload": "{\"nested\": {\"key\": \"value\"}}"}}`),
want: map[string]any{
"payload": `{"nested": {"key": "value"}}`,
},
},
{
name: "string with escaped quotes and braces",
buffer: []byte(`{"name": "analyze", "arguments": {"text": "The JSON is: {\"key\": \"val{ue}\"}"}}`),
want: map[string]any{
"text": `The JSON is: {"key": "val{ue}"}`,
},
},
{
name: "multiple objects with string containing braces",
buffer: []byte(`{"name": "test", "arguments": {"query": "find } in text"}} {"name": "other"}`),
want: map[string]any{
"query": "find } in text",
},
},
{
name: "unmatched closing brace in string",
buffer: []byte(`{"name": "search", "arguments": {"pattern": "regex: }"}}`),
want: map[string]any{
"pattern": "regex: }",
},
},
{
name: "complex nested with mixed braces",
buffer: []byte(`{"name": "analyze", "arguments": {"data": "{\"items\": [{\"value\": \"}\"}, {\"code\": \"if (x) { return y; }\"}]}"}}`),
want: map[string]any{
"data": `{"items": [{"value": "}"}, {"code": "if (x) { return y; }"}]}`,
},
},
{
name: "string with newline and braces",
buffer: []byte(`{"name": "format", "arguments": {"template": "{\n \"key\": \"value\"\n}"}}`),
want: map[string]any{
"template": "{\n \"key\": \"value\"\n}",
},
},
{
name: "string with unicode escape",
buffer: []byte(`{"name": "test", "arguments": {"text": "Unicode: \u007B and \u007D"}}`),
want: map[string]any{
"text": "Unicode: { and }",
},
},
{
name: "array arguments",
buffer: []byte(`{"name": "batch", "arguments": ["item1", "item2", "{\"nested\": true}"]}`),
want: nil, // This should return nil because arguments is not a map
},
{
name: "escaped backslash before quote",
buffer: []byte(`{"name": "path", "arguments": {"dir": "C:\\Program Files\\{App}\\"}}`),
want: map[string]any{
"dir": `C:\Program Files\{App}\`,
},
},
{
name: "single quotes not treated as string delimiters",
buffer: []byte(`{"name": "query", "arguments": {"sql": "SELECT * FROM users WHERE name = '{admin}'"}}`),
want: map[string]any{
"sql": "SELECT * FROM users WHERE name = '{admin}'",
},
},
{
name: "incomplete json at buffer end",
buffer: []byte(`{"name": "test", "arguments": {"data": "some {"`),
want: nil,
},
{
name: "multiple escaped quotes",
buffer: []byte(`{"name": "echo", "arguments": {"msg": "He said \"Hello {World}\" loudly"}}`),
want: map[string]any{
"msg": `He said "Hello {World}" loudly`,
},
},
{
name: "json with comments style string",
buffer: []byte(`{"name": "code", "arguments": {"snippet": "// This is a comment with { and }"}}`),
want: map[string]any{
"snippet": "// This is a comment with { and }",
},
},
{
name: "consecutive escaped backslashes",
buffer: []byte(`{"name": "test", "arguments": {"path": "C:\\\\{folder}\\\\"}}`),
want: map[string]any{
"path": `C:\\{folder}\\`,
},
},
{
name: "empty string with braces after",
buffer: []byte(`{"name": "test", "arguments": {"a": "", "b": "{value}"}}`),
want: map[string]any{
"a": "",
"b": "{value}",
},
},
{
name: "unicode in key names",
buffer: []byte(`{"name": "test", "arguments": {"key{": "value", "key}": "value2"}}`),
want: map[string]any{
"key{": "value",
"key}": "value2",
},
},
{
name: "very long string with braces",
buffer: []byte(`{"name": "test", "arguments": {"data": "` + strings.Repeat("a{b}c", 100) + `"}}`),
want: map[string]any{
"data": strings.Repeat("a{b}c", 100),
},
},
{
name: "tab characters and braces",
buffer: []byte(`{"name": "test", "arguments": {"code": "\tif (true) {\n\t\treturn;\n\t}"}}`),
want: map[string]any{
"code": "\tif (true) {\n\t\treturn;\n\t}",
},
},
{
name: "null byte in string",
buffer: []byte(`{"name": "test", "arguments": {"data": "before\u0000{after}"}}`),
want: map[string]any{
"data": "before\x00{after}",
},
},
{
name: "escaped quote at end of string",
buffer: []byte(`{"name": "test", "arguments": {"data": "text with quote at end\\\""}}`),
want: map[string]any{
"data": `text with quote at end\"`,
},
},
{
name: "mixed array and object in arguments",
buffer: []byte(`{"name": "test", "arguments": {"items": ["{", "}", {"key": "value"}]}}`),
want: map[string]any{
"items": []any{"{", "}", map[string]any{"key": "value"}},
},
},
}
for _, tt := range tests {