Compare commits

..

1 Commits

Author SHA1 Message Date
jmorganca
19638cec55 add docs.json 2025-08-17 13:12:39 -07:00
11 changed files with 96 additions and 684 deletions

View File

@@ -411,8 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
- [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
- [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)
### Cloud
@@ -539,8 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
- [Ollama for D](https://github.com/kassane/ollama-d)
- [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))
### Mobile

View File

@@ -90,10 +90,6 @@ type GenerateRequest struct {
// (request that thinking _not_ be used) and unset (use the old behavior
// before this option was introduced)
Think *ThinkValue `json:"think,omitempty"`
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
// template instead of calling the model.
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
}
// ChatRequest describes a request sent by [Client.Chat].
@@ -124,10 +120,6 @@ type ChatRequest struct {
// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
// for supported models.
Think *ThinkValue `json:"think,omitempty"`
// DebugRenderOnly is a debug option that, when set to true, returns the rendered
// template instead of calling the model.
DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
}
type Tools []Tool
@@ -316,19 +308,6 @@ type ChatResponse struct {
Metrics
}
// DebugInfo contains debug information for template rendering
type DebugInfo struct {
RenderedTemplate string `json:"rendered_template"`
ImageCount int `json:"image_count,omitempty"`
}
// DebugTemplateResponse is returned when _debug_render_only is set to true
type DebugTemplateResponse struct {
Model string `json:"model"`
CreatedAt time.Time `json:"created_at"`
DebugInfo DebugInfo `json:"_debug_info"`
}
type Metrics struct {
TotalDuration time.Duration `json:"total_duration,omitempty"`
LoadDuration time.Duration `json:"load_duration,omitempty"`

75
docs/docs.json Normal file
View File

@@ -0,0 +1,75 @@
{
"$schema": "https://mintlify.com/docs.json",
"theme": "mint",
"background": {
"color": {
"light": "#ffffff",
"dark": "#000000"
}
},
"appearance": {
"default": "light"
},
"styling": {
"codeblocks": "system"
},
"contextual": {
"options": ["copy", "chatgpt", "claude", "view"]
},
"fonts": {
"heading": {
"family": "Inter"
},
"body": {
"family": "Inter"
}
},
"name": "Ollama",
"colors": {
"primary": "#000",
"light": "#b5b5b5",
"dark": "#fff"
},
"favicon": "/ollama.png",
"logo": {
"light": "/ollama.png",
"dark": "/favicon.svg"
},
"navigation": {
"tabs": [
{
"tab": "Documentation",
"groups": [
{
"group": "Home",
"pages": ["index", "quickstart", "faq", "troubleshooting"]
},
{
"group": "Platforms",
"pages": ["linux", "windows", "docker"]
},
{
"group": "Features",
"pages": [
"modelfile",
"apis",
"openai",
"import",
"gpu",
"benchmark"
]
}
]
},
{
"tab": "Development",
"groups": [
{
"group": " ",
"pages": ["development", "examples", "template"]
}
]
}
]
}
}

View File

@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
const bool output_all = false;
// when computing embeddings, all tokens are output
const bool output_all = cparams.embeddings;
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);

View File

@@ -13,7 +13,7 @@ checks.
1 file changed, 18 insertions(+)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 57eae461..c7f9dc3a 100644
index 57eae461..9db0c8b5 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud

View File

@@ -1,23 +0,0 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Michael Yang <git@mxy.ng>
Date: Mon, 18 Aug 2025 16:58:39 -0700
Subject: [PATCH] decode: disable output_all
---
src/llama-context.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 26a5cf9c..6ece5263 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
- // when computing embeddings, all tokens are output
- const bool output_all = cparams.embeddings;
+ const bool output_all = false;
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);

View File

@@ -651,9 +651,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
if !success {
s.initModel(ctx, LoadRequest{}, LoadOperationClose)
}
if s.mem != nil {
s.mem.Log(slog.LevelInfo)
}
s.mem.Log(slog.LevelInfo)
}()
slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)

View File

@@ -2,7 +2,6 @@ package server
import (
"context"
"fmt"
"log/slog"
"slices"
"strings"
@@ -276,9 +275,8 @@ const (
// HarmonyMessageHandler processes harmony events and accumulates content appropriately.
// This is a higher level interface that maps harmony concepts into ollama concepts
type HarmonyMessageHandler struct {
state harmonyMessageState
harmonyParser *HarmonyParser
functionNameMap *FunctionNameMap
state harmonyMessageState
harmonyParser *HarmonyParser
}
// NewHarmonyMessageHandler creates a new message handler
@@ -290,7 +288,6 @@ func NewHarmonyMessageHandler() *HarmonyMessageHandler {
MessageEndTag: "<|end|>",
HeaderEndTag: "<|message|>",
},
functionNameMap: NewFunctionNameMap(),
}
}
@@ -381,97 +378,3 @@ func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
func (a *HarmonyToolCallAccumulator) Content() string {
return a.acc.String()
}
// FunctionNameMap maps a user-specified function name to a valid function
// name for harmony (which look like TypeScript identifiers). This is needed to
// transform user-specified function names, which might contain characters that
// are not allowed in TypeScript identifiers
type FunctionNameMap struct {
userToHarmony map[string]string
harmonyToUser map[string]string
}
func NewFunctionNameMap() *FunctionNameMap {
return &FunctionNameMap{
userToHarmony: make(map[string]string),
harmonyToUser: make(map[string]string),
}
}
func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
harmonyFunctionName := m.deriveName(userFunctionName)
m.userToHarmony[userFunctionName] = harmonyFunctionName
m.harmonyToUser[harmonyFunctionName] = userFunctionName
return harmonyFunctionName
}
// OriginalFromConverted looks up the reverse-mapping of a previously-converted
// user->harmony function name. To unmap reliably, the mapping must exist, as
// the conversion process is not reversible without the appropriate state
func (m *FunctionNameMap) OriginalFromConverted(harmonyFunctionName string) string {
if userFunctionName, ok := m.harmonyToUser[harmonyFunctionName]; ok {
return userFunctionName
}
slog.Warn("harmony parser: no reverse mapping found for function name", "harmonyFunctionName", harmonyFunctionName)
// fallback to the original function name if we can't find a mapping
return harmonyFunctionName
}
// convertToValidChars converts a user-specified function name to a valid
// TypeScript identifier.
//
// Limitations:
//
// - This doesn't restrict reserved TypeScript keywords.
// - We don't perform a real ID_Start/ID_Continue check, and instead use the more
// restrictive unicode.IsLetter/unicode.IsDigit check. Unclear what kind of
// identifiers these models were trained on, so in the end we might want to
// convert unicode-heavy identifiers to their closest ASCII equivalents.
func (m *FunctionNameMap) convertToValidChars(userFunctionName string) string {
mapper := func(r rune) rune {
// first, replace certain characters with underscores
if r == ' ' || r == '-' || r == '.' {
return '_'
}
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$' {
return r
}
// finally, remove any other characters
return -1
}
candidate := strings.Map(mapper, userFunctionName)
// set a default name if we end up with nothing left
if candidate == "" {
return "unnamed"
}
// if the candidate starts with a number, prepend an underscore to make it a
// valid identifier
if unicode.IsDigit(rune(candidate[0])) {
candidate = "_" + candidate
}
return candidate
}
func (m *FunctionNameMap) deriveName(userFunctionName string) string {
originalCandidate := m.convertToValidChars(userFunctionName)
candidate := originalCandidate
// Check for dupes, and if so, add a number to the end.
// We start at 2 because if we have dupes and the first is never renamed, it
// makes sense for them to be named, say, `f`, `f_2`, `f_3`
count := 2
for {
if _, exists := m.harmonyToUser[candidate]; !exists {
break
}
candidate = fmt.Sprintf("%s_%d", originalCandidate, count)
count++
}
return candidate
}

View File

@@ -467,71 +467,3 @@ func TestHarmonyParserStreaming(t *testing.T) {
})
}
}
// TestFunctionConvertToValidChars tests only FunctionNameMap.convert(), which doesn't
// handle any saving (and therefore no dupe handling)
func TestFunctionConvertToValidChars(t *testing.T) {
tests := []struct {
name string
in string
want string
}{
{name: "replace spaces with underscores", in: "get weather", want: "get_weather"},
{name: "replace hyphens with underscores", in: "get-weather", want: "get_weather"},
{name: "replace periods with underscores", in: "get.weather", want: "get_weather"},
{name: "disallow non-word characters", in: "get weather!", want: "get_weather"},
{name: "strip out invalid non-alphanumeric unicode characters", in: "a🫠bc", want: "abc"},
{name: "names that only contain invalid characters", in: "🫠", want: "unnamed"},
{name: "leading number", in: "123", want: "_123"},
{name: "$ allowed", in: "$", want: "$"},
// show that we allow weird unicode letter characters, though we might want
// to convert them to their closest ASCII equivalents in the future
{name: "allow weird unicode letter characters", in: "𝓸𝓵𝓵𝓪𝓶𝓪", want: "𝓸𝓵𝓵𝓪𝓶𝓪"},
// names that look like words but are invalid (i.e., not ID_Start/ID_Continue)
{name: "disallow non-word characters that look like words", in: "ⓞⓛⓛⓐⓜⓐ123", want: "_123"},
}
for i, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
parser := NewFunctionNameMap()
got := parser.convertToValidChars(tt.in)
if got != tt.want {
t.Errorf("case %d: got %q, want %q", i, got, tt.want)
}
})
}
}
func TestFunctionConvertAndAdd(t *testing.T) {
// make a fresh map for each test, but within a test use the same map so we can test for dupe handling
tests := []struct {
name string
in []string
want []string
}{
{name: "basic dupe handling", in: []string{"get weather", "get weather"}, want: []string{"get_weather", "get_weather_2"}},
{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
}
for i, tt := range tests {
parser := NewFunctionNameMap()
t.Run(tt.name, func(t *testing.T) {
for j, in := range tt.in {
got := parser.ConvertAndAdd(in)
want := tt.want[j]
if got != want {
t.Errorf("case %d: got %q, want %q", i, got, want)
}
// check that the maps are correct
if parser.userToHarmony[in] != want {
t.Errorf("case %d: userToHarmony[%q] = %q, want %q", i, in, parser.userToHarmony[in], want)
}
if parser.harmonyToUser[want] != in {
t.Errorf("case %d: harmonyToUser[%q] = %q, want %q", i, want, parser.harmonyToUser[want], in)
}
}
})
}
}

View File

@@ -314,19 +314,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
prompt = b.String()
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
c.JSON(http.StatusOK, api.DebugTemplateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
DebugInfo: api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
var thinkingState *thinking.Parser
if !useHarmony {
openingTag, closingTag := thinking.InferTags(m.Template.Template)
@@ -1603,12 +1590,24 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
msgs = filterThinkTags(msgs, m)
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
if err != nil {
slog.Error("chat prompt error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
useHarmony := shouldUseHarmony(*m)
processedTools := req.Tools
// Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return
}
var harmonyMessageHandler *HarmonyMessageHandler
var harmonyToolParser *HarmonyToolCallAccumulator
if useHarmony {
harmonyMessageHandler = NewHarmonyMessageHandler()
var lastMessage *api.Message
@@ -1617,40 +1616,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
harmonyToolParser = harmonyMessageHandler.CreateToolParser()
// make a copy of tools to pass to the chat prompt. Function names may be
// renamed to be valid Harmony function names.
processedTools = make([]api.Tool, len(req.Tools))
copy(processedTools, req.Tools)
for i, tool := range processedTools {
processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
}
}
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
if err != nil {
slog.Error("chat prompt error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
// If debug mode is enabled, return the rendered template instead of calling the model
if req.DebugRenderOnly {
c.JSON(http.StatusOK, api.DebugTemplateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
DebugInfo: api.DebugInfo{
RenderedTemplate: prompt,
ImageCount: len(images),
},
})
return
}
// Validate Think value: string values currently only allowed for gptoss models
if req.Think != nil && req.Think.IsString() && !useHarmony {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
return
}
var thinkingState *thinking.Parser
@@ -1705,7 +1670,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
toolName, toolContent := harmonyToolParser.Drain()
if toolName != nil {
*toolName = strings.TrimPrefix(*toolName, "functions.")
*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
var args api.ToolCallFunctionArguments
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())

View File

@@ -1,413 +0,0 @@
package server
import (
"bytes"
"encoding/json"
"net/http"
"testing"
"time"
"github.com/gin-gonic/gin"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/discover"
"github.com/ollama/ollama/fs/ggml"
"github.com/ollama/ollama/llm"
)
func TestGenerateDebugRenderOnly(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ .Prompt }}",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
tests := []struct {
name string
request api.GenerateRequest
expectDebug bool
expectTemplate string
expectNumImages int
}{
{
name: "debug render only enabled",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello, world!",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "Hello, world!",
},
{
name: "debug render only disabled",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello, world!",
DebugRenderOnly: false,
},
expectDebug: false,
},
{
name: "debug render only with system prompt",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "User question",
System: "You are a helpful assistant",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "User question",
},
{
name: "debug render only with template",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Hello",
Template: "PROMPT: {{ .Prompt }}",
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "PROMPT: Hello",
},
{
name: "debug render only with images",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Describe this image",
Images: []api.ImageData{[]byte("fake-image-data")},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "[img-0]\n\nDescribe this image",
expectNumImages: 1,
},
{
name: "debug render only with raw mode",
request: api.GenerateRequest{
Model: "test-model",
Prompt: "Raw prompt text",
Raw: true,
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "Raw prompt text",
},
}
for _, tt := range tests {
// Test both with and without streaming
streamValues := []bool{false, true}
for _, stream := range streamValues {
streamSuffix := ""
if stream {
streamSuffix = " (streaming)"
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
w := createRequest(t, s.GenerateHandler, req)
if tt.expectDebug {
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
}
var response api.DebugTemplateResponse
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
if response.Model != tt.request.Model {
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
}
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
}
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
}
} else {
// When debug is disabled, it should attempt normal processing
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
}
}
})
}
}
}
func TestChatDebugRenderOnly(t *testing.T) {
gin.SetMode(gin.TestMode)
mock := mockRunner{
CompletionResponse: llm.CompletionResponse{
Done: true,
DoneReason: llm.DoneReasonStop,
PromptEvalCount: 1,
PromptEvalDuration: 1,
EvalCount: 1,
EvalDuration: 1,
},
}
s := Server{
sched: &Scheduler{
pendingReqCh: make(chan *LlmRequest, 1),
finishedReqCh: make(chan *LlmRequest, 1),
expiredCh: make(chan *runnerRef, 1),
unloadedCh: make(chan any, 1),
loaded: make(map[string]*runnerRef),
newServerFn: newMockServer(&mock),
getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading
time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{
llama: &mock,
}
return false
},
},
}
go s.sched.Run(t.Context())
// Create a test model
stream := false
_, digest := createBinFile(t, ggml.KV{
"general.architecture": "llama",
"llama.block_count": uint32(1),
"llama.context_length": uint32(8192),
"llama.embedding_length": uint32(4096),
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
"tokenizer.ggml.tokens": []string{""},
"tokenizer.ggml.scores": []float32{0},
"tokenizer.ggml.token_type": []int32{0},
}, []*ggml.Tensor{
{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
})
w := createRequest(t, s.CreateHandler, api.CreateRequest{
Model: "test-model",
Files: map[string]string{"file.gguf": digest},
Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
Stream: &stream,
})
if w.Code != http.StatusOK {
t.Fatalf("expected status 200, got %d", w.Code)
}
tests := []struct {
name string
request api.ChatRequest
expectDebug bool
expectTemplate string
expectNumImages int
}{
{
name: "chat debug render only enabled",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "system", Content: "You are a helpful assistant"},
{Role: "user", Content: "Hello"},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
},
{
name: "chat debug render only disabled",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Hello"},
},
DebugRenderOnly: false,
},
expectDebug: false,
},
{
name: "chat debug with assistant message",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "Hi there!"},
{Role: "user", Content: "How are you?"},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
},
{
name: "chat debug with images",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{
Role: "user",
Content: "What's in this image?",
Images: []api.ImageData{[]byte("fake-image-data")},
},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "user: [img-0]What's in this image?\n",
expectNumImages: 1,
},
{
name: "chat debug with tools",
request: api.ChatRequest{
Model: "test-model",
Messages: []api.Message{
{Role: "user", Content: "Get the weather"},
},
Tools: api.Tools{
{
Type: "function",
Function: api.ToolFunction{
Name: "get_weather",
Description: "Get weather information",
},
},
},
DebugRenderOnly: true,
},
expectDebug: true,
expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
},
}
for _, tt := range tests {
// Test both with and without streaming
streamValues := []bool{false, true}
for _, stream := range streamValues {
streamSuffix := ""
if stream {
streamSuffix = " (streaming)"
}
t.Run(tt.name+streamSuffix, func(t *testing.T) {
req := tt.request
req.Stream = &stream
w := createRequest(t, s.ChatHandler, req)
if tt.expectDebug {
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
}
var response api.DebugTemplateResponse
if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
t.Fatalf("failed to unmarshal response: %v", err)
}
if response.Model != tt.request.Model {
t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
}
if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
}
if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
}
} else {
// When debug is disabled, it should attempt normal processing
if w.Code != http.StatusOK {
t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
}
}
})
}
}
}