add docs.json

2025-08-17 13:12:39 -07:00
11 changed files with 96 additions and 684 deletions
--- a/README.md
+++ b/README.md
@@ -411,8 +411,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ollama launcher](https://github.com/NGC13009/ollama-launcher) (A launcher for Ollama, aiming to provide users with convenient functions such as ollama server launching, management, or configuration.)
 - [ai-hub](https://github.com/Aj-Seven/ai-hub) (AI Hub supports multiple models via API keys and Chat support via Ollama API.)
 - [Mayan EDMS](https://gitlab.com/mayan-edms/mayan-edms) (Open source document management system to organize, tag, search, and automate your files with powerful Ollama driven workflows.)
- [Serene Pub](https://github.com/doolijb/serene-pub) (Beginner friendly, open source AI Roleplaying App for Windows, Mac OS and Linux. Search, download and use models with Ollama all inside the app.)
- [Andes](https://github.com/aqerd/andes) (A Visual Studio Code extension that provides a local UI interface for Ollama models)

 ### Cloud

@@ -539,8 +537,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Nichey](https://github.com/goodreasonai/nichey) is a Python package for generating custom wikis for your research topic
 - [Ollama for D](https://github.com/kassane/ollama-d)
 - [OllamaPlusPlus](https://github.com/HardCodeDev777/OllamaPlusPlus) (Very simple C++ library for Ollama)
- [any-llm](https://github.com/mozilla-ai/any-llm) (A single interface to use different llm providers by [mozilla.ai](https://www.mozilla.ai/))
- [any-agent](https://github.com/mozilla-ai/any-agent) (A single interface to use and evaluate different agent frameworks by [mozilla.ai](https://www.mozilla.ai/))

 ### Mobile

--- a/api/types.go
+++ b/api/types.go
@@ -90,10 +90,6 @@ type GenerateRequest struct {
 	// (request that thinking _not_ be used) and unset (use the old behavior
 	// before this option was introduced)
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 // ChatRequest describes a request sent by [Client.Chat].
@@ -124,10 +120,6 @@ type ChatRequest struct {
 	// responding. Can be a boolean (true/false) or a string ("high", "medium", "low")
 	// for supported models.
 	Think *ThinkValue `json:"think,omitempty"`
-
-	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
-	// template instead of calling the model.
-	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 }

 type Tools []Tool
@@ -316,19 +308,6 @@ type ChatResponse struct {
 	Metrics
 }

-// DebugInfo contains debug information for template rendering
-type DebugInfo struct {
-	RenderedTemplate string `json:"rendered_template"`
-	ImageCount       int    `json:"image_count,omitempty"`
-}
-
-// DebugTemplateResponse is returned when _debug_render_only is set to true
-type DebugTemplateResponse struct {
-	Model     string    `json:"model"`
-	CreatedAt time.Time `json:"created_at"`
-	DebugInfo DebugInfo `json:"_debug_info"`
-}
-
 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -0,0 +1,75 @@
+{
+  "$schema": "https://mintlify.com/docs.json",
+  "theme": "mint",
+  "background": {
+    "color": {
+      "light": "#ffffff",
+      "dark": "#000000"
+    }
+  },
+  "appearance": {
+    "default": "light"
+  },
+  "styling": {
+    "codeblocks": "system"
+  },
+  "contextual": {
+    "options": ["copy", "chatgpt", "claude", "view"]
+  },
+  "fonts": {
+    "heading": {
+      "family": "Inter"
+    },
+    "body": {
+      "family": "Inter"
+    }
+  },
+  "name": "Ollama",
+  "colors": {
+    "primary": "#000",
+    "light": "#b5b5b5",
+    "dark": "#fff"
+  },
+  "favicon": "/ollama.png",
+  "logo": {
+    "light": "/ollama.png",
+    "dark": "/favicon.svg"
+  },
+  "navigation": {
+    "tabs": [
+      {
+        "tab": "Documentation",
+        "groups": [
+          {
+            "group": "Home",
+            "pages": ["index", "quickstart", "faq", "troubleshooting"]
+          },
+          {
+            "group": "Platforms",
+            "pages": ["linux", "windows", "docker"]
+          },
+          {
+            "group": "Features",
+            "pages": [
+              "modelfile",
+              "apis",
+              "openai",
+              "import",
+              "gpu",
+              "benchmark"
+            ]
+          }
+        ]
+      },
+      {
+        "tab": "Development",
+        "groups": [
+          {
+            "group": " ",
+            "pages": ["development", "examples", "template"]
+          }
+        ]
+      }
+    ]
+  }
+}
--- a/llama/llama.cpp/src/llama-context.cpp
+++ b/llama/llama.cpp/src/llama-context.cpp
@@ -962,7 +962,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
    const int64_t n_vocab = vocab.n_tokens();
    const int64_t n_embd  = hparams.n_embd;

-    const bool output_all = false;
+    // when computing embeddings, all tokens are output
+    const bool output_all = cparams.embeddings;

    if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
+++ b/llama/patches/0019-Enable-CUDA-Graphs-for-gemma3n.patch
@@ -13,7 +13,7 @@ checks.
 1 file changed, 18 insertions(+)

 diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
-index 57eae461..c7f9dc3a 100644
+index 57eae461..9db0c8b5 100644
 --- a/ggml/src/ggml-cuda/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2671,12 +2671,24 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
--- a/llama/patches/0023-decode-disable-output_all.patch
+++ b/llama/patches/0023-decode-disable-output_all.patch
@@ -1,23 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Michael Yang <git@mxy.ng>
-Date: Mon, 18 Aug 2025 16:58:39 -0700
-Subject: [PATCH] decode: disable output_all
-
---
- src/llama-context.cpp | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 26a5cf9c..6ece5263 100644
--- a/src/llama-context.cpp
-+++ b/src/llama-context.cpp
-@@ -962,8 +962,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
-     const int64_t n_vocab = vocab.n_tokens();
-     const int64_t n_embd  = hparams.n_embd;
- 
-    // when computing embeddings, all tokens are output
-    const bool output_all = cparams.embeddings;
-+    const bool output_all = false;
- 
-     if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
-         LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
--- a/llm/server.go
+++ b/llm/server.go
@@ -651,9 +651,7 @@ func (s *ollamaServer) Load(ctx context.Context, gpus discover.GpuInfoList, requ
 		if !success {
 			s.initModel(ctx, LoadRequest{}, LoadOperationClose)
 		}
-		if s.mem != nil {
-			s.mem.Log(slog.LevelInfo)
-		}
+		s.mem.Log(slog.LevelInfo)
 	}()

 	slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
--- a/server/harmonyparser.go
+++ b/server/harmonyparser.go
@@ -2,7 +2,6 @@ package server

 import (
 	"context"
-	"fmt"
 	"log/slog"
 	"slices"
 	"strings"
@@ -276,9 +275,8 @@ const (
 // HarmonyMessageHandler processes harmony events and accumulates content appropriately.
 // This is a higher level interface that maps harmony concepts into ollama concepts
 type HarmonyMessageHandler struct {
-	state           harmonyMessageState
-	harmonyParser   *HarmonyParser
-	functionNameMap *FunctionNameMap
+	state         harmonyMessageState
+	harmonyParser *HarmonyParser
 }

 // NewHarmonyMessageHandler creates a new message handler
@@ -290,7 +288,6 @@ func NewHarmonyMessageHandler() *HarmonyMessageHandler {
 			MessageEndTag:   "<|end|>",
 			HeaderEndTag:    "<|message|>",
 		},
-		functionNameMap: NewFunctionNameMap(),
 	}
 }

@@ -381,97 +378,3 @@ func (a *HarmonyToolCallAccumulator) Drain() (*string, string) {
 func (a *HarmonyToolCallAccumulator) Content() string {
 	return a.acc.String()
 }
-
-// FunctionNameMap maps a user-specified function name to a valid function
-// name for harmony (which look like TypeScript identifiers). This is needed to
-// transform user-specified function names, which might contain characters that
-// are not allowed in TypeScript identifiers
-type FunctionNameMap struct {
-	userToHarmony map[string]string
-	harmonyToUser map[string]string
-}
-
-func NewFunctionNameMap() *FunctionNameMap {
-	return &FunctionNameMap{
-		userToHarmony: make(map[string]string),
-		harmonyToUser: make(map[string]string),
-	}
-}
-
-func (m *FunctionNameMap) ConvertAndAdd(userFunctionName string) string {
-	harmonyFunctionName := m.deriveName(userFunctionName)
-	m.userToHarmony[userFunctionName] = harmonyFunctionName
-	m.harmonyToUser[harmonyFunctionName] = userFunctionName
-	return harmonyFunctionName
-}
-
-// OriginalFromConverted looks up the reverse-mapping of a previously-converted
-// user->harmony function name. To unmap reliably, the mapping must exist, as
-// the conversion process is not reversible without the appropriate state
-func (m *FunctionNameMap) OriginalFromConverted(harmonyFunctionName string) string {
-	if userFunctionName, ok := m.harmonyToUser[harmonyFunctionName]; ok {
-		return userFunctionName
-	}
-	slog.Warn("harmony parser: no reverse mapping found for function name", "harmonyFunctionName", harmonyFunctionName)
-	// fallback to the original function name if we can't find a mapping
-	return harmonyFunctionName
-}
-
-// convertToValidChars converts a user-specified function name to a valid
-// TypeScript identifier.
-//
-// Limitations:
-//
-//   - This doesn't restrict reserved TypeScript keywords.
-//   - We don't perform a real ID_Start/ID_Continue check, and instead use the more
-//     restrictive unicode.IsLetter/unicode.IsDigit check. Unclear what kind of
-//     identifiers these models were trained on, so in the end we might want to
-//     convert unicode-heavy identifiers to their closest ASCII equivalents.
-func (m *FunctionNameMap) convertToValidChars(userFunctionName string) string {
-	mapper := func(r rune) rune {
-		// first, replace certain characters with underscores
-		if r == ' ' || r == '-' || r == '.' {
-			return '_'
-		}
-
-		if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '_' || r == '$' {
-			return r
-		}
-
-		// finally, remove any other characters
-		return -1
-	}
-	candidate := strings.Map(mapper, userFunctionName)
-
-	// set a default name if we end up with nothing left
-	if candidate == "" {
-		return "unnamed"
-	}
-
-	// if the candidate starts with a number, prepend an underscore to make it a
-	// valid identifier
-	if unicode.IsDigit(rune(candidate[0])) {
-		candidate = "_" + candidate
-	}
-
-	return candidate
-}
-
-func (m *FunctionNameMap) deriveName(userFunctionName string) string {
-	originalCandidate := m.convertToValidChars(userFunctionName)
-	candidate := originalCandidate
-
-	// Check for dupes, and if so, add a number to the end.
-	// We start at 2 because if we have dupes and the first is never renamed, it
-	// makes sense for them to be named, say, `f`, `f_2`, `f_3`
-	count := 2
-	for {
-		if _, exists := m.harmonyToUser[candidate]; !exists {
-			break
-		}
-		candidate = fmt.Sprintf("%s_%d", originalCandidate, count)
-		count++
-	}
-
-	return candidate
-}
--- a/server/harmonyparser_test.go
+++ b/server/harmonyparser_test.go
@@ -467,71 +467,3 @@ func TestHarmonyParserStreaming(t *testing.T) {
 		})
 	}
 }
-
-// TestFunctionConvertToValidChars tests only FunctionNameMap.convert(), which doesn't
-// handle any saving (and therefore no dupe handling)
-func TestFunctionConvertToValidChars(t *testing.T) {
-	tests := []struct {
-		name string
-		in   string
-		want string
-	}{
-		{name: "replace spaces with underscores", in: "get weather", want: "get_weather"},
-		{name: "replace hyphens with underscores", in: "get-weather", want: "get_weather"},
-		{name: "replace periods with underscores", in: "get.weather", want: "get_weather"},
-		{name: "disallow non-word characters", in: "get weather!", want: "get_weather"},
-		{name: "strip out invalid non-alphanumeric unicode characters", in: "a🫠bc", want: "abc"},
-		{name: "names that only contain invalid characters", in: "🫠", want: "unnamed"},
-		{name: "leading number", in: "123", want: "_123"},
-		{name: "$ allowed", in: "$", want: "$"},
-		// show that we allow weird unicode letter characters, though we might want
-		// to convert them to their closest ASCII equivalents in the future
-		{name: "allow weird unicode letter characters", in: "𝓸𝓵𝓵𝓪𝓶𝓪", want: "𝓸𝓵𝓵𝓪𝓶𝓪"},
-		// names that look like words but are invalid (i.e., not ID_Start/ID_Continue)
-		{name: "disallow non-word characters that look like words", in: "ⓞⓛⓛⓐⓜⓐ123", want: "_123"},
-	}
-
-	for i, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			parser := NewFunctionNameMap()
-			got := parser.convertToValidChars(tt.in)
-			if got != tt.want {
-				t.Errorf("case %d: got %q, want %q", i, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestFunctionConvertAndAdd(t *testing.T) {
-	// make a fresh map for each test, but within a test use the same map so we can test for dupe handling
-	tests := []struct {
-		name string
-		in   []string
-		want []string
-	}{
-		{name: "basic dupe handling", in: []string{"get weather", "get weather"}, want: []string{"get_weather", "get_weather_2"}},
-		{name: "dupes from different user-specified names", in: []string{"get weather", "get_weather", "get-weather"}, want: []string{"get_weather", "get_weather_2", "get_weather_3"}},
-		{name: "non dupes after dupes", in: []string{"get weather", "get_weather", "get-weather", "something-different"}, want: []string{"get_weather", "get_weather_2", "get_weather_3", "something_different"}},
-		{name: "multiple sets of dupes", in: []string{"a", "a", "b", "a", "a", "b", "a"}, want: []string{"a", "a_2", "b", "a_3", "a_4", "b_2", "a_5"}},
-	}
-
-	for i, tt := range tests {
-		parser := NewFunctionNameMap()
-		t.Run(tt.name, func(t *testing.T) {
-			for j, in := range tt.in {
-				got := parser.ConvertAndAdd(in)
-				want := tt.want[j]
-				if got != want {
-					t.Errorf("case %d: got %q, want %q", i, got, want)
-				}
-				// check that the maps are correct
-				if parser.userToHarmony[in] != want {
-					t.Errorf("case %d: userToHarmony[%q] = %q, want %q", i, in, parser.userToHarmony[in], want)
-				}
-				if parser.harmonyToUser[want] != in {
-					t.Errorf("case %d: harmonyToUser[%q] = %q, want %q", i, want, parser.harmonyToUser[want], in)
-				}
-			}
-		})
-	}
-}
--- a/server/routes.go
+++ b/server/routes.go
@@ -314,19 +314,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}

-	// If debug mode is enabled, return the rendered template instead of calling the model
-	if req.DebugRenderOnly {
-		c.JSON(http.StatusOK, api.DebugTemplateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			DebugInfo: api.DebugInfo{
-				RenderedTemplate: prompt,
-				ImageCount:       len(images),
-			},
-		})
-		return
-	}
-
 	var thinkingState *thinking.Parser
 	if !useHarmony {
 		openingTag, closingTag := thinking.InferTags(m.Template.Template)
@@ -1603,12 +1590,24 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	var harmonyMessageHandler *HarmonyMessageHandler
-	var harmonyToolParser *HarmonyToolCallAccumulator
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
+	if err != nil {
+		slog.Error("chat prompt error", "error", err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}

 	useHarmony := shouldUseHarmony(*m)

-	processedTools := req.Tools
+	// Validate Think value: string values currently only allowed for gptoss models
+	if req.Think != nil && req.Think.IsString() && !useHarmony {
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
+		return
+	}
+
+	var harmonyMessageHandler *HarmonyMessageHandler
+	var harmonyToolParser *HarmonyToolCallAccumulator
+
 	if useHarmony {
 		harmonyMessageHandler = NewHarmonyMessageHandler()
 		var lastMessage *api.Message
@@ -1617,40 +1616,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		}
 		harmonyMessageHandler.harmonyParser.AddImplicitStartOrPrefill(lastMessage)
 		harmonyToolParser = harmonyMessageHandler.CreateToolParser()
-
-		// make a copy of tools to pass to the chat prompt. Function names may be
-		// renamed to be valid Harmony function names.
-		processedTools = make([]api.Tool, len(req.Tools))
-		copy(processedTools, req.Tools)
-		for i, tool := range processedTools {
-			processedTools[i].Function.Name = harmonyMessageHandler.functionNameMap.ConvertAndAdd(tool.Function.Name)
-		}
-	}
-
-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, processedTools, req.Think)
-	if err != nil {
-		slog.Error("chat prompt error", "error", err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
-		return
-	}
-
-	// If debug mode is enabled, return the rendered template instead of calling the model
-	if req.DebugRenderOnly {
-		c.JSON(http.StatusOK, api.DebugTemplateResponse{
-			Model:     req.Model,
-			CreatedAt: time.Now().UTC(),
-			DebugInfo: api.DebugInfo{
-				RenderedTemplate: prompt,
-				ImageCount:       len(images),
-			},
-		})
-		return
-	}
-
-	// Validate Think value: string values currently only allowed for gptoss models
-	if req.Think != nil && req.Think.IsString() && !useHarmony {
-		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
-		return
 	}

 	var thinkingState *thinking.Parser
@@ -1705,7 +1670,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					toolName, toolContent := harmonyToolParser.Drain()
 					if toolName != nil {
 						*toolName = strings.TrimPrefix(*toolName, "functions.")
-						*toolName = harmonyMessageHandler.functionNameMap.OriginalFromConverted(*toolName)
 						var args api.ToolCallFunctionArguments
 						if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
 							errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
--- a/server/routes_debug_test.go
+++ b/server/routes_debug_test.go
@@ -1,413 +0,0 @@
-package server
-
-import (
-	"bytes"
-	"encoding/json"
-	"net/http"
-	"testing"
-	"time"
-
-	"github.com/gin-gonic/gin"
-	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/discover"
-	"github.com/ollama/ollama/fs/ggml"
-	"github.com/ollama/ollama/llm"
-)
-
-func TestGenerateDebugRenderOnly(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         llm.DoneReasonStop,
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      discover.GetGPUInfo,
-			getCpuFn:      discover.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	// Create a test model
-	stream := false
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:    "test-model",
-		Files:    map[string]string{"file.gguf": digest},
-		Template: "{{ .Prompt }}",
-		Stream:   &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	tests := []struct {
-		name            string
-		request         api.GenerateRequest
-		expectDebug     bool
-		expectTemplate  string
-		expectNumImages int
-	}{
-		{
-			name: "debug render only enabled",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello, world!",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "Hello, world!",
-		},
-		{
-			name: "debug render only disabled",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello, world!",
-				DebugRenderOnly: false,
-			},
-			expectDebug: false,
-		},
-		{
-			name: "debug render only with system prompt",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "User question",
-				System:          "You are a helpful assistant",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "User question",
-		},
-		{
-			name: "debug render only with template",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Hello",
-				Template:        "PROMPT: {{ .Prompt }}",
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "PROMPT: Hello",
-		},
-		{
-			name: "debug render only with images",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Describe this image",
-				Images:          []api.ImageData{[]byte("fake-image-data")},
-				DebugRenderOnly: true,
-			},
-			expectDebug:     true,
-			expectTemplate:  "[img-0]\n\nDescribe this image",
-			expectNumImages: 1,
-		},
-		{
-			name: "debug render only with raw mode",
-			request: api.GenerateRequest{
-				Model:           "test-model",
-				Prompt:          "Raw prompt text",
-				Raw:             true,
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "Raw prompt text",
-		},
-	}
-
-	for _, tt := range tests {
-		// Test both with and without streaming
-		streamValues := []bool{false, true}
-		for _, stream := range streamValues {
-			streamSuffix := ""
-			if stream {
-				streamSuffix = " (streaming)"
-			}
-			t.Run(tt.name+streamSuffix, func(t *testing.T) {
-				req := tt.request
-				req.Stream = &stream
-				w := createRequest(t, s.GenerateHandler, req)
-
-				if tt.expectDebug {
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
-					}
-
-					var response api.DebugTemplateResponse
-					if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
-						t.Fatalf("failed to unmarshal response: %v", err)
-					}
-
-					if response.Model != tt.request.Model {
-						t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
-					}
-
-					if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
-						t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
-					}
-
-					if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
-						t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
-					}
-				} else {
-					// When debug is disabled, it should attempt normal processing
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
-					}
-				}
-			})
-		}
-	}
-}
-
-func TestChatDebugRenderOnly(t *testing.T) {
-	gin.SetMode(gin.TestMode)
-
-	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
-			Done:               true,
-			DoneReason:         llm.DoneReasonStop,
-			PromptEvalCount:    1,
-			PromptEvalDuration: 1,
-			EvalCount:          1,
-			EvalDuration:       1,
-		},
-	}
-
-	s := Server{
-		sched: &Scheduler{
-			pendingReqCh:  make(chan *LlmRequest, 1),
-			finishedReqCh: make(chan *LlmRequest, 1),
-			expiredCh:     make(chan *runnerRef, 1),
-			unloadedCh:    make(chan any, 1),
-			loaded:        make(map[string]*runnerRef),
-			newServerFn:   newMockServer(&mock),
-			getGpuFn:      discover.GetGPUInfo,
-			getCpuFn:      discover.GetCPUInfo,
-			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
-				// add small delay to simulate loading
-				time.Sleep(time.Millisecond)
-				req.successCh <- &runnerRef{
-					llama: &mock,
-				}
-				return false
-			},
-		},
-	}
-
-	go s.sched.Run(t.Context())
-
-	// Create a test model
-	stream := false
-	_, digest := createBinFile(t, ggml.KV{
-		"general.architecture":          "llama",
-		"llama.block_count":             uint32(1),
-		"llama.context_length":          uint32(8192),
-		"llama.embedding_length":        uint32(4096),
-		"llama.attention.head_count":    uint32(32),
-		"llama.attention.head_count_kv": uint32(8),
-		"tokenizer.ggml.tokens":         []string{""},
-		"tokenizer.ggml.scores":         []float32{0},
-		"tokenizer.ggml.token_type":     []int32{0},
-	}, []*ggml.Tensor{
-		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_gate.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_up.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.ffn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_k.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_q.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "blk.0.attn_v.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-		{Name: "output.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
-	})
-
-	w := createRequest(t, s.CreateHandler, api.CreateRequest{
-		Model:    "test-model",
-		Files:    map[string]string{"file.gguf": digest},
-		Template: "{{ if .Tools }}{{ .Tools }}{{ end }}{{ range .Messages }}{{ .Role }}: {{ .Content }}\n{{ end }}",
-		Stream:   &stream,
-	})
-
-	if w.Code != http.StatusOK {
-		t.Fatalf("expected status 200, got %d", w.Code)
-	}
-
-	tests := []struct {
-		name            string
-		request         api.ChatRequest
-		expectDebug     bool
-		expectTemplate  string
-		expectNumImages int
-	}{
-		{
-			name: "chat debug render only enabled",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "system", Content: "You are a helpful assistant"},
-					{Role: "user", Content: "Hello"},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "system: You are a helpful assistant\nuser: Hello\n",
-		},
-		{
-			name: "chat debug render only disabled",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-				},
-				DebugRenderOnly: false,
-			},
-			expectDebug: false,
-		},
-		{
-			name: "chat debug with assistant message",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Hello"},
-					{Role: "assistant", Content: "Hi there!"},
-					{Role: "user", Content: "How are you?"},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "user: Hello\nassistant: Hi there!\nuser: How are you?\n",
-		},
-		{
-			name: "chat debug with images",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{
-						Role:    "user",
-						Content: "What's in this image?",
-						Images:  []api.ImageData{[]byte("fake-image-data")},
-					},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:     true,
-			expectTemplate:  "user: [img-0]What's in this image?\n",
-			expectNumImages: 1,
-		},
-		{
-			name: "chat debug with tools",
-			request: api.ChatRequest{
-				Model: "test-model",
-				Messages: []api.Message{
-					{Role: "user", Content: "Get the weather"},
-				},
-				Tools: api.Tools{
-					{
-						Type: "function",
-						Function: api.ToolFunction{
-							Name:        "get_weather",
-							Description: "Get weather information",
-						},
-					},
-				},
-				DebugRenderOnly: true,
-			},
-			expectDebug:    true,
-			expectTemplate: "[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\",\"description\":\"Get weather information\",\"parameters\":{\"type\":\"\",\"required\":null,\"properties\":null}}}]user: Get the weather\n",
-		},
-	}
-
-	for _, tt := range tests {
-		// Test both with and without streaming
-		streamValues := []bool{false, true}
-		for _, stream := range streamValues {
-			streamSuffix := ""
-			if stream {
-				streamSuffix = " (streaming)"
-			}
-			t.Run(tt.name+streamSuffix, func(t *testing.T) {
-				req := tt.request
-				req.Stream = &stream
-				w := createRequest(t, s.ChatHandler, req)
-
-				if tt.expectDebug {
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d, body: %s", http.StatusOK, w.Code, w.Body.String())
-					}
-
-					var response api.DebugTemplateResponse
-					if err := json.Unmarshal(w.Body.Bytes(), &response); err != nil {
-						t.Fatalf("failed to unmarshal response: %v", err)
-					}
-
-					if response.Model != tt.request.Model {
-						t.Errorf("expected model %s, got %s", tt.request.Model, response.Model)
-					}
-
-					if tt.expectTemplate != "" && response.DebugInfo.RenderedTemplate != tt.expectTemplate {
-						t.Errorf("expected template %q, got %q", tt.expectTemplate, response.DebugInfo.RenderedTemplate)
-					}
-
-					if tt.expectNumImages > 0 && response.DebugInfo.ImageCount != tt.expectNumImages {
-						t.Errorf("expected image count %d, got %d", tt.expectNumImages, response.DebugInfo.ImageCount)
-					}
-				} else {
-					// When debug is disabled, it should attempt normal processing
-					if w.Code != http.StatusOK {
-						t.Errorf("expected status %d, got %d", http.StatusOK, w.Code)
-					}
-				}
-			})
-		}
-	}
-}