fix tests

app/ui: add gemini-3-pro-preview to featured list
models: enable deepseek2 (deepseek v3.1 w/ MLA) on the new engine (#13151 )
2025-11-19 15:57:19 -05:00 · 2025-11-19 14:59:29 -05:00 · 2025-11-18 22:03:50 -08:00 · 2025-11-18 20:42:28 -08:00 · 2025-11-18 20:42:28 -08:00 · 2025-11-18 19:06:34 -08:00
41 changed files with 2200 additions and 362 deletions
--- a/app/ui/app/src/api.ts
+++ b/app/ui/app/src/api.ts
@@ -25,7 +25,7 @@ declare module "@/gotypes" {
 }

 Model.prototype.isCloud = function (): boolean {
-  return this.model.endsWith("cloud");
+  return this.model.endsWith("cloud") || this.model === "gemini-3-pro-preview";
 };

 // Helper function to convert Uint8Array to base64
--- a/app/ui/app/src/utils/mergeModels.test.ts
+++ b/app/ui/app/src/utils/mergeModels.test.ts
@@ -14,8 +14,8 @@ describe("Model merging logic", () => {
    const merged = mergeModels(localModels);

    // First verify cloud models are first and in FEATURED_MODELS order
-    const cloudModels = FEATURED_MODELS.filter((m: string) =>
-      m.endsWith("cloud"),
+    const cloudModels = FEATURED_MODELS.filter(
+      (m: string) => m.endsWith("cloud") || m === "gemini-3-pro-preview",
    );
    for (let i = 0; i < cloudModels.length; i++) {
      expect(merged[i].model).toBe(cloudModels[i]);
@@ -24,7 +24,7 @@ describe("Model merging logic", () => {

    // Then verify non-cloud featured models are next and in FEATURED_MODELS order
    const nonCloudFeatured = FEATURED_MODELS.filter(
-      (m: string) => !m.endsWith("cloud"),
+      (m: string) => !m.endsWith("cloud") && m !== "gemini-3-pro-preview",
    );
    for (let i = 0; i < nonCloudFeatured.length; i++) {
      const model = merged[i + cloudModels.length];
@@ -54,9 +54,9 @@ describe("Model merging logic", () => {
    const cloudModels = merged.filter((m) => m.isCloud());
    expect(cloudModels.length).toBe(0);

-    // Should have non-cloud featured models
+    // Should have non-cloud featured models (excluding gemini-3-pro-preview which is treated as cloud)
    const nonCloudFeatured = FEATURED_MODELS.filter(
-      (m) => !m.endsWith("cloud"),
+      (m) => !m.endsWith("cloud") && m !== "gemini-3-pro-preview",
    );
    for (let i = 0; i < nonCloudFeatured.length; i++) {
      const model = merged[i];
@@ -74,7 +74,9 @@ describe("Model merging logic", () => {
    const merged = mergeModels([]);

    // First verify cloud models are first and in FEATURED_MODELS order
-    const cloudModels = FEATURED_MODELS.filter((m) => m.endsWith("cloud"));
+    const cloudModels = FEATURED_MODELS.filter(
+      (m) => m.endsWith("cloud") || m === "gemini-3-pro-preview",
+    );
    for (let i = 0; i < cloudModels.length; i++) {
      expect(merged[i].model).toBe(cloudModels[i]);
      expect(merged[i].isCloud()).toBe(true);
@@ -82,7 +84,7 @@ describe("Model merging logic", () => {

    // Then verify non-cloud featured models are next and in FEATURED_MODELS order
    const nonCloudFeatured = FEATURED_MODELS.filter(
-      (m) => !m.endsWith("cloud"),
+      (m) => !m.endsWith("cloud") && m !== "gemini-3-pro-preview",
    );
    for (let i = 0; i < nonCloudFeatured.length; i++) {
      const model = merged[i + cloudModels.length];
@@ -104,7 +106,9 @@ describe("Model merging logic", () => {
    const merged = mergeModels(localModels);

    // First verify cloud models are first and in FEATURED_MODELS order
-    const cloudModels = FEATURED_MODELS.filter((m) => m.endsWith("cloud"));
+    const cloudModels = FEATURED_MODELS.filter(
+      (m) => m.endsWith("cloud") || m === "gemini-3-pro-preview",
+    );
    for (let i = 0; i < cloudModels.length; i++) {
      expect(merged[i].model).toBe(cloudModels[i]);
      expect(merged[i].isCloud()).toBe(true);
@@ -112,7 +116,7 @@ describe("Model merging logic", () => {

    // Then verify non-cloud featured models are next and in FEATURED_MODELS order
    const nonCloudFeatured = FEATURED_MODELS.filter(
-      (m) => !m.endsWith("cloud"),
+      (m) => !m.endsWith("cloud") && m !== "gemini-3-pro-preview",
    );
    for (let i = 0; i < nonCloudFeatured.length; i++) {
      const model = merged[i + cloudModels.length];
--- a/app/ui/app/src/utils/mergeModels.ts
+++ b/app/ui/app/src/utils/mergeModels.ts
@@ -4,6 +4,7 @@ import { Model } from "@/gotypes";
 export const FEATURED_MODELS = [
  "gpt-oss:120b-cloud",
  "gpt-oss:20b-cloud",
+  "gemini-3-pro-preview",
  "deepseek-v3.1:671b-cloud",
  "qwen3-coder:480b-cloud",
  "qwen3-vl:235b-cloud",
@@ -40,7 +41,9 @@ export function mergeModels(
  const cloudModels = [...allModels.filter((m) => m.isCloud())];

  // Add any cloud models from FEATURED_MODELS that aren't in local models
-  FEATURED_MODELS.filter((f) => f.endsWith("cloud")).forEach((cloudModel) => {
+  FEATURED_MODELS.filter(
+    (f) => f.endsWith("cloud") || f === "gemini-3-pro-preview",
+  ).forEach((cloudModel) => {
    if (!cloudModels.some((m) => m.model === cloudModel)) {
      cloudModels.push(new Model({ model: cloudModel }));
    }
@@ -48,7 +51,7 @@ export function mergeModels(

  // 2. Get other featured models (non-cloud)
  const featuredModels = FEATURED_MODELS.filter(
-    (f) => !f.endsWith("cloud"),
+    (f) => !f.endsWith("cloud") && f !== "gemini-3-pro-preview",
  ).map((model) => {
    // Check if this model exists in local models
    const localMatch = allModels.find(
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -206,6 +206,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
 		conv = &commandrModel{}
 	case "GptOssForCausalLM":
 		conv = &gptossModel{}
+	case "DeepseekOCRForCausalLM":
+		conv = &deepseekocr{}
 	default:
 		return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
 	}
--- a/convert/convert_deepseekocr.go
+++ b/convert/convert_deepseekocr.go
@@ -0,0 +1,136 @@
+package convert
+
+import (
+	"fmt"
+
+	"github.com/ollama/ollama/fs/ggml"
+)
+
+type deepseekocr struct {
+	ModelParameters
+	LanguageConfig struct {
+		MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
+		HiddenSize            uint32 `json:"hidden_size"`
+		HiddenLayers          uint32 `json:"num_hidden_layers"`
+		IntermediateSize      uint32 `json:"intermediate_size"`
+		NumAttentionHeads     uint32 `json:"num_attention_heads"`
+		NumKeyValueHeads      uint32 `json:"num_key_value_heads"`
+		NumRoutedExperts      uint32 `json:"n_routed_experts"`
+		NumSharedExperts      uint32 `json:"n_shared_experts"`
+		NumExpertsPerToken    uint32 `json:"num_experts_per_tok"`
+		FirstKDenseReplace    uint32 `json:"first_k_dense_replace"`
+	} `json:"language_config"`
+
+	VisionConfig struct {
+		ImageSize uint32 `json:"image_size"`
+		Width     struct {
+			Vision struct {
+				Heads     uint32 `json:"heads"`
+				ImageSize uint32 `json:"image_size"`
+				Layers    uint32 `json:"layers"`
+				PatchSize uint32 `json:"patch_size"`
+				Width     uint32 `json:"width"`
+			} `json:"clip-l-14-224"`
+			Sam struct {
+				GlobalAttentionIndexes []int32 `json:"global_attn_indexes"`
+				Heads                  uint32  `json:"heads"`
+				Layers                 uint32  `json:"layers"`
+				Width                  uint32  `json:"width"`
+			} `json:"sam_vit_b"`
+		}
+	} `json:"vision_config"`
+}
+
+func (m *deepseekocr) KV(t *Tokenizer) ggml.KV {
+	kv := m.ModelParameters.KV(t)
+	kv["general.architecture"] = "deepseekocr"
+	kv["block_count"] = m.LanguageConfig.HiddenLayers
+	kv["context_length"] = m.LanguageConfig.MaxPositionEmbeddings
+	kv["embedding_length"] = m.LanguageConfig.HiddenSize
+	kv["feed_forward_length"] = m.LanguageConfig.IntermediateSize
+	kv["attention.head_count"] = m.LanguageConfig.NumAttentionHeads
+	kv["attention.head_count_kv"] = m.LanguageConfig.NumKeyValueHeads
+	kv["expert_count"] = m.LanguageConfig.NumRoutedExperts
+	kv["expert_used_count"] = m.LanguageConfig.NumExpertsPerToken
+	kv["leading_dense_block_count"] = m.LanguageConfig.FirstKDenseReplace
+
+	kv["vision.block_count"] = m.VisionConfig.Width.Vision.Layers
+	kv["vision.embedding_length"] = m.VisionConfig.Width.Vision.Width
+	kv["vision.head_count"] = m.VisionConfig.Width.Vision.Heads
+	kv["vision.image_size"] = m.VisionConfig.Width.Vision.ImageSize
+	kv["vision.patch_size"] = m.VisionConfig.Width.Vision.PatchSize
+
+	kv["sam.block_count"] = m.VisionConfig.Width.Sam.Layers
+	kv["sam.embedding_length"] = m.VisionConfig.Width.Sam.Width
+	kv["sam.head_count"] = m.VisionConfig.Width.Sam.Heads
+	kv["sam.global_attention_indexes"] = m.VisionConfig.Width.Sam.GlobalAttentionIndexes
+	return kv
+}
+
+func (m *deepseekocr) Tensors(s []Tensor) (out []*ggml.Tensor) {
+	merges := make([]merge, m.LanguageConfig.HiddenLayers*3)
+	for i := range m.LanguageConfig.HiddenLayers {
+		merges[i*3+0] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.gate_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
+		}
+		merges[i*3+1] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.up_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
+		}
+		merges[i*3+2] = merge{
+			fmt.Sprintf("blk.%d.mlp.experts.*.down_proj.weight", i),
+			fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
+		}
+	}
+
+	out, s = mergeTensors(s, merges...)
+	for _, t := range s {
+		out = append(out, &ggml.Tensor{
+			Name:     t.Name(),
+			Kind:     t.Kind(),
+			Shape:    t.Shape(),
+			WriterTo: t,
+		})
+	}
+	return out
+}
+
+func (m *deepseekocr) Replacements() []string {
+	return []string{
+		"model.embed_tokens", "token_embd",
+		"model.layers", "blk",
+		"input_layernorm", "attn_norm",
+		"self_attn.q_proj", "attn_q",
+		"self_attn.k_proj", "attn_k",
+		"self_attn.v_proj", "attn_v",
+		"self_attn.o_proj", "attn_output",
+		"post_attention_layernorm", "ffn_norm",
+		"mlp.gate_proj", "ffn_gate",
+		"mlp.up_proj", "ffn_up",
+		"mlp.down_proj", "ffn_down",
+		"mlp.gate", "ffn_gate_inp",
+		"mlp.shared_experts.gate_proj", "ffn_gate_shexp",
+		"mlp.shared_experts.up_proj", "ffn_up_shexp",
+		"mlp.shared_experts.down_proj", "ffn_down_shexp",
+		"model.norm", "output_norm",
+		"lm_head", "output",
+
+		"model.vision_model", "v",
+		"embeddings.patch_embedding", "patch_embd",
+		"embeddings.class_embedding", "class_embd",
+		"embeddings.position_embedding", "position_embd",
+		"transformer.layers", "blk",
+
+		"model.projector", "mm",
+		"model.image_newline", "mm.image_newline",
+		//nolint:misspell // this misspelling is upstream. fixing it breaks the model
+		"model.view_seperator", "mm.view_seperator",
+
+		"model.sam_model.patch_embed.proj", "s.patch_embd",
+		"model.sam_model.pos_embed", "s.position_embd",
+		"model.sam_model.blocks", "s.blk",
+		"model.sam_model.neck", "s.neck",
+		"model.sam_model.net_", "s.net_",
+	}
+}
--- a/convert/reader.go
+++ b/convert/reader.go
@@ -44,7 +44,10 @@ func (t tensorBase) Kind() uint32 {
 		t.name == "v.positional_embedding_vlm" ||
 		t.name == "v.tile_position_embd.weight" ||
 		t.name == "v.pre_tile_position_embd.weight" ||
-		t.name == "v.post_tile_position_embd.weight" {
+		t.name == "v.post_tile_position_embd.weight" ||
+		t.name == "s.position_embd" ||
+		strings.HasSuffix(t.name, "rel_pos_h") ||
+		strings.HasSuffix(t.name, "rel_pos_w") {
 		// these tensors are always F32
 		return tensorKindFP32
 	}
--- a/convert/reader_safetensors.go
+++ b/convert/reader_safetensors.go
@@ -96,7 +96,10 @@ type safetensor struct {

 func (st safetensor) Kind() uint32 {
 	kind := st.tensorBase.Kind()
-	if !strings.HasPrefix(st.name, "v.") && st.dtype == "BF16" && kind != tensorKindFP32 {
+	if st.dtype == "BF16" &&
+		!strings.HasPrefix(st.name, "v.") &&
+		!strings.HasPrefix(st.name, "s.") &&
+		kind != tensorKindFP32 {
 		kind = tensorKindBF16
 	}

--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -249,6 +249,8 @@ func (kv KV) OllamaEngineRequired() bool {
 		"qwen25vl",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
+		"deepseekocr",
+		"deepseek2",
 	}, kv.Architecture())
 }

--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -3,7 +3,6 @@ package kvcache
 import (
 	"errors"
 	"fmt"
-	"log/slog"
 	"math"
 	"slices"

@@ -40,18 +39,18 @@ type Causal struct {

 	// ** current forward pass **

-	// the active layer for Get and Put
-	curLayer int
-
-	// starting location for data storage for this batch
-	curLoc int
-
 	// size of the current batch
 	curBatchSize int

+	// locations for data storage for this batch
+	curLoc ml.Tensor
+
 	// mask of the cache as used by this batch
 	curMask ml.Tensor

+	// the active layer for Get and Put
+	curLayer int
+
 	// locations in the cache that are needed for this batch
 	curCellRange cellRange

@@ -206,45 +205,47 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 	c.curPositions = batch.Positions
 	c.opts.Except = nil

+	var locs []int32
 	if !reserve {
 		c.updateSlidingWindow()

 		var err error
-		c.curLoc, err = c.findStartLoc()
-		if errors.Is(err, ErrKvCacheFull) {
-			c.defrag()
-			c.curLoc, err = c.findStartLoc()
-		}
+		locs, err = c.findLocs()
 		if err != nil {
 			return err
 		}

 		for i, pos := range batch.Positions {
 			seq := batch.Sequences[i]
+			loc := int(locs[i])

-			c.cells[c.curLoc+i] = cacheCell{pos: pos, sequences: []int{seq}}
+			c.cells[loc] = cacheCell{pos: pos, sequences: []int{seq}}

 			seqRange, ok := c.cellRanges[seq]
 			if !ok {
 				seqRange = newRange()
 			}

-			seqRange.min = min(seqRange.min, c.curLoc+i)
-			c.curCellRange.min = min(c.curCellRange.min, c.curLoc+i)
+			seqRange.min = min(seqRange.min, loc)
+			c.curCellRange.min = min(c.curCellRange.min, loc)

-			seqRange.max = max(seqRange.max, c.curLoc+i)
-			c.curCellRange.max = max(c.curCellRange.max, c.curLoc+i)
+			seqRange.max = max(seqRange.max, loc)
+			c.curCellRange.max = max(c.curCellRange.max, loc)

 			c.cellRanges[seq] = seqRange
 		}
 	} else {
 		// If we are reserving memory, don't update any of the cache metadata but set the size
 		// to the worst case.
-		c.curLoc = 0
+		locs = make([]int32, c.curBatchSize)
+		for i := range locs {
+			locs[i] = int32(i)
+		}
 		c.curCellRange.min = 0
 		c.curCellRange.max = len(c.cells) - 1
 	}

+	c.curLoc = ctx.Input().FromInts(locs, len(locs))
 	c.curMask = c.buildMask(ctx)

 	return nil
@@ -257,22 +258,20 @@ func newRange() cellRange {
 	}
 }

-// Find the first contiguous block of at least curBatchSize
-func (c *Causal) findStartLoc() (int, error) {
-	var start, count int
+// Returns a slice of locations where each token in the batch should be stored
+func (c *Causal) findLocs() ([]int32, error) {
+	loc := make([]int32, 0, c.curBatchSize)
+
 	for i := range c.cells {
 		if len(c.cells[i].sequences) == 0 {
-			count++
-			if count >= c.curBatchSize {
-				return start, nil
+			loc = append(loc, int32(i))
+			if len(loc) >= c.curBatchSize {
+				return loc, nil
 			}
-		} else {
-			start = i + 1
-			count = 0
 		}
 	}

-	return 0, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
+	return nil, fmt.Errorf("%w (cache: %v batch: %v)", ErrKvCacheFull, len(c.cells), c.curBatchSize)
 }

 func (c *Causal) updateSlidingWindow() {
@@ -402,145 +401,6 @@ func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	return maskTensor
 }

-func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
-	for i, key := range c.keys {
-		if key == nil {
-			continue
-		}
-
-		kHeadDim := key.Dim(0)
-		numKVHeads := key.Dim(1)
-		rowSize := key.Stride(2)
-
-		kSrcView := key.View(ctx, rowSize*src, kHeadDim*numKVHeads*length)
-		kDstView := key.View(ctx, rowSize*dst, kHeadDim*numKVHeads*length)
-
-		value := c.values[i]
-		var vSrcView, vDstView ml.Tensor
-		if c.config.PermutedV {
-			vHeadDim := value.Dim(1)
-			elemSize := value.Stride(0)
-
-			vSrcView = value.View(ctx, elemSize*src, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-			vDstView = value.View(ctx, elemSize*dst, length, len(c.cells)*elemSize, vHeadDim*numKVHeads)
-		} else {
-			vHeadDim := value.Dim(0)
-			rowSize := value.Stride(2)
-
-			vSrcView = value.View(ctx, rowSize*src, vHeadDim*numKVHeads*length)
-			vDstView = value.View(ctx, rowSize*dst, vHeadDim*numKVHeads*length)
-		}
-
-		ctx.Forward(
-			kSrcView.Copy(ctx, kDstView),
-			vSrcView.Copy(ctx, vDstView),
-		)
-	}
-}
-
-func (c *Causal) defrag() {
-	slog.Debug("defragmenting kv cache")
-
-	// Defrag strategy:
-	// - Search for empty holes at the beginning of the cache,
-	//   filling them with active data starting at the end
-	// - If there are contiguous elements that need to be moved,
-	//   combine them into a single operation by holding new moves
-	//   until we see that the next one is non-contiguous
-	// - Fill up the context with the maximum number of operations it
-	//   can hold then compute that and continue with a new context
-	//
-	// We could try to optimize placement by grouping blocks from
-	// the same sequences together but most likely the next forward
-	// pass will disrupt this anyways, so the real world benefit
-	// seems limited as this time.
-
-	ctx := c.backend.NewContext()
-
-	// For every move, 6 tensors are required per layer (2 views and a
-	// copy for each of k and v). We also need to refer to the original
-	// k and v cache tensors - once per layer, not per move.
-	layers := 0
-	for _, key := range c.keys {
-		if key == nil {
-			continue
-		}
-		layers++
-	}
-
-	maxMoves := (ctx.MaxGraphNodes() - 2*layers) / (6 * layers)
-	moves := 0
-
-	var pendingSrc, pendingDst, pendingLen int
-	src := len(c.cells) - 1
-
-	for dst := 0; dst < src; dst++ {
-		if len(c.cells[dst].sequences) == 0 {
-			for ; src > dst; src-- {
-				if len(c.cells[src].sequences) != 0 {
-					c.cells[dst] = c.cells[src]
-					c.cells[src] = cacheCell{}
-
-					if pendingLen > 0 {
-						if src == pendingSrc-pendingLen && dst == pendingDst+pendingLen {
-							pendingSrc = src
-							pendingLen++
-							break
-						} else {
-							c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
-							moves++
-						}
-					}
-
-					pendingSrc = src
-					pendingDst = dst
-					pendingLen = 1
-
-					break
-				}
-			}
-		}
-
-		if moves >= maxMoves {
-			ctx.Compute()
-			ctx.Close()
-			ctx = c.backend.NewContext()
-
-			moves = 0
-		}
-	}
-
-	if pendingLen > 0 {
-		c.moveCells(ctx, pendingSrc, pendingDst, pendingLen)
-		moves++
-	}
-
-	if moves > 0 {
-		ctx.Compute()
-	}
-	ctx.Close()
-
-	// Reset range metadata
-	for seq := range c.cellRanges {
-		seqRange := newRange()
-
-		for i, cell := range c.cells {
-			if slices.Contains(cell.sequences, seq) {
-				if i < seqRange.min {
-					seqRange.min = i
-				}
-				if i > seqRange.max {
-					seqRange.max = i
-				}
-			}
-		}
-
-		c.cellRanges[seq] = seqRange
-	}
-
-	c.updateSlidingWindow()
-}
-
 func (c *Causal) SetLayer(layer int) {
 	c.curLayer = layer
 }
@@ -625,18 +485,25 @@ func (c *Causal) Put(ctx ml.Context, key, value ml.Tensor) {
 		}
 	}

-	rowSize := c.keys[c.curLayer].Stride(2)
-	ctx.Forward(key.Copy(ctx, c.keys[c.curLayer].View(ctx, rowSize*c.curLoc, kHeadDim*numKVHeads*batchSize)))
+	key = key.Reshape(ctx, kHeadDim*numKVHeads, batchSize)
+	keyCache := c.keys[c.curLayer]
+	keyCache = keyCache.Reshape(ctx, kHeadDim*numKVHeads, len(c.cells))
+	ctx.Forward(keyCache.SetRows(ctx, key, c.curLoc))

 	if c.config.PermutedV {
-		elemSize := c.values[c.curLayer].Stride(0)
+		value = value.Reshape(ctx, vHeadDim*numKVHeads, 1, batchSize)
+		value = value.Permute(ctx, 2, 0, 1, 3)

-		value = value.Permute(ctx, 1, 2, 0, 3)
-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, elemSize*c.curLoc, batchSize, len(c.cells)*elemSize, vHeadDim*numKVHeads)))
+		valueCache := c.values[c.curLayer]
+		valueCache = valueCache.Reshape(ctx, 1, len(c.cells), vHeadDim*numKVHeads)
+
+		ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
 	} else {
-		rowSize := c.values[c.curLayer].Stride(2)
+		value = value.Reshape(ctx, vHeadDim*numKVHeads, batchSize)
+		valueCache := c.values[c.curLayer]
+		valueCache = valueCache.Reshape(ctx, vHeadDim*numKVHeads, len(c.cells))

-		ctx.Forward(value.Copy(ctx, c.values[c.curLayer].View(ctx, rowSize*c.curLoc, vHeadDim*numKVHeads*batchSize)))
+		ctx.Forward(valueCache.SetRows(ctx, value, c.curLoc))
 	}
 }

--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -207,11 +207,11 @@ func TestSWAMem(t *testing.T) {
 			inShape:       []int{1, 1, 2},
 			seqs:          []int{0, 0},
 			pos:           []int32{4, 5},
-			expected:      []float32{4, 5, 6},
-			expectedShape: []int{1, 1, 3},
+			expected:      []float32{5, 2, 3, 4, 6},
+			expectedShape: []int{1, 1, 5},
 			expectedMask: []float32{
-				0, 0, x,
-				x, 0, 0,
+				0, x, x, 0, x,
+				0, x, x, x, 0,
 			},
 		},
 	}
@@ -319,6 +319,8 @@ func TestRemove(t *testing.T) {

 	cache.Init(backend, ml.DTypeF16, 1, 16, 16)

+	x := float32(math.Inf(-1))
+
 	tests := []testCase{
 		{
 			name:          "FirstBatch",
@@ -328,7 +330,12 @@ func TestRemove(t *testing.T) {
 			pos:           []int32{0, 1, 0, 1},
 			expected:      []float32{1, 2, 3, 4},
 			expectedShape: []int{1, 1, 4},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0},
+			expectedMask: []float32{
+				0, x, x, x,
+				0, 0, x, x,
+				x, x, 0, x,
+				x, x, 0, 0,
+			},
 		},
 	}

@@ -346,9 +353,12 @@ func TestRemove(t *testing.T) {
 			inShape:       []int{1, 1, 2},
 			seqs:          []int{0, 1},
 			pos:           []int32{1, 2},
-			expected:      []float32{1, 2, 3, 4, 5, 6},
-			expectedShape: []int{1, 1, 6},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), 0},
+			expected:      []float32{1, 5, 3, 4, 6},
+			expectedShape: []int{1, 1, 5},
+			expectedMask: []float32{
+				0, 0, x, x, x,
+				x, x, 0, 0, 0,
+			},
 		},
 	}

@@ -366,59 +376,12 @@ func TestRemove(t *testing.T) {
 			inShape:       []int{1, 1, 2},
 			seqs:          []int{0, 0},
 			pos:           []int32{1, 2},
-			expected:      []float32{7, 8, 3, 4, 4},
-			expectedShape: []int{1, 1, 5},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-}
-
-func TestDefrag(t *testing.T) {
-	backend := &testBackend{}
-	cache := NewCausalCache(func(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-		return key.Add(ctx, shift), nil
-	})
-	defer cache.Close()
-
-	cache.Init(backend, ml.DTypeF16, 1, 16, 16)
-
-	tests := []testCase{
-		{
-			name:          "FirstBatch",
-			in:            []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-			inShape:       []int{1, 1, 16},
-			seqs:          []int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-			pos:           []int32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-			expected:      []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-			expectedShape: []int{1, 1, 16},
-			expectedMask:  []float32{0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-		},
-	}
-
-	testCache(t, backend, cache, tests)
-
-	err := cache.Remove(0, 2, 4)
-	if err != nil {
-		panic(err)
-	}
-
-	err = cache.Remove(0, 13, math.MaxInt32)
-	if err != nil {
-		panic(err)
-	}
-
-	tests = []testCase{
-		{
-			name:          "Defrag",
-			in:            []float32{17, 18, 19},
-			inShape:       []int{1, 1, 3},
-			seqs:          []int{0, 0, 0},
-			pos:           []int32{16, 17, 18},
-			expected:      []float32{1, 2, 12, 13, 3, 4, 5, 6, 7, 8, 9, 10, 11, 17, 18, 19},
-			expectedShape: []int{1, 1, 16},
-			expectedMask:  []float32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, float32(math.Inf(-1)), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+			expected:      []float32{7, 4, 3, 4, 6, 8},
+			expectedShape: []int{1, 1, 6},
+			expectedMask: []float32{
+				0, 0, x, x, x, x,
+				0, 0, x, x, x, 0,
+			},
 		},
 	}

@@ -770,6 +733,15 @@ func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return out
 }

+func (t *testTensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
+	return &testTensor{
+		dtype:       t.dtype,
+		elementSize: t.elementSize,
+		data:        t.data,
+		shape:       shape,
+	}
+}
+
 func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	offset /= t.elementSize

@@ -778,6 +750,8 @@ func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	switch len(shape) {
 	case 1:
 		s = []int{shape[0]}
+	case 3:
+		s = []int{shape[0], shape[2]}
 	case 5:
 		s = []int{shape[0], shape[2], shape[4]}
 	default:
@@ -792,6 +766,86 @@ func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	return view
 }

+func (t *testTensor) SetRows(ctx ml.Context, src ml.Tensor, idxs ml.Tensor) ml.Tensor {
+	dst := t
+	srcTensor := src.(*testTensor)
+	idxTensor := idxs.(*testTensor)
+
+	shapeTo4D := func(shape []int) [4]int {
+		out := [4]int{1, 1, 1, 1}
+		for i := 0; i < len(shape) && i < 4; i++ {
+			out[i] = shape[i]
+		}
+		return out
+	}
+
+	computeStrides := func(shape [4]int) [4]int {
+		out := [4]int{1, 1, 1, 1}
+		for i := 1; i < 4; i++ {
+			out[i] = out[i-1] * shape[i-1]
+		}
+		return out
+	}
+
+	dstShape4D := shapeTo4D(dst.shape)
+	srcShape4D := shapeTo4D(srcTensor.shape)
+	idxShape4D := shapeTo4D(idxTensor.shape)
+
+	if dstShape4D[0] != srcShape4D[0] || dstShape4D[2] != srcShape4D[2] || dstShape4D[3] != srcShape4D[3] {
+		panic("SetRows requires matching tensor shapes")
+	}
+
+	if srcShape4D[1] != idxShape4D[0] {
+		panic("SetRows rows/index mismatch")
+	}
+
+	if srcShape4D[2]%idxShape4D[1] != 0 || srcShape4D[3]%idxShape4D[2] != 0 {
+		panic("SetRows cannot broadcast indices")
+	}
+
+	if idxShape4D[3] != 1 {
+		panic("SetRows expects 1D or 2D index tensors")
+	}
+
+	dstStride := computeStrides(dstShape4D)
+	srcStride := computeStrides(srcShape4D)
+	idxStride := computeStrides(idxShape4D)
+
+	numColumns := srcShape4D[0]
+	numRows := srcShape4D[1]
+
+	for dim3Index := range dstShape4D[3] {
+		for dim2Index := range dstShape4D[2] {
+			idxDim2 := 0
+			idxDim3 := 0
+			if idxShape4D[1] > 0 {
+				idxDim2 = dim2Index % idxShape4D[1]
+			}
+			if idxShape4D[2] > 0 {
+				idxDim3 = dim3Index % idxShape4D[2]
+			}
+
+			idxBase := idxDim3*idxStride[2] + idxDim2*idxStride[1]
+			srcBase := dim3Index*srcStride[3] + dim2Index*srcStride[2]
+			dstBase := dim3Index*dstStride[3] + dim2Index*dstStride[2]
+
+			for row := range numRows {
+				idx := int(idxTensor.data[idxBase+row*idxStride[0]])
+				if idx < 0 || idx >= dstShape4D[1] {
+					panic("SetRows index out of range")
+				}
+
+				srcOffset := srcBase + row*srcStride[1]
+				dstOffset := dstBase + idx*dstStride[1]
+
+				copy(dst.data[dstOffset:dstOffset+numColumns], srcTensor.data[srcOffset:srcOffset+numColumns])
+			}
+		}
+	}
+
+	return dst
+}
+
 func (t *testTensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	copy(t2.(*testTensor).data, t.data)
 	return nil
--- a/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
+++ b/llama/patches/0028-Add-memory-detection-using-DXGI-PDH.patch
@@ -38,7 +38,7 @@ index 44ae76d66..639d551a2 100644
 #ifdef __cplusplus
 }
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index d2c278a35..221e29509 100644
+index ca02ea079..c12b069e5 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -73,6 +73,7 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
--- a/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
+++ b/llama/patches/0029-vulkan-Call-ggml_vk_buffer_write_2d-from-ggml_vk_buf.patch
@@ -11,7 +11,7 @@ vidmem optimization.
 1 file changed, 1 insertion(+), 4 deletions(-)

 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 221e29509..18b7cbccf 100644
+index c12b069e5..76c78c2ea 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5654,14 +5654,11 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
--- a/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
+++ b/llama/patches/0030-Vulkan-MMQ-Integer-Dot-Refactor-and-K-Quant-support-.patch
@@ -50,7 +50,7 @@ Subject: [PATCH] Vulkan MMQ Integer Dot Refactor and K-Quant support (#16536)
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl

 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 18b7cbccf..53b57c179 100644
+index 76c78c2ea..7669ed206 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -488,6 +488,7 @@ struct vk_device_struct {
--- a/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
+++ b/llama/patches/0031-vulkan-Update-topk_moe-fusion-to-handle-gpt-s-late-s.patch
@@ -58,7 +58,7 @@ index 639d551a2..e5c446d1d 100644
 GGML_API size_t gguf_type_size(enum gguf_type type);
 GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 53b57c179..b2855b078 100644
+index 7669ed206..63a762ec2 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -387,12 +387,76 @@ static constexpr uint32_t num_argsort_pipelines = 11;
--- a/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
+++ b/llama/patches/0032-vulkan-Fuse-rope-set_rows-16769.patch
@@ -31,7 +31,7 @@ Add new backend tests.
 6 files changed, 371 insertions(+), 117 deletions(-)

 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index b2855b078..aaf4334b5 100644
+index 63a762ec2..db92a7901 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -458,6 +458,11 @@ static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
--- a/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
+++ b/llama/patches/0033-vulkan-Handle-argsort-with-a-large-number-of-rows-16.patch
@@ -9,7 +9,7 @@ Subject: [PATCH] vulkan: Handle argsort with a large number of rows (#16851)
 2 files changed, 16 insertions(+), 4 deletions(-)

 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index aaf4334b5..3604ceb04 100644
+index db92a7901..e959674d1 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -1084,6 +1084,7 @@ struct vk_op_soft_max_push_constants {
--- a/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
+++ b/llama/patches/0035-vulkan-Fix-crash-when-FP16-mul_mat-accumulation-is-n.patch
@@ -20,7 +20,7 @@ Subject: [PATCH] vulkan: Fix crash when FP16 mul_mat accumulation is not
 1 file changed, 13 insertions(+), 7 deletions(-)

 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
-index 3604ceb04..80185d9f0 100644
+index e959674d1..903050b0b 100644
 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
 +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -146,8 +146,13 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
--- a/llama/patches/0036-ggml-cuda-skip-large-batches.patch
+++ b/llama/patches/0036-ggml-cuda-skip-large-batches.patch
@@ -0,0 +1,25 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <git@mxy.ng>
+Date: Tue, 18 Nov 2025 11:13:04 -0800
+Subject: [PATCH] ggml-cuda: skip large batches
+
+cuda panics on batches larger than 1024 so mark it as unsupported to
+fallback to cpu
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index f1a20e7fe..1a71e07c9 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
+                 if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
+                     return false;
+                 }
+                if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
+                    return false;
+                }
+ #ifdef GGML_USE_MUSA
+                 const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
+                 if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
--- a/llama/patches/0036-win-exit-instead-of-abort.patch
+++ b/llama/patches/0036-win-exit-instead-of-abort.patch
@@ -0,0 +1,28 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Tue, 18 Nov 2025 09:58:23 -0800
+Subject: [PATCH] win: exit instead of abort
+
+---
+ ggml/src/ggml.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
+index 9be35c1be..923c33d05 100644
+--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
+@@ -229,8 +229,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
+         fprintf(stderr, "%s\n", message);
+         ggml_print_backtrace();
+     }
+-
+#if defined(_WIN32)
+    fflush(stderr);
+    fflush(stdout);
+    exit(1);
+#else
+     abort();
+#endif
+ }
+ 
+ // ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -173,6 +173,7 @@ type Tensor interface {
 	Cos(ctx Context) Tensor
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context, up ...Tensor) Tensor
+	QuickGELU(ctx Context, up ...Tensor) Tensor
 	SILU(ctx Context, up ...Tensor) Tensor
 	RELU(ctx Context, up ...Tensor) Tensor
 	Sigmoid(ctx Context) Tensor
@@ -193,6 +194,7 @@ type Tensor interface {
 	Repeat(ctx Context, dim, n int) Tensor
 	Concat(ctx Context, t2 Tensor, dim int) Tensor
 	Rows(ctx Context, t2 Tensor) Tensor
+	SetRows(ctx Context, src Tensor, idxs Tensor) Tensor
 	Copy(ctx Context, t2 Tensor) Tensor
 	Duplicate(ctx Context) Tensor

@@ -207,6 +209,8 @@ type Tensor interface {
 	Stddev(ctx Context) Tensor
 	Sqr(ctx Context) Tensor
 	Sqrt(ctx Context) Tensor
+
+	Interpolate(ctx Context, dims [4]int, samplingMode SamplingMode) Tensor
 }

 // ScaledDotProductAttention implements a fused attention
@@ -372,3 +376,10 @@ const (
 	DTypeI32
 	DTypeMXFP4
 )
+
+type SamplingMode int
+
+const (
+	SamplingModeNearest SamplingMode = iota
+	SamplingModeBilinear
+)
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -314,7 +314,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
 			"altup_proj", "altup_unembd_proj",
 			"per_layer_token_embd", "per_layer_model_proj", "per_layer_proj_norm"):
 			createTensor(tensor{source: t}, output.bts, blocks)
-		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
+		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm.") || strings.HasPrefix(t.Name, "s."):
 			// TODO: assign vision tensors to the gpu if possible
 			createTensor(tensor{source: t}, output.bts, blocks)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
@@ -1338,6 +1338,13 @@ func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

+func (t *Tensor) SetRows(ctx ml.Context, src ml.Tensor, idxs ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_set_rows(ctx.(*Context).ctx, t.t, src.(*Tensor).t, idxs.(*Tensor).t),
+	}
+}
+
 func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1378,6 +1385,10 @@ func inferShape(t *Tensor, shape []int) {
 }

 func (t *Tensor) Reshape(ctx ml.Context, shape ...int) ml.Tensor {
+	if !C.ggml_is_contiguous(t.t) {
+		return t.Contiguous(ctx, shape...)
+	}
+
 	if slices.Contains(shape, -1) {
 		inferShape(t, shape)
 	}
@@ -1567,6 +1578,16 @@ func (t *Tensor) GELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
 	}
 }

+func (t *Tensor) QuickGELU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
+	var tt *C.struct_ggml_tensor
+	if len(t2) > 0 {
+		tt = C.ggml_geglu_quick_split(ctx.(*Context).ctx, t.t, t2[0].(*Tensor).t)
+	} else {
+		tt = C.ggml_gelu_quick_inplace(ctx.(*Context).ctx, t.t)
+	}
+	return &Tensor{b: t.b, t: tt}
+}
+
 func (t *Tensor) SILU(ctx ml.Context, t2 ...ml.Tensor) ml.Tensor {
 	if len(t2) > 0 {
 		return &Tensor{
@@ -1724,6 +1745,23 @@ func (t *Tensor) Sqrt(ctx ml.Context) ml.Tensor {
 	}
 }

+func (t *Tensor) Interpolate(ctx ml.Context, dims [4]int, samplingMode ml.SamplingMode) ml.Tensor {
+	var mode C.uint32_t
+	switch samplingMode {
+	case ml.SamplingModeNearest:
+		mode = C.GGML_SCALE_MODE_NEAREST
+	case ml.SamplingModeBilinear:
+		mode = C.GGML_SCALE_MODE_BILINEAR
+	default:
+		panic("unsupported interpolate mode")
+	}
+
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_interpolate(ctx.(*Context).ctx, t.t, C.int64_t(dims[0]), C.int64_t(dims[1]), C.int64_t(dims[2]), C.int64_t(dims[3]), mode),
+	}
+}
+
 // Slice returns a view of the tensor sliced along dim from low to high in step steps.
 // Slice panics if the dimension is invalid or the slice parameters are out of range.
 // If dim=0 and step>1, the tensor is a copy rather than a view to ensure proper shape.
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3677,6 +3677,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                    return false;
                }
+                if (op->op == GGML_OP_MUL_MAT && b->ne[2] * b->ne[3] > 1024) {
+                    return false;
+                }
 #ifdef GGML_USE_MUSA
                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
                if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -229,8 +229,13 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
        fprintf(stderr, "%s\n", message);
        ggml_print_backtrace();
    }
-
+#if defined(_WIN32)
+    fflush(stderr);
+    fflush(stdout);
+    exit(1);
+#else
    abort();
+#endif
 }

 // ggml_print_backtrace is registered with std::set_terminate by ggml.cpp
--- a/model/imageproc/images.go
+++ b/model/imageproc/images.go
@@ -25,12 +25,15 @@ const (

 // Composite returns an image with the alpha channel removed by drawing over a white background.
 func Composite(img image.Image) image.Image {
-	dst := image.NewRGBA(img.Bounds())
-
 	white := color.RGBA{255, 255, 255, 255}
-	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
-	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
+	return CompositeColor(img, white)
+}

+// CompositeColor returns an image with the alpha channel removed by drawing over a white background.
+func CompositeColor(img image.Image, color color.Color) image.Image {
+	dst := image.NewRGBA(img.Bounds())
+	draw.Draw(dst, dst.Bounds(), &image.Uniform{color}, image.Point{}, draw.Src)
+	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
 	return dst
 }

@@ -55,6 +58,31 @@ func Resize(img image.Image, newSize image.Point, method int) image.Image {
 	return dst
 }

+// Pad returns an image which has been resized to fit within a new size, preserving aspect ratio, and padded with a color.
+func Pad(img image.Image, newSize image.Point, color color.Color, kernel draw.Interpolator) image.Image {
+	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
+	draw.Draw(dst, dst.Bounds(), &image.Uniform{color}, image.Point{}, draw.Src)
+
+	var minPoint, maxPoint image.Point
+	if img.Bounds().Dx() > img.Bounds().Dy() {
+		// landscape
+		height := newSize.X * img.Bounds().Dy() / img.Bounds().Dx()
+		minPoint = image.Point{0, (newSize.Y - height) / 2}
+		maxPoint = image.Point{newSize.X, height + minPoint.Y}
+	} else {
+		// portrait
+		width := newSize.Y * img.Bounds().Dx() / img.Bounds().Dy()
+		minPoint = image.Point{(newSize.X - width) / 2, 0}
+		maxPoint = image.Point{minPoint.X + width, newSize.Y}
+	}
+
+	kernel.Scale(dst, image.Rectangle{
+		Min: minPoint,
+		Max: maxPoint,
+	}, img, img.Bounds(), draw.Over, nil)
+	return dst
+}
+
 // Normalize returns a slice of float32 containing each of the r, g, b values for an image normalized around a value.
 func Normalize(img image.Image, mean, std [3]float32, rescale bool, channelFirst bool) []float32 {
 	var pixelVals []float32
--- a/model/models/bert/embed.go
+++ b/model/models/bert/embed.go
@@ -156,6 +156,7 @@ func New(c fs.Config) (model.Model, error) {
 					)),
 				},
 			},
+			true,
 		)
 	default:
 		return nil, model.ErrUnsupportedTokenizer
--- a/model/models/deepseek2/model.go
+++ b/model/models/deepseek2/model.go
@@ -236,6 +236,11 @@ type Model struct {
 }

 func New(c fs.Config) (model.Model, error) {
+	if c.Uint("attention.key_length_mla") == 0 {
+		// non-MLA models aren't yet supported
+		return nil, model.ErrUnsupportedModel
+	}
+
 	layers := make([]Layer, c.Uint("block_count"))

 	firstDenseLayerIndex := int(c.Uint("leading_dense_block_count"))
--- a/model/models/deepseekocr/imageprocessor.go
+++ b/model/models/deepseekocr/imageprocessor.go
@@ -0,0 +1,83 @@
+package deepseekocr
+
+import (
+	"bytes"
+	"image"
+	"image/color"
+	"math"
+	"slices"
+
+	"golang.org/x/image/draw"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+type ratio struct {
+	x, y int
+}
+
+func ProcessImage(ctx ml.Context, bts []byte) (ml.Tensor, ml.Tensor, []int, error) {
+	img, _, err := image.Decode(bytes.NewReader(bts))
+	if err != nil {
+		return nil, nil, nil, err
+	}
+
+	minNum, maxNum, imageSize, baseSize := 2, 9, 640, 1024
+	var targetRatios []ratio
+	for n := minNum; n <= maxNum; n++ {
+		for i := 1; i <= n; i++ {
+			for j := 1; j <= n; j++ {
+				if i*j <= maxNum && i*j >= minNum && !slices.Contains(targetRatios, ratio{i, j}) {
+					targetRatios = append(targetRatios, ratio{i, j})
+				}
+			}
+		}
+	}
+
+	targetRatio := findBestAspectRatio(targetRatios, img.Bounds().Dx(), img.Bounds().Dy(), imageSize)
+	targetWidth, targetHeight := imageSize*targetRatio.x, imageSize*targetRatio.y
+	blocks := targetRatio.x * targetRatio.y
+
+	mean := imageproc.ImageNetStandardMean
+	std := imageproc.ImageNetStandardSTD
+
+	var patches []float32
+	resized := imageproc.Resize(img, image.Point{X: targetWidth, Y: targetHeight}, imageproc.ResizeBilinear)
+	for i := range blocks {
+		patch := image.NewRGBA(image.Rect(0, 0, imageSize, imageSize))
+		draw.Draw(patch, patch.Bounds(), resized, image.Point{
+			X: i % (targetWidth / imageSize) * imageSize,
+			Y: i / (targetWidth / imageSize) * imageSize,
+		}, draw.Over)
+
+		patches = append(patches, imageproc.Normalize(patch, mean, std, true, true)...)
+	}
+
+	img = imageproc.CompositeColor(img, color.Gray{})
+	img = imageproc.Pad(img, image.Point{X: baseSize, Y: baseSize}, color.Gray{127}, draw.BiLinear)
+
+	return ctx.Input().FromFloats(patches, imageSize, imageSize, 3, blocks),
+		ctx.Input().FromFloats(imageproc.Normalize(img, mean, std, true, true), baseSize, baseSize, 3),
+		[]int{targetRatio.x, targetRatio.y},
+		nil
+}
+
+func findBestAspectRatio(targetRatios []ratio, width, height, imageSize int) ratio {
+	bestDiff := math.MaxFloat64
+	best := ratio{1, 1}
+	realRatio := float64(width) / float64(height)
+	for _, target := range targetRatios {
+		targetRatio := float64(target.x) / float64(target.y)
+		diff := math.Abs(realRatio - targetRatio)
+		if diff < bestDiff {
+			bestDiff = diff
+			best = target
+		} else if diff == bestDiff {
+			if float64(width*height) > 0.5*float64(imageSize*imageSize*best.x*best.y) {
+				best = target
+			}
+		}
+	}
+	return best
+}
--- a/model/models/deepseekocr/model.go
+++ b/model/models/deepseekocr/model.go
@@ -0,0 +1,192 @@
+package deepseekocr
+
+import (
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.TextProcessor
+
+	Sam    *samModel    `gguf:"s"`
+	Vision *visionModel `gguf:"v"`
+	Text   *textModel
+
+	ImageNewline ml.Tensor `gguf:"mm.image_newline"`
+	//nolint:misspell // this misspelling is upstream. fixing it breaks the model
+	ViewSeperator ml.Tensor `gguf:"mm.view_seperator"`
+
+	Projector *nn.Linear `gguf:"mm.layers"`
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, bts []byte) ([]input.Multimodal, error) {
+	patches, original, crop, err := ProcessImage(ctx, bts)
+	if err != nil {
+		return nil, err
+	}
+
+	var outputs []ml.Tensor
+	if true { // TODO: local features if sum(patches) != 0
+		samOutputs := m.Sam.Forward(ctx, patches)
+		visionOutputs := m.Vision.Forward(ctx, patches, samOutputs)
+
+		samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
+		visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
+		localOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
+		localOutputs = m.Projector.Forward(ctx, localOutputs)
+
+		hw := int(math.Sqrt(float64(localOutputs.Dim(1))))
+		localOutputs = localOutputs.Reshape(ctx, -1, hw, crop[0], crop[1])
+		localOutputs = localOutputs.Permute(ctx, 0, 2, 1, 3)
+		localOutputs = localOutputs.Contiguous(ctx, -1, crop[0]*hw, crop[1]*hw)
+		localOutputs = localOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, localOutputs.Dim(2)), 1)
+		localOutputs = localOutputs.Reshape(ctx, localOutputs.Dim(0), -1)
+
+		outputs = append(outputs, localOutputs)
+	}
+
+	samOutputs := m.Sam.Forward(ctx, original)
+	visionOutputs := m.Vision.Forward(ctx, original, samOutputs)
+
+	samOutputs = samOutputs.Reshape(ctx, -1, samOutputs.Dim(2), samOutputs.Dim(3)).Permute(ctx, 1, 0, 2, 3)
+	visionOutputs = visionOutputs.Slice(ctx, 1, 1, visionOutputs.Dim(1), 1)
+	globalOutputs := visionOutputs.Concat(ctx, samOutputs, 0)
+	globalOutputs = m.Projector.Forward(ctx, globalOutputs)
+
+	hw := int(math.Sqrt(float64(globalOutputs.Dim(1))))
+	globalOutputs = globalOutputs.Reshape(ctx, -1, hw, hw)
+	globalOutputs = globalOutputs.Concat(ctx, m.ImageNewline.Repeat(ctx, 2, globalOutputs.Dim(2)), 1)
+	globalOutputs = globalOutputs.Reshape(ctx, globalOutputs.Dim(0), -1)
+
+	outputs = append(outputs, globalOutputs, m.ViewSeperator)
+	return []input.Multimodal{
+		{Tensor: outputs[0].Stack(ctx, 1, outputs[1:]...)},
+	}, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	outputs := make([]*input.Input, 0, len(inputs))
+	for i := range inputs {
+		if inputs[i].Multimodal == nil {
+			outputs = append(outputs, inputs[i])
+			continue
+		}
+
+		t := inputs[i].Multimodal[0].Tensor
+		outputs = append(outputs, &input.Input{
+			Token:          128815,
+			Multimodal:     inputs[i].Multimodal,
+			MultimodalHash: inputs[i].MultimodalHash,
+			SameBatch:      t.Dim(1) - 1,
+		})
+
+		outputs = slices.Grow(outputs, t.Dim(1)-1)
+		outputs = append(outputs, slices.Repeat([]*input.Input{{Token: 128815}}, t.Dim(1)-1)...)
+	}
+	return outputs, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	inputsEmbeds := m.Text.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+
+	for _, mm := range batch.Multimodal {
+		t := mm.Multimodal[0].Tensor
+		ctx.Forward(t.Copy(ctx, inputsEmbeds.View(ctx, mm.Index*inputsEmbeds.Stride(1), t.Dim(0)*t.Dim(1))))
+	}
+
+	hiddenStates := inputsEmbeds
+	for i, block := range m.Text.Blocks {
+		if m.Cache != nil {
+			m.Cache.SetLayer(i)
+		}
+
+		var outputs ml.Tensor
+		if i == len(m.Text.Blocks)-1 {
+			outputs = batch.Outputs
+		}
+
+		hiddenStates = block.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Text.Options)
+	}
+
+	hiddenStates = m.Text.OutputNorm.Forward(ctx, hiddenStates, m.Text.Options.eps)
+	return m.Text.Output.Forward(ctx, hiddenStates), nil
+}
+
+func init() {
+	model.Register("deepseekocr", func(c fs.Config) (model.Model, error) {
+		textBlocks := make([]textBlock, c.Uint("block_count"))
+		leadingDenseBlockCount := int(c.Uint("leading_dense_block_count", 1))
+		for i := range textBlocks {
+			if i >= leadingDenseBlockCount {
+				textBlocks[i].FeedForward = &textMoe{}
+			} else {
+				textBlocks[i].FeedForward = &textMLP{}
+			}
+		}
+
+		m := Model{
+			TextProcessor: model.NewBytePairEncoding(
+				&model.Vocabulary{
+					Values: c.Strings("tokenizer.ggml.tokens"),
+					Types:  c.Ints("tokenizer.ggml.token_type"),
+					Merges: c.Strings("tokenizer.ggml.merges"),
+					AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+					BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+					AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+					EOS: append(
+						[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+						c.Ints("tokenizer.ggml.eos_token_ids")...,
+					),
+				},
+				// Split regex into multiple parts (according to DeepSeek3's regex)
+				"\\p{N}{1,3}",
+				`[一-龥぀-ゟ゠-ヿ]+`,
+				"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
+			),
+			Text: &textModel{
+				Blocks: textBlocks,
+				Options: textOptions{
+					hiddenSize:     int(c.Uint("embedding_length")),
+					numHeads:       int(c.Uint("attention.head_count")),
+					numKVHeads:     int(c.Uint("attention.head_count_kv")),
+					numExperts:     int(c.Uint("expert_count")),
+					numExpertsUsed: int(c.Uint("expert_used_count")),
+					ropeBase:       c.Float("rope.freq_base", 10_000),
+					ropeScale:      c.Float("rope.scaling.factor", 1.0),
+					eps:            c.Float("attention.layer_norm_rms_epsilon", 1e-6),
+				},
+			},
+			Vision: &visionModel{
+				Blocks: make([]visionBlock, c.Uint("vision.block_count")),
+				Options: visionOptions{
+					hiddenSize: int(c.Uint("vision.embedding_length")),
+					numHeads:   int(c.Uint("vision.head_count")),
+					imageSize:  int(c.Uint("vision.image_size", 224)),
+					patchSize:  int(c.Uint("vision.patch_size", 14)),
+					eps:        c.Float("vision.attention.layer_norm_epsilon", 1e-5),
+				},
+			},
+			Sam: &samModel{
+				Blocks: make([]samBlock, c.Uint("sam.block_count")),
+				Options: samOptions{
+					hiddenSize:            int(c.Uint("sam.embedding_length")),
+					numHeads:              int(c.Uint("sam.head_count")),
+					eps:                   c.Float("sam.attention.layer_norm_epsilon", 1e-6),
+					globalAttentionLayers: c.Ints("sam.global_attention_indexes"),
+				},
+			},
+		}
+
+		m.Cache = kvcache.NewCausalCache(m.Text.Shift)
+		return &m, nil
+	})
+}
--- a/model/models/deepseekocr/model_sam.go
+++ b/model/models/deepseekocr/model_sam.go
@@ -0,0 +1,225 @@
+package deepseekocr
+
+import (
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+type samModel struct {
+	PatchEmbedding    *nn.Conv2D `gguf:"patch_embd"`
+	PositionEmbedding ml.Tensor  `gguf:"position_embd"`
+
+	Blocks []samBlock `gguf:"blk"`
+
+	Neck *samNeck   `gguf:"neck"`
+	Net2 *nn.Conv2D `gguf:"net_2"`
+	Net3 *nn.Conv2D `gguf:"net_3"`
+
+	Options samOptions
+}
+
+func (m *samModel) absolutePositionEmbedding(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	source := m.PositionEmbedding.Dim(1)
+	target := hiddenStates.Dim(2)
+	if source != target {
+		positionEmbed := m.PositionEmbedding.Permute(ctx, 2, 0, 1, 3)
+		positionEmbed = positionEmbed.Interpolate(ctx, [4]int{target, target, hiddenStates.Dim(0), 1}, ml.SamplingModeBilinear)
+		return positionEmbed.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+	}
+
+	return m.PositionEmbedding
+}
+
+func (m *samModel) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	hiddenStates := m.PatchEmbedding.Forward(ctx, t, 16, 16, 0, 0, 1, 1)
+	hiddenStates = hiddenStates.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	if m.PositionEmbedding != nil {
+		hiddenStates = hiddenStates.Add(ctx, m.absolutePositionEmbedding(ctx, hiddenStates))
+	}
+
+	for i, block := range m.Blocks {
+		var windowSize int
+		if !slices.Contains(m.Options.globalAttentionLayers, int32(i)) {
+			windowSize = 14
+		}
+
+		hiddenStates = block.Forward(ctx, hiddenStates, windowSize, m.Options)
+	}
+
+	hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+	hiddenStates = m.Neck.Forward(ctx, hiddenStates, m.Options)
+	hiddenStates = m.Net2.Forward(ctx, hiddenStates, 2, 2, 1, 1, 1, 1)
+	hiddenStates = m.Net3.Forward(ctx, hiddenStates, 2, 2, 1, 1, 1, 1)
+	return hiddenStates
+}
+
+type samOptions struct {
+	hiddenSize,
+	numHeads int
+	eps                   float32
+	globalAttentionLayers []int32
+}
+
+func (o samOptions) headDim() int {
+	return o.hiddenSize / o.numHeads
+}
+
+type samBlock struct {
+	Norm1       *nn.LayerNorm `gguf:"norm1"`
+	Attention   *samAttention `gguf:"attn"`
+	Norm2       *nn.LayerNorm `gguf:"norm2"`
+	FeedForward *samMLP       `gguf:"mlp"`
+}
+
+func (m *samBlock) Forward(ctx ml.Context, hiddenStates ml.Tensor, windowSize int, opts samOptions) ml.Tensor {
+	c, w, h := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+
+	residual := hiddenStates
+	hiddenStates = m.Norm1.Forward(ctx, hiddenStates, opts.eps)
+
+	var pw, ph int
+	if windowSize > 0 {
+		pw = (windowSize - hiddenStates.Dim(1)%windowSize) % windowSize
+		ph = (windowSize - hiddenStates.Dim(2)%windowSize) % windowSize
+		if pw > 0 || ph > 0 {
+			hiddenStates = hiddenStates.Pad(ctx, 0, pw, ph, 0)
+		}
+
+		hiddenStates = hiddenStates.Reshape(ctx, c*windowSize, (w+pw)/windowSize, windowSize, -1)
+		hiddenStates = hiddenStates.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, c, windowSize, windowSize, -1)
+	}
+
+	hiddenStates = m.Attention.Forward(ctx, hiddenStates, opts)
+
+	if windowSize > 0 {
+		hiddenStates = hiddenStates.Reshape(ctx, c*windowSize, windowSize, (w+pw)/windowSize, -1)
+		hiddenStates = hiddenStates.Permute(ctx, 0, 2, 1, 3)
+		hiddenStates = hiddenStates.Contiguous(ctx, c, w+pw, h+ph, -1)
+		hiddenStates = hiddenStates.Pad(ctx, 0, -pw, -ph, 0)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = m.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = m.FeedForward.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type samAttention struct {
+	QKV    *nn.Linear `gguf:"qkv"`
+	Output *nn.Linear `gguf:"proj"`
+
+	RelativePosition *struct {
+		Height ml.Tensor `gguf:"h"`
+		Width  ml.Tensor `gguf:"w"`
+	} `gguf:",pre:rel_pos_"`
+}
+
+func relativeCoordinates(ctx ml.Context, qn, kn int) ml.Tensor {
+	s := make([]int32, qn*kn)
+	for i := range qn {
+		for j := range kn {
+			q := i * max(kn/qn, 1)
+			k := j * max(qn/kn, 1)
+			s[i*kn+j] = int32(q - k + (kn-1)*max(qn/kn, 1))
+		}
+	}
+	return ctx.Input().FromInts(s, qn*kn)
+}
+
+func relativePositions(ctx ml.Context, positions ml.Tensor, qn, kn int) ml.Tensor {
+	maxRelativeDistance := 2*max(qn, kn) - 1
+	if positions.Dim(1) != maxRelativeDistance {
+		// linear interpolation kernel not available so approx. with bilinear interpolation
+		positions = positions.Interpolate(ctx, [4]int{positions.Dim(0), maxRelativeDistance, 1, 1}, ml.SamplingModeBilinear)
+	}
+
+	rc := relativeCoordinates(ctx, qn, kn)
+	return positions.Rows(ctx, rc).Reshape(ctx, positions.Dim(0), kn, qn)
+}
+
+func (m *samAttention) decomposedRelativePositions(ctx ml.Context, query ml.Tensor, qn, kn []int) (ml.Tensor, ml.Tensor) {
+	qh, qw := qn[0], qn[1]
+	kh, kw := kn[0], kn[1]
+
+	rh := relativePositions(ctx, m.RelativePosition.Height, qh, kh)
+	rw := relativePositions(ctx, m.RelativePosition.Width, qw, kw)
+
+	query = query.Contiguous(ctx, query.Dim(0), qw, qh, -1)
+	rh = rh.Mulmat(ctx, query).Reshape(ctx, 1, kh, qh*qw, -1)
+	rw = rw.Mulmat(ctx, query.Permute(ctx, 0, 2, 1, 3)).Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, kw, 1, qh*qw, -1)
+	return rh, rw
+}
+
+func (m *samAttention) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts samOptions) ml.Tensor {
+	w, h, b := hiddenStates.Dim(1), hiddenStates.Dim(2), hiddenStates.Dim(3)
+
+	qkv := m.QKV.Forward(ctx, hiddenStates)
+	qkv = qkv.Reshape(ctx, opts.headDim(), -1, w*h, b)
+	chunks := qkv.Chunk(ctx, 1, opts.numHeads)
+	query, key, value := chunks[0], chunks[1], chunks[2]
+
+	ctx.Forward(query, key, value)
+
+	query = query.Permute(ctx, 0, 2, 1, 3)
+	rh, rw := m.decomposedRelativePositions(ctx, query, []int{h, w}, []int{h, w})
+	mask := rh.Repeat(ctx, 0, rw.Dim(0)).Add(ctx, rw)
+	mask = mask.Reshape(ctx, h*w, -1, opts.numHeads, b)
+
+	key = key.Permute(ctx, 0, 2, 1, 3)
+	scores := key.MulmatFullPrec(ctx, query)
+	scores = scores.Scale(ctx, 1/math.Sqrt(float64(opts.headDim())))
+
+	scores = scores.Add(ctx, mask)
+	scores = scores.Softmax(ctx)
+
+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+	attention := value.Mulmat(ctx, scores)
+	attention = attention.Permute(ctx, 0, 2, 1, 3)
+	attention = attention.Contiguous(ctx, -1, w, h, b)
+	return m.Output.Forward(ctx, attention)
+}
+
+type samMLP struct {
+	Lin1 *nn.Linear `gguf:"lin1"`
+	Lin2 *nn.Linear `gguf:"lin2"`
+}
+
+func (m *samMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts samOptions) ml.Tensor {
+	return m.Lin2.Forward(ctx, m.Lin1.Forward(ctx, hiddenStates).GELU(ctx))
+}
+
+type LayerNorm2D struct {
+	Weight ml.Tensor `gguf:"weight"`
+	Bias   ml.Tensor `gguf:"bias"`
+}
+
+func (ln *LayerNorm2D) Forward(ctx ml.Context, x ml.Tensor, eps float32) ml.Tensor {
+	x = x.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+	u := x.Mean(ctx)
+	d := x.Sub(ctx, u)
+	s := d.Sqr(ctx).Mean(ctx)
+	x = d.Div(ctx, s.Add(ctx, ctx.Input().FromFloats([]float32{eps}, 1)).Sqrt(ctx))
+	x = x.Mul(ctx, ln.Weight).Add(ctx, ln.Bias)
+	return x.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+}
+
+type samNeck struct {
+	C1  *nn.Conv2D   `gguf:"0"`
+	LN1 *LayerNorm2D `gguf:"1"`
+	C2  *nn.Conv2D   `gguf:"2"`
+	LN2 *LayerNorm2D `gguf:"3"`
+}
+
+func (m *samNeck) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts samOptions) ml.Tensor {
+	hiddenStates = m.C1.Forward(ctx, hiddenStates, 1, 1, 0, 0, 1, 1)
+	hiddenStates = m.LN1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = m.C2.Forward(ctx, hiddenStates, 1, 1, 1, 1, 1, 1)
+	hiddenStates = m.LN2.Forward(ctx, hiddenStates, opts.eps)
+	return hiddenStates
+}
--- a/model/models/deepseekocr/model_text.go
+++ b/model/models/deepseekocr/model_text.go
@@ -0,0 +1,140 @@
+package deepseekocr
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+type textModel struct {
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	Blocks         []textBlock   `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output"`
+
+	Options textOptions
+}
+
+func (m *textModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return m.Options.applyRotaryPositionalEmbedding(ctx, key, shift), nil
+}
+
+type textOptions struct {
+	hiddenSize,
+	numHeads,
+	numKVHeads,
+	numExperts,
+	numExpertsUsed int
+	ropeBase,
+	ropeScale,
+	eps float32
+}
+
+func (o textOptions) headDim() int {
+	return o.hiddenSize / o.numHeads
+}
+
+func (o textOptions) applyRotaryPositionalEmbedding(ctx ml.Context, t, p ml.Tensor) ml.Tensor {
+	return fast.RoPE(ctx, t, p, o.headDim(), o.ropeBase, 1/o.ropeScale, rope.WithTypeNeoX())
+}
+
+type textBlock struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	Attention     *textAttention
+	MLPNNorm      *nn.RMSNorm `gguf:"ffn_norm"`
+	FeedForward   textFeedForward
+}
+
+func (m *textBlock) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts textOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = m.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = m.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = m.MLPNNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = m.FeedForward.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type textAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (m *textAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts textOptions) ml.Tensor {
+	query := m.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, -1)
+
+	key := m.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, -1)
+
+	value := m.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, -1)
+
+	query = opts.applyRotaryPositionalEmbedding(ctx, query, positions)
+	key = opts.applyRotaryPositionalEmbedding(ctx, key, positions)
+
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
+	attention = attention.Reshape(ctx, -1, attention.Dim(2))
+	return m.Output.Forward(ctx, attention)
+}
+
+type textFeedForward interface {
+	Forward(ml.Context, ml.Tensor, textOptions) ml.Tensor
+}
+
+type textMoe struct {
+	Router        *nn.Linear      `gguf:"ffn_gate_inp"`
+	Gate          *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up            *nn.LinearBatch `gguf:"ffn_up_exps"`
+	Down          *nn.LinearBatch `gguf:"ffn_down_exps"`
+	SharedExperts *textMLP        `gguf:",suf:_shexp"`
+}
+
+func (m *textMoe) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts textOptions) ml.Tensor {
+	scores := m.Router.Forward(ctx, hiddenStates).Softmax(ctx)
+	indices := scores.TopK(ctx, opts.numExpertsUsed)
+	weights := scores.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, indices)
+
+	experts := hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+	experts = m.Gate.Forward(ctx, experts, indices).SILU(ctx, m.Up.Forward(ctx, experts, indices))
+	experts = m.Down.Forward(ctx, experts, indices)
+	experts = experts.Mul(ctx, weights)
+
+	expert := func(i int) ml.Tensor {
+		return experts.View(
+			ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2),
+		)
+	}
+
+	routedStates := expert(0)
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		routedStates = routedStates.Add(ctx, expert(i))
+	}
+
+	sharedStates := m.SharedExperts.Forward(ctx, hiddenStates, opts)
+	return routedStates.Add(ctx, sharedStates)
+}
+
+type textMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (m *textMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ textOptions) ml.Tensor {
+	hiddenStates = m.Gate.Forward(ctx, hiddenStates).SILU(ctx, m.Up.Forward(ctx, hiddenStates))
+	return m.Down.Forward(ctx, hiddenStates)
+}
--- a/model/models/deepseekocr/model_vision.go
+++ b/model/models/deepseekocr/model_vision.go
@@ -0,0 +1,117 @@
+package deepseekocr
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+)
+
+type visionModel struct {
+	PatchEmbedding    *nn.Conv2D    `gguf:"patch_embd"`
+	ClassEmbedding    ml.Tensor     `gguf:"class_embd"`
+	PositionEmbedding *nn.Embedding `gguf:"position_embd"`
+
+	PreLayerNorm *nn.LayerNorm `gguf:"pre_layrnorm"`
+	Blocks       []visionBlock `gguf:"blk"`
+
+	Options visionOptions
+}
+
+func (m *visionModel) absolutePositionEmbedding(ctx ml.Context, embeds ml.Tensor) ml.Tensor {
+	numPatches := m.Options.imageSize / m.Options.patchSize * m.Options.imageSize / m.Options.patchSize
+	positions := ctx.Arange(0, float32(numPatches+1), 1, ml.DTypeI32)
+	positionEmbeds := m.PositionEmbedding.Forward(ctx, positions)
+
+	source := int(math.Sqrt(float64(positionEmbeds.Dim(1) - 1)))
+	target := int(math.Sqrt(float64(embeds.Dim(1) - 1)))
+	if source != target {
+		newPositionEmbeds := positionEmbeds.Slice(ctx, 1, 1, positionEmbeds.Dim(1), 1)
+		newPositionEmbeds = newPositionEmbeds.Reshape(ctx, -1, source, source)
+		newPositionEmbeds = newPositionEmbeds.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+		newPositionEmbeds = newPositionEmbeds.Interpolate(ctx, [4]int{target, target, embeds.Dim(0), 1}, ml.SamplingModeBilinear)
+		newPositionEmbeds = newPositionEmbeds.Permute(ctx, 1, 2, 0, 3)
+		newPositionEmbeds = newPositionEmbeds.Contiguous(ctx, -1, target*target)
+
+		positionEmbeds = positionEmbeds.Slice(ctx, 1, 0, 1, 1).Concat(ctx, newPositionEmbeds, 1)
+	}
+
+	return positionEmbeds
+}
+
+func (m *visionModel) Forward(ctx ml.Context, pixelValues, patchEmbeds ml.Tensor) ml.Tensor {
+	if patchEmbeds == nil {
+		patchEmbeds = m.PatchEmbedding.Forward(ctx, pixelValues, m.Options.patchSize, m.Options.patchSize, 0, 0, 1, 1)
+	}
+
+	patchEmbeds = patchEmbeds.Reshape(ctx, -1, patchEmbeds.Dim(2), patchEmbeds.Dim(3))
+	patchEmbeds = patchEmbeds.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	classEmbeds := m.ClassEmbedding.Repeat(ctx, 2, patchEmbeds.Dim(2))
+	embeds := classEmbeds.Concat(ctx, patchEmbeds, 1)
+	embeds = embeds.Add(ctx, m.absolutePositionEmbedding(ctx, embeds))
+
+	hiddenStates := m.PreLayerNorm.Forward(ctx, embeds, m.Options.eps)
+	for _, block := range m.Blocks {
+		hiddenStates = block.Forward(ctx, hiddenStates, m.Options)
+	}
+
+	return hiddenStates
+}
+
+type visionOptions struct {
+	hiddenSize,
+	numHeads int
+	eps float32
+
+	imageSize, patchSize int
+}
+
+func (o visionOptions) headDim() int {
+	return o.hiddenSize / o.numHeads
+}
+
+type visionBlock struct {
+	Norm1       *nn.LayerNorm    `gguf:"layer_norm1"`
+	Attention   *visionAttention `gguf:"self_attn"`
+	Norm2       *nn.LayerNorm    `gguf:"layer_norm2"`
+	FeedForward *visionMLP       `gguf:"mlp"`
+}
+
+func (m *visionBlock) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts visionOptions) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = m.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = m.Attention.Forward(ctx, hiddenStates, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = m.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = m.FeedForward.Forward(ctx, hiddenStates)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	return hiddenStates
+}
+
+type visionAttention struct {
+	QKV    *nn.Linear `gguf:"qkv_proj"`
+	Output *nn.Linear `gguf:"out_proj"`
+}
+
+func (m *visionAttention) Forward(ctx ml.Context, t ml.Tensor, opts visionOptions) ml.Tensor {
+	qkv := m.QKV.Forward(ctx, t)
+	qkv = qkv.Reshape(ctx, opts.headDim(), -1, qkv.Dim(1), qkv.Dim(2))
+	chunks := qkv.Chunk(ctx, 1, opts.numHeads)
+	query, key, value := chunks[0], chunks[1], chunks[2]
+
+	attention := nn.Attention(ctx, query, key, value, 1/math.Sqrt(float64(opts.headDim())), nil)
+	attention = attention.Reshape(ctx, -1, attention.Dim(2), attention.Dim(3))
+	return m.Output.Forward(ctx, attention)
+}
+
+type visionMLP struct {
+	FC1 *nn.Linear `gguf:"fc1"`
+	FC2 *nn.Linear `gguf:"fc2"`
+}
+
+func (m *visionMLP) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
+	return m.FC2.Forward(ctx, m.FC1.Forward(ctx, t).QuickGELU(ctx))
+}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -3,6 +3,7 @@ package models
 import (
 	_ "github.com/ollama/ollama/model/models/bert"
 	_ "github.com/ollama/ollama/model/models/deepseek2"
+	_ "github.com/ollama/ollama/model/models/deepseekocr"
 	_ "github.com/ollama/ollama/model/models/gemma2"
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
@@ -11,6 +12,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/nomicbert"
 	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
 	_ "github.com/ollama/ollama/model/models/qwen3"
--- a/model/models/nomicbert/model.go
+++ b/model/models/nomicbert/model.go
@@ -0,0 +1,170 @@
+package nomicbert
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/pooling"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.TextProcessor
+
+	TokenEmbedding     *nn.Embedding `gguf:"token_embd"`
+	TypeEmbedding      *nn.Embedding `gguf:"token_types"`
+	TokenEmbeddingNorm *nn.LayerNorm `gguf:"token_embd_norm"`
+
+	Layers []EncoderLayer `gguf:"blk"`
+
+	Options
+}
+
+type Options struct {
+	hiddenSize   int
+	numHeads     int
+	headDim      int
+	eps          float32
+	poolingType  pooling.Type
+	normalize    bool
+	ropeFreqBase float32
+}
+
+// Single Encoder Layer
+type EncoderLayer struct {
+	*Attention
+
+	AttentionNorm *nn.LayerNorm `gguf:"attn_output_norm"`
+
+	*MLP
+
+	MLPNorm *nn.LayerNorm `gguf:"layer_output_norm"`
+}
+
+type Attention struct {
+	QKV    *nn.Linear `gguf:"attn_qkv"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+type MLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	typeEmbed := m.TypeEmbedding.Weight.Slice(ctx, 1, 0, 1, 1)
+	hiddenStates = hiddenStates.Add(ctx, typeEmbed)
+
+	hiddenStates = m.TokenEmbeddingNorm.Forward(ctx, hiddenStates, m.eps)
+
+	positions := ctx.Input().FromInts(batch.Positions, len(batch.Positions))
+
+	for _, layer := range m.Layers {
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, &m.Options)
+	}
+
+	hiddenStates = m.poolingType.Forward(ctx, hiddenStates)
+
+	if m.normalize {
+		hiddenStates = hiddenStates.L2Norm(ctx, 1e-12)
+	}
+
+	return hiddenStates, nil
+}
+
+func (e *EncoderLayer) Forward(ctx ml.Context, hiddenStates ml.Tensor, positions ml.Tensor, opts *Options) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = e.Attention.Forward(ctx, hiddenStates, positions, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	hiddenStates = e.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+
+	residual = hiddenStates
+	hiddenStates = e.MLP.Forward(ctx, hiddenStates)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	hiddenStates = e.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+
+	return hiddenStates
+}
+
+func (a *Attention) Forward(ctx ml.Context, hiddenStates ml.Tensor, positions ml.Tensor, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	qkv := a.QKV.Forward(ctx, hiddenStates)
+
+	qkv = qkv.Reshape(ctx, opts.headDim, opts.numHeads*3, batchSize)
+	chunks := qkv.Chunk(ctx, 1, opts.numHeads)
+	query, key, value := chunks[0], chunks[1], chunks[2]
+
+	query = fast.RoPE(ctx, query, positions, opts.headDim, opts.ropeFreqBase, 1.0, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, opts.headDim, opts.ropeFreqBase, 1.0, rope.WithTypeNeoX())
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(opts.headDim)), nil)
+
+	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return a.Output.Forward(ctx, attention)
+}
+
+func (m *MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	hidden := m.Gate.Forward(ctx, hiddenStates).SILU(ctx, m.Up.Forward(ctx, hiddenStates))
+
+	return m.Down.Forward(ctx, hidden)
+}
+
+func New(c fs.Config) (model.Model, error) {
+	hiddenSize := int(c.Uint("embedding_length"))
+	numHeads := int(c.Uint("attention.head_count"))
+	headDim := hiddenSize / numHeads
+
+	processor := model.NewWordPiece(
+		&model.Vocabulary{
+			Values: c.Strings("tokenizer.ggml.tokens"),
+			Scores: c.Floats("tokenizer.ggml.scores"),
+			Types:  c.Ints("tokenizer.ggml.token_type"),
+			AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+			BOS: []int32{
+				int32(cmp.Or(
+					c.Uint("tokenizer.ggml.cls_token_id"),
+					c.Uint("tokenizer.ggml.bos_token_id"),
+				)),
+			},
+			AddEOS: c.Bool("tokenizer.ggml.add_eos_token", true),
+			EOS: []int32{
+				int32(cmp.Or(
+					c.Uint("tokenizer.ggml.separator_token_id"),
+					c.Uint("tokenizer.ggml.eos_token_id"),
+				)),
+			},
+		},
+		false,
+	)
+
+	return &Model{
+		TextProcessor: processor,
+		Layers:        make([]EncoderLayer, c.Uint("block_count")),
+		Options: Options{
+			hiddenSize:   hiddenSize,
+			numHeads:     numHeads,
+			headDim:      headDim,
+			eps:          c.Float("attention.layer_norm_epsilon"),
+			poolingType:  pooling.Type(c.Uint("pooling_type")),
+			normalize:    c.Bool("normalize_embeddings", false),
+			ropeFreqBase: c.Float("rope.freq_base", 1000.0),
+		},
+	}, nil
+}
+
+func init() {
+	model.Register("nomic-bert", New)
+	model.Register("nomic-bert_embed", New)
+}
--- a/model/renderers/cogito.go
+++ b/model/renderers/cogito.go
@@ -0,0 +1,129 @@
+package renderers
+
+import (
+	"encoding/json"
+	"strings"
+
+	"github.com/ollama/ollama/api"
+)
+
+type CogitoRenderer struct {
+	isThinking bool
+}
+
+func (r *CogitoRenderer) Render(messages []api.Message, tools []api.Tool, thinkValue *api.ThinkValue) (string, error) {
+	var sb strings.Builder
+
+	defaultPrompt := "You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco."
+
+	// thinking is enabled: model must support it AND user must request it (true)
+	enableThinking := r.isThinking && (thinkValue != nil && thinkValue.Bool())
+
+	var systemPrompt string
+	var conversationMessages []api.Message
+
+	if len(messages) > 0 && messages[0].Role == "system" {
+		systemPrompt = messages[0].Content
+		conversationMessages = messages[1:]
+	} else {
+		conversationMessages = messages
+	}
+
+	var finalSystemPrompt string
+	if enableThinking {
+		finalSystemPrompt = "Enable deep thinking subroutine.\n\n" + defaultPrompt
+		if systemPrompt != "" {
+			finalSystemPrompt += "\n\n" + systemPrompt + "\n\n"
+		}
+	} else {
+		finalSystemPrompt = defaultPrompt
+		if systemPrompt != "" {
+			finalSystemPrompt += "\n\n" + systemPrompt
+		}
+	}
+
+	if len(tools) > 0 {
+		if finalSystemPrompt != "" {
+			finalSystemPrompt += "\nYou have the following functions available:\n"
+		} else {
+			finalSystemPrompt = "You have the following functions available:\n"
+		}
+
+		for _, tool := range tools {
+			toolJSON, _ := json.MarshalIndent(tool, "", "    ") // TODO(gguo): double check json format
+			finalSystemPrompt += "```json\n" + string(toolJSON) + "\n```\n"
+		}
+	}
+
+	sb.WriteString("<｜begin▁of▁sentence｜>" + finalSystemPrompt)
+
+	outputsOpen := false
+	isLastUser := false
+
+	for i, message := range conversationMessages {
+		switch message.Role {
+		case "user":
+			isLastUser = true
+			sb.WriteString("<｜User｜>" + message.Content + "<｜Assistant｜>")
+
+		case "assistant":
+			isLastUser = false
+
+			if len(message.ToolCalls) > 0 {
+				if message.Content != "" {
+					sb.WriteString(message.Content)
+				}
+
+				sb.WriteString("<｜tool▁calls▁begin｜>")
+
+				for j, toolCall := range message.ToolCalls {
+					sb.WriteString("<｜tool▁call▁begin｜>function<｜tool▁sep｜>" + toolCall.Function.Name)
+
+					argsJSON, _ := json.Marshal(toolCall.Function.Arguments)
+					sb.WriteString("\n```json\n" + string(argsJSON) + "\n```")
+					sb.WriteString("<｜tool▁call▁end｜>")
+
+					if j < len(message.ToolCalls)-1 {
+						sb.WriteString("\n")
+					}
+				}
+
+				sb.WriteString("<｜tool▁calls▁end｜><｜end▁of▁sentence｜>")
+			} else {
+				sb.WriteString(message.Content + "<｜end▁of▁sentence｜>")
+			}
+
+		case "tool":
+			isLastUser = false
+
+			if !outputsOpen {
+				sb.WriteString("<｜tool▁outputs▁begin｜>")
+				outputsOpen = true
+			}
+
+			sb.WriteString("<｜tool▁output▁begin｜>" + message.Content + "<｜tool▁output▁end｜>")
+
+			hasNextTool := i+1 < len(conversationMessages) && conversationMessages[i+1].Role == "tool"
+			if hasNextTool {
+				sb.WriteString("\n")
+			} else {
+				sb.WriteString("<｜tool▁outputs▁end｜>")
+				outputsOpen = false
+			}
+		}
+	}
+
+	if outputsOpen {
+		sb.WriteString("<｜tool▁outputs▁end｜>")
+	}
+
+	if !isLastUser {
+		sb.WriteString("<｜Assistant｜>")
+	}
+
+	if enableThinking {
+		sb.WriteString("<think>\n")
+	}
+
+	return sb.String(), nil
+}
--- a/model/renderers/cogito_test.go
+++ b/model/renderers/cogito_test.go
@@ -0,0 +1,491 @@
+package renderers
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+
+	"github.com/ollama/ollama/api"
+)
+
+func TestCogitoRenderer(t *testing.T) {
+	tests := []struct {
+		name       string
+		messages   []api.Message
+		tools      []api.Tool
+		thinkValue *api.ThinkValue
+		expected   string
+	}{
+		{
+			name: "basic user message",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello, how are you?"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello, how are you?<｜Assistant｜>`,
+		},
+		{
+			name: "basic with system message",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a helpful assistant."},
+				{Role: "user", Content: "Hello, how are you?"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+
+You are a helpful assistant.<｜User｜>Hello, how are you?<｜Assistant｜>`,
+		},
+		{
+			name: "conversation with assistant response",
+			messages: []api.Message{
+				{Role: "user", Content: "What is the capital of France?"},
+				{Role: "assistant", Content: "The capital of France is Paris."},
+				{Role: "user", Content: "Fantastic!"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>What is the capital of France?<｜Assistant｜>The capital of France is Paris.<｜end▁of▁sentence｜><｜User｜>Fantastic!<｜Assistant｜>`,
+		},
+		{
+			name: "thinking enabled without system",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello, how are you?"},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello, how are you?<｜Assistant｜><think>
+`,
+		},
+		{
+			name: "thinking enabled with system",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a helpful assistant."},
+				{Role: "user", Content: "Hello, how are you?"},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+
+You are a helpful assistant.
+
+<｜User｜>Hello, how are you?<｜Assistant｜><think>
+`,
+		},
+		{
+			name: "thinking disabled",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello, how are you?"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello, how are you?<｜Assistant｜>`,
+		},
+		{
+			name: "with tools",
+			messages: []api.Message{
+				{Role: "user", Content: "What's the weather like?"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			tools: []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name:        "get_weather",
+						Description: "Get current weather",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+							Properties: map[string]api.ToolProperty{
+								"location": {
+									Type:        api.PropertyType{"string"},
+									Description: "City name",
+								},
+							},
+							Required: []string{"location"},
+						},
+					},
+				},
+			},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+You have the following functions available:
+` + "```json\n" + `{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current weather",
+        "parameters": {
+            "type": "object",
+            "required": [
+                "location"
+            ],
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City name"
+                }
+            }
+        }
+    }
+}
+` + "```\n" + `<｜User｜>What's the weather like?<｜Assistant｜>`,
+		},
+		{
+			name: "assistant with tool calls",
+			messages: []api.Message{
+				{Role: "user", Content: "What's the weather in Paris?"},
+				{
+					Role:    "assistant",
+					Content: "I'll check the weather in Paris for you.",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: api.ToolCallFunctionArguments{
+									"location": "Paris",
+								},
+							},
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>What's the weather in Paris?<｜Assistant｜>I'll check the weather in Paris for you.<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+` + "```json\n" + `{"location":"Paris"}
+` + "```" + `<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜><｜Assistant｜>`,
+		},
+		{
+			name: "tool response",
+			messages: []api.Message{
+				{Role: "user", Content: "What's the weather in Paris?"},
+				{
+					Role: "assistant",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: api.ToolCallFunctionArguments{
+									"location": "Paris",
+								},
+							},
+						},
+					},
+				},
+				{Role: "tool", Content: "Temperature: 22°C, Sunny"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>What's the weather in Paris?<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+` + "```json\n" + `{"location":"Paris"}
+` + "```" + `<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜><｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>Temperature: 22°C, Sunny<｜tool▁output▁end｜><｜tool▁outputs▁end｜><｜Assistant｜>`,
+		},
+		{
+			name: "multiple tool responses",
+			messages: []api.Message{
+				{Role: "user", Content: "Get weather for Paris and London"},
+				{
+					Role: "assistant",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: api.ToolCallFunctionArguments{
+									"location": "Paris",
+								},
+							},
+						},
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: api.ToolCallFunctionArguments{
+									"location": "London",
+								},
+							},
+						},
+					},
+				},
+				{Role: "tool", Content: "Paris: 22°C, Sunny"},
+				{Role: "tool", Content: "London: 18°C, Cloudy"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Get weather for Paris and London<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+` + "```json\n" + `{"location":"Paris"}
+` + "```" + `<｜tool▁call▁end｜>
+<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+` + "```json\n" + `{"location":"London"}
+` + "```" + `<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜><｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>Paris: 22°C, Sunny<｜tool▁output▁end｜>
+<｜tool▁output▁begin｜>London: 18°C, Cloudy<｜tool▁output▁end｜><｜tool▁outputs▁end｜><｜Assistant｜>`,
+		},
+		{
+			name: "thinking with tools",
+			messages: []api.Message{
+				{Role: "user", Content: "What's the weather like?"},
+			},
+			tools: []api.Tool{
+				{
+					Type: "function",
+					Function: api.ToolFunction{
+						Name:        "get_weather",
+						Description: "Get current weather",
+						Parameters: api.ToolFunctionParameters{
+							Type: "object",
+							Properties: map[string]api.ToolProperty{
+								"location": {
+									Type:        api.PropertyType{"string"},
+									Description: "City name",
+								},
+							},
+							Required: []string{"location"},
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+You have the following functions available:
+` + "```json\n" + `{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get current weather",
+        "parameters": {
+            "type": "object",
+            "required": [
+                "location"
+            ],
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "City name"
+                }
+            }
+        }
+    }
+}
+` + "```\n" + `<｜User｜>What's the weather like?<｜Assistant｜><think>
+`,
+		},
+		// test cases based on cogito
+		{
+			name: "single_turn_thinking_false",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello<｜Assistant｜>`,
+		},
+		{
+			name: "single_turn_thinking_true",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello"},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello<｜Assistant｜><think>
+`,
+		},
+		{
+			name: "multi_turn_thinking_false",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "Hi there!"},
+				{Role: "user", Content: "How are you?"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello<｜Assistant｜>Hi there!<｜end▁of▁sentence｜><｜User｜>How are you?<｜Assistant｜>`,
+		},
+		{
+			name: "multi_turn_thinking_true",
+			messages: []api.Message{
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: "Hi there!"},
+				{Role: "user", Content: "How are you?"},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello<｜Assistant｜>Hi there!<｜end▁of▁sentence｜><｜User｜>How are you?<｜Assistant｜><think>
+`,
+		},
+		{
+			name: "multi_with_system_thinking_false",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a helpful assistant"},
+				{Role: "user", Content: "Start"},
+				{Role: "assistant", Content: "Okay"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+
+You are a helpful assistant<｜User｜>Start<｜Assistant｜>Okay<｜end▁of▁sentence｜><｜Assistant｜>`,
+		},
+		{
+			name: "multi_with_system_thinking_true",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a helpful assistant"},
+				{Role: "user", Content: "Start"},
+				{Role: "assistant", Content: "Okay"},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+
+You are a helpful assistant
+
+<｜User｜>Start<｜Assistant｜>Okay<｜end▁of▁sentence｜><｜Assistant｜><think>
+`,
+		},
+		{
+			name: "multi_with_system2_thinking_false",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a pirate chatbot who always responds in pirate speak!"},
+				{Role: "user", Content: "Give me a short introduction to LLMs."},
+				{Role: "assistant", Content: "Arrr! I'm a pirate"},
+				{Role: "user", Content: "Tell me more about LLMs."},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+
+You are a pirate chatbot who always responds in pirate speak!<｜User｜>Give me a short introduction to LLMs.<｜Assistant｜>Arrr! I'm a pirate<｜end▁of▁sentence｜><｜User｜>Tell me more about LLMs.<｜Assistant｜>`,
+		},
+		{
+			name: "multi_with_system2_thinking_true",
+			messages: []api.Message{
+				{Role: "system", Content: "You are a pirate chatbot who always responds in pirate speak!"},
+				{Role: "user", Content: "Give me a short introduction to LLMs."},
+				{Role: "assistant", Content: "Arrr! I'm a pirate"},
+				{Role: "user", Content: "Tell me more about LLMs."},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+
+You are a pirate chatbot who always responds in pirate speak!
+
+<｜User｜>Give me a short introduction to LLMs.<｜Assistant｜>Arrr! I'm a pirate<｜end▁of▁sentence｜><｜User｜>Tell me more about LLMs.<｜Assistant｜><think>
+`,
+		},
+		// tools
+		{
+			name: "tool_calls_only_no_content",
+			messages: []api.Message{
+				{Role: "user", Content: "Get weather for Paris"},
+				{
+					Role: "assistant",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "get_weather",
+								Arguments: api.ToolCallFunctionArguments{
+									"location": "Paris",
+								},
+							},
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Get weather for Paris<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_weather
+` + "```json\n" + `{"location":"Paris"}
+` + "```" + `<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜><｜Assistant｜>`,
+		},
+		{
+			name: "complex_tool_arguments",
+			messages: []api.Message{
+				{Role: "user", Content: "Process complex data"},
+				{
+					Role: "assistant",
+					ToolCalls: []api.ToolCall{
+						{
+							Function: api.ToolCallFunction{
+								Name: "process_data",
+								Arguments: api.ToolCallFunctionArguments{
+									"items": []any{"item1", "item2", "item3"},
+									"config": map[string]any{
+										"enabled":   true,
+										"threshold": 0.95,
+										"tags":      []string{"important", "urgent"},
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Process complex data<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>process_data
+` + "```json\n" + `{"config":{"enabled":true,"tags":["important","urgent"],"threshold":0.95},"items":["item1","item2","item3"]}
+` + "```" + `<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜><｜Assistant｜>`,
+		},
+		{
+			name: "empty_messages",
+			messages: []api.Message{
+				{Role: "system", Content: ""},
+				{Role: "user", Content: "Hello"},
+				{Role: "assistant", Content: ""},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hello<｜Assistant｜><｜end▁of▁sentence｜><｜Assistant｜>`,
+		},
+		{
+			name: "thinking_with_empty_assistant_content",
+			messages: []api.Message{
+				{Role: "user", Content: "Think about this"},
+				{Role: "assistant", Content: ""},
+			},
+			thinkValue: &api.ThinkValue{Value: true},
+			expected: `<｜begin▁of▁sentence｜>Enable deep thinking subroutine.
+
+You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Think about this<｜Assistant｜><｜end▁of▁sentence｜><｜Assistant｜><think>
+`,
+		},
+		{
+			name: "multiple_system_messages",
+			messages: []api.Message{
+				{Role: "system", Content: "First instruction"},
+				{Role: "system", Content: "Second instruction"},
+				{Role: "user", Content: "Hello"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected: `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.
+
+First instruction<｜User｜>Hello<｜Assistant｜>`,
+		},
+		{
+			name: "special_characters_in_content",
+			messages: []api.Message{
+				{Role: "user", Content: "What about <|special|> tokens and \"quotes\"?"},
+				{Role: "assistant", Content: "They're handled normally in content."},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>What about <|special|> tokens and "quotes"?<｜Assistant｜>They're handled normally in content.<｜end▁of▁sentence｜><｜Assistant｜>`,
+		},
+		{
+			name: "long_conversation_multiple_rounds",
+			messages: []api.Message{
+				{Role: "user", Content: "Hi"},
+				{Role: "assistant", Content: "Hello!"},
+				{Role: "user", Content: "How are you?"},
+				{Role: "assistant", Content: "Good, thanks!"},
+				{Role: "user", Content: "What's the weather?"},
+			},
+			thinkValue: &api.ThinkValue{Value: false},
+			expected:   `<｜begin▁of▁sentence｜>You are Cogito, an AI assistant created by Deep Cogito, which is an AI research lab based in San Francisco.<｜User｜>Hi<｜Assistant｜>Hello!<｜end▁of▁sentence｜><｜User｜>How are you?<｜Assistant｜>Good, thanks!<｜end▁of▁sentence｜><｜User｜>What's the weather?<｜Assistant｜>`,
+		},
+	}
+
+	renderer := &CogitoRenderer{isThinking: true}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			rendered, err := renderer.Render(tt.messages, tt.tools, tt.thinkValue)
+			if err != nil {
+				t.Fatalf("Render() error = %v", err)
+			}
+			if diff := cmp.Diff(tt.expected, rendered); diff != "" {
+				t.Errorf("Render() mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
--- a/model/renderers/renderer.go
+++ b/model/renderers/renderer.go
@@ -56,6 +56,9 @@ func rendererForName(name string) Renderer {
 	case "qwen3-vl-thinking":
 		renderer := &Qwen3VLRenderer{isThinking: true, useImgTags: RenderImgTags}
 		return renderer
+	case "cogito":
+		renderer := &CogitoRenderer{isThinking: true}
+		return renderer
 	default:
 		return nil
 	}
--- a/model/wordpiece.go
+++ b/model/wordpiece.go
@@ -10,7 +10,8 @@ import (
 )

 type WordPiece struct {
-	vocab *Vocabulary
+	vocab     *Vocabulary
+	lowercase bool
 }

 // ggmlPrefix is the prefix used by GGML vocabularies to indicate word boundaries.
@@ -114,8 +115,10 @@ func (wpm WordPiece) Encode(s string, addSpecial bool) ([]int32, error) {
 					subword = ggmlPrefix + subword
 				}

-				// TODO: some models might not want [ToLower]
-				piece = wpm.vocab.Encode(strings.ToLower(subword))
+				if wpm.lowercase {
+					subword = strings.ToLower(subword)
+				}
+				piece = wpm.vocab.Encode(subword)
 				if piece >= 0 {
 					break
 				}
@@ -160,8 +163,9 @@ func (wpm WordPiece) Vocabulary() *Vocabulary {

 var _ TextProcessor = (*WordPiece)(nil)

-func NewWordPiece(vocab *Vocabulary) WordPiece {
+func NewWordPiece(vocab *Vocabulary, lowercase bool) WordPiece {
 	return WordPiece{
-		vocab: vocab,
+		vocab:     vocab,
+		lowercase: lowercase,
 	}
 }
--- a/model/wordpiece_test.go
+++ b/model/wordpiece_test.go
@@ -15,7 +15,9 @@ func TestWordPiece(t *testing.T) {
 			AddEOS: true,
 			BOS:    []int32{1},
 			EOS:    []int32{2},
-		})
+		},
+		true, // lowercase
+	)

 	ids, err := wpm.Encode("Hello world!", true)
 	if err != nil {
--- a/parser/expandpath_test.go
+++ b/parser/expandpath_test.go
@@ -0,0 +1,123 @@
+package parser
+
+import (
+	"os"
+	"os/user"
+	"path/filepath"
+	"runtime"
+	"testing"
+)
+
+func TestExpandPath(t *testing.T) {
+	mockCurrentUser := func() (*user.User, error) {
+		return &user.User{
+			Username: "testuser",
+			HomeDir: func() string {
+				if os.PathSeparator == '\\' {
+					return filepath.FromSlash("D:/home/testuser")
+				}
+				return "/home/testuser"
+			}(),
+		}, nil
+	}
+
+	mockLookupUser := func(username string) (*user.User, error) {
+		fakeUsers := map[string]string{
+			"testuser": func() string {
+				if os.PathSeparator == '\\' {
+					return filepath.FromSlash("D:/home/testuser")
+				}
+				return "/home/testuser"
+			}(),
+			"anotheruser": func() string {
+				if os.PathSeparator == '\\' {
+					return filepath.FromSlash("D:/home/anotheruser")
+				}
+				return "/home/anotheruser"
+			}(),
+		}
+
+		if homeDir, ok := fakeUsers[username]; ok {
+			return &user.User{
+				Username: username,
+				HomeDir:  homeDir,
+			}, nil
+		}
+		return nil, os.ErrNotExist
+	}
+
+	pwd, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	t.Run("unix tests", func(t *testing.T) {
+		if runtime.GOOS == "windows" {
+			return
+		}
+
+		tests := []struct {
+			path        string
+			relativeDir string
+			expected    string
+			shouldErr   bool
+		}{
+			{"~", "", "/home/testuser", false},
+			{"~/myfolder/myfile.txt", "", "/home/testuser/myfolder/myfile.txt", false},
+			{"~anotheruser/docs/file.txt", "", "/home/anotheruser/docs/file.txt", false},
+			{"~nonexistentuser/file.txt", "", "", true},
+			{"relative/path/to/file", "", filepath.Join(pwd, "relative/path/to/file"), false},
+			{"/absolute/path/to/file", "", "/absolute/path/to/file", false},
+			{"/absolute/path/to/file", "someotherdir/", "/absolute/path/to/file", false},
+			{".", pwd, pwd, false},
+			{".", "", pwd, false},
+			{"somefile", "somedir", filepath.Join(pwd, "somedir", "somefile"), false},
+		}
+
+		for _, test := range tests {
+			result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
+			if (err != nil) != test.shouldErr {
+				t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
+			}
+
+			if result != test.expected && !test.shouldErr {
+				t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
+			}
+		}
+	})
+
+	t.Run("windows tests", func(t *testing.T) {
+		if runtime.GOOS != "windows" {
+			return
+		}
+
+		tests := []struct {
+			path        string
+			relativeDir string
+			expected    string
+			shouldErr   bool
+		}{
+			{"~", "", "D:\\home\\testuser", false},
+			{"~/myfolder/myfile.txt", "", "D:\\home\\testuser\\myfolder\\myfile.txt", false},
+			{"~anotheruser/docs/file.txt", "", "D:\\home\\anotheruser\\docs\\file.txt", false},
+			{"~nonexistentuser/file.txt", "", "", true},
+			{"relative\\path\\to\\file", "", filepath.Join(pwd, "relative\\path\\to\\file"), false},
+			{"D:\\absolute\\path\\to\\file", "", "D:\\absolute\\path\\to\\file", false},
+			{"D:\\absolute\\path\\to\\file", "someotherdir/", "D:\\absolute\\path\\to\\file", false},
+			{".", pwd, pwd, false},
+			{".", "", pwd, false},
+			{"somefile", "somedir", filepath.Join(pwd, "somedir", "somefile"), false},
+		}
+
+		for _, test := range tests {
+			result, err := expandPathImpl(test.path, test.relativeDir, mockCurrentUser, mockLookupUser)
+			if (err != nil) != test.shouldErr {
+				t.Errorf("expandPathImpl(%q) returned error: %v, expected error: %v", test.path, err != nil, test.shouldErr)
+			}
+
+			if result != test.expected && !test.shouldErr {
+				t.Errorf("expandPathImpl(%q) = %q, want %q", test.path, result, test.expected)
+			}
+		}
+	})
+}
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -620,43 +620,43 @@ func isValidCommand(cmd string) bool {
 	}
 }

-func expandPath(path, dir string) (string, error) {
-	if filepath.IsAbs(path) {
-		return path, nil
-	}
+func expandPathImpl(path, relativeDir string, currentUserFunc func() (*user.User, error), lookupUserFunc func(string) (*user.User, error)) (string, error) {
+	if filepath.IsAbs(path) || strings.HasPrefix(path, "\\") || strings.HasPrefix(path, "/") {
+		return filepath.Abs(path)
+	} else if strings.HasPrefix(path, "~") {
+		var homeDir string

-	path, found := strings.CutPrefix(path, "~")
-	switch {
-	case !found:
-		// make path relative to dir
-		if !filepath.IsAbs(dir) {
-			// if dir is relative, make it absolute relative to cwd
-			cwd, err := os.Getwd()
+		if path == "~" || strings.HasPrefix(path, "~/") {
+			// Current user's home directory
+			currentUser, err := currentUserFunc()
 			if err != nil {
-				return "", err
+				return "", fmt.Errorf("failed to get current user: %w", err)
+			}
+			homeDir = currentUser.HomeDir
+			path = strings.TrimPrefix(path, "~")
+		} else {
+			// Specific user's home directory
+			parts := strings.SplitN(path[1:], "/", 2)
+			userInfo, err := lookupUserFunc(parts[0])
+			if err != nil {
+				return "", fmt.Errorf("failed to find user '%s': %w", parts[0], err)
+			}
+			homeDir = userInfo.HomeDir
+			if len(parts) > 1 {
+				path = "/" + parts[1]
+			} else {
+				path = ""
 			}
-			dir = filepath.Join(cwd, dir)
 		}
-		path = filepath.Join(dir, path)
-	case filepath.IsLocal(path):
-		// ~<user>/...
-		// make path relative to specified user's home
-		split := strings.SplitN(path, string(os.PathSeparator), 2)
-		u, err := user.Lookup(split[0])
-		if err != nil {
-			return "", err
-		}
-		split[0] = u.HomeDir
-		path = filepath.Join(split...)
-	default:
-		// ~ or ~/...
-		// make path relative to current user's home
-		home, err := os.UserHomeDir()
-		if err != nil {
-			return "", err
-		}
-		path = filepath.Join(home, path)
+
+		path = filepath.Join(homeDir, path)
+	} else {
+		path = filepath.Join(relativeDir, path)
 	}

-	return filepath.Clean(path), nil
+	return filepath.Abs(path)
+}
+
+func expandPath(path, relativeDir string) (string, error) {
+	return expandPathImpl(path, relativeDir, user.Current, user.Lookup)
 }
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -9,9 +9,7 @@ import (
 	"io"
 	"maps"
 	"os"
-	"os/user"
 	"path/filepath"
-	"runtime"
 	"strings"
 	"testing"
 	"unicode/utf16"
@@ -1128,62 +1126,3 @@ func TestFilesForModel(t *testing.T) {
 		})
 	}
 }
-
-func TestExpandPath(t *testing.T) {
-	home := t.TempDir()
-	t.Setenv("HOME", home)
-	t.Setenv("USERPROFILE", home)
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	u, err := user.Current()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	volume := ""
-	if runtime.GOOS == "windows" {
-		volume = "D:"
-	}
-
-	cases := []struct {
-		input,
-		dir,
-		want string
-		err error
-	}{
-		{"~", "", home, nil},
-		{"~/path/to/file", "", filepath.Join(home, filepath.ToSlash("path/to/file")), nil},
-		{"~" + u.Username + "/path/to/file", "", filepath.Join(u.HomeDir, filepath.ToSlash("path/to/file")), nil},
-		{"~nonexistentuser/path/to/file", "", "", user.UnknownUserError("nonexistentuser")},
-		{"relative/path/to/file", "", filepath.Join(cwd, filepath.ToSlash("relative/path/to/file")), nil},
-		{volume + "/absolute/path/to/file", "", filepath.ToSlash(volume + "/absolute/path/to/file"), nil},
-		{volume + "/absolute/path/to/file", filepath.ToSlash("another/path"), filepath.ToSlash(volume + "/absolute/path/to/file"), nil},
-		{".", cwd, cwd, nil},
-		{".", "", cwd, nil},
-		{"", cwd, cwd, nil},
-		{"", "", cwd, nil},
-		{"file", "path/to", filepath.Join(cwd, filepath.ToSlash("path/to/file")), nil},
-	}
-
-	for _, tt := range cases {
-		t.Run(tt.input, func(t *testing.T) {
-			got, err := expandPath(tt.input, tt.dir)
-			// On Windows, user.Lookup does not map syscall errors to user.UnknownUserError
-			// so we special case the test to just check for an error.
-			// See https://cs.opensource.google/go/go/+/refs/tags/go1.25.1:src/os/user/lookup_windows.go;l=455
-			if runtime.GOOS != "windows" && !errors.Is(err, tt.err) {
-				t.Fatalf("expandPath(%q) error = %v, wantErr %v", tt.input, err, tt.err)
-			} else if tt.err != nil && err == nil {
-				t.Fatal("test case expected to fail on windows")
-			}
-
-			if got != tt.want {
-				t.Errorf("expandPath(%q) = %v, want %v", tt.input, got, tt.want)
-			}
-		})
-	}
-}
Author	SHA1	Message	Date
Eva Ho	5aee34db9f	fix tests	2025-11-19 15:57:19 -05:00
Eva Ho	32393f11d7	app/ui: add gemini-3-pro-preview to featured list	2025-11-19 14:59:29 -05:00
Patrick Devine	604e43b28d	models: enable deepseek2 (deepseek v3.1 w/ MLA) on the new engine (#13151 )	2025-11-18 22:03:50 -08:00
Jesse Gross	53985b3c4d	kvcache: Use SetRows to store cache data We currently copy data into the KV cache in contiguous buffers using ggml_cpy(). ggml_set_rows() was introduced to allow scatter operation so that contiguous buffers are no longer required. The direct primary benefit of this is that we no longer need to perform defragmentation. However, GGML recently removed an optimization for ggml_cpy() and we picked it up in `544b673` "ggml update to b6840 (#12791)". This caused a roughly 40% drop in token generation performance on CUDA due to CUDA graphs no longer being used. By switching to ggml_set_rows(), the original optimization is no longer necessary and CUDA performance is restored. Fixes #13112	2025-11-18 20:42:28 -08:00
Jesse Gross	b6e02cbbd2	ggml: Automatically make tensors contiguous on reshape GGML requires tensors to be contiguous for reshape and if this is not the case, it will assert fail. Contiguous is an expensive operation, so it's best to do it lazily when it is actually required rather than ahead of time when it may not be needed.	2025-11-18 20:42:28 -08:00
Grace	91935631ac	Renderer for Cogito v2 (#13139 )	2025-11-18 19:06:34 -08:00
nicole pardal	8de30b568a	nomic-embed-text model implementation (#13071 )	2025-11-18 18:28:10 -08:00
Daniel Hiltgen	485da9fd35	win: exit instead of abort (#13138 ) Calling abort on windows triggers the C++ runtime to attempt a debugger attach, which causes the crashed runners to hang instead of exit, leading to a timeout instead of a fast failure during discovery.	2025-11-18 16:33:33 -08:00
Michael Yang	0796d79d19	cuda: skip large batches cuda panics on batches larger than 1024 so skip those and fallback to cpu	2025-11-18 16:11:37 -08:00
Michael Yang	92981ae3f2	deepseekocr	2025-11-18 16:11:37 -08:00